# HG changeset patch
# User drewp@bigasterisk.com
# Date 1714760468 25200
# Node ID adde35eb477306d17dca1280cfa9c12080745fb8
# Parent 429bfd62e6baeafc10aa81bde8c8a3c3bac26a41
collapse ./next to ./
diff -r 429bfd62e6ba -r adde35eb4773 alert_rules.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alert_rules.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,433 @@
+"""
+pdm run invoke push-config
+
+docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+"Whenever the alert expression results in one or more vector
+elements at a given point in time, the alert counts as active for
+these elements' label sets."
+also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+
+"""
+
+import json
+
+
+def pomRules():
+ return [
+ {
+ "alert": "frequent_upstream_connect_failures",
+ "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
+ },
+ {
+ "alert": "high_logging_pomerium",
+ "for": "3h",
+ "labels": {
+ "severity": "waste"
+ },
+ "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
+ "annotations": {
+ "summary": "high log output rate"
+ },
+ },
+ ]
+
+
+def k8sRules():
+ # from https://awesome-prometheus-alerts.grep.to/rules.html
+ return [
+ {
+ "alert": "metricsTargetMissing",
+ "expr": 'up{job!~"cm-acme-.*"} == 0',
+ 'for': '10m',
+ "labels": {
+ "severity": "critical"
+ },
+ "annotations": {
+ "summary": "metrics target missing (instance {{ $labels.instance }})",
+ "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesMemoryPressure",
+ "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
+ "for": "2m",
+ "labels": {
+ "severity": "critical"
+ },
+ "annotations": {
+ "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
+ "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesDiskPressure",
+ "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
+ "for": "2m",
+ "labels": {
+ "severity": "critical"
+ },
+ "annotations": {
+ "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
+ "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesOutOfDisk",
+ "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
+ "for": "2m",
+ "labels": {
+ "severity": "critical"
+ },
+ "annotations": {
+ "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
+ "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesJobFailed",
+ "expr": "kube_job_status_failed > 0",
+ "labels": {
+ "severity": "warning"
+ },
+ "annotations": {
+ "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
+ "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesPodCrashLooping",
+ "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
+ "for": "2m",
+ "labels": {
+ "severity": "warning"
+ },
+ "annotations": {
+ "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
+ "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "KubernetesClientCertificateExpiresNextWeek",
+ "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
+ "labels": {
+ "severity": "warning"
+ },
+ "annotations": {
+ "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
+ "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
+ },
+ },
+ {
+ "alert": "container_waiting",
+ "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
+ "annotations": {
+ "description": '',
+ "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
+ },
+ "for": "2m",
+ },
+ ]
+
+
+def allRules(ctx):
+ return {
+ "groups": [
+ {
+ "name": "k8s",
+ "interval": "1m",
+ "rules": k8sRules(),
+ },
+ {
+ "name": "pomerium_proxy",
+ "interval": "1m",
+ "rules": pomRules(),
+ },
+ {
+ "name":
+ "Outages",
+ "interval":
+ "1m",
+ "rules": [
+ {
+ "alert": "powereagleStalled",
+ "expr": "rate(house_power_w[100m]) == 0",
+ "for": "0m",
+ "labels": {
+ "severity": "losingData"
+ },
+ "annotations": {
+ "summary": "power eagle data stalled",
+ "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+ },
+ },
+ {
+ "alert": "powereagleAbsent",
+ "expr": "absent_over_time(house_power_w[5m])",
+ "for": "2m",
+ "labels": {
+ "severity": "losingData"
+ },
+ "annotations": {
+ "summary": "power eagle data missing",
+ "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+ },
+ },
+ {
+ "alert": "absent_zigbee",
+ "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
+ },
+ {
+ "alert": "net_routes_sync",
+ "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
+ "for": "10m",
+ "labels": {
+ "severity": "houseUsersAffected"
+ },
+ "annotations": {
+ "summary": "net_routes is not getting regular updates"
+ },
+ },
+ ],
+ },
+ {
+ "name": "disk_errs",
+ "interval": "2d",
+ "rules": [{
+ "alert": "zpool_device_error_increase",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'increase(zpool_device_error_count[3d]) > 0',
+ }, {
+ "alert": "zpool_device_error_count",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'zpool_device_error_count > 0',
+ }],
+ },
+ {
+ "name": "lighting",
+ "interval": "5m",
+ "rules": [{
+ "alert": "light_bridge_no_mqtt",
+ "expr": 'mqtt_connected{job="light-bridge"} != 1',
+ }],
+ },
+ {
+ "name":
+ "front_door",
+ "interval":
+ "5m",
+ "rules": [
+ {
+ "alert": "front_door_reader_esp32_no_mqtt",
+ 'expr': 'hw_connected{job="fingerprint"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ {
+ "alert": "front_door_reader_svc_down",
+ 'expr': 'up{job="fingerprint"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ {
+ "alert": "front_door_reader_svc_reader_no_mqtt",
+ 'expr': 'mqtt_connected{job="fingerprint"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ {
+ "alert": "front_door_lock_svc_down",
+ 'expr': 'up{job="front-door-lock"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ {
+ "alert": "front_door_lock_svc_no_mqtt",
+ 'expr': 'mqtt_connected{job="front-door-lock"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ {
+ "alert": "front_door_lock_esp32_no_mqtt",
+ 'expr': 'hw_connected{job="front-door-lock"} < 1',
+ "annotations": {
+ "summary": "see https://bigasterisk.com/front-door-lock/"
+ },
+ },
+ ],
+ },
+ {
+ "name":
+ "net_routes",
+ "interval":
+ "5m",
+ "rules": [
+ {
+ "alert": "no_house_ip_service",
+ "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
+ },
+ {
+ "alert": "no_net_routes_running",
+ "expr": 'absent(python_info{job="net-routes"})'
+ },
+ {
+ "alert": "allowed_check_never_returned_200",
+ 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
+ },
+ {
+ "alert": "allowed_check_never_returned_403",
+ 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
+ },
+ {
+ 'alert': 'net_route_input_eval_cal_loop_is_down',
+ 'expr': 'eval_cal_up!=1'
+ },
+ {
+ 'alert': 'net_route_input_mongo_loop_is_down',
+ 'expr': 'mongo_to_net_routes_up!=1'
+ },
+ {
+ 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
+ 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
+ },
+ {
+ 'alert': 'gcalendarwatch_current_events_loop_is_down',
+ 'expr': 'current_events_up != 1'
+ },
+ ],
+ },
+ {
+ "name": "http",
+ "interval": "1h",
+ 'rules': [
+ {
+ 'alert': 'old_https_certs',
+ 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
+ },
+ {
+ 'alert': 'high_500_response_rate',
+ 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
+ },
+ ],
+ },
+ {
+ "name": "ping",
+ "interval": "1m",
+ "rules": [{
+ "alert": "ping_failed",
+ "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
+ }]
+ },
+ {
+ "name":
+ "alerts",
+ "rules": [
+ {
+ "alert": "kube_node_status_bad_condition",
+ "for": "2h",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
+ },
+ {
+ "alert": "housePower",
+ "for": "1h",
+ "labels": {
+ "severity": "waste"
+ },
+ "expr": "house_power_w > 4000",
+ "annotations": {
+ "summary": "house power usage over 4KW"
+ },
+ },
+ {
+ "alert": "host_root_fs_space_low",
+ "for": "20m",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'disk_free{host!="garage",path="/"} < 20G',
+ },
+ {
+ "alert": "zpool_space_low",
+ "for": "20m",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
+ },
+ {
+ "alert": "disk_week_incr",
+ "for": "20m",
+ "labels": {
+ "severity": "warning"
+ },
+ "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
+ "annotations": {
+ "summary": "high mb/week on zfs dir"
+ },
+ },
+ {
+ "alert": "high_logging",
+ "for": "3h",
+ "labels": {
+ "severity": "waste"
+ },
+ "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
+ "annotations": {
+ "summary": "high log output rate"
+ },
+ },
+ {
+ "alert": "stale_process",
+ "for": "1d",
+ "labels": {
+ "severity": "dataRisk"
+ },
+ "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
+ "annotations": {
+ "summary": "process time is old"
+ },
+ },
+ {
+ "alert": "starlette",
+ "for": "1m",
+ "labels": {
+ "severity": "fix"
+ },
+ "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
+ "annotations": {
+ "summary": "set starlette app name"
+ },
+ },
+ {
+ "alert": "ssl_certs_expiring_soon",
+ "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
+ "labels": {
+ "severity": "warning"
+ },
+ "annotations": {
+ "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
+ },
+ },
+ ],
+ },
+ ] + hostsExpectedOnline(ctx)['groups']
+ }
+
+
+def _runJson(ctx, cmd):
+ return json.loads(ctx.run(cmd, hide="stdout").stdout)
+
+
+def hostsExpectedOnline(ctx):
+ return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
diff -r 429bfd62e6ba -r adde35eb4773 create_k8s.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create_k8s.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,254 @@
+from pathlib import Path
+from index_page import makeIndexHtml
+from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
+
+
+def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
+ (build / f'{agentFileName}_deploy.yaml').write_text(
+ toJson({
+ "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
+ "spec": {
+ "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
+ "template": {
+ "metadata": {
+ "labels": { "app": agentName },
+ "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
+ },
+ "spec": {
+ "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
+ "serviceAccountName": "victoriametrics",
+ "containers": [{
+ "name": "vmagent",
+ "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
+ "imagePullPolicy": "IfNotPresent",
+ "args": [
+ f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
+ tzArg,
+ f"-promscrape.config=/local/config/{scrapeMapKey}",
+ "-promscrape.configCheckInterval=5s",
+ "-sortLabels",
+ f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
+ "-remoteWrite.showURL",
+ ],
+ "ports": [{ "containerPort": agentPort }],
+ "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
+ }]
+ }
+ }
+ }
+ })) # yapf: disable
+
+
+def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
+ (build / f'{insertFileName}_deploy.yaml').write_text(
+ toJson({
+ "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
+ "spec": {
+ "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
+ "template": {
+ "metadata": {
+ "labels": { "app": insertName },
+ "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+ },
+ "spec": {
+ "serviceAccountName": "victoriametrics",
+ "containers": [{
+ "name": "vminsert",
+ "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
+ "imagePullPolicy": "IfNotPresent",
+ "args": [
+ f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
+ tzArg,
+ f"-storageNode={storageName}",
+ ],
+ "ports": [{ "containerPort": insertPort }]
+ }]
+ }
+ }
+ }
+ })) # yapf: disable
+
+
+def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
+ (build / f'{storageFileName}_2deploy.yaml').write_text(
+ toJson({
+ "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
+ "spec": {
+ "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
+ "template": {
+ "metadata": {
+ "labels": { "app": storageName },
+ "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
+ },
+ "spec": {
+ "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
+ "serviceAccountName": "victoriametrics",
+ "containers": [{
+ "name": "vmstorage",
+ "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
+ "imagePullPolicy": "IfNotPresent",
+ "args": [
+ f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
+ tzArg,
+ f"-retentionPeriod={retention}",
+ f"-storageDataPath=/data/{pipelineName}",
+ ],
+ "ports": [
+ { "containerPort": 8482, "name": "http" },
+ { "containerPort": storageInsertPort, "name": "vminsert" },
+ { "containerPort": storageSelectPort, "name": "vmselect" },
+ ],
+ "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
+ }],
+ "affinity": affinityToNode(localPvHost)
+ }
+ }
+ }
+ })) # yapf: disable
+
+
+def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
+ name = f"{objPrefix}-vmselect"
+ (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
+ toJson({
+ "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+ "spec": {
+ "replicas": 1,
+ "strategy": { "type": "Recreate" },
+ "selector": { "matchLabels": { "app": name } },
+ "template": {
+ "metadata": {
+ "labels": { "app": name },
+ "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+ },
+ "spec": {
+ "serviceAccountName": "victoriametrics",
+ "containers": [{
+ "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
+ "args": [
+ f"-http.pathPrefix={webRoot}/vmselect/",
+ tzArg,
+ ] + [f"-storageNode={n}" for n in storageSvcs],
+ "ports": [{ "containerPort": selectPort }]
+ }]
+ }
+ }
+ }
+ })) # yapf: disable
+
+def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
+ agentName = f"{objPrefix}-{pipelineName}-vmagent"
+ insertName = f"{objPrefix}-{pipelineName}-vminsert"
+ storageName = f"{objPrefix}-{pipelineName}-vmstorage"
+
+ agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
+ insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
+ storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
+
+ localPvHost = "ditto"
+ insertPort = 8480
+ agentPort = 8429
+ storageInsertPort = 8400
+ storageSelectPort = 8401
+ volName = f"{objPrefix}-data-{pipelineName}"
+ request = "50Gi"
+ pipelineWebRoot = f'{webRoot}/{pipelineName}'
+
+ createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
+ createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
+ createPv(storageFileName, volName, request)
+ createPvc(storageFileName, volName, request)
+ createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
+
+ createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
+ createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
+ createSvc(storageFileName,storageName, [
+ {"port": 80, "targetPort": "http", "name": "http"},
+ {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
+ {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
+ ]) # yapf: disable
+
+ return storageName
+
+
+def createIndex(objPrefix, webRoot, html):
+ name = f'{objPrefix}-index'
+ httpServeRoot = '/opt/html'
+
+ (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
+ "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
+ "data": {
+ "index.html": html,
+ "index.js": Path("index.js").read_text(),
+ "index.css": Path("index.css").read_text(),
+ }
+ })) # yapf: disable
+
+ (build / f'{objPrefix}-3index_deploy.yaml').write_text(
+ toJson({
+ "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+ "spec": {
+ "replicas": 1,
+ "selector": { "matchLabels": { "app": name } },
+ "template": {
+ "metadata": { "labels": { "app": name } },
+ "spec": {
+ "containers": [{
+ "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
+ "args": [
+ f'--root={httpServeRoot}',
+ '--directory-listing=true',
+ '--experimental-metrics=true',
+ ],
+ "ports": [{ "containerPort": 80 }],
+ "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
+ }],
+ "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
+ }
+ }
+ }
+ })) # yapf: disable
+ createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
+
+
+def main():
+ tzArg = "-loggerTimezone=America/Los_Angeles"
+ objPrefix = "next-victoriametrics" # prefix on all k8s object names
+ webRoot = "/m/next"
+ vmVersion = "v1.100.1"
+ webHost = 'bigasterisk.com'
+ pipelines = [
+ ('forever', '100y'),
+ ('recent', '90y'),
+ ]
+ storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
+
+ selectPort = 8481
+ createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
+ createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
+
+ ingressPaths = [
+ { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
+ { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
+ ] # yapf: disable
+ for p, _ in pipelines:
+ ingressPaths.extend([
+ { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } },
+ { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } },
+ { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
+ ]) # yapf: disable
+
+ policy = """\
+allow:
+ or:
+ - { email: { is: "drewpca@gmail.com" }}
+ - { email: { is: "kelsimp@gmail.com" }}
+ """
+ createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
+ createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
+
+
+main()
+
+# in vmui, set server url to
+# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
diff -r 429bfd62e6ba -r adde35eb4773 create_scrape_configs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create_scrape_configs.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,170 @@
+from pathlib import Path
+
+from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName
+import private
+
+# previously this used `kubernetes_sd_configs: [{ role: node }]`
+all_hosts = [
+ 'dash',
+ 'ditto',
+ # 'ws-printer',
+ #todo:
+]
+
+smartctl_hosts = [
+ # ideally, all nodes with disks, but many turn off and on
+ 'dash',
+ 'ditto',
+]
+
+ping_hosts = [
+ # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
+ 'printer',
+ # wireguard connection test
+ 'prime5',
+ # after pyinfra or reboot, seems to lose wg0 address
+ 'garage5',
+]
+
+
+forever_jobs = [
+ jobConfig(name='maildir-count', targets=['prime:2500']),
+ jobConfig(name='mongodb', targets=['mongodb:9216']),
+ jobConfig(name='net-traffic', targets=['pipe:8080']),
+ jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True),
+ jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec
+ jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'),
+ jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]),
+ jobConfig(name='wifi', targets=['wifi:80']),
+ jobConfig(name='zfs-exporter', targets=['ditto:9634']),
+ jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
+ jobConfig(name='zpool-exporter', targets=['ditto:9986']),
+ jobConfig(name='octoprint', targets=['octoprint'],
+ metrics_path='/plugin/prometheus_exporter/metrics',
+ params={'apikey' : [private.octoprint_apikey]},
+ ),
+] # yapf: disable
+
+recent_jobs = [
+ jobConfig(name="dnsmasq-log", targets=['pipe:9991']),
+ jobConfig(name="filebeat", targets=[f'{h}:5067' for h in all_hosts]),
+ jobConfig(name="net-routes", targets=['pipe:9999']),
+ jobConfig(name="net-traffic", targets=['pipe:8080']),
+ jobConfig(name="pomerium", targets=['pomerium-metrics.pomerium:9090']),
+ jobConfig(name="telegraf", targets=[f'{h}:9273' for h in all_hosts]),
+ jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'),
+
+ jobConfig(name="next-victoriametrics-forever-vmagent", metrics_path='/m/next/forever/vmagent/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-forever-vminsert", metrics_path='/m/next/forever/vminsert/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName),
+ jobConfig(name="next-victoriametrics-recent-vmagent", metrics_path='/m/next/recent/vmagent/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-recent-vminsert", metrics_path='/m/next/recent/vminsert/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-recent-vmstorage", metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-vmselect", metrics_path='/m/next/vmselect/metrics', targets=FromName),
+ jobConfig(name="next-victoriametrics-index", targets=FromName),
+
+ # todo:
+ # - video-files
+ # - cert-manager
+ # - syncthing(s)
+ # - nvidia runner
+ # - longhorn
+ # - kube-system.metrics-server
+ jobConfig(
+ name="racc",
+ scrape_interval='30s',
+ targets=[
+ # - dash:5150
+ # - dot:5150
+ # - squib:5150
+ # - ashermac:5150
+ ],
+ ),
+] # yapf: disable
+
+
+deploy_doesnt_serve_metrics = [
+ 'apprise',
+ 'bitwarden',
+ 'digi-files',
+ 'digi-pose-predict',
+ 'digi-tts-mimic',
+ 'digi-web',
+ 'dovecot',
+ 'ectoscope',
+ 'front-door-display',
+ 'hass',
+ 'homepage',
+ 'itch150',
+ 'jsregistry',
+ 'kallithea',
+ 'kube-web-view',
+ 'magma',
+ 'megasecond',
+ 'minecraft-build-world',
+ 'minecraft-lake-world',
+ 'minecraft-smp-world',
+ 'mongodb',
+ 'mqtt1',
+ 'mqtt2',
+ 'nodered',
+ 'photoprism',
+ 'plik',
+ 'projects',
+ 'registry-ui',
+ 'registry',
+ 'speakerphone',
+ 'victorialogs-ui',
+ 'video-files',
+ 'video',
+ 'zigbee2mqtt',
+ 'zwave2mqtt',
+]
+
+existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs]
+recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs))
+
+recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | {
+ 'kubernetes_sd_configs': [{
+ 'role': 'endpoints'
+ }],
+ 'relabel_configs': [{
+ 'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'],
+ 'action': 'keep',
+ 'regex': 'default;kubernetes;https'
+ }],
+})
+
+recent_jobs.append(
+ jobConfig(name="kubernetes-nodes", https=True, targets=[]) | {
+ "kubernetes_sd_configs": [{
+ "role": "node"
+ }],
+ "relabel_configs": [{
+ "action": "labeldrop",
+ "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+ }, {
+ "action": "labelmap",
+ "regex": "__meta_kubernetes_node_label_(.+)"
+ }, {
+ "action": "labeldrop",
+ "regex": "kubernetes_io_hostname"
+ }],
+ })
+
+# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
+# for metric definitions
+
+recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | {
+ "kubernetes_sd_configs": [{
+ "role": "node"
+ }],
+ "relabel_configs": [{
+ "action": "labeldrop",
+ "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+ }],
+})
+
+outDir = Path('build/scrape_config')
+writeJobConfigs(outDir, forever_jobs, 'forever')
+writeJobConfigs(outDir, recent_jobs, 'recent')
diff -r 429bfd62e6ba -r adde35eb4773 deploy_alertmanager.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/deploy_alertmanager.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: alertmanager
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: alertmanager
+ template:
+ metadata:
+ labels:
+ app: alertmanager
+ spec:
+ volumes:
+ - name: opt-alertmanager
+ persistentVolumeClaim:
+ claimName: opt-alertmanager
+ serviceAccountName: victoriametrics
+ containers:
+ - name: alertmanager
+ image: docker.io/prom/alertmanager:v0.27.0
+ args:
+ - --config.file=/alertmanager/alertmanager.yml
+ - --web.external-url=https://bigasterisk.com/alertmanager/
+ - --web.route-prefix=/
+ - --log.level=info
+ ports:
+ - containerPort: 9093
+ volumeMounts:
+ - name: opt-alertmanager
+ mountPath: /alertmanager
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: "kubernetes.io/hostname"
+ operator: In
+ values: ["ditto"]
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: alertmanager
+spec:
+ ports:
+ - port: 80
+ targetPort: 9093
+ selector:
+ app: alertmanager
diff -r 429bfd62e6ba -r adde35eb4773 deploy_vmalert.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/deploy_vmalert.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: vmalert
+spec:
+ replicas: 1
+ strategy: { type: Recreate }
+ selector:
+ matchLabels:
+ app: vmalert
+ template:
+ metadata:
+ labels:
+ app: vmalert
+ annotations:
+ prometheus.io/scrape: "true"
+ spec:
+ volumes:
+ - name: config
+ configMap: { name: victoriametrics-config }
+ serviceAccountName: victoriametrics
+ containers:
+ - name: vmalert
+ image: docker.io/victoriametrics/vmalert:v1.91.2
+ args:
+ - -configCheckInterval=5s
+ - -datasource.url=http://victoriametrics/m/
+ - -datasource.queryStep=5m
+ - -evaluationInterval=1m
+ - -external.url=https://bigasterisk.com/vmalert
+ - -loggerLevel=INFO
+ - -loggerTimezone=America/Los_Angeles
+ - -memory.allowedBytes=512MB
+ - -notifier.url=http://alertmanager
+ - -remoteRead.url=http://victoriametrics/m/
+ - -remoteWrite.url=http://victoriametrics/m/
+ - -rule=/local/rules
+ ports:
+ - containerPort: 8880
+ volumeMounts:
+ - { name: config, mountPath: /local }
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: vmalert
+spec:
+ ports:
+ - port: 80
+ targetPort: 8880
+ selector:
+ app: vmalert
diff -r 429bfd62e6ba -r adde35eb4773 index.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/index.css Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,8 @@
+section {
+ margin-left: 2em;
+}
+
+h1,
+h2 {
+ border-top: 1px solid lightgray;
+}
diff -r 429bfd62e6ba -r adde35eb4773 index.js
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/index.js Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,4 @@
+function init(serverUrl) {
+ // this defaults to something incorrect, so we fix it hopefully before you go to vmui
+ localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl }));
+}
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 index_page.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/index_page.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,75 @@
+def makeIndexHtml(objPrefix, webRoot, webHost):
+ return f"""
+
+
+ {objPrefix}
+
+
+
+ {objPrefix}
+
+
+
+
+
+
+
+
+
+ """
diff -r 429bfd62e6ba -r adde35eb4773 ingress_alertmanager.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ingress_alertmanager.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,55 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: vmalert
+ annotations:
+ cert-manager.io/cluster-issuer: letsencrypt-prod
+ ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+ ingress.pomerium.io/pass_identity_headers: "true"
+ ingress.pomerium.io/preserve_host_header: "true"
+ ingress.pomerium.io/policy: |
+ allow:
+ or:
+ - { email: { is: "drewpca@gmail.com" }}
+ - { email: { is: "kelsimp@gmail.com" }}
+ # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
+spec:
+ ingressClassName: pomerium
+ rules:
+ - host: "bigasterisk.com"
+ http:
+ paths:
+ - pathType: Prefix
+ path: /vmalert/
+ backend: { service: { name: vmalert, port: { number: 80 } } }
+ tls:
+ - hosts: [bigasterisk.com]
+ secretName: bigasterisk.com-tls
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: alertmanager
+ annotations:
+ cert-manager.io/cluster-issuer: letsencrypt-prod
+ ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+ ingress.pomerium.io/pass_identity_headers: "true"
+ ingress.pomerium.io/preserve_host_header: "true"
+ ingress.pomerium.io/policy: |
+ allow:
+ or:
+ - { email: { is: "drewpca@gmail.com" }}
+ - { email: { is: "kelsimp@gmail.com" }}
+ ingress.pomerium.io/prefix_rewrite: "/"
+spec:
+ ingressClassName: pomerium
+ rules:
+ - host: "bigasterisk.com"
+ http:
+ paths:
+ - pathType: Prefix
+ path: /alertmanager/
+ backend: { service: { name: alertmanager, port: { number: 80 } } }
+ tls:
+ - hosts: [bigasterisk.com]
+ secretName: bigasterisk.com-tls
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 k8s_ops.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/k8s_ops.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,50 @@
+import json
+import time
+
+from kubernetes import client
+
+
+def refreshPodCmaps(pod_name, namespace="default"):
+ """
+ Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
+ until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
+ """
+ api_instance = client.CoreV1Api()
+
+ pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
+ if pod.metadata.annotations is None:
+ pod.metadata.annotations = {}
+ pod.metadata.annotations["force-configmap-update"] = str(time.time())
+ api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
+
+
+def firstPodName(selector):
+ api_instance = client.CoreV1Api()
+ pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
+ return pod_list.items[0].metadata.name
+
+
+def hup(ctx, deployment, process_name):
+ ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
+
+
+def replaceCmap(name, dataObj):
+ api_instance = client.CoreV1Api()
+
+ data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
+
+ try:
+
+ existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
+ existing_config_map.data.update(data)
+ api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
+ except client.rest.ApiException as e:
+ if e.status == 404:
+ config_map = client.V1ConfigMap()
+ config_map.metadata = client.V1ObjectMeta(name=name)
+ config_map.data = data
+ api_response = api_instance.create_namespaced_config_map('default', config_map)
+ else:
+ raise
+
+ print(f"{name} resource_version is now {api_response.metadata.resource_version}")
diff -r 429bfd62e6ba -r adde35eb4773 next/alert_rules.py
--- a/next/alert_rules.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,433 +0,0 @@
-"""
-pdm run invoke push-config
-
-docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-"Whenever the alert expression results in one or more vector
-elements at a given point in time, the alert counts as active for
-these elements' label sets."
-also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
-
-"""
-
-import json
-
-
-def pomRules():
- return [
- {
- "alert": "frequent_upstream_connect_failures",
- "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
- },
- {
- "alert": "high_logging_pomerium",
- "for": "3h",
- "labels": {
- "severity": "waste"
- },
- "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
- "annotations": {
- "summary": "high log output rate"
- },
- },
- ]
-
-
-def k8sRules():
- # from https://awesome-prometheus-alerts.grep.to/rules.html
- return [
- {
- "alert": "metricsTargetMissing",
- "expr": 'up{job!~"cm-acme-.*"} == 0',
- 'for': '10m',
- "labels": {
- "severity": "critical"
- },
- "annotations": {
- "summary": "metrics target missing (instance {{ $labels.instance }})",
- "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesMemoryPressure",
- "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
- "for": "2m",
- "labels": {
- "severity": "critical"
- },
- "annotations": {
- "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
- "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesDiskPressure",
- "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
- "for": "2m",
- "labels": {
- "severity": "critical"
- },
- "annotations": {
- "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
- "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesOutOfDisk",
- "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
- "for": "2m",
- "labels": {
- "severity": "critical"
- },
- "annotations": {
- "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
- "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesJobFailed",
- "expr": "kube_job_status_failed > 0",
- "labels": {
- "severity": "warning"
- },
- "annotations": {
- "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
- "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesPodCrashLooping",
- "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
- "for": "2m",
- "labels": {
- "severity": "warning"
- },
- "annotations": {
- "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
- "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "KubernetesClientCertificateExpiresNextWeek",
- "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
- "labels": {
- "severity": "warning"
- },
- "annotations": {
- "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
- "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
- },
- },
- {
- "alert": "container_waiting",
- "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
- "annotations": {
- "description": '',
- "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
- },
- "for": "2m",
- },
- ]
-
-
-def allRules(ctx):
- return {
- "groups": [
- {
- "name": "k8s",
- "interval": "1m",
- "rules": k8sRules(),
- },
- {
- "name": "pomerium_proxy",
- "interval": "1m",
- "rules": pomRules(),
- },
- {
- "name":
- "Outages",
- "interval":
- "1m",
- "rules": [
- {
- "alert": "powereagleStalled",
- "expr": "rate(house_power_w[100m]) == 0",
- "for": "0m",
- "labels": {
- "severity": "losingData"
- },
- "annotations": {
- "summary": "power eagle data stalled",
- "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
- },
- },
- {
- "alert": "powereagleAbsent",
- "expr": "absent_over_time(house_power_w[5m])",
- "for": "2m",
- "labels": {
- "severity": "losingData"
- },
- "annotations": {
- "summary": "power eagle data missing",
- "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
- },
- },
- {
- "alert": "absent_zigbee",
- "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
- },
- {
- "alert": "net_routes_sync",
- "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
- "for": "10m",
- "labels": {
- "severity": "houseUsersAffected"
- },
- "annotations": {
- "summary": "net_routes is not getting regular updates"
- },
- },
- ],
- },
- {
- "name": "disk_errs",
- "interval": "2d",
- "rules": [{
- "alert": "zpool_device_error_increase",
- "labels": {
- "severity": "warning"
- },
- "expr": 'increase(zpool_device_error_count[3d]) > 0',
- }, {
- "alert": "zpool_device_error_count",
- "labels": {
- "severity": "warning"
- },
- "expr": 'zpool_device_error_count > 0',
- }],
- },
- {
- "name": "lighting",
- "interval": "5m",
- "rules": [{
- "alert": "light_bridge_no_mqtt",
- "expr": 'mqtt_connected{job="light-bridge"} != 1',
- }],
- },
- {
- "name":
- "front_door",
- "interval":
- "5m",
- "rules": [
- {
- "alert": "front_door_reader_esp32_no_mqtt",
- 'expr': 'hw_connected{job="fingerprint"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- {
- "alert": "front_door_reader_svc_down",
- 'expr': 'up{job="fingerprint"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- {
- "alert": "front_door_reader_svc_reader_no_mqtt",
- 'expr': 'mqtt_connected{job="fingerprint"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- {
- "alert": "front_door_lock_svc_down",
- 'expr': 'up{job="front-door-lock"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- {
- "alert": "front_door_lock_svc_no_mqtt",
- 'expr': 'mqtt_connected{job="front-door-lock"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- {
- "alert": "front_door_lock_esp32_no_mqtt",
- 'expr': 'hw_connected{job="front-door-lock"} < 1',
- "annotations": {
- "summary": "see https://bigasterisk.com/front-door-lock/"
- },
- },
- ],
- },
- {
- "name":
- "net_routes",
- "interval":
- "5m",
- "rules": [
- {
- "alert": "no_house_ip_service",
- "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
- },
- {
- "alert": "no_net_routes_running",
- "expr": 'absent(python_info{job="net-routes"})'
- },
- {
- "alert": "allowed_check_never_returned_200",
- 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
- },
- {
- "alert": "allowed_check_never_returned_403",
- 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
- },
- {
- 'alert': 'net_route_input_eval_cal_loop_is_down',
- 'expr': 'eval_cal_up!=1'
- },
- {
- 'alert': 'net_route_input_mongo_loop_is_down',
- 'expr': 'mongo_to_net_routes_up!=1'
- },
- {
- 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
- 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
- },
- {
- 'alert': 'gcalendarwatch_current_events_loop_is_down',
- 'expr': 'current_events_up != 1'
- },
- ],
- },
- {
- "name": "http",
- "interval": "1h",
- 'rules': [
- {
- 'alert': 'old_https_certs',
- 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
- },
- {
- 'alert': 'high_500_response_rate',
- 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
- },
- ],
- },
- {
- "name": "ping",
- "interval": "1m",
- "rules": [{
- "alert": "ping_failed",
- "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
- }]
- },
- {
- "name":
- "alerts",
- "rules": [
- {
- "alert": "kube_node_status_bad_condition",
- "for": "2h",
- "labels": {
- "severity": "warning"
- },
- "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
- },
- {
- "alert": "housePower",
- "for": "1h",
- "labels": {
- "severity": "waste"
- },
- "expr": "house_power_w > 4000",
- "annotations": {
- "summary": "house power usage over 4KW"
- },
- },
- {
- "alert": "host_root_fs_space_low",
- "for": "20m",
- "labels": {
- "severity": "warning"
- },
- "expr": 'disk_free{host!="garage",path="/"} < 20G',
- },
- {
- "alert": "zpool_space_low",
- "for": "20m",
- "labels": {
- "severity": "warning"
- },
- "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
- },
- {
- "alert": "disk_week_incr",
- "for": "20m",
- "labels": {
- "severity": "warning"
- },
- "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
- "annotations": {
- "summary": "high mb/week on zfs dir"
- },
- },
- {
- "alert": "high_logging",
- "for": "3h",
- "labels": {
- "severity": "waste"
- },
- "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
- "annotations": {
- "summary": "high log output rate"
- },
- },
- {
- "alert": "stale_process",
- "for": "1d",
- "labels": {
- "severity": "dataRisk"
- },
- "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
- "annotations": {
- "summary": "process time is old"
- },
- },
- {
- "alert": "starlette",
- "for": "1m",
- "labels": {
- "severity": "fix"
- },
- "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
- "annotations": {
- "summary": "set starlette app name"
- },
- },
- {
- "alert": "ssl_certs_expiring_soon",
- "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
- "labels": {
- "severity": "warning"
- },
- "annotations": {
- "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
- },
- },
- ],
- },
- ] + hostsExpectedOnline(ctx)['groups']
- }
-
-
-def _runJson(ctx, cmd):
- return json.loads(ctx.run(cmd, hide="stdout").stdout)
-
-
-def hostsExpectedOnline(ctx):
- return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
diff -r 429bfd62e6ba -r adde35eb4773 next/create_k8s.py
--- a/next/create_k8s.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,254 +0,0 @@
-from pathlib import Path
-from index_page import makeIndexHtml
-from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
-
-
-def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
- (build / f'{agentFileName}_deploy.yaml').write_text(
- toJson({
- "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
- "spec": {
- "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
- "template": {
- "metadata": {
- "labels": { "app": agentName },
- "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
- },
- "spec": {
- "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
- "serviceAccountName": "victoriametrics",
- "containers": [{
- "name": "vmagent",
- "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
- "imagePullPolicy": "IfNotPresent",
- "args": [
- f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
- tzArg,
- f"-promscrape.config=/local/config/{scrapeMapKey}",
- "-promscrape.configCheckInterval=5s",
- "-sortLabels",
- f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
- "-remoteWrite.showURL",
- ],
- "ports": [{ "containerPort": agentPort }],
- "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
- }]
- }
- }
- }
- })) # yapf: disable
-
-
-def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
- (build / f'{insertFileName}_deploy.yaml').write_text(
- toJson({
- "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
- "spec": {
- "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
- "template": {
- "metadata": {
- "labels": { "app": insertName },
- "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
- },
- "spec": {
- "serviceAccountName": "victoriametrics",
- "containers": [{
- "name": "vminsert",
- "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
- "imagePullPolicy": "IfNotPresent",
- "args": [
- f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
- tzArg,
- f"-storageNode={storageName}",
- ],
- "ports": [{ "containerPort": insertPort }]
- }]
- }
- }
- }
- })) # yapf: disable
-
-
-def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
- (build / f'{storageFileName}_2deploy.yaml').write_text(
- toJson({
- "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
- "spec": {
- "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
- "template": {
- "metadata": {
- "labels": { "app": storageName },
- "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
- },
- "spec": {
- "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
- "serviceAccountName": "victoriametrics",
- "containers": [{
- "name": "vmstorage",
- "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
- "imagePullPolicy": "IfNotPresent",
- "args": [
- f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
- tzArg,
- f"-retentionPeriod={retention}",
- f"-storageDataPath=/data/{pipelineName}",
- ],
- "ports": [
- { "containerPort": 8482, "name": "http" },
- { "containerPort": storageInsertPort, "name": "vminsert" },
- { "containerPort": storageSelectPort, "name": "vmselect" },
- ],
- "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
- }],
- "affinity": affinityToNode(localPvHost)
- }
- }
- }
- })) # yapf: disable
-
-
-def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
- name = f"{objPrefix}-vmselect"
- (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
- toJson({
- "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
- "spec": {
- "replicas": 1,
- "strategy": { "type": "Recreate" },
- "selector": { "matchLabels": { "app": name } },
- "template": {
- "metadata": {
- "labels": { "app": name },
- "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
- },
- "spec": {
- "serviceAccountName": "victoriametrics",
- "containers": [{
- "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
- "args": [
- f"-http.pathPrefix={webRoot}/vmselect/",
- tzArg,
- ] + [f"-storageNode={n}" for n in storageSvcs],
- "ports": [{ "containerPort": selectPort }]
- }]
- }
- }
- }
- })) # yapf: disable
-
-def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
- agentName = f"{objPrefix}-{pipelineName}-vmagent"
- insertName = f"{objPrefix}-{pipelineName}-vminsert"
- storageName = f"{objPrefix}-{pipelineName}-vmstorage"
-
- agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
- insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
- storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
-
- localPvHost = "ditto"
- insertPort = 8480
- agentPort = 8429
- storageInsertPort = 8400
- storageSelectPort = 8401
- volName = f"{objPrefix}-data-{pipelineName}"
- request = "50Gi"
- pipelineWebRoot = f'{webRoot}/{pipelineName}'
-
- createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
- createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
- createPv(storageFileName, volName, request)
- createPvc(storageFileName, volName, request)
- createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
-
- createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
- createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
- createSvc(storageFileName,storageName, [
- {"port": 80, "targetPort": "http", "name": "http"},
- {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
- {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
- ]) # yapf: disable
-
- return storageName
-
-
-def createIndex(objPrefix, webRoot, html):
- name = f'{objPrefix}-index'
- httpServeRoot = '/opt/html'
-
- (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
- "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
- "data": {
- "index.html": html,
- "index.js": Path("index.js").read_text(),
- "index.css": Path("index.css").read_text(),
- }
- })) # yapf: disable
-
- (build / f'{objPrefix}-3index_deploy.yaml').write_text(
- toJson({
- "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
- "spec": {
- "replicas": 1,
- "selector": { "matchLabels": { "app": name } },
- "template": {
- "metadata": { "labels": { "app": name } },
- "spec": {
- "containers": [{
- "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
- "args": [
- f'--root={httpServeRoot}',
- '--directory-listing=true',
- '--experimental-metrics=true',
- ],
- "ports": [{ "containerPort": 80 }],
- "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
- }],
- "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
- }
- }
- }
- })) # yapf: disable
- createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
-
-
-def main():
- tzArg = "-loggerTimezone=America/Los_Angeles"
- objPrefix = "next-victoriametrics" # prefix on all k8s object names
- webRoot = "/m/next"
- vmVersion = "v1.100.1"
- webHost = 'bigasterisk.com'
- pipelines = [
- ('forever', '100y'),
- ('recent', '90y'),
- ]
- storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
-
- selectPort = 8481
- createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
- createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
-
- ingressPaths = [
- { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
- { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
- ] # yapf: disable
- for p, _ in pipelines:
- ingressPaths.extend([
- { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } },
- { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } },
- { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
- ]) # yapf: disable
-
- policy = """\
-allow:
- or:
- - { email: { is: "drewpca@gmail.com" }}
- - { email: { is: "kelsimp@gmail.com" }}
- """
- createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
- createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
-
-
-main()
-
-# in vmui, set server url to
-# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
diff -r 429bfd62e6ba -r adde35eb4773 next/create_scrape_configs.py
--- a/next/create_scrape_configs.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,170 +0,0 @@
-from pathlib import Path
-
-from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName
-import private
-
-# previously this used `kubernetes_sd_configs: [{ role: node }]`
-all_hosts = [
- 'dash',
- 'ditto',
- # 'ws-printer',
- #todo:
-]
-
-smartctl_hosts = [
- # ideally, all nodes with disks, but many turn off and on
- 'dash',
- 'ditto',
-]
-
-ping_hosts = [
- # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
- 'printer',
- # wireguard connection test
- 'prime5',
- # after pyinfra or reboot, seems to lose wg0 address
- 'garage5',
-]
-
-
-forever_jobs = [
- jobConfig(name='maildir-count', targets=['prime:2500']),
- jobConfig(name='mongodb', targets=['mongodb:9216']),
- jobConfig(name='net-traffic', targets=['pipe:8080']),
- jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True),
- jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec
- jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'),
- jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]),
- jobConfig(name='wifi', targets=['wifi:80']),
- jobConfig(name='zfs-exporter', targets=['ditto:9634']),
- jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
- jobConfig(name='zpool-exporter', targets=['ditto:9986']),
- jobConfig(name='octoprint', targets=['octoprint'],
- metrics_path='/plugin/prometheus_exporter/metrics',
- params={'apikey' : [private.octoprint_apikey]},
- ),
-] # yapf: disable
-
-recent_jobs = [
- jobConfig(name="dnsmasq-log", targets=['pipe:9991']),
- jobConfig(name="filebeat", targets=[f'{h}:5067' for h in all_hosts]),
- jobConfig(name="net-routes", targets=['pipe:9999']),
- jobConfig(name="net-traffic", targets=['pipe:8080']),
- jobConfig(name="pomerium", targets=['pomerium-metrics.pomerium:9090']),
- jobConfig(name="telegraf", targets=[f'{h}:9273' for h in all_hosts]),
- jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'),
-
- jobConfig(name="next-victoriametrics-forever-vmagent", metrics_path='/m/next/forever/vmagent/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-forever-vminsert", metrics_path='/m/next/forever/vminsert/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName),
- jobConfig(name="next-victoriametrics-recent-vmagent", metrics_path='/m/next/recent/vmagent/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-recent-vminsert", metrics_path='/m/next/recent/vminsert/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-recent-vmstorage", metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-vmselect", metrics_path='/m/next/vmselect/metrics', targets=FromName),
- jobConfig(name="next-victoriametrics-index", targets=FromName),
-
- # todo:
- # - video-files
- # - cert-manager
- # - syncthing(s)
- # - nvidia runner
- # - longhorn
- # - kube-system.metrics-server
- jobConfig(
- name="racc",
- scrape_interval='30s',
- targets=[
- # - dash:5150
- # - dot:5150
- # - squib:5150
- # - ashermac:5150
- ],
- ),
-] # yapf: disable
-
-
-deploy_doesnt_serve_metrics = [
- 'apprise',
- 'bitwarden',
- 'digi-files',
- 'digi-pose-predict',
- 'digi-tts-mimic',
- 'digi-web',
- 'dovecot',
- 'ectoscope',
- 'front-door-display',
- 'hass',
- 'homepage',
- 'itch150',
- 'jsregistry',
- 'kallithea',
- 'kube-web-view',
- 'magma',
- 'megasecond',
- 'minecraft-build-world',
- 'minecraft-lake-world',
- 'minecraft-smp-world',
- 'mongodb',
- 'mqtt1',
- 'mqtt2',
- 'nodered',
- 'photoprism',
- 'plik',
- 'projects',
- 'registry-ui',
- 'registry',
- 'speakerphone',
- 'victorialogs-ui',
- 'video-files',
- 'video',
- 'zigbee2mqtt',
- 'zwave2mqtt',
-]
-
-existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs]
-recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs))
-
-recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | {
- 'kubernetes_sd_configs': [{
- 'role': 'endpoints'
- }],
- 'relabel_configs': [{
- 'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'],
- 'action': 'keep',
- 'regex': 'default;kubernetes;https'
- }],
-})
-
-recent_jobs.append(
- jobConfig(name="kubernetes-nodes", https=True, targets=[]) | {
- "kubernetes_sd_configs": [{
- "role": "node"
- }],
- "relabel_configs": [{
- "action": "labeldrop",
- "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
- }, {
- "action": "labelmap",
- "regex": "__meta_kubernetes_node_label_(.+)"
- }, {
- "action": "labeldrop",
- "regex": "kubernetes_io_hostname"
- }],
- })
-
-# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
-# for metric definitions
-
-recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | {
- "kubernetes_sd_configs": [{
- "role": "node"
- }],
- "relabel_configs": [{
- "action": "labeldrop",
- "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
- }],
-})
-
-outDir = Path('build/scrape_config')
-writeJobConfigs(outDir, forever_jobs, 'forever')
-writeJobConfigs(outDir, recent_jobs, 'recent')
diff -r 429bfd62e6ba -r adde35eb4773 next/deploy_alertmanager.yaml
--- a/next/deploy_alertmanager.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,51 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
- name: alertmanager
-spec:
- replicas: 1
- selector:
- matchLabels:
- app: alertmanager
- template:
- metadata:
- labels:
- app: alertmanager
- spec:
- volumes:
- - name: opt-alertmanager
- persistentVolumeClaim:
- claimName: opt-alertmanager
- serviceAccountName: victoriametrics
- containers:
- - name: alertmanager
- image: docker.io/prom/alertmanager:v0.27.0
- args:
- - --config.file=/alertmanager/alertmanager.yml
- - --web.external-url=https://bigasterisk.com/alertmanager/
- - --web.route-prefix=/
- - --log.level=info
- ports:
- - containerPort: 9093
- volumeMounts:
- - name: opt-alertmanager
- mountPath: /alertmanager
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: "kubernetes.io/hostname"
- operator: In
- values: ["ditto"]
----
-apiVersion: v1
-kind: Service
-metadata:
- name: alertmanager
-spec:
- ports:
- - port: 80
- targetPort: 9093
- selector:
- app: alertmanager
diff -r 429bfd62e6ba -r adde35eb4773 next/deploy_vmalert.yaml
--- a/next/deploy_vmalert.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
- name: vmalert
-spec:
- replicas: 1
- strategy: { type: Recreate }
- selector:
- matchLabels:
- app: vmalert
- template:
- metadata:
- labels:
- app: vmalert
- annotations:
- prometheus.io/scrape: "true"
- spec:
- volumes:
- - name: config
- configMap: { name: victoriametrics-config }
- serviceAccountName: victoriametrics
- containers:
- - name: vmalert
- image: docker.io/victoriametrics/vmalert:v1.91.2
- args:
- - -configCheckInterval=5s
- - -datasource.url=http://victoriametrics/m/
- - -datasource.queryStep=5m
- - -evaluationInterval=1m
- - -external.url=https://bigasterisk.com/vmalert
- - -loggerLevel=INFO
- - -loggerTimezone=America/Los_Angeles
- - -memory.allowedBytes=512MB
- - -notifier.url=http://alertmanager
- - -remoteRead.url=http://victoriametrics/m/
- - -remoteWrite.url=http://victoriametrics/m/
- - -rule=/local/rules
- ports:
- - containerPort: 8880
- volumeMounts:
- - { name: config, mountPath: /local }
----
-apiVersion: v1
-kind: Service
-metadata:
- name: vmalert
-spec:
- ports:
- - port: 80
- targetPort: 8880
- selector:
- app: vmalert
diff -r 429bfd62e6ba -r adde35eb4773 next/index.css
--- a/next/index.css Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-section {
- margin-left: 2em;
-}
-
-h1,
-h2 {
- border-top: 1px solid lightgray;
-}
diff -r 429bfd62e6ba -r adde35eb4773 next/index.js
--- a/next/index.js Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
-function init(serverUrl) {
- // this defaults to something incorrect, so we fix it hopefully before you go to vmui
- localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl }));
-}
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 next/index_page.py
--- a/next/index_page.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-def makeIndexHtml(objPrefix, webRoot, webHost):
- return f"""
-
-
- {objPrefix}
-
-
-
- {objPrefix}
-
-
-
-
-
-
-
-
-
- """
diff -r 429bfd62e6ba -r adde35eb4773 next/ingress_alertmanager.yaml
--- a/next/ingress_alertmanager.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
- name: vmalert
- annotations:
- cert-manager.io/cluster-issuer: letsencrypt-prod
- ingress.pomerium.io/allow_public_unauthenticated_access: "false"
- ingress.pomerium.io/pass_identity_headers: "true"
- ingress.pomerium.io/preserve_host_header: "true"
- ingress.pomerium.io/policy: |
- allow:
- or:
- - { email: { is: "drewpca@gmail.com" }}
- - { email: { is: "kelsimp@gmail.com" }}
- # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
-spec:
- ingressClassName: pomerium
- rules:
- - host: "bigasterisk.com"
- http:
- paths:
- - pathType: Prefix
- path: /vmalert/
- backend: { service: { name: vmalert, port: { number: 80 } } }
- tls:
- - hosts: [bigasterisk.com]
- secretName: bigasterisk.com-tls
----
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
- name: alertmanager
- annotations:
- cert-manager.io/cluster-issuer: letsencrypt-prod
- ingress.pomerium.io/allow_public_unauthenticated_access: "false"
- ingress.pomerium.io/pass_identity_headers: "true"
- ingress.pomerium.io/preserve_host_header: "true"
- ingress.pomerium.io/policy: |
- allow:
- or:
- - { email: { is: "drewpca@gmail.com" }}
- - { email: { is: "kelsimp@gmail.com" }}
- ingress.pomerium.io/prefix_rewrite: "/"
-spec:
- ingressClassName: pomerium
- rules:
- - host: "bigasterisk.com"
- http:
- paths:
- - pathType: Prefix
- path: /alertmanager/
- backend: { service: { name: alertmanager, port: { number: 80 } } }
- tls:
- - hosts: [bigasterisk.com]
- secretName: bigasterisk.com-tls
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 next/k8s_ops.py
--- a/next/k8s_ops.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-import json
-import time
-
-from kubernetes import client
-
-
-def refreshPodCmaps(pod_name, namespace="default"):
- """
- Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
- until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
- """
- api_instance = client.CoreV1Api()
-
- pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
- if pod.metadata.annotations is None:
- pod.metadata.annotations = {}
- pod.metadata.annotations["force-configmap-update"] = str(time.time())
- api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
-
-
-def firstPodName(selector):
- api_instance = client.CoreV1Api()
- pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
- return pod_list.items[0].metadata.name
-
-
-def hup(ctx, deployment, process_name):
- ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
-
-
-def replaceCmap(name, dataObj):
- api_instance = client.CoreV1Api()
-
- data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
-
- try:
-
- existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
- existing_config_map.data.update(data)
- api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
- except client.rest.ApiException as e:
- if e.status == 404:
- config_map = client.V1ConfigMap()
- config_map.metadata = client.V1ObjectMeta(name=name)
- config_map.data = data
- api_response = api_instance.create_namespaced_config_map('default', config_map)
- else:
- raise
-
- print(f"{name} resource_version is now {api_response.metadata.resource_version}")
diff -r 429bfd62e6ba -r adde35eb4773 next/output.py
--- a/next/output.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,122 +0,0 @@
-import json
-from pathlib import Path
-
-build = Path('build/k8s_config')
-build.mkdir(parents=True, exist_ok=True)
-
-
-def toJson(d):
- return json.dumps(d, sort_keys=True, indent=2)
-
-
-def createSvc(filename, name, ports):
- (build / f'{filename}_svc.yaml').write_text(toJson({
- "apiVersion": "v1",
- "kind": "Service",
- "metadata": {
- "name": name
- },
- "spec": {
- "ports": ports,
- "selector": {
- "app": name
- }
- },
- }))
-
-
-def createIngress(filename, objName, policy, ingressPaths, host):
-
- (build / filename).write_text(
- toJson({
- "apiVersion": "networking.k8s.io/v1",
- "kind": "Ingress",
- "metadata": {
- "name": objName,
- "annotations": {
- "cert-manager.io/cluster-issuer": "letsencrypt-prod",
- "ingress.pomerium.io/allow_public_unauthenticated_access": "false",
- "ingress.pomerium.io/pass_identity_headers": "true",
- "ingress.pomerium.io/preserve_host_header": "true",
- "ingress.pomerium.io/policy": policy,
- }
- },
- "spec": {
- "ingressClassName": "pomerium",
- "rules": [{
- "host": host,
- "http": {
- "paths": ingressPaths
- }
- },],
- "tls": [{
- "hosts": [host],
- "secretName": f"{host}-tls"
- }]
- }
- }))
-
-
-def createPv(storageFileName, volName, request):
- (build / f'{storageFileName}_0pv.yaml').write_text(
- toJson({
- "apiVersion": "v1",
- "kind": "PersistentVolume",
- "metadata": {
- "name": volName,
- "labels": {
- "type": "local"
- }
- },
- "spec": {
- "storageClassName": "manual",
- "hostPath": {
- "path": f"/opt/{volName}"
- },
- "capacity": {
- "storage": request
- },
- "accessModes": ["ReadWriteMany"],
- "persistentVolumeReclaimPolicy": "Retain",
- "claimRef": {
- "namespace": "default",
- "name": volName
- }
- }
- }))
-
-
-def createPvc(storageFileName, volName, request):
- (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({
- "apiVersion": "v1",
- "kind": "PersistentVolumeClaim",
- "metadata": {
- "name": volName,
- },
- "spec": {
- "storageClassName": "",
- "volumeName": volName,
- "accessModes": ["ReadWriteMany"],
- "resources": {
- "requests": {
- "storage": request
- }
- }
- },
- }))
-
-
-def affinityToNode(node):
- return {
- "nodeAffinity": {
- "requiredDuringSchedulingIgnoredDuringExecution": {
- "nodeSelectorTerms": [{
- "matchExpressions": [{
- "key": "kubernetes.io/hostname",
- "operator": "In",
- "values": [node],
- }],
- }],
- },
- }
- }
diff -r 429bfd62e6ba -r adde35eb4773 next/roles.yaml
--- a/next/roles.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
- name: victoriametrics
-rules:
-- apiGroups: [""]
- resources:
- - nodes
- - nodes/metrics
- - nodes/proxy
- - services
- - endpoints
- - pods
- verbs: ["get", "list", "watch"]
-- apiGroups:
- - extensions
- resources:
- - ingresses
- verbs: ["get", "list", "watch"]
-- nonResourceURLs: ["/metrics"]
- verbs: ["get"]
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
- name: victoriametrics
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
- name: victoriametrics
-roleRef:
- apiGroup: rbac.authorization.k8s.io
- kind: ClusterRole
- name: victoriametrics
-subjects:
-- kind: ServiceAccount
- name: victoriametrics
- namespace: default
-# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
-# - kind: ServiceAccount
-# name: default
-# namespace: default
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 next/scrape_job.py
--- a/next/scrape_job.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,102 +0,0 @@
-import json
-from pathlib import Path
-import subprocess
-
-class FromName:
- pass
-
-def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False):
- """one scrape job config"""
- ret = {
- "job_name": name,
- "relabel_configs": [
- {
- "target_label": "namespace",
- "replacement": "default"
- },
- {
- "source_labels": ["__meta_kubernetes_pod_node_name"],
- "target_label": "node"
- },
- ]
- }
-
- if targets is FromName:
- targets = [name]
-
- if targets:
- ret["static_configs"] = [{
- "targets": targets,
- }]
-
- if metrics_path:
- ret.setdefault('relabel_configs', []).append({
- "action": "replace",
- "target_label": "__metrics_path__",
- "replacement": metrics_path,
- })
-
- if scrape_interval:
- ret['scrape_interval'] = scrape_interval
-
- if params:
- ret['params'] = params
-
- if ping_job:
- ret['metrics_path'] = '/probe'
- ret['params'] = {'module': ['icmp']}
- ret["relabel_configs"] = [
- {
- "source_labels": ["__address__"],
- "target_label": "__param_target"
- },
- {
- "source_labels": ["__param_target"],
- "target_label": "instance"
- },
- {
- "target_label": "__address__",
- "replacement": "prober"
- },
- ]
-
- if https:
- ret['scheme'] = 'https'
- ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"}
- ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token"
-
- return ret
-
-
-def current_deployments():
- deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
- for deploy in deploys['items']:
- name = deploy['metadata']['name']
- yield name
-
-
-def scrape_deployments(skip_names):
- ret = []
- for name in current_deployments():
- if name in skip_names:
- continue
- targets = [name]
- ret.append(jobConfig(name=name, targets=targets))
- return ret
-
-
-def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
- outDir.mkdir(exist_ok=True, parents=True)
- filenames_written = []
- for job in jobConfs:
- filename = f'job_{job["job_name"]}.yaml'
- (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
- filenames_written.append(filename)
-
- (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
- "global": {
- "scrape_interval": "1m",
- "scrape_timeout": "10s"
- },
- "scrape_config_files": sorted(filenames_written),
- }, indent=2))
diff -r 429bfd62e6ba -r adde35eb4773 next/skaffold.yaml
--- a/next/skaffold.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-apiVersion: skaffold/v3
-kind: Config
-metadata:
- name: victoriametrics
-manifests:
- rawYaml:
- - roles.yaml
- - build/k8s_config/*.yaml
-deploy:
- kubectl: {}
diff -r 429bfd62e6ba -r adde35eb4773 next/tasks.py
--- a/next/tasks.py Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-from pathlib import Path
-
-import yaml
-from invoke import task
-from kubernetes import config
-
-import alert_rules
-from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
-
-config.load_kube_config()
-
-
-def scrapeConfig(fn):
- return yaml.load(open(fn), yaml.FullLoader)
-
-
-@task
-def push_config(ctx):
- # plan:
- # every discovered service may:
- # - be described here as a forever retention - ignore the discovery
- # - be blocked here as a no-metrics service - ignore the discovery
- # - be scraped as 'recent', with possible overrides of port/path
- # all per-node metrics shall be 'recent' (oops, not smartctl!)
- map: dict[str, object] = {
- 'rules': alert_rules.allRules(ctx),
- }
- top = Path('build/scrape_config')
- for p in top.glob('*.yaml'):
- map[str(p.relative_to(top))] = scrapeConfig(p)
- replaceCmap("next-victoriametrics-config", map)
- refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
- refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
diff -r 429bfd62e6ba -r adde35eb4773 next/volumes_alert.yaml
--- a/next/volumes_alert.yaml Fri May 03 11:19:50 2024 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
- name: opt-alertmanager
- labels:
- type: local
-spec:
- storageClassName: manual
- hostPath:
- path: "/opt/alertmanager"
- capacity:
- storage: 50Gi
- accessModes:
- - ReadWriteOnce
- persistentVolumeReclaimPolicy: Retain
- claimRef:
- namespace: default
- name: opt-alertmanager
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
- name: opt-alertmanager
-spec:
- storageClassName: ""
- volumeName: "opt-alertmanager"
- accessModes:
- - ReadWriteOnce
- resources:
- requests:
- storage: 50Gi
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/output.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+
+build = Path('build/k8s_config')
+build.mkdir(parents=True, exist_ok=True)
+
+
+def toJson(d):
+ return json.dumps(d, sort_keys=True, indent=2)
+
+
+def createSvc(filename, name, ports):
+ (build / f'{filename}_svc.yaml').write_text(toJson({
+ "apiVersion": "v1",
+ "kind": "Service",
+ "metadata": {
+ "name": name
+ },
+ "spec": {
+ "ports": ports,
+ "selector": {
+ "app": name
+ }
+ },
+ }))
+
+
+def createIngress(filename, objName, policy, ingressPaths, host):
+
+ (build / filename).write_text(
+ toJson({
+ "apiVersion": "networking.k8s.io/v1",
+ "kind": "Ingress",
+ "metadata": {
+ "name": objName,
+ "annotations": {
+ "cert-manager.io/cluster-issuer": "letsencrypt-prod",
+ "ingress.pomerium.io/allow_public_unauthenticated_access": "false",
+ "ingress.pomerium.io/pass_identity_headers": "true",
+ "ingress.pomerium.io/preserve_host_header": "true",
+ "ingress.pomerium.io/policy": policy,
+ }
+ },
+ "spec": {
+ "ingressClassName": "pomerium",
+ "rules": [{
+ "host": host,
+ "http": {
+ "paths": ingressPaths
+ }
+ },],
+ "tls": [{
+ "hosts": [host],
+ "secretName": f"{host}-tls"
+ }]
+ }
+ }))
+
+
+def createPv(storageFileName, volName, request):
+ (build / f'{storageFileName}_0pv.yaml').write_text(
+ toJson({
+ "apiVersion": "v1",
+ "kind": "PersistentVolume",
+ "metadata": {
+ "name": volName,
+ "labels": {
+ "type": "local"
+ }
+ },
+ "spec": {
+ "storageClassName": "manual",
+ "hostPath": {
+ "path": f"/opt/{volName}"
+ },
+ "capacity": {
+ "storage": request
+ },
+ "accessModes": ["ReadWriteMany"],
+ "persistentVolumeReclaimPolicy": "Retain",
+ "claimRef": {
+ "namespace": "default",
+ "name": volName
+ }
+ }
+ }))
+
+
+def createPvc(storageFileName, volName, request):
+ (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({
+ "apiVersion": "v1",
+ "kind": "PersistentVolumeClaim",
+ "metadata": {
+ "name": volName,
+ },
+ "spec": {
+ "storageClassName": "",
+ "volumeName": volName,
+ "accessModes": ["ReadWriteMany"],
+ "resources": {
+ "requests": {
+ "storage": request
+ }
+ }
+ },
+ }))
+
+
+def affinityToNode(node):
+ return {
+ "nodeAffinity": {
+ "requiredDuringSchedulingIgnoredDuringExecution": {
+ "nodeSelectorTerms": [{
+ "matchExpressions": [{
+ "key": "kubernetes.io/hostname",
+ "operator": "In",
+ "values": [node],
+ }],
+ }],
+ },
+ }
+ }
diff -r 429bfd62e6ba -r adde35eb4773 roles.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/roles.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,43 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: victoriametrics
+rules:
+- apiGroups: [""]
+ resources:
+ - nodes
+ - nodes/metrics
+ - nodes/proxy
+ - services
+ - endpoints
+ - pods
+ verbs: ["get", "list", "watch"]
+- apiGroups:
+ - extensions
+ resources:
+ - ingresses
+ verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+ verbs: ["get"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: victoriametrics
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: victoriametrics
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: victoriametrics
+subjects:
+- kind: ServiceAccount
+ name: victoriametrics
+ namespace: default
+# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
+# - kind: ServiceAccount
+# name: default
+# namespace: default
\ No newline at end of file
diff -r 429bfd62e6ba -r adde35eb4773 scrape_job.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scrape_job.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,102 @@
+import json
+from pathlib import Path
+import subprocess
+
+class FromName:
+ pass
+
+def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False):
+ """one scrape job config"""
+ ret = {
+ "job_name": name,
+ "relabel_configs": [
+ {
+ "target_label": "namespace",
+ "replacement": "default"
+ },
+ {
+ "source_labels": ["__meta_kubernetes_pod_node_name"],
+ "target_label": "node"
+ },
+ ]
+ }
+
+ if targets is FromName:
+ targets = [name]
+
+ if targets:
+ ret["static_configs"] = [{
+ "targets": targets,
+ }]
+
+ if metrics_path:
+ ret.setdefault('relabel_configs', []).append({
+ "action": "replace",
+ "target_label": "__metrics_path__",
+ "replacement": metrics_path,
+ })
+
+ if scrape_interval:
+ ret['scrape_interval'] = scrape_interval
+
+ if params:
+ ret['params'] = params
+
+ if ping_job:
+ ret['metrics_path'] = '/probe'
+ ret['params'] = {'module': ['icmp']}
+ ret["relabel_configs"] = [
+ {
+ "source_labels": ["__address__"],
+ "target_label": "__param_target"
+ },
+ {
+ "source_labels": ["__param_target"],
+ "target_label": "instance"
+ },
+ {
+ "target_label": "__address__",
+ "replacement": "prober"
+ },
+ ]
+
+ if https:
+ ret['scheme'] = 'https'
+ ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"}
+ ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+
+ return ret
+
+
+def current_deployments():
+ deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
+ for deploy in deploys['items']:
+ name = deploy['metadata']['name']
+ yield name
+
+
+def scrape_deployments(skip_names):
+ ret = []
+ for name in current_deployments():
+ if name in skip_names:
+ continue
+ targets = [name]
+ ret.append(jobConfig(name=name, targets=targets))
+ return ret
+
+
+def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
+ outDir.mkdir(exist_ok=True, parents=True)
+ filenames_written = []
+ for job in jobConfs:
+ filename = f'job_{job["job_name"]}.yaml'
+ (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
+ filenames_written.append(filename)
+
+ (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
+ "global": {
+ "scrape_interval": "1m",
+ "scrape_timeout": "10s"
+ },
+ "scrape_config_files": sorted(filenames_written),
+ }, indent=2))
diff -r 429bfd62e6ba -r adde35eb4773 skaffold.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/skaffold.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,10 @@
+apiVersion: skaffold/v3
+kind: Config
+metadata:
+ name: victoriametrics
+manifests:
+ rawYaml:
+ - roles.yaml
+ - build/k8s_config/*.yaml
+deploy:
+ kubectl: {}
diff -r 429bfd62e6ba -r adde35eb4773 tasks.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tasks.py Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import yaml
+from invoke import task
+from kubernetes import config
+
+import alert_rules
+from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
+
+config.load_kube_config()
+
+
+def scrapeConfig(fn):
+ return yaml.load(open(fn), yaml.FullLoader)
+
+
+@task
+def push_config(ctx):
+ # plan:
+ # every discovered service may:
+ # - be described here as a forever retention - ignore the discovery
+ # - be blocked here as a no-metrics service - ignore the discovery
+ # - be scraped as 'recent', with possible overrides of port/path
+ # all per-node metrics shall be 'recent' (oops, not smartctl!)
+ map: dict[str, object] = {
+ 'rules': alert_rules.allRules(ctx),
+ }
+ top = Path('build/scrape_config')
+ for p in top.glob('*.yaml'):
+ map[str(p.relative_to(top))] = scrapeConfig(p)
+ replaceCmap("next-victoriametrics-config", map)
+ refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
+ refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
diff -r 429bfd62e6ba -r adde35eb4773 volumes_alert.yaml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/volumes_alert.yaml Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: opt-alertmanager
+ labels:
+ type: local
+spec:
+ storageClassName: manual
+ hostPath:
+ path: "/opt/alertmanager"
+ capacity:
+ storage: 50Gi
+ accessModes:
+ - ReadWriteOnce
+ persistentVolumeReclaimPolicy: Retain
+ claimRef:
+ namespace: default
+ name: opt-alertmanager
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: opt-alertmanager
+spec:
+ storageClassName: ""
+ volumeName: "opt-alertmanager"
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 50Gi
\ No newline at end of file