# HG changeset patch # User drewp@bigasterisk.com # Date 1714760468 25200 # Node ID adde35eb477306d17dca1280cfa9c12080745fb8 # Parent 429bfd62e6baeafc10aa81bde8c8a3c3bac26a41 collapse ./next to ./ diff -r 429bfd62e6ba -r adde35eb4773 alert_rules.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alert_rules.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,433 @@ +""" +pdm run invoke push-config + +docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +"Whenever the alert expression results in one or more vector +elements at a given point in time, the alert counts as active for +these elements' label sets." +also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + +""" + +import json + + +def pomRules(): + return [ + { + "alert": "frequent_upstream_connect_failures", + "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" + }, + { + "alert": "high_logging_pomerium", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', + "annotations": { + "summary": "high log output rate" + }, + }, + ] + + +def k8sRules(): + # from https://awesome-prometheus-alerts.grep.to/rules.html + return [ + { + "alert": "metricsTargetMissing", + "expr": 'up{job!~"cm-acme-.*"} == 0', + 'for': '10m', + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "metrics target missing (instance {{ $labels.instance }})", + "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesMemoryPressure", + "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesDiskPressure", + "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesOutOfDisk", + "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesJobFailed", + "expr": "kube_job_status_failed > 0", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", + "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesPodCrashLooping", + "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", + "for": "2m", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", + "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesClientCertificateExpiresNextWeek", + "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", + "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "container_waiting", + "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", + "annotations": { + "description": '', + "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", + }, + "for": "2m", + }, + ] + + +def allRules(ctx): + return { + "groups": [ + { + "name": "k8s", + "interval": "1m", + "rules": k8sRules(), + }, + { + "name": "pomerium_proxy", + "interval": "1m", + "rules": pomRules(), + }, + { + "name": + "Outages", + "interval": + "1m", + "rules": [ + { + "alert": "powereagleStalled", + "expr": "rate(house_power_w[100m]) == 0", + "for": "0m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data stalled", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "powereagleAbsent", + "expr": "absent_over_time(house_power_w[5m])", + "for": "2m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data missing", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "absent_zigbee", + "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', + }, + { + "alert": "net_routes_sync", + "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', + "for": "10m", + "labels": { + "severity": "houseUsersAffected" + }, + "annotations": { + "summary": "net_routes is not getting regular updates" + }, + }, + ], + }, + { + "name": "disk_errs", + "interval": "2d", + "rules": [{ + "alert": "zpool_device_error_increase", + "labels": { + "severity": "warning" + }, + "expr": 'increase(zpool_device_error_count[3d]) > 0', + }, { + "alert": "zpool_device_error_count", + "labels": { + "severity": "warning" + }, + "expr": 'zpool_device_error_count > 0', + }], + }, + { + "name": "lighting", + "interval": "5m", + "rules": [{ + "alert": "light_bridge_no_mqtt", + "expr": 'mqtt_connected{job="light-bridge"} != 1', + }], + }, + { + "name": + "front_door", + "interval": + "5m", + "rules": [ + { + "alert": "front_door_reader_esp32_no_mqtt", + 'expr': 'hw_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_down", + 'expr': 'up{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_reader_no_mqtt", + 'expr': 'mqtt_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_down", + 'expr': 'up{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_no_mqtt", + 'expr': 'mqtt_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_esp32_no_mqtt", + 'expr': 'hw_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + ], + }, + { + "name": + "net_routes", + "interval": + "5m", + "rules": [ + { + "alert": "no_house_ip_service", + "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' + }, + { + "alert": "no_net_routes_running", + "expr": 'absent(python_info{job="net-routes"})' + }, + { + "alert": "allowed_check_never_returned_200", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' + }, + { + "alert": "allowed_check_never_returned_403", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' + }, + { + 'alert': 'net_route_input_eval_cal_loop_is_down', + 'expr': 'eval_cal_up!=1' + }, + { + 'alert': 'net_route_input_mongo_loop_is_down', + 'expr': 'mongo_to_net_routes_up!=1' + }, + { + 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', + 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' + }, + { + 'alert': 'gcalendarwatch_current_events_loop_is_down', + 'expr': 'current_events_up != 1' + }, + ], + }, + { + "name": "http", + "interval": "1h", + 'rules': [ + { + 'alert': 'old_https_certs', + 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', + }, + { + 'alert': 'high_500_response_rate', + 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', + }, + ], + }, + { + "name": "ping", + "interval": "1m", + "rules": [{ + "alert": "ping_failed", + "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', + }] + }, + { + "name": + "alerts", + "rules": [ + { + "alert": "kube_node_status_bad_condition", + "for": "2h", + "labels": { + "severity": "warning" + }, + "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', + }, + { + "alert": "housePower", + "for": "1h", + "labels": { + "severity": "waste" + }, + "expr": "house_power_w > 4000", + "annotations": { + "summary": "house power usage over 4KW" + }, + }, + { + "alert": "host_root_fs_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'disk_free{host!="garage",path="/"} < 20G', + }, + { + "alert": "zpool_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', + }, + { + "alert": "disk_week_incr", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', + "annotations": { + "summary": "high mb/week on zfs dir" + }, + }, + { + "alert": "high_logging", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', + "annotations": { + "summary": "high log output rate" + }, + }, + { + "alert": "stale_process", + "for": "1d", + "labels": { + "severity": "dataRisk" + }, + "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", + "annotations": { + "summary": "process time is old" + }, + }, + { + "alert": "starlette", + "for": "1m", + "labels": { + "severity": "fix" + }, + "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', + "annotations": { + "summary": "set starlette app name" + }, + }, + { + "alert": "ssl_certs_expiring_soon", + "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" + }, + }, + ], + }, + ] + hostsExpectedOnline(ctx)['groups'] + } + + +def _runJson(ctx, cmd): + return json.loads(ctx.run(cmd, hide="stdout").stdout) + + +def hostsExpectedOnline(ctx): + return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") diff -r 429bfd62e6ba -r adde35eb4773 create_k8s.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create_k8s.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,254 @@ +from pathlib import Path +from index_page import makeIndexHtml +from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc + + +def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix): + (build / f'{agentFileName}_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } }, + "template": { + "metadata": { + "labels": { "app": agentName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }], + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmagent", + "image": f"docker.io/victoriametrics/vmagent:{vmVersion}", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vmagent/", + tzArg, + f"-promscrape.config=/local/config/{scrapeMapKey}", + "-promscrape.configCheckInterval=5s", + "-sortLabels", + f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write", + "-remoteWrite.showURL", + ], + "ports": [{ "containerPort": agentPort }], + "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }] + }] + } + } + } + })) # yapf: disable + + +def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort): + (build / f'{insertFileName}_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } }, + "template": { + "metadata": { + "labels": { "app": insertName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vminsert", + "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vminsert/", + tzArg, + f"-storageNode={storageName}", + ], + "ports": [{ "containerPort": insertPort }] + }] + } + } + } + })) # yapf: disable + + +def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort): + (build / f'{storageFileName}_2deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } }, + "template": { + "metadata": { + "labels": { "app": storageName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }], + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmstorage", + "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vmstorage/", + tzArg, + f"-retentionPeriod={retention}", + f"-storageDataPath=/data/{pipelineName}", + ], + "ports": [ + { "containerPort": 8482, "name": "http" }, + { "containerPort": storageInsertPort, "name": "vminsert" }, + { "containerPort": storageSelectPort, "name": "vmselect" }, + ], + "volumeMounts": [{ "name": "data", "mountPath": "/data" }] + }], + "affinity": affinityToNode(localPvHost) + } + } + } + })) # yapf: disable + + +def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort): + name = f"{objPrefix}-vmselect" + (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, + "spec": { + "replicas": 1, + "strategy": { "type": "Recreate" }, + "selector": { "matchLabels": { "app": name } }, + "template": { + "metadata": { + "labels": { "app": name }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={webRoot}/vmselect/", + tzArg, + ] + [f"-storageNode={n}" for n in storageSvcs], + "ports": [{ "containerPort": selectPort }] + }] + } + } + } + })) # yapf: disable + +def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention): + agentName = f"{objPrefix}-{pipelineName}-vmagent" + insertName = f"{objPrefix}-{pipelineName}-vminsert" + storageName = f"{objPrefix}-{pipelineName}-vmstorage" + + agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent" + insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert" + storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage" + + localPvHost = "ditto" + insertPort = 8480 + agentPort = 8429 + storageInsertPort = 8400 + storageSelectPort = 8401 + volName = f"{objPrefix}-data-{pipelineName}" + request = "50Gi" + pipelineWebRoot = f'{webRoot}/{pipelineName}' + + createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix) + createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort) + createPv(storageFileName, volName, request) + createPvc(storageFileName, volName, request) + createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort) + + createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}]) + createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}]) + createSvc(storageFileName,storageName, [ + {"port": 80, "targetPort": "http", "name": "http"}, + {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"}, + {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"}, + ]) # yapf: disable + + return storageName + + +def createIndex(objPrefix, webRoot, html): + name = f'{objPrefix}-index' + httpServeRoot = '/opt/html' + + (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({ + "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name }, + "data": { + "index.html": html, + "index.js": Path("index.js").read_text(), + "index.css": Path("index.css").read_text(), + } + })) # yapf: disable + + (build / f'{objPrefix}-3index_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, + "spec": { + "replicas": 1, + "selector": { "matchLabels": { "app": name } }, + "template": { + "metadata": { "labels": { "app": name } }, + "spec": { + "containers": [{ + "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent", + "args": [ + f'--root={httpServeRoot}', + '--directory-listing=true', + '--experimental-metrics=true', + ], + "ports": [{ "containerPort": 80 }], + "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }] + }], + "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }] + } + } + } + })) # yapf: disable + createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}]) + + +def main(): + tzArg = "-loggerTimezone=America/Los_Angeles" + objPrefix = "next-victoriametrics" # prefix on all k8s object names + webRoot = "/m/next" + vmVersion = "v1.100.1" + webHost = 'bigasterisk.com' + pipelines = [ + ('forever', '100y'), + ('recent', '90y'), + ] + storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines] + + selectPort = 8481 + createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort) + createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}]) + + ingressPaths = [ + { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } }, + ] # yapf: disable + for p, _ in pipelines: + ingressPaths.extend([ + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } }, + ]) # yapf: disable + + policy = """\ +allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + """ + createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost) + createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost)) + + +main() + +# in vmui, set server url to +# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus diff -r 429bfd62e6ba -r adde35eb4773 create_scrape_configs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create_scrape_configs.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,170 @@ +from pathlib import Path + +from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName +import private + +# previously this used `kubernetes_sd_configs: [{ role: node }]` +all_hosts = [ + 'dash', + 'ditto', + # 'ws-printer', + #todo: +] + +smartctl_hosts = [ + # ideally, all nodes with disks, but many turn off and on + 'dash', + 'ditto', +] + +ping_hosts = [ + # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1 + 'printer', + # wireguard connection test + 'prime5', + # after pyinfra or reboot, seems to lose wg0 address + 'garage5', +] + + +forever_jobs = [ + jobConfig(name='maildir-count', targets=['prime:2500']), + jobConfig(name='mongodb', targets=['mongodb:9216']), + jobConfig(name='net-traffic', targets=['pipe:8080']), + jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True), + jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec + jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'), + jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]), + jobConfig(name='wifi', targets=['wifi:80']), + jobConfig(name='zfs-exporter', targets=['ditto:9634']), + jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']), + jobConfig(name='zpool-exporter', targets=['ditto:9986']), + jobConfig(name='octoprint', targets=['octoprint'], + metrics_path='/plugin/prometheus_exporter/metrics', + params={'apikey' : [private.octoprint_apikey]}, + ), +] # yapf: disable + +recent_jobs = [ + jobConfig(name="dnsmasq-log", targets=['pipe:9991']), + jobConfig(name="filebeat", targets=[f'{h}:5067' for h in all_hosts]), + jobConfig(name="net-routes", targets=['pipe:9999']), + jobConfig(name="net-traffic", targets=['pipe:8080']), + jobConfig(name="pomerium", targets=['pomerium-metrics.pomerium:9090']), + jobConfig(name="telegraf", targets=[f'{h}:9273' for h in all_hosts]), + jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'), + + jobConfig(name="next-victoriametrics-forever-vmagent", metrics_path='/m/next/forever/vmagent/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-forever-vminsert", metrics_path='/m/next/forever/vminsert/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName), + jobConfig(name="next-victoriametrics-recent-vmagent", metrics_path='/m/next/recent/vmagent/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-recent-vminsert", metrics_path='/m/next/recent/vminsert/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-recent-vmstorage", metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-vmselect", metrics_path='/m/next/vmselect/metrics', targets=FromName), + jobConfig(name="next-victoriametrics-index", targets=FromName), + + # todo: + # - video-files + # - cert-manager + # - syncthing(s) + # - nvidia runner + # - longhorn + # - kube-system.metrics-server + jobConfig( + name="racc", + scrape_interval='30s', + targets=[ + # - dash:5150 + # - dot:5150 + # - squib:5150 + # - ashermac:5150 + ], + ), +] # yapf: disable + + +deploy_doesnt_serve_metrics = [ + 'apprise', + 'bitwarden', + 'digi-files', + 'digi-pose-predict', + 'digi-tts-mimic', + 'digi-web', + 'dovecot', + 'ectoscope', + 'front-door-display', + 'hass', + 'homepage', + 'itch150', + 'jsregistry', + 'kallithea', + 'kube-web-view', + 'magma', + 'megasecond', + 'minecraft-build-world', + 'minecraft-lake-world', + 'minecraft-smp-world', + 'mongodb', + 'mqtt1', + 'mqtt2', + 'nodered', + 'photoprism', + 'plik', + 'projects', + 'registry-ui', + 'registry', + 'speakerphone', + 'victorialogs-ui', + 'video-files', + 'video', + 'zigbee2mqtt', + 'zwave2mqtt', +] + +existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs] +recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs)) + +recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | { + 'kubernetes_sd_configs': [{ + 'role': 'endpoints' + }], + 'relabel_configs': [{ + 'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'], + 'action': 'keep', + 'regex': 'default;kubernetes;https' + }], +}) + +recent_jobs.append( + jobConfig(name="kubernetes-nodes", https=True, targets=[]) | { + "kubernetes_sd_configs": [{ + "role": "node" + }], + "relabel_configs": [{ + "action": "labeldrop", + "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*" + }, { + "action": "labelmap", + "regex": "__meta_kubernetes_node_label_(.+)" + }, { + "action": "labeldrop", + "regex": "kubernetes_io_hostname" + }], + }) + +# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md +# for metric definitions + +recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | { + "kubernetes_sd_configs": [{ + "role": "node" + }], + "relabel_configs": [{ + "action": "labeldrop", + "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*" + }], +}) + +outDir = Path('build/scrape_config') +writeJobConfigs(outDir, forever_jobs, 'forever') +writeJobConfigs(outDir, recent_jobs, 'recent') diff -r 429bfd62e6ba -r adde35eb4773 deploy_alertmanager.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deploy_alertmanager.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + volumes: + - name: opt-alertmanager + persistentVolumeClaim: + claimName: opt-alertmanager + serviceAccountName: victoriametrics + containers: + - name: alertmanager + image: docker.io/prom/alertmanager:v0.27.0 + args: + - --config.file=/alertmanager/alertmanager.yml + - --web.external-url=https://bigasterisk.com/alertmanager/ + - --web.route-prefix=/ + - --log.level=info + ports: + - containerPort: 9093 + volumeMounts: + - name: opt-alertmanager + mountPath: /alertmanager + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/hostname" + operator: In + values: ["ditto"] +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager +spec: + ports: + - port: 80 + targetPort: 9093 + selector: + app: alertmanager diff -r 429bfd62e6ba -r adde35eb4773 deploy_vmalert.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deploy_vmalert.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vmalert +spec: + replicas: 1 + strategy: { type: Recreate } + selector: + matchLabels: + app: vmalert + template: + metadata: + labels: + app: vmalert + annotations: + prometheus.io/scrape: "true" + spec: + volumes: + - name: config + configMap: { name: victoriametrics-config } + serviceAccountName: victoriametrics + containers: + - name: vmalert + image: docker.io/victoriametrics/vmalert:v1.91.2 + args: + - -configCheckInterval=5s + - -datasource.url=http://victoriametrics/m/ + - -datasource.queryStep=5m + - -evaluationInterval=1m + - -external.url=https://bigasterisk.com/vmalert + - -loggerLevel=INFO + - -loggerTimezone=America/Los_Angeles + - -memory.allowedBytes=512MB + - -notifier.url=http://alertmanager + - -remoteRead.url=http://victoriametrics/m/ + - -remoteWrite.url=http://victoriametrics/m/ + - -rule=/local/rules + ports: + - containerPort: 8880 + volumeMounts: + - { name: config, mountPath: /local } +--- +apiVersion: v1 +kind: Service +metadata: + name: vmalert +spec: + ports: + - port: 80 + targetPort: 8880 + selector: + app: vmalert diff -r 429bfd62e6ba -r adde35eb4773 index.css --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index.css Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,8 @@ +section { + margin-left: 2em; +} + +h1, +h2 { + border-top: 1px solid lightgray; +} diff -r 429bfd62e6ba -r adde35eb4773 index.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index.js Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,4 @@ +function init(serverUrl) { + // this defaults to something incorrect, so we fix it hopefully before you go to vmui + localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl })); +} \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 index_page.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index_page.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,75 @@ +def makeIndexHtml(objPrefix, webRoot, webHost): + return f""" + + + {objPrefix} + + + +

{objPrefix}

+
+

Retentions

+
+

recent

+ + + + + + + + + + + + + + +
vmagentmetricstargets
vminsertmetrics
vmstoragemetrics
+
+ +
+

forever

+ + + + + + + + + + + + + + +
vmagentmetricstargets
vminsertmetrics
vmstoragemetrics
+
+
+ +
+

vmselect

+ + + + + +
vmselectmetrics
+
+ +
+

vmui

+ + + + +
vmui
+
+ + + + + """ diff -r 429bfd62e6ba -r adde35eb4773 ingress_alertmanager.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ingress_alertmanager.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,55 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vmalert + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + ingress.pomerium.io/allow_public_unauthenticated_access: "false" + ingress.pomerium.io/pass_identity_headers: "true" + ingress.pomerium.io/preserve_host_header: "true" + ingress.pomerium.io/policy: | + allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + # ingress.pomerium.io/prefix_rewrite: "/vmalert/" +spec: + ingressClassName: pomerium + rules: + - host: "bigasterisk.com" + http: + paths: + - pathType: Prefix + path: /vmalert/ + backend: { service: { name: vmalert, port: { number: 80 } } } + tls: + - hosts: [bigasterisk.com] + secretName: bigasterisk.com-tls +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: alertmanager + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + ingress.pomerium.io/allow_public_unauthenticated_access: "false" + ingress.pomerium.io/pass_identity_headers: "true" + ingress.pomerium.io/preserve_host_header: "true" + ingress.pomerium.io/policy: | + allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + ingress.pomerium.io/prefix_rewrite: "/" +spec: + ingressClassName: pomerium + rules: + - host: "bigasterisk.com" + http: + paths: + - pathType: Prefix + path: /alertmanager/ + backend: { service: { name: alertmanager, port: { number: 80 } } } + tls: + - hosts: [bigasterisk.com] + secretName: bigasterisk.com-tls \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 k8s_ops.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/k8s_ops.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,50 @@ +import json +import time + +from kubernetes import client + + +def refreshPodCmaps(pod_name, namespace="default"): + """ + Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while + until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations. + """ + api_instance = client.CoreV1Api() + + pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace) + if pod.metadata.annotations is None: + pod.metadata.annotations = {} + pod.metadata.annotations["force-configmap-update"] = str(time.time()) + api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod) + + +def firstPodName(selector): + api_instance = client.CoreV1Api() + pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector) + return pod_list.items[0].metadata.name + + +def hup(ctx, deployment, process_name): + ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}") + + +def replaceCmap(name, dataObj): + api_instance = client.CoreV1Api() + + data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items()) + + try: + + existing_config_map = api_instance.read_namespaced_config_map(name, 'default') + existing_config_map.data.update(data) + api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map) + except client.rest.ApiException as e: + if e.status == 404: + config_map = client.V1ConfigMap() + config_map.metadata = client.V1ObjectMeta(name=name) + config_map.data = data + api_response = api_instance.create_namespaced_config_map('default', config_map) + else: + raise + + print(f"{name} resource_version is now {api_response.metadata.resource_version}") diff -r 429bfd62e6ba -r adde35eb4773 next/alert_rules.py --- a/next/alert_rules.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,433 +0,0 @@ -""" -pdm run invoke push-config - -docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ -"Whenever the alert expression results in one or more vector -elements at a given point in time, the alert counts as active for -these elements' label sets." -also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics - -""" - -import json - - -def pomRules(): - return [ - { - "alert": "frequent_upstream_connect_failures", - "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" - }, - { - "alert": "high_logging_pomerium", - "for": "3h", - "labels": { - "severity": "waste" - }, - "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', - "annotations": { - "summary": "high log output rate" - }, - }, - ] - - -def k8sRules(): - # from https://awesome-prometheus-alerts.grep.to/rules.html - return [ - { - "alert": "metricsTargetMissing", - "expr": 'up{job!~"cm-acme-.*"} == 0', - 'for': '10m', - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "metrics target missing (instance {{ $labels.instance }})", - "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesMemoryPressure", - "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesDiskPressure", - "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesOutOfDisk", - "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesJobFailed", - "expr": "kube_job_status_failed > 0", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", - "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesPodCrashLooping", - "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", - "for": "2m", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", - "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesClientCertificateExpiresNextWeek", - "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", - "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", - }, - }, - { - "alert": "container_waiting", - "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", - "annotations": { - "description": '', - "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", - }, - "for": "2m", - }, - ] - - -def allRules(ctx): - return { - "groups": [ - { - "name": "k8s", - "interval": "1m", - "rules": k8sRules(), - }, - { - "name": "pomerium_proxy", - "interval": "1m", - "rules": pomRules(), - }, - { - "name": - "Outages", - "interval": - "1m", - "rules": [ - { - "alert": "powereagleStalled", - "expr": "rate(house_power_w[100m]) == 0", - "for": "0m", - "labels": { - "severity": "losingData" - }, - "annotations": { - "summary": "power eagle data stalled", - "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", - }, - }, - { - "alert": "powereagleAbsent", - "expr": "absent_over_time(house_power_w[5m])", - "for": "2m", - "labels": { - "severity": "losingData" - }, - "annotations": { - "summary": "power eagle data missing", - "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", - }, - }, - { - "alert": "absent_zigbee", - "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', - }, - { - "alert": "net_routes_sync", - "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', - "for": "10m", - "labels": { - "severity": "houseUsersAffected" - }, - "annotations": { - "summary": "net_routes is not getting regular updates" - }, - }, - ], - }, - { - "name": "disk_errs", - "interval": "2d", - "rules": [{ - "alert": "zpool_device_error_increase", - "labels": { - "severity": "warning" - }, - "expr": 'increase(zpool_device_error_count[3d]) > 0', - }, { - "alert": "zpool_device_error_count", - "labels": { - "severity": "warning" - }, - "expr": 'zpool_device_error_count > 0', - }], - }, - { - "name": "lighting", - "interval": "5m", - "rules": [{ - "alert": "light_bridge_no_mqtt", - "expr": 'mqtt_connected{job="light-bridge"} != 1', - }], - }, - { - "name": - "front_door", - "interval": - "5m", - "rules": [ - { - "alert": "front_door_reader_esp32_no_mqtt", - 'expr': 'hw_connected{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_reader_svc_down", - 'expr': 'up{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_reader_svc_reader_no_mqtt", - 'expr': 'mqtt_connected{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_svc_down", - 'expr': 'up{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_svc_no_mqtt", - 'expr': 'mqtt_connected{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_esp32_no_mqtt", - 'expr': 'hw_connected{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - ], - }, - { - "name": - "net_routes", - "interval": - "5m", - "rules": [ - { - "alert": "no_house_ip_service", - "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' - }, - { - "alert": "no_net_routes_running", - "expr": 'absent(python_info{job="net-routes"})' - }, - { - "alert": "allowed_check_never_returned_200", - 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' - }, - { - "alert": "allowed_check_never_returned_403", - 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' - }, - { - 'alert': 'net_route_input_eval_cal_loop_is_down', - 'expr': 'eval_cal_up!=1' - }, - { - 'alert': 'net_route_input_mongo_loop_is_down', - 'expr': 'mongo_to_net_routes_up!=1' - }, - { - 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', - 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' - }, - { - 'alert': 'gcalendarwatch_current_events_loop_is_down', - 'expr': 'current_events_up != 1' - }, - ], - }, - { - "name": "http", - "interval": "1h", - 'rules': [ - { - 'alert': 'old_https_certs', - 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', - }, - { - 'alert': 'high_500_response_rate', - 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', - }, - ], - }, - { - "name": "ping", - "interval": "1m", - "rules": [{ - "alert": "ping_failed", - "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', - }] - }, - { - "name": - "alerts", - "rules": [ - { - "alert": "kube_node_status_bad_condition", - "for": "2h", - "labels": { - "severity": "warning" - }, - "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', - }, - { - "alert": "housePower", - "for": "1h", - "labels": { - "severity": "waste" - }, - "expr": "house_power_w > 4000", - "annotations": { - "summary": "house power usage over 4KW" - }, - }, - { - "alert": "host_root_fs_space_low", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'disk_free{host!="garage",path="/"} < 20G', - }, - { - "alert": "zpool_space_low", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', - }, - { - "alert": "disk_week_incr", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', - "annotations": { - "summary": "high mb/week on zfs dir" - }, - }, - { - "alert": "high_logging", - "for": "3h", - "labels": { - "severity": "waste" - }, - "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', - "annotations": { - "summary": "high log output rate" - }, - }, - { - "alert": "stale_process", - "for": "1d", - "labels": { - "severity": "dataRisk" - }, - "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", - "annotations": { - "summary": "process time is old" - }, - }, - { - "alert": "starlette", - "for": "1m", - "labels": { - "severity": "fix" - }, - "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', - "annotations": { - "summary": "set starlette app name" - }, - }, - { - "alert": "ssl_certs_expiring_soon", - "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" - }, - }, - ], - }, - ] + hostsExpectedOnline(ctx)['groups'] - } - - -def _runJson(ctx, cmd): - return json.loads(ctx.run(cmd, hide="stdout").stdout) - - -def hostsExpectedOnline(ctx): - return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") diff -r 429bfd62e6ba -r adde35eb4773 next/create_k8s.py --- a/next/create_k8s.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,254 +0,0 @@ -from pathlib import Path -from index_page import makeIndexHtml -from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc - - -def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix): - (build / f'{agentFileName}_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } }, - "template": { - "metadata": { - "labels": { "app": agentName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }], - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmagent", - "image": f"docker.io/victoriametrics/vmagent:{vmVersion}", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vmagent/", - tzArg, - f"-promscrape.config=/local/config/{scrapeMapKey}", - "-promscrape.configCheckInterval=5s", - "-sortLabels", - f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write", - "-remoteWrite.showURL", - ], - "ports": [{ "containerPort": agentPort }], - "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }] - }] - } - } - } - })) # yapf: disable - - -def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort): - (build / f'{insertFileName}_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } }, - "template": { - "metadata": { - "labels": { "app": insertName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vminsert", - "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vminsert/", - tzArg, - f"-storageNode={storageName}", - ], - "ports": [{ "containerPort": insertPort }] - }] - } - } - } - })) # yapf: disable - - -def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort): - (build / f'{storageFileName}_2deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } }, - "template": { - "metadata": { - "labels": { "app": storageName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }], - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmstorage", - "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vmstorage/", - tzArg, - f"-retentionPeriod={retention}", - f"-storageDataPath=/data/{pipelineName}", - ], - "ports": [ - { "containerPort": 8482, "name": "http" }, - { "containerPort": storageInsertPort, "name": "vminsert" }, - { "containerPort": storageSelectPort, "name": "vmselect" }, - ], - "volumeMounts": [{ "name": "data", "mountPath": "/data" }] - }], - "affinity": affinityToNode(localPvHost) - } - } - } - })) # yapf: disable - - -def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort): - name = f"{objPrefix}-vmselect" - (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, - "spec": { - "replicas": 1, - "strategy": { "type": "Recreate" }, - "selector": { "matchLabels": { "app": name } }, - "template": { - "metadata": { - "labels": { "app": name }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={webRoot}/vmselect/", - tzArg, - ] + [f"-storageNode={n}" for n in storageSvcs], - "ports": [{ "containerPort": selectPort }] - }] - } - } - } - })) # yapf: disable - -def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention): - agentName = f"{objPrefix}-{pipelineName}-vmagent" - insertName = f"{objPrefix}-{pipelineName}-vminsert" - storageName = f"{objPrefix}-{pipelineName}-vmstorage" - - agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent" - insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert" - storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage" - - localPvHost = "ditto" - insertPort = 8480 - agentPort = 8429 - storageInsertPort = 8400 - storageSelectPort = 8401 - volName = f"{objPrefix}-data-{pipelineName}" - request = "50Gi" - pipelineWebRoot = f'{webRoot}/{pipelineName}' - - createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix) - createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort) - createPv(storageFileName, volName, request) - createPvc(storageFileName, volName, request) - createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort) - - createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}]) - createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}]) - createSvc(storageFileName,storageName, [ - {"port": 80, "targetPort": "http", "name": "http"}, - {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"}, - {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"}, - ]) # yapf: disable - - return storageName - - -def createIndex(objPrefix, webRoot, html): - name = f'{objPrefix}-index' - httpServeRoot = '/opt/html' - - (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({ - "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name }, - "data": { - "index.html": html, - "index.js": Path("index.js").read_text(), - "index.css": Path("index.css").read_text(), - } - })) # yapf: disable - - (build / f'{objPrefix}-3index_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, - "spec": { - "replicas": 1, - "selector": { "matchLabels": { "app": name } }, - "template": { - "metadata": { "labels": { "app": name } }, - "spec": { - "containers": [{ - "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent", - "args": [ - f'--root={httpServeRoot}', - '--directory-listing=true', - '--experimental-metrics=true', - ], - "ports": [{ "containerPort": 80 }], - "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }] - }], - "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }] - } - } - } - })) # yapf: disable - createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}]) - - -def main(): - tzArg = "-loggerTimezone=America/Los_Angeles" - objPrefix = "next-victoriametrics" # prefix on all k8s object names - webRoot = "/m/next" - vmVersion = "v1.100.1" - webHost = 'bigasterisk.com' - pipelines = [ - ('forever', '100y'), - ('recent', '90y'), - ] - storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines] - - selectPort = 8481 - createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort) - createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}]) - - ingressPaths = [ - { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } }, - ] # yapf: disable - for p, _ in pipelines: - ingressPaths.extend([ - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } }, - ]) # yapf: disable - - policy = """\ -allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - """ - createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost) - createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost)) - - -main() - -# in vmui, set server url to -# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus diff -r 429bfd62e6ba -r adde35eb4773 next/create_scrape_configs.py --- a/next/create_scrape_configs.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,170 +0,0 @@ -from pathlib import Path - -from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName -import private - -# previously this used `kubernetes_sd_configs: [{ role: node }]` -all_hosts = [ - 'dash', - 'ditto', - # 'ws-printer', - #todo: -] - -smartctl_hosts = [ - # ideally, all nodes with disks, but many turn off and on - 'dash', - 'ditto', -] - -ping_hosts = [ - # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1 - 'printer', - # wireguard connection test - 'prime5', - # after pyinfra or reboot, seems to lose wg0 address - 'garage5', -] - - -forever_jobs = [ - jobConfig(name='maildir-count', targets=['prime:2500']), - jobConfig(name='mongodb', targets=['mongodb:9216']), - jobConfig(name='net-traffic', targets=['pipe:8080']), - jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True), - jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec - jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'), - jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]), - jobConfig(name='wifi', targets=['wifi:80']), - jobConfig(name='zfs-exporter', targets=['ditto:9634']), - jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']), - jobConfig(name='zpool-exporter', targets=['ditto:9986']), - jobConfig(name='octoprint', targets=['octoprint'], - metrics_path='/plugin/prometheus_exporter/metrics', - params={'apikey' : [private.octoprint_apikey]}, - ), -] # yapf: disable - -recent_jobs = [ - jobConfig(name="dnsmasq-log", targets=['pipe:9991']), - jobConfig(name="filebeat", targets=[f'{h}:5067' for h in all_hosts]), - jobConfig(name="net-routes", targets=['pipe:9999']), - jobConfig(name="net-traffic", targets=['pipe:8080']), - jobConfig(name="pomerium", targets=['pomerium-metrics.pomerium:9090']), - jobConfig(name="telegraf", targets=[f'{h}:9273' for h in all_hosts]), - jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'), - - jobConfig(name="next-victoriametrics-forever-vmagent", metrics_path='/m/next/forever/vmagent/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-forever-vminsert", metrics_path='/m/next/forever/vminsert/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName), - jobConfig(name="next-victoriametrics-recent-vmagent", metrics_path='/m/next/recent/vmagent/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-recent-vminsert", metrics_path='/m/next/recent/vminsert/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-recent-vmstorage", metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-vmselect", metrics_path='/m/next/vmselect/metrics', targets=FromName), - jobConfig(name="next-victoriametrics-index", targets=FromName), - - # todo: - # - video-files - # - cert-manager - # - syncthing(s) - # - nvidia runner - # - longhorn - # - kube-system.metrics-server - jobConfig( - name="racc", - scrape_interval='30s', - targets=[ - # - dash:5150 - # - dot:5150 - # - squib:5150 - # - ashermac:5150 - ], - ), -] # yapf: disable - - -deploy_doesnt_serve_metrics = [ - 'apprise', - 'bitwarden', - 'digi-files', - 'digi-pose-predict', - 'digi-tts-mimic', - 'digi-web', - 'dovecot', - 'ectoscope', - 'front-door-display', - 'hass', - 'homepage', - 'itch150', - 'jsregistry', - 'kallithea', - 'kube-web-view', - 'magma', - 'megasecond', - 'minecraft-build-world', - 'minecraft-lake-world', - 'minecraft-smp-world', - 'mongodb', - 'mqtt1', - 'mqtt2', - 'nodered', - 'photoprism', - 'plik', - 'projects', - 'registry-ui', - 'registry', - 'speakerphone', - 'victorialogs-ui', - 'video-files', - 'video', - 'zigbee2mqtt', - 'zwave2mqtt', -] - -existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs] -recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs)) - -recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | { - 'kubernetes_sd_configs': [{ - 'role': 'endpoints' - }], - 'relabel_configs': [{ - 'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'], - 'action': 'keep', - 'regex': 'default;kubernetes;https' - }], -}) - -recent_jobs.append( - jobConfig(name="kubernetes-nodes", https=True, targets=[]) | { - "kubernetes_sd_configs": [{ - "role": "node" - }], - "relabel_configs": [{ - "action": "labeldrop", - "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*" - }, { - "action": "labelmap", - "regex": "__meta_kubernetes_node_label_(.+)" - }, { - "action": "labeldrop", - "regex": "kubernetes_io_hostname" - }], - }) - -# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md -# for metric definitions - -recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | { - "kubernetes_sd_configs": [{ - "role": "node" - }], - "relabel_configs": [{ - "action": "labeldrop", - "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*" - }], -}) - -outDir = Path('build/scrape_config') -writeJobConfigs(outDir, forever_jobs, 'forever') -writeJobConfigs(outDir, recent_jobs, 'recent') diff -r 429bfd62e6ba -r adde35eb4773 next/deploy_alertmanager.yaml --- a/next/deploy_alertmanager.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: alertmanager -spec: - replicas: 1 - selector: - matchLabels: - app: alertmanager - template: - metadata: - labels: - app: alertmanager - spec: - volumes: - - name: opt-alertmanager - persistentVolumeClaim: - claimName: opt-alertmanager - serviceAccountName: victoriametrics - containers: - - name: alertmanager - image: docker.io/prom/alertmanager:v0.27.0 - args: - - --config.file=/alertmanager/alertmanager.yml - - --web.external-url=https://bigasterisk.com/alertmanager/ - - --web.route-prefix=/ - - --log.level=info - ports: - - containerPort: 9093 - volumeMounts: - - name: opt-alertmanager - mountPath: /alertmanager - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "kubernetes.io/hostname" - operator: In - values: ["ditto"] ---- -apiVersion: v1 -kind: Service -metadata: - name: alertmanager -spec: - ports: - - port: 80 - targetPort: 9093 - selector: - app: alertmanager diff -r 429bfd62e6ba -r adde35eb4773 next/deploy_vmalert.yaml --- a/next/deploy_vmalert.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vmalert -spec: - replicas: 1 - strategy: { type: Recreate } - selector: - matchLabels: - app: vmalert - template: - metadata: - labels: - app: vmalert - annotations: - prometheus.io/scrape: "true" - spec: - volumes: - - name: config - configMap: { name: victoriametrics-config } - serviceAccountName: victoriametrics - containers: - - name: vmalert - image: docker.io/victoriametrics/vmalert:v1.91.2 - args: - - -configCheckInterval=5s - - -datasource.url=http://victoriametrics/m/ - - -datasource.queryStep=5m - - -evaluationInterval=1m - - -external.url=https://bigasterisk.com/vmalert - - -loggerLevel=INFO - - -loggerTimezone=America/Los_Angeles - - -memory.allowedBytes=512MB - - -notifier.url=http://alertmanager - - -remoteRead.url=http://victoriametrics/m/ - - -remoteWrite.url=http://victoriametrics/m/ - - -rule=/local/rules - ports: - - containerPort: 8880 - volumeMounts: - - { name: config, mountPath: /local } ---- -apiVersion: v1 -kind: Service -metadata: - name: vmalert -spec: - ports: - - port: 80 - targetPort: 8880 - selector: - app: vmalert diff -r 429bfd62e6ba -r adde35eb4773 next/index.css --- a/next/index.css Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -section { - margin-left: 2em; -} - -h1, -h2 { - border-top: 1px solid lightgray; -} diff -r 429bfd62e6ba -r adde35eb4773 next/index.js --- a/next/index.js Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -function init(serverUrl) { - // this defaults to something incorrect, so we fix it hopefully before you go to vmui - localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl })); -} \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 next/index_page.py --- a/next/index_page.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -def makeIndexHtml(objPrefix, webRoot, webHost): - return f""" - - - {objPrefix} - - - -

{objPrefix}

-
-

Retentions

-
-

recent

- - - - - - - - - - - - - - -
vmagentmetricstargets
vminsertmetrics
vmstoragemetrics
-
- -
-

forever

- - - - - - - - - - - - - - -
vmagentmetricstargets
vminsertmetrics
vmstoragemetrics
-
-
- -
-

vmselect

- - - - - -
vmselectmetrics
-
- -
-

vmui

- - - - -
vmui
-
- - - - - """ diff -r 429bfd62e6ba -r adde35eb4773 next/ingress_alertmanager.yaml --- a/next/ingress_alertmanager.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: vmalert - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - ingress.pomerium.io/allow_public_unauthenticated_access: "false" - ingress.pomerium.io/pass_identity_headers: "true" - ingress.pomerium.io/preserve_host_header: "true" - ingress.pomerium.io/policy: | - allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - # ingress.pomerium.io/prefix_rewrite: "/vmalert/" -spec: - ingressClassName: pomerium - rules: - - host: "bigasterisk.com" - http: - paths: - - pathType: Prefix - path: /vmalert/ - backend: { service: { name: vmalert, port: { number: 80 } } } - tls: - - hosts: [bigasterisk.com] - secretName: bigasterisk.com-tls ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: alertmanager - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - ingress.pomerium.io/allow_public_unauthenticated_access: "false" - ingress.pomerium.io/pass_identity_headers: "true" - ingress.pomerium.io/preserve_host_header: "true" - ingress.pomerium.io/policy: | - allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - ingress.pomerium.io/prefix_rewrite: "/" -spec: - ingressClassName: pomerium - rules: - - host: "bigasterisk.com" - http: - paths: - - pathType: Prefix - path: /alertmanager/ - backend: { service: { name: alertmanager, port: { number: 80 } } } - tls: - - hosts: [bigasterisk.com] - secretName: bigasterisk.com-tls \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 next/k8s_ops.py --- a/next/k8s_ops.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -import json -import time - -from kubernetes import client - - -def refreshPodCmaps(pod_name, namespace="default"): - """ - Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while - until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations. - """ - api_instance = client.CoreV1Api() - - pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace) - if pod.metadata.annotations is None: - pod.metadata.annotations = {} - pod.metadata.annotations["force-configmap-update"] = str(time.time()) - api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod) - - -def firstPodName(selector): - api_instance = client.CoreV1Api() - pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector) - return pod_list.items[0].metadata.name - - -def hup(ctx, deployment, process_name): - ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}") - - -def replaceCmap(name, dataObj): - api_instance = client.CoreV1Api() - - data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items()) - - try: - - existing_config_map = api_instance.read_namespaced_config_map(name, 'default') - existing_config_map.data.update(data) - api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map) - except client.rest.ApiException as e: - if e.status == 404: - config_map = client.V1ConfigMap() - config_map.metadata = client.V1ObjectMeta(name=name) - config_map.data = data - api_response = api_instance.create_namespaced_config_map('default', config_map) - else: - raise - - print(f"{name} resource_version is now {api_response.metadata.resource_version}") diff -r 429bfd62e6ba -r adde35eb4773 next/output.py --- a/next/output.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -import json -from pathlib import Path - -build = Path('build/k8s_config') -build.mkdir(parents=True, exist_ok=True) - - -def toJson(d): - return json.dumps(d, sort_keys=True, indent=2) - - -def createSvc(filename, name, ports): - (build / f'{filename}_svc.yaml').write_text(toJson({ - "apiVersion": "v1", - "kind": "Service", - "metadata": { - "name": name - }, - "spec": { - "ports": ports, - "selector": { - "app": name - } - }, - })) - - -def createIngress(filename, objName, policy, ingressPaths, host): - - (build / filename).write_text( - toJson({ - "apiVersion": "networking.k8s.io/v1", - "kind": "Ingress", - "metadata": { - "name": objName, - "annotations": { - "cert-manager.io/cluster-issuer": "letsencrypt-prod", - "ingress.pomerium.io/allow_public_unauthenticated_access": "false", - "ingress.pomerium.io/pass_identity_headers": "true", - "ingress.pomerium.io/preserve_host_header": "true", - "ingress.pomerium.io/policy": policy, - } - }, - "spec": { - "ingressClassName": "pomerium", - "rules": [{ - "host": host, - "http": { - "paths": ingressPaths - } - },], - "tls": [{ - "hosts": [host], - "secretName": f"{host}-tls" - }] - } - })) - - -def createPv(storageFileName, volName, request): - (build / f'{storageFileName}_0pv.yaml').write_text( - toJson({ - "apiVersion": "v1", - "kind": "PersistentVolume", - "metadata": { - "name": volName, - "labels": { - "type": "local" - } - }, - "spec": { - "storageClassName": "manual", - "hostPath": { - "path": f"/opt/{volName}" - }, - "capacity": { - "storage": request - }, - "accessModes": ["ReadWriteMany"], - "persistentVolumeReclaimPolicy": "Retain", - "claimRef": { - "namespace": "default", - "name": volName - } - } - })) - - -def createPvc(storageFileName, volName, request): - (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({ - "apiVersion": "v1", - "kind": "PersistentVolumeClaim", - "metadata": { - "name": volName, - }, - "spec": { - "storageClassName": "", - "volumeName": volName, - "accessModes": ["ReadWriteMany"], - "resources": { - "requests": { - "storage": request - } - } - }, - })) - - -def affinityToNode(node): - return { - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [{ - "matchExpressions": [{ - "key": "kubernetes.io/hostname", - "operator": "In", - "values": [node], - }], - }], - }, - } - } diff -r 429bfd62e6ba -r adde35eb4773 next/roles.yaml --- a/next/roles.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: victoriametrics -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: - - extensions - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: victoriametrics ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: victoriametrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: victoriametrics -subjects: -- kind: ServiceAccount - name: victoriametrics - namespace: default -# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account -# - kind: ServiceAccount -# name: default -# namespace: default \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 next/scrape_job.py --- a/next/scrape_job.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ -import json -from pathlib import Path -import subprocess - -class FromName: - pass - -def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False): - """one scrape job config""" - ret = { - "job_name": name, - "relabel_configs": [ - { - "target_label": "namespace", - "replacement": "default" - }, - { - "source_labels": ["__meta_kubernetes_pod_node_name"], - "target_label": "node" - }, - ] - } - - if targets is FromName: - targets = [name] - - if targets: - ret["static_configs"] = [{ - "targets": targets, - }] - - if metrics_path: - ret.setdefault('relabel_configs', []).append({ - "action": "replace", - "target_label": "__metrics_path__", - "replacement": metrics_path, - }) - - if scrape_interval: - ret['scrape_interval'] = scrape_interval - - if params: - ret['params'] = params - - if ping_job: - ret['metrics_path'] = '/probe' - ret['params'] = {'module': ['icmp']} - ret["relabel_configs"] = [ - { - "source_labels": ["__address__"], - "target_label": "__param_target" - }, - { - "source_labels": ["__param_target"], - "target_label": "instance" - }, - { - "target_label": "__address__", - "replacement": "prober" - }, - ] - - if https: - ret['scheme'] = 'https' - ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"} - ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token" - - return ret - - -def current_deployments(): - deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json'])) - for deploy in deploys['items']: - name = deploy['metadata']['name'] - yield name - - -def scrape_deployments(skip_names): - ret = [] - for name in current_deployments(): - if name in skip_names: - continue - targets = [name] - ret.append(jobConfig(name=name, targets=targets)) - return ret - - -def writeJobConfigs(outDir: Path, jobConfs: list, retention: str): - outDir.mkdir(exist_ok=True, parents=True) - filenames_written = [] - for job in jobConfs: - filename = f'job_{job["job_name"]}.yaml' - (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True)) - filenames_written.append(filename) - - (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({ - "global": { - "scrape_interval": "1m", - "scrape_timeout": "10s" - }, - "scrape_config_files": sorted(filenames_written), - }, indent=2)) diff -r 429bfd62e6ba -r adde35eb4773 next/skaffold.yaml --- a/next/skaffold.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -apiVersion: skaffold/v3 -kind: Config -metadata: - name: victoriametrics -manifests: - rawYaml: - - roles.yaml - - build/k8s_config/*.yaml -deploy: - kubectl: {} diff -r 429bfd62e6ba -r adde35eb4773 next/tasks.py --- a/next/tasks.py Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -from pathlib import Path - -import yaml -from invoke import task -from kubernetes import config - -import alert_rules -from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap - -config.load_kube_config() - - -def scrapeConfig(fn): - return yaml.load(open(fn), yaml.FullLoader) - - -@task -def push_config(ctx): - # plan: - # every discovered service may: - # - be described here as a forever retention - ignore the discovery - # - be blocked here as a no-metrics service - ignore the discovery - # - be scraped as 'recent', with possible overrides of port/path - # all per-node metrics shall be 'recent' (oops, not smartctl!) - map: dict[str, object] = { - 'rules': alert_rules.allRules(ctx), - } - top = Path('build/scrape_config') - for p in top.glob('*.yaml'): - map[str(p.relative_to(top))] = scrapeConfig(p) - replaceCmap("next-victoriametrics-config", map) - refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent")) - refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent")) diff -r 429bfd62e6ba -r adde35eb4773 next/volumes_alert.yaml --- a/next/volumes_alert.yaml Fri May 03 11:19:50 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: opt-alertmanager - labels: - type: local -spec: - storageClassName: manual - hostPath: - path: "/opt/alertmanager" - capacity: - storage: 50Gi - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain - claimRef: - namespace: default - name: opt-alertmanager ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: opt-alertmanager -spec: - storageClassName: "" - volumeName: "opt-alertmanager" - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/output.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,122 @@ +import json +from pathlib import Path + +build = Path('build/k8s_config') +build.mkdir(parents=True, exist_ok=True) + + +def toJson(d): + return json.dumps(d, sort_keys=True, indent=2) + + +def createSvc(filename, name, ports): + (build / f'{filename}_svc.yaml').write_text(toJson({ + "apiVersion": "v1", + "kind": "Service", + "metadata": { + "name": name + }, + "spec": { + "ports": ports, + "selector": { + "app": name + } + }, + })) + + +def createIngress(filename, objName, policy, ingressPaths, host): + + (build / filename).write_text( + toJson({ + "apiVersion": "networking.k8s.io/v1", + "kind": "Ingress", + "metadata": { + "name": objName, + "annotations": { + "cert-manager.io/cluster-issuer": "letsencrypt-prod", + "ingress.pomerium.io/allow_public_unauthenticated_access": "false", + "ingress.pomerium.io/pass_identity_headers": "true", + "ingress.pomerium.io/preserve_host_header": "true", + "ingress.pomerium.io/policy": policy, + } + }, + "spec": { + "ingressClassName": "pomerium", + "rules": [{ + "host": host, + "http": { + "paths": ingressPaths + } + },], + "tls": [{ + "hosts": [host], + "secretName": f"{host}-tls" + }] + } + })) + + +def createPv(storageFileName, volName, request): + (build / f'{storageFileName}_0pv.yaml').write_text( + toJson({ + "apiVersion": "v1", + "kind": "PersistentVolume", + "metadata": { + "name": volName, + "labels": { + "type": "local" + } + }, + "spec": { + "storageClassName": "manual", + "hostPath": { + "path": f"/opt/{volName}" + }, + "capacity": { + "storage": request + }, + "accessModes": ["ReadWriteMany"], + "persistentVolumeReclaimPolicy": "Retain", + "claimRef": { + "namespace": "default", + "name": volName + } + } + })) + + +def createPvc(storageFileName, volName, request): + (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({ + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": volName, + }, + "spec": { + "storageClassName": "", + "volumeName": volName, + "accessModes": ["ReadWriteMany"], + "resources": { + "requests": { + "storage": request + } + } + }, + })) + + +def affinityToNode(node): + return { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "kubernetes.io/hostname", + "operator": "In", + "values": [node], + }], + }], + }, + } + } diff -r 429bfd62e6ba -r adde35eb4773 roles.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/roles.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,43 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: victoriametrics +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: victoriametrics +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: victoriametrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: victoriametrics +subjects: +- kind: ServiceAccount + name: victoriametrics + namespace: default +# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account +# - kind: ServiceAccount +# name: default +# namespace: default \ No newline at end of file diff -r 429bfd62e6ba -r adde35eb4773 scrape_job.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scrape_job.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,102 @@ +import json +from pathlib import Path +import subprocess + +class FromName: + pass + +def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False): + """one scrape job config""" + ret = { + "job_name": name, + "relabel_configs": [ + { + "target_label": "namespace", + "replacement": "default" + }, + { + "source_labels": ["__meta_kubernetes_pod_node_name"], + "target_label": "node" + }, + ] + } + + if targets is FromName: + targets = [name] + + if targets: + ret["static_configs"] = [{ + "targets": targets, + }] + + if metrics_path: + ret.setdefault('relabel_configs', []).append({ + "action": "replace", + "target_label": "__metrics_path__", + "replacement": metrics_path, + }) + + if scrape_interval: + ret['scrape_interval'] = scrape_interval + + if params: + ret['params'] = params + + if ping_job: + ret['metrics_path'] = '/probe' + ret['params'] = {'module': ['icmp']} + ret["relabel_configs"] = [ + { + "source_labels": ["__address__"], + "target_label": "__param_target" + }, + { + "source_labels": ["__param_target"], + "target_label": "instance" + }, + { + "target_label": "__address__", + "replacement": "prober" + }, + ] + + if https: + ret['scheme'] = 'https' + ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"} + ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + return ret + + +def current_deployments(): + deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json'])) + for deploy in deploys['items']: + name = deploy['metadata']['name'] + yield name + + +def scrape_deployments(skip_names): + ret = [] + for name in current_deployments(): + if name in skip_names: + continue + targets = [name] + ret.append(jobConfig(name=name, targets=targets)) + return ret + + +def writeJobConfigs(outDir: Path, jobConfs: list, retention: str): + outDir.mkdir(exist_ok=True, parents=True) + filenames_written = [] + for job in jobConfs: + filename = f'job_{job["job_name"]}.yaml' + (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True)) + filenames_written.append(filename) + + (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({ + "global": { + "scrape_interval": "1m", + "scrape_timeout": "10s" + }, + "scrape_config_files": sorted(filenames_written), + }, indent=2)) diff -r 429bfd62e6ba -r adde35eb4773 skaffold.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/skaffold.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,10 @@ +apiVersion: skaffold/v3 +kind: Config +metadata: + name: victoriametrics +manifests: + rawYaml: + - roles.yaml + - build/k8s_config/*.yaml +deploy: + kubectl: {} diff -r 429bfd62e6ba -r adde35eb4773 tasks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tasks.py Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,33 @@ +from pathlib import Path + +import yaml +from invoke import task +from kubernetes import config + +import alert_rules +from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap + +config.load_kube_config() + + +def scrapeConfig(fn): + return yaml.load(open(fn), yaml.FullLoader) + + +@task +def push_config(ctx): + # plan: + # every discovered service may: + # - be described here as a forever retention - ignore the discovery + # - be blocked here as a no-metrics service - ignore the discovery + # - be scraped as 'recent', with possible overrides of port/path + # all per-node metrics shall be 'recent' (oops, not smartctl!) + map: dict[str, object] = { + 'rules': alert_rules.allRules(ctx), + } + top = Path('build/scrape_config') + for p in top.glob('*.yaml'): + map[str(p.relative_to(top))] = scrapeConfig(p) + replaceCmap("next-victoriametrics-config", map) + refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent")) + refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent")) diff -r 429bfd62e6ba -r adde35eb4773 volumes_alert.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/volumes_alert.yaml Fri May 03 11:21:08 2024 -0700 @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: opt-alertmanager + labels: + type: local +spec: + storageClassName: manual + hostPath: + path: "/opt/alertmanager" + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + claimRef: + namespace: default + name: opt-alertmanager +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: opt-alertmanager +spec: + storageClassName: "" + volumeName: "opt-alertmanager" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi \ No newline at end of file