# HG changeset patch # User drewp@bigasterisk.com # Date 1696902636 25200 # Node ID 3b91d52b007d864b3d43fab1b863b87826ec23b2 # Parent ce7ae4d2b24b083471a6402c5f53cae072249eb8 rules tuning diff -r ce7ae4d2b24b -r 3b91d52b007d alert_rules.py --- a/alert_rules.py Fri Sep 01 23:19:56 2023 -0700 +++ b/alert_rules.py Mon Oct 09 18:50:36 2023 -0700 @@ -16,14 +16,14 @@ # from https://awesome-prometheus-alerts.grep.to/rules.html return [ { - "alert": "PrometheusTargetMissing", - "expr": "up == 0", + "alert": "metricsTargetMissing", + "expr": 'up{job!~"cm-acme-.*"} == 0', "labels": { "severity": "critical" }, "annotations": { - "summary": "Prometheus target missing (instance {{ $labels.instance }})", - "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + "summary": "metrics target missing (instance {{ $labels.instance }})", + "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", }, }, { @@ -100,7 +100,11 @@ }, { "alert": "container_waiting", - "expr": "sum by (container)(kube_pod_container_status_waiting!=0)", + "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", + "annotations": { + "description": '', + "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", + }, "for": "2m", }, ] @@ -177,6 +181,22 @@ }, { "name": + "front_door", + "interval": + "5m", + "rules": [ + { + "alert": "service_disconnected_from_mqtt", + "expr": "mqtt_connected < 1" + }, + { + "alert": "esp_not_connected_to_mqtt", + "expr": "hw_connected < 1", + }, + ] + }, + { + "name": "alerts", "rules": [ { @@ -204,7 +224,7 @@ "labels": { "severity": "warning" }, - "expr": 'disk_free{path="/"} < 20G', + "expr": 'disk_free{host!="garage",path="/"} < 20G', }, { "alert": "zpool_space_low", diff -r ce7ae4d2b24b -r 3b91d52b007d config/scrape_main.yaml --- a/config/scrape_main.yaml Fri Sep 01 23:19:56 2023 -0700 +++ b/config/scrape_main.yaml Mon Oct 09 18:50:36 2023 -0700 @@ -109,8 +109,9 @@ - {if: '{job="pomerium-proxy", __port_number="8080"}', action: drop} - {if: '{job="pomerium-proxy", __port_number="8443"}', action: drop} - {if: '{job="video-files", __port_number="8003"}', action: drop} + - {if: '{job=~"cm-acme-.*"}', action: drop} - # Assume all 8001/8002 port discoveries are redundant with an nginnx proxy + # Assume all 8001/8002 port discoveries are redundant with an nginx proxy - {if: '{__port_number="8001"}', action: drop} - {if: '{__port_number="8002"}', action: drop} diff -r ce7ae4d2b24b -r 3b91d52b007d tasks.py --- a/tasks.py Fri Sep 01 23:19:56 2023 -0700 +++ b/tasks.py Mon Oct 09 18:50:36 2023 -0700 @@ -15,6 +15,13 @@ replaceCmap("victoriametrics-config", {"scrape_main": configObj, "rules": rulesObj}) + # these don't give errors on rules format! they just quietly keep the old + # rules! use `skaffold run` to get errs. + # + # or run + # validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates") + # validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") + refreshPodCmaps(firstPodName("app=victoriametrics")) refreshPodCmaps(firstPodName("app=vmalert"))