changeset 34:3b91d52b007d

rules tuning
author drewp@bigasterisk.com
date Mon, 09 Oct 2023 18:50:36 -0700
parents ce7ae4d2b24b
children 80e275ab2f88
files alert_rules.py config/scrape_main.yaml tasks.py
diffstat 3 files changed, 35 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/alert_rules.py	Fri Sep 01 23:19:56 2023 -0700
+++ b/alert_rules.py	Mon Oct 09 18:50:36 2023 -0700
@@ -16,14 +16,14 @@
     # from https://awesome-prometheus-alerts.grep.to/rules.html
     return [
         {
-            "alert": "PrometheusTargetMissing",
-            "expr": "up == 0",
+            "alert": "metricsTargetMissing",
+            "expr": 'up{job!~"cm-acme-.*"} == 0',
             "labels": {
                 "severity": "critical"
             },
             "annotations": {
-                "summary": "Prometheus target missing (instance {{ $labels.instance }})",
-                "description": "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
+                "summary": "metrics target missing (instance {{ $labels.instance }})",
+                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
             },
         },
         {
@@ -100,7 +100,11 @@
         },
         {
             "alert": "container_waiting",
-            "expr": "sum by (container)(kube_pod_container_status_waiting!=0)",
+            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
+            "annotations": {
+                "description": '',
+                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
+            },
             "for": "2m",
         },
     ]
@@ -177,6 +181,22 @@
             },
             {
                 "name":
+                    "front_door",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "service_disconnected_from_mqtt",
+                        "expr": "mqtt_connected < 1"
+                    },
+                    {
+                        "alert": "esp_not_connected_to_mqtt",
+                        "expr": "hw_connected < 1",
+                    },
+                ]
+            },
+            {
+                "name":
                     "alerts",
                 "rules": [
                     {
@@ -204,7 +224,7 @@
                         "labels": {
                             "severity": "warning"
                         },
-                        "expr": 'disk_free{path="/"} < 20G',
+                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
                     },
                     {
                         "alert": "zpool_space_low",
--- a/config/scrape_main.yaml	Fri Sep 01 23:19:56 2023 -0700
+++ b/config/scrape_main.yaml	Mon Oct 09 18:50:36 2023 -0700
@@ -109,8 +109,9 @@
       - {if: '{job="pomerium-proxy",    __port_number="8080"}', action: drop}
       - {if: '{job="pomerium-proxy",    __port_number="8443"}', action: drop}
       - {if: '{job="video-files",       __port_number="8003"}', action: drop}
+      - {if: '{job=~"cm-acme-.*"}', action: drop}
 
-      # Assume all 8001/8002 port discoveries are redundant with an nginnx proxy
+      # Assume all 8001/8002 port discoveries are redundant with an nginx proxy
       - {if: '{__port_number="8001"}', action: drop}
       - {if: '{__port_number="8002"}', action: drop}
       
--- a/tasks.py	Fri Sep 01 23:19:56 2023 -0700
+++ b/tasks.py	Mon Oct 09 18:50:36 2023 -0700
@@ -15,6 +15,13 @@
 
     replaceCmap("victoriametrics-config", {"scrape_main": configObj, "rules": rulesObj})
 
+    # these don't give errors on rules format! they just quietly keep the old
+    # rules! use `skaffold run` to get errs.
+    #
+    # or run
+    #  validateTemplates   = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
+	#  validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
+
     refreshPodCmaps(firstPodName("app=victoriametrics"))
     refreshPodCmaps(firstPodName("app=vmalert"))