changeset 28:e114edff93dc

more explicit intervals. try to get a single day of notification out of a disk err increase
author drewp@bigasterisk.com
date Wed, 19 Jul 2023 21:17:22 -0700
parents eec015e90818
children a4c49fa01c9d
files alert_rules.py deploy_vmalert.yaml
diffstat 2 files changed, 16 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/alert_rules.py	Thu Jun 29 14:12:22 2023 -0700
+++ b/alert_rules.py	Wed Jul 19 21:17:22 2023 -0700
@@ -18,7 +18,6 @@
         {
             "alert": "PrometheusTargetMissing",
             "expr": "up == 0",
-            "for": "0m",
             "labels": {"severity": "critical"},
             "annotations": {
                 "summary": "Prometheus target missing (instance {{ $labels.instance }})",
@@ -58,7 +57,6 @@
         {
             "alert": "KubernetesJobFailed",
             "expr": "kube_job_status_failed > 0",
-            "for": "0m",
             "labels": {"severity": "warning"},
             "annotations": {
                 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
@@ -78,7 +76,6 @@
         {
             "alert": "KubernetesClientCertificateExpiresNextWeek",
             "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
-            "for": "0m",
             "labels": {"severity": "warning"},
             "annotations": {
                 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
@@ -98,12 +95,14 @@
         "groups": [
             {
                 "name": "k8s",
+                "interval": "1m",
                 "rules": k8sRules(),
             },
             #
             # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
             {
                 "name": "Outages",
+                "interval": "1m",
                 "rules": [
                     {
                         "alert": "powereagleStalled",
@@ -141,6 +140,17 @@
                 ],
             },
             {
+                "name": "disk_errs",
+                "interval": "2d",
+                "rules": [
+                    {
+                        "alert": "zpool_device_error_count",
+                        "labels": {"severity": "warning"},
+                        "expr": 'increase(zpool_device_error_count[3d]) > 0',
+                    },
+                ],
+            },
+            {
                 "name": "alerts",
                 "rules": [
                     {
@@ -151,7 +161,7 @@
                     },
                     {
                         "alert": "housePower",
-                        "for": "24h",
+                        "for": "1h",
                         "labels": {"severity": "waste"},
                         "expr": "house_power_w > 4000",
                         "annotations": {"summary": "house power usage over 4KW"},
@@ -169,12 +179,6 @@
                         "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
                     },
                     {
-                        "alert": "zpool_device_error_count",
-                        "for": "20m",
-                        "labels": {"severity": "warning"},
-                        "expr": 'increase(zpool_device_error_count[2h]) > 0',
-                    },
-                    {
                         "alert": "disk_week_incr",
                         "for": "20m",
                         "labels": {"severity": "warning"},
--- a/deploy_vmalert.yaml	Thu Jun 29 14:12:22 2023 -0700
+++ b/deploy_vmalert.yaml	Wed Jul 19 21:17:22 2023 -0700
@@ -25,6 +25,8 @@
           args:
             - -configCheckInterval=5s
             - -datasource.url=http://victoriametrics/m/
+            - -datasource.queryStep=5m
+            - -evaluationInterval=1m
             - -external.url=https://bigasterisk.com/vmalert
             - -loggerLevel=INFO
             - -loggerTimezone=America/Los_Angeles