changeset 23:ffa013944200

mv
author drewp@bigasterisk.com
date Sat, 24 Jun 2023 23:02:17 -0700
parents cd115f1ca2a8
children d7aa409ebf78
files alert_rules.py rules/rules_k8s.yaml rules/rules_main.yaml
diffstat 3 files changed, 254 insertions(+), 174 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alert_rules.py	Sat Jun 24 23:02:17 2023 -0700
@@ -0,0 +1,254 @@
+"""
+pdm run invoke push-config
+
+docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+"Whenever the alert expression results in one or more vector
+elements at a given point in time, the alert counts as active for
+these elements' label sets."
+also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+
+"""
+
+import json
+
+import yaml
+
+
+def k8sRules():
+    # from https://awesome-prometheus-alerts.grep.to/rules.html
+    return [
+        {
+            "alert": "PrometheusTargetMissing",
+            "expr": "up == 0",
+            "for": "0m",
+            "labels": {"severity": "critical"},
+            "annotations": {
+                "summary": "Prometheus target missing (instance {{ $labels.instance }})",
+                "description": "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesMemoryPressure",
+            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {"severity": "critical"},
+            "annotations": {
+                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesDiskPressure",
+            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {"severity": "critical"},
+            "annotations": {
+                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesOutOfDisk",
+            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
+            "for": "2m",
+            "labels": {"severity": "critical"},
+            "annotations": {
+                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesJobFailed",
+            "expr": "kube_job_status_failed > 0",
+            "for": "0m",
+            "labels": {"severity": "warning"},
+            "annotations": {
+                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
+                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesPodCrashLooping",
+            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
+            "for": "2m",
+            "labels": {"severity": "warning"},
+            "annotations": {
+                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
+                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesClientCertificateExpiresNextWeek",
+            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
+            "for": "0m",
+            "labels": {"severity": "warning"},
+            "annotations": {
+                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
+                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "container_waiting",
+            "expr": "sum by (container)(kube_pod_container_status_waiting!=0)",
+            "for": "2m",
+        },
+    ]
+
+
+def allRules():
+    return {
+        "groups": [
+            {
+                "name": "k8s",
+                "rules": k8sRules(),
+            },
+            #
+            # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
+            {
+                "name": "Outages",
+                "rules": [
+                    {
+                        "alert": "powereagleStalled",
+                        "expr": "rate(house_power_w[100m]) == 0",
+                        "for": "0m",
+                        "labels": {"severity": "losingData"},
+                        "annotations": {
+                            "summary": "power eagle data stalled",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "powereagleAbsent",
+                        "expr": "absent_over_time(house_power_w[5m])",
+                        "for": "2m",
+                        "labels": {"severity": "losingData"},
+                        "annotations": {
+                            "summary": "power eagle data missing",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "absent_zigbee",
+                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
+                    },
+                    {
+                        "alert": "net_routes_sync",
+                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
+                        "for": "10m",
+                        "labels": {"severity": "houseUsersAffected"},
+                        "annotations": {
+                            "summary": "net_routes is not getting regular updates"
+                        },
+                    },
+                ],
+            },
+            {
+                "name": "alerts",
+                "rules": [
+                    {
+                        "alert": "kube_node_status_bad_condition",
+                        "for": "2h",
+                        "labels": {"severity": "warning"},
+                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
+                    },
+                    {
+                        "alert": "housePower",
+                        "for": "24h",
+                        "labels": {"severity": "waste"},
+                        "expr": "house_power_w > 4000",
+                        "annotations": {"summary": "house power usage over 4KW"},
+                    },
+                    {
+                        "alert": "host_root_fs_space_low",
+                        "for": "20m",
+                        "labels": {"severity": "warning"},
+                        "expr": 'disk_free{path="/"} < 20G',
+                        "annotations": {"summary": "low disk_free"},
+                    },
+                    {
+                        "alert": "zpool_space_low",
+                        "for": "20m",
+                        "labels": {"severity": "warning"},
+                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
+                        "annotations": {"summary": "low disk_free"},
+                    },
+                    {
+                        "alert": "zpool_device_error_count",
+                        "for": "20m",
+                        "labels": {"severity": "warning"},
+                        "expr": 'increase(zpool_device_error_count[1d]) > 0',
+                        "annotations": {"summary": "low disk_free"},
+                    },
+                    {
+                        "alert": "disk_week_incr",
+                        "for": "20m",
+                        "labels": {"severity": "warning"},
+                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
+                        "annotations": {"summary": "high mb/week on zfs dir"},
+                    },
+                    {
+                        "alert": "high_logging",
+                        "for": "20m",
+                        "labels": {"severity": "waste"},
+                        "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
+                        "annotations": {"summary": "high log output rate"},
+                    },
+                    {
+                        "alert": "stale_process",
+                        "for": "1d",
+                        "labels": {"severity": "dataRisk"},
+                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
+                        "annotations": {"summary": "process time is old"},
+                    },
+                    {
+                        "alert": "starlette",
+                        "for": "1m",
+                        "labels": {"severity": "fix"},
+                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
+                        "annotations": {"summary": "set starlette app name"},
+                    },
+                    {
+                        "alert": "ssl_certs_expiring_soon",
+                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
+                        "labels": {"severity": "warning"},
+                        "annotations": {
+                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
+                        },
+                    },
+                ],
+            },
+        ]
+    }
+
+
+def _runJson(ctx, cmd):
+    return json.loads(ctx.run(cmd, hide="stdout").stdout)
+
+
+def hostsExpectedOnline(ctx):
+    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
+
+
+def expectedK8sNodes(ctx):
+    getNode = _runJson(ctx, "kubectl get node -o json")
+    hosts = [item["metadata"]["name"] for item in getNode["items"]]
+    optionalHosts = {'slash'}
+    return {
+        "groups": [
+            {
+                "name": "k8s_expected_nodes",
+                "rules": [
+                    {
+                        "alert": "kube_node_log_size_report_" + h,
+                        "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})'
+                        % h,
+                        "for": "1h",
+                        "annotations": {
+                            "summary": f"no recent k8s log size report from host {h}"
+                        },
+                    }
+                    for h in hosts if not h in optionalHosts
+                ],
+            }
+        ]
+    }
--- a/rules/rules_k8s.yaml	Sat Jun 24 23:02:04 2023 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,64 +0,0 @@
-groups: 
-  - name: k8s
-    rules:
-      # from https://awesome-prometheus-alerts.grep.to/rules.html
-      - alert: PrometheusTargetMissing
-        expr: up == 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Prometheus target missing (instance {{ $labels.instance }})
-          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesMemoryPressure
-        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesDiskPressure
-        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesOutOfDisk
-        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes out of disk (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesJobFailed
-        expr: kube_job_status_failed > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes Job failed (instance {{ $labels.instance }})
-          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPodCrashLooping
-        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
-          description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesClientCertificateExpiresNextWeek
-        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
-          description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: container_waiting
-        expr: sum by (container)(kube_pod_container_status_waiting!=0)
-        for: 2m
--- a/rules/rules_main.yaml	Sat Jun 24 23:02:04 2023 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,110 +0,0 @@
-groups:
-  # docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-  # "Whenever the alert expression results in one or more vector
-  # elements at a given point in time, the alert counts as active for
-  # these elements' label sets."
-
-  # also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
-  #
-  # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
-
-  # - name: webcam
-  #   rules:
-    # waiting for twinscam revival
-      # - alert: twinscam_not_reporting
-      #   expr: absent(cam_pipeline_state{job="webcam-record-twinscam"})
-      #   for: 2m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      #     summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}"
-
-      # - alert: cam_garagehall_not_reporting
-      #   expr: absent(cam_pipeline_state{job="webcam-record-garagehall"})
-      #   for: 2m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      # #     summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}"
-
-      # - alert: cam_pipeline_stopped
-      #   expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1
-      #   for: 10m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      #     summary: "webcam-record gst pipeline is not state=playing {{ $labels }}"
-
-      # - alert: cam_not_advancing
-      #   expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2
-      #   for: 10m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      #     summary: "cam output bytes is advancing too slowly. {{ $labels }}"
-
-      # - alert: webcam_indexer_stalled
-      #   expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01
-      #   for: 10m
-      #   labels:
-      #     severity: webcamUsersAffected
-      #   annotations:
-      #     summary: "webcam indexer update loop is stalled"
-
-  - name: Outages
-    rules:
-      - alert: powereagleStalled
-        expr: rate(house_power_w[100m]) == 0
-        for: 0m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "power eagle data stalled"
-          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
-
-      - alert: powereagleAbsent
-        expr: absent_over_time(house_power_w[5m])
-        for: 2m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "power eagle data missing"
-          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
-
-      # - alert: absent_mitmproxy
-      #   expr: absent(process_resident_memory_bytes{job="mitmproxy"})
-      #   labels:
-      #     severity: houseUsersAffected
-      #   annotations:
-      #     summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)"
-
-      - alert: absent_zigbee_dash
-        expr: absent(container_last_seen{container="zigbee2mqtt-dash"})
-
-      - alert: net_routes_sync
-        expr: rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70
-        for: 10m
-        labels:
-          severity: houseUsersAffected
-        annotations:
-          summary: "net_routes is not getting regular updates"
-
-      
-  - name: alerts
-    rules:
-      - {alert: housePower,     for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
-      - {alert: disk1,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk2,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk3,          for: 20m, labels: {severity: warning}, expr: '1 > 2',                                                                               annotations: {summary: "unused"}}
-      - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000',          annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
-        # - {alert: oom,        for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
-      - {alert: high_logging,   for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k',     annotations: {summary: "high log output rate {{ $labels }}"}}
-      - {alert: stale_process,  for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
-      - {alert: starlette,      for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                    annotations: {summary: "set starlette app name {{ $labels }}"}}
-      - alert: ssl_certs_expiring_soon
-        expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
-        labels:
-          severity: warning
-        annotations:
-          summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-# williamperttula.com needs to serve real cert, not self-signed