diff rules/rules_main.yaml @ 9:17db5e8e7a2f

big rules and scrape config updates
author drewp@bigasterisk.com
date Sun, 04 Dec 2022 02:08:08 -0800
parents config/rules_main.yaml@1eb6e6a2b9b6
children b6720e379d5b
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rules/rules_main.yaml	Sun Dec 04 02:08:08 2022 -0800
@@ -0,0 +1,115 @@
+groups:
+  # docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+  # "Whenever the alert expression results in one or more vector
+  # elements at a given point in time, the alert counts as active for
+  # these elements' label sets."
+
+  # also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+  #
+  # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
+
+  # - name: webcam
+  #   rules:
+    # waiting for twinscam revival
+      # - alert: twinscam_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-twinscam"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}"
+
+      # - alert: cam_garagehall_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-garagehall"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      # #     summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}"
+
+      # - alert: cam_pipeline_stopped
+      #   expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1
+      #   for: 10m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record gst pipeline is not state=playing {{ $labels }}"
+
+      # - alert: cam_not_advancing
+      #   expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2
+      #   for: 10m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "cam output bytes is advancing too slowly. {{ $labels }}"
+
+      # - alert: webcam_indexer_stalled
+      #   expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01
+      #   for: 10m
+      #   labels:
+      #     severity: webcamUsersAffected
+      #   annotations:
+      #     summary: "webcam indexer update loop is stalled"
+
+  - name: Outages
+    rules:
+      - alert: powereagleStalled
+        expr: rate(house_power_w[100m]) == 0
+        for: 0m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data stalled"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      - alert: powereagleAbsent
+        expr: absent_over_time(house_power_w[5m])
+        for: 2m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data missing"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      # - alert: wifi_scrape_errors
+      #   expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
+      #   labels:
+      #     severity: houseUsersAffected
+      #   annotations:
+      #     summary: "errors getting wifi users list"
+
+      # - alert: absent_mitmproxy
+      #   expr: absent(process_resident_memory_bytes{job="mitmproxy"})
+      #   labels:
+      #     severity: houseUsersAffected
+      #   annotations:
+      #     summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)"
+
+      - alert: absent_zigbee_dash
+        expr: absent(container_last_seen{container="zigbee2mqtt-dash"})
+
+      - alert: net_routes_sync
+        expr: rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70
+        for: 10m
+        labels:
+          severity: houseUsersAffected
+        annotations:
+          summary: "net_routes is not getting regular updates"
+
+      
+  - name: alerts
+    rules:
+      - {alert: housePower,   for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
+      - {alert: disk1,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk2,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk3,        for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500',           annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
+      - {alert: oom,          for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
+      - {alert: high_logging, for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}}
+      - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
+      - {alert: starlette,    for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                       annotations: {summary: "set starlette app name {{ $labels }}"}}
+      - alert: ssl_certs_expiring_soon
+        expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
+        labels:
+          severity: warning
+        annotations:
+          summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n  LABELS = {{ $labels }}"