diff rules/rules_main.yaml @ 12:b6720e379d5b

config updates
author drewp@bigasterisk.com
date Tue, 14 Mar 2023 20:04:06 -0700
parents 17db5e8e7a2f
children 89a351ec7abf
line wrap: on
line diff
--- a/rules/rules_main.yaml	Mon Jan 16 01:05:31 2023 -0800
+++ b/rules/rules_main.yaml	Tue Mar 14 20:04:06 2023 -0700
@@ -71,13 +71,6 @@
           summary: "power eagle data missing"
           description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
 
-      # - alert: wifi_scrape_errors
-      #   expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
-      #   labels:
-      #     severity: houseUsersAffected
-      #   annotations:
-      #     summary: "errors getting wifi users list"
-
       # - alert: absent_mitmproxy
       #   expr: absent(process_resident_memory_bytes{job="mitmproxy"})
       #   labels:
@@ -99,14 +92,15 @@
       
   - name: alerts
     rules:
-      - {alert: housePower,   for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
-      - {alert: disk1,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk2,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk3,        for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500',           annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
-      - {alert: oom,          for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
-      - {alert: high_logging, for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}}
-      - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
-      - {alert: starlette,    for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                       annotations: {summary: "set starlette app name {{ $labels }}"}}
+      - {alert: housePower,     for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
+      - {alert: disk1,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk2,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk3,          for: 20m, labels: {severity: warning}, expr: '1 > 2',                                                                               annotations: {summary: "unused"}}
+      - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000',          annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
+        # - {alert: oom,        for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
+      - {alert: high_logging,   for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k',     annotations: {summary: "high log output rate {{ $labels }}"}}
+      - {alert: stale_process,  for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
+      - {alert: starlette,      for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                    annotations: {summary: "set starlette app name {{ $labels }}"}}
       - alert: ssl_certs_expiring_soon
         expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
         labels: