Mercurial > code > home > repos > victoriametrics
annotate alert_rules.py @ 39:84a00d1b87b3
maildir-count target
author | drewp@bigasterisk.com |
---|---|
date | Tue, 09 Jan 2024 19:56:22 -0800 |
parents | 6e27d280b598 |
children | 24daba56722c |
rev | line source |
---|---|
23 | 1 """ |
2 pdm run invoke push-config | |
3 | |
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ | |
5 "Whenever the alert expression results in one or more vector | |
6 elements at a given point in time, the alert counts as active for | |
7 these elements' label sets." | |
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics | |
9 | |
10 """ | |
11 | |
12 import json | |
13 | |
14 | |
15 def k8sRules(): | |
16 # from https://awesome-prometheus-alerts.grep.to/rules.html | |
17 return [ | |
18 { | |
34 | 19 "alert": "metricsTargetMissing", |
20 "expr": 'up{job!~"cm-acme-.*"} == 0', | |
36 | 21 'for': '10m', |
31 | 22 "labels": { |
23 "severity": "critical" | |
24 }, | |
23 | 25 "annotations": { |
34 | 26 "summary": "metrics target missing (instance {{ $labels.instance }})", |
27 "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", | |
23 | 28 }, |
29 }, | |
30 { | |
31 "alert": "KubernetesMemoryPressure", | |
32 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', | |
33 "for": "2m", | |
31 | 34 "labels": { |
35 "severity": "critical" | |
36 }, | |
23 | 37 "annotations": { |
38 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", | |
39 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", | |
40 }, | |
41 }, | |
42 { | |
43 "alert": "KubernetesDiskPressure", | |
44 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', | |
45 "for": "2m", | |
31 | 46 "labels": { |
47 "severity": "critical" | |
48 }, | |
23 | 49 "annotations": { |
50 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", | |
51 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", | |
52 }, | |
53 }, | |
54 { | |
55 "alert": "KubernetesOutOfDisk", | |
56 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', | |
57 "for": "2m", | |
31 | 58 "labels": { |
59 "severity": "critical" | |
60 }, | |
23 | 61 "annotations": { |
62 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", | |
63 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", | |
64 }, | |
65 }, | |
66 { | |
67 "alert": "KubernetesJobFailed", | |
68 "expr": "kube_job_status_failed > 0", | |
31 | 69 "labels": { |
70 "severity": "warning" | |
71 }, | |
23 | 72 "annotations": { |
73 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", | |
74 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", | |
75 }, | |
76 }, | |
77 { | |
78 "alert": "KubernetesPodCrashLooping", | |
79 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", | |
80 "for": "2m", | |
31 | 81 "labels": { |
82 "severity": "warning" | |
83 }, | |
23 | 84 "annotations": { |
85 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", | |
86 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", | |
87 }, | |
88 }, | |
89 { | |
31 | 90 "alert": |
91 "KubernetesClientCertificateExpiresNextWeek", | |
92 "expr": | |
93 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | |
94 "labels": { | |
95 "severity": "warning" | |
96 }, | |
23 | 97 "annotations": { |
98 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", | |
99 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", | |
100 }, | |
101 }, | |
102 { | |
103 "alert": "container_waiting", | |
34 | 104 "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", |
105 "annotations": { | |
106 "description": '', | |
107 "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", | |
108 }, | |
23 | 109 "for": "2m", |
110 }, | |
111 ] | |
112 | |
113 | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
114 def allRules(ctx): |
23 | 115 return { |
116 "groups": [ | |
117 { | |
118 "name": "k8s", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
119 "interval": "1m", |
23 | 120 "rules": k8sRules(), |
121 }, | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
122 expectedK8sNodesGroup(ctx), |
23 | 123 # |
124 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name | |
125 { | |
31 | 126 "name": |
127 "Outages", | |
128 "interval": | |
129 "1m", | |
23 | 130 "rules": [ |
131 { | |
132 "alert": "powereagleStalled", | |
133 "expr": "rate(house_power_w[100m]) == 0", | |
134 "for": "0m", | |
31 | 135 "labels": { |
136 "severity": "losingData" | |
137 }, | |
23 | 138 "annotations": { |
139 "summary": "power eagle data stalled", | |
140 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
141 }, | |
142 }, | |
143 { | |
144 "alert": "powereagleAbsent", | |
145 "expr": "absent_over_time(house_power_w[5m])", | |
146 "for": "2m", | |
31 | 147 "labels": { |
148 "severity": "losingData" | |
149 }, | |
23 | 150 "annotations": { |
151 "summary": "power eagle data missing", | |
152 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
153 }, | |
154 }, | |
155 { | |
156 "alert": "absent_zigbee", | |
157 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', | |
158 }, | |
159 { | |
160 "alert": "net_routes_sync", | |
161 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', | |
162 "for": "10m", | |
31 | 163 "labels": { |
164 "severity": "houseUsersAffected" | |
165 }, | |
23 | 166 "annotations": { |
167 "summary": "net_routes is not getting regular updates" | |
168 }, | |
169 }, | |
170 ], | |
171 }, | |
172 { | |
36 | 173 "name": |
174 "disk_errs", | |
175 "interval": | |
176 "2d", | |
31 | 177 "rules": [{ |
36 | 178 "alert": "zpool_device_error_increase", |
179 "labels": { | |
180 "severity": "warning" | |
181 }, | |
182 "expr": 'increase(zpool_device_error_count[3d]) > 0', | |
183 }, { | |
31 | 184 "alert": "zpool_device_error_count", |
185 "labels": { | |
186 "severity": "warning" | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
187 }, |
36 | 188 "expr": 'zpool_device_error_count > 0', |
31 | 189 }], |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
190 }, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
191 { |
31 | 192 "name": |
34 | 193 "front_door", |
194 "interval": | |
195 "5m", | |
196 "rules": [ | |
197 { | |
36 | 198 "alert": "front_door_reader_esp32_no_mqtt", |
199 'expr': 'hw_connected{job="fingerprint"} < 1', | |
200 "annotations": { | |
201 "summary": "see https://bigasterisk.com/front-door-lock/" | |
202 }, | |
203 }, | |
204 { | |
205 "alert": "fronr_door_reader_svc_down", | |
206 'expr': 'up{job="fingerprint"} < 1', | |
207 "annotations": { | |
208 "summary": "see https://bigasterisk.com/front-door-lock/" | |
209 }, | |
210 }, | |
211 { | |
212 "alert": "fronr_door_reader_svc_reader_no_mqtt", | |
213 'expr': 'mqtt_connected{job="fingerprint"} < 1', | |
214 "annotations": { | |
215 "summary": "see https://bigasterisk.com/front-door-lock/" | |
216 }, | |
34 | 217 }, |
218 { | |
36 | 219 "alert": "front_door_lock_svc_down", |
220 'expr': 'up{job="front-door-lock"} < 1', | |
221 "annotations": { | |
222 "summary": "see https://bigasterisk.com/front-door-lock/" | |
223 }, | |
34 | 224 }, |
36 | 225 { |
226 "alert": "front_door_lock_svc_no_mqtt", | |
227 'expr': 'mqtt_connected{job="front-door-lock"} < 1', | |
228 "annotations": { | |
229 "summary": "see https://bigasterisk.com/front-door-lock/" | |
230 }, | |
231 }, | |
232 { | |
233 "alert": "front_door_lock_esp32_no_mqtt", | |
234 'expr': 'hw_connected{job="front-door-lock"} < 1', | |
235 "annotations": { | |
236 "summary": "see https://bigasterisk.com/front-door-lock/" | |
237 }, | |
238 }, | |
239 ], | |
34 | 240 }, |
241 { | |
37
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
242 "name": "https_certs", |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
243 "interval": "1h", |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
244 'rules': [{ |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
245 'alert': 'old_https_certs', |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
246 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 45', |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
247 }], |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
248 }, |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
249 { |
34 | 250 "name": |
31 | 251 "alerts", |
23 | 252 "rules": [ |
253 { | |
254 "alert": "kube_node_status_bad_condition", | |
255 "for": "2h", | |
31 | 256 "labels": { |
257 "severity": "warning" | |
258 }, | |
23 | 259 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', |
260 }, | |
261 { | |
262 "alert": "housePower", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
263 "for": "1h", |
31 | 264 "labels": { |
265 "severity": "waste" | |
266 }, | |
23 | 267 "expr": "house_power_w > 4000", |
31 | 268 "annotations": { |
269 "summary": "house power usage over 4KW" | |
270 }, | |
23 | 271 }, |
272 { | |
273 "alert": "host_root_fs_space_low", | |
274 "for": "20m", | |
31 | 275 "labels": { |
276 "severity": "warning" | |
277 }, | |
34 | 278 "expr": 'disk_free{host!="garage",path="/"} < 20G', |
23 | 279 }, |
280 { | |
281 "alert": "zpool_space_low", | |
282 "for": "20m", | |
31 | 283 "labels": { |
284 "severity": "warning" | |
285 }, | |
23 | 286 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', |
287 }, | |
288 { | |
289 "alert": "disk_week_incr", | |
290 "for": "20m", | |
31 | 291 "labels": { |
292 "severity": "warning" | |
293 }, | |
23 | 294 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', |
31 | 295 "annotations": { |
296 "summary": "high mb/week on zfs dir" | |
297 }, | |
23 | 298 }, |
299 { | |
300 "alert": "high_logging", | |
31 | 301 "for": "3h", |
302 "labels": { | |
303 "severity": "waste" | |
304 }, | |
23 | 305 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", |
31 | 306 "annotations": { |
307 "summary": "high log output rate" | |
308 }, | |
23 | 309 }, |
310 { | |
311 "alert": "stale_process", | |
312 "for": "1d", | |
31 | 313 "labels": { |
314 "severity": "dataRisk" | |
315 }, | |
23 | 316 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", |
31 | 317 "annotations": { |
318 "summary": "process time is old" | |
319 }, | |
23 | 320 }, |
321 { | |
322 "alert": "starlette", | |
323 "for": "1m", | |
31 | 324 "labels": { |
325 "severity": "fix" | |
326 }, | |
23 | 327 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', |
31 | 328 "annotations": { |
329 "summary": "set starlette app name" | |
330 }, | |
23 | 331 }, |
332 { | |
333 "alert": "ssl_certs_expiring_soon", | |
334 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", | |
31 | 335 "labels": { |
336 "severity": "warning" | |
337 }, | |
23 | 338 "annotations": { |
339 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" | |
340 }, | |
341 }, | |
342 ], | |
343 }, | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
344 ] + hostsExpectedOnline(ctx)['groups'] |
23 | 345 } |
346 | |
347 | |
348 def _runJson(ctx, cmd): | |
349 return json.loads(ctx.run(cmd, hide="stdout").stdout) | |
350 | |
351 | |
352 def hostsExpectedOnline(ctx): | |
353 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") | |
354 | |
355 | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
356 def expectedK8sNodesGroup(ctx): |
23 | 357 getNode = _runJson(ctx, "kubectl get node -o json") |
358 hosts = [item["metadata"]["name"] for item in getNode["items"]] | |
359 optionalHosts = {'slash'} | |
360 return { | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
361 "name": |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
362 "k8s_expected_nodes", |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
363 "rules": [{ |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
364 "alert": "kube_node_log_size_report_" + h, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
365 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' % h, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
366 "for": "1h", |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
367 "annotations": { |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
368 "summary": f"no recent k8s log size report from host {h}" |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
369 }, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
370 } for h in hosts if h not in optionalHosts], |
23 | 371 } |