Mercurial > code > home > repos > victoriametrics
annotate alert_rules.py @ 82:5a526531305f
rules
author | drewp@bigasterisk.com |
---|---|
date | Wed, 07 Aug 2024 15:10:28 -0700 |
parents | 5767d3fd170f |
children | ce7c4a918832 |
rev | line source |
---|---|
23 | 1 """ |
2 pdm run invoke push-config | |
3 | |
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ | |
5 "Whenever the alert expression results in one or more vector | |
6 elements at a given point in time, the alert counts as active for | |
7 these elements' label sets." | |
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics | |
9 | |
10 """ | |
11 | |
12 import json | |
13 | |
14 | |
49 | 15 def pomRules(): |
16 return [ | |
17 { | |
18 "alert": "frequent_upstream_connect_failures", | |
51
df44473de6a1
let connection failure alerts finish sooner
drewp@bigasterisk.com
parents:
49
diff
changeset
|
19 "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" |
49 | 20 }, |
21 { | |
22 "alert": "high_logging_pomerium", | |
23 "for": "3h", | |
24 "labels": { | |
25 "severity": "waste" | |
26 }, | |
82 | 27 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[1h])) > 50k', |
49 | 28 "annotations": { |
29 "summary": "high log output rate" | |
30 }, | |
31 }, | |
32 ] | |
33 | |
34 | |
23 | 35 def k8sRules(): |
36 # from https://awesome-prometheus-alerts.grep.to/rules.html | |
37 return [ | |
38 { | |
34 | 39 "alert": "metricsTargetMissing", |
40 "expr": 'up{job!~"cm-acme-.*"} == 0', | |
36 | 41 'for': '10m', |
31 | 42 "labels": { |
43 "severity": "critical" | |
44 }, | |
23 | 45 "annotations": { |
34 | 46 "summary": "metrics target missing (instance {{ $labels.instance }})", |
47 "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", | |
23 | 48 }, |
49 }, | |
50 { | |
51 "alert": "KubernetesMemoryPressure", | |
52 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', | |
53 "for": "2m", | |
31 | 54 "labels": { |
55 "severity": "critical" | |
56 }, | |
23 | 57 "annotations": { |
58 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", | |
59 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", | |
60 }, | |
61 }, | |
62 { | |
63 "alert": "KubernetesDiskPressure", | |
64 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', | |
65 "for": "2m", | |
31 | 66 "labels": { |
67 "severity": "critical" | |
68 }, | |
23 | 69 "annotations": { |
70 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", | |
71 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", | |
72 }, | |
73 }, | |
74 { | |
75 "alert": "KubernetesOutOfDisk", | |
76 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', | |
77 "for": "2m", | |
31 | 78 "labels": { |
79 "severity": "critical" | |
80 }, | |
23 | 81 "annotations": { |
82 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", | |
83 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", | |
84 }, | |
85 }, | |
86 { | |
87 "alert": "KubernetesJobFailed", | |
88 "expr": "kube_job_status_failed > 0", | |
31 | 89 "labels": { |
90 "severity": "warning" | |
91 }, | |
23 | 92 "annotations": { |
93 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", | |
94 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", | |
95 }, | |
96 }, | |
97 { | |
98 "alert": "KubernetesPodCrashLooping", | |
99 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", | |
100 "for": "2m", | |
31 | 101 "labels": { |
102 "severity": "warning" | |
103 }, | |
23 | 104 "annotations": { |
105 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", | |
106 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", | |
107 }, | |
108 }, | |
109 { | |
48 | 110 "alert": "KubernetesClientCertificateExpiresNextWeek", |
111 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | |
31 | 112 "labels": { |
113 "severity": "warning" | |
114 }, | |
23 | 115 "annotations": { |
116 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", | |
117 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", | |
118 }, | |
119 }, | |
120 { | |
121 "alert": "container_waiting", | |
34 | 122 "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", |
123 "annotations": { | |
124 "description": '', | |
125 "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", | |
126 }, | |
81 | 127 "for": "10m", |
23 | 128 }, |
129 ] | |
130 | |
131 | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
132 def allRules(ctx): |
23 | 133 return { |
134 "groups": [ | |
135 { | |
136 "name": "k8s", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
137 "interval": "1m", |
23 | 138 "rules": k8sRules(), |
139 }, | |
140 { | |
49 | 141 "name": "pomerium_proxy", |
142 "interval": "1m", | |
143 "rules": pomRules(), | |
144 }, | |
145 { | |
31 | 146 "name": |
147 "Outages", | |
148 "interval": | |
149 "1m", | |
23 | 150 "rules": [ |
151 { | |
152 "alert": "powereagleStalled", | |
153 "expr": "rate(house_power_w[100m]) == 0", | |
154 "for": "0m", | |
31 | 155 "labels": { |
156 "severity": "losingData" | |
157 }, | |
23 | 158 "annotations": { |
159 "summary": "power eagle data stalled", | |
160 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
161 }, | |
162 }, | |
163 { | |
164 "alert": "powereagleAbsent", | |
165 "expr": "absent_over_time(house_power_w[5m])", | |
166 "for": "2m", | |
31 | 167 "labels": { |
168 "severity": "losingData" | |
169 }, | |
23 | 170 "annotations": { |
171 "summary": "power eagle data missing", | |
172 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
173 }, | |
174 }, | |
175 { | |
176 "alert": "absent_zigbee", | |
177 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', | |
81 | 178 "for": "10m", |
23 | 179 }, |
180 { | |
181 "alert": "net_routes_sync", | |
182 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', | |
183 "for": "10m", | |
31 | 184 "labels": { |
185 "severity": "houseUsersAffected" | |
186 }, | |
23 | 187 "annotations": { |
80 | 188 "summary": "net_routes is not getting regular updates from net_routes_input", |
23 | 189 }, |
190 }, | |
191 ], | |
192 }, | |
193 { | |
48 | 194 "name": "disk_errs", |
195 "interval": "2d", | |
31 | 196 "rules": [{ |
36 | 197 "alert": "zpool_device_error_increase", |
198 "labels": { | |
199 "severity": "warning" | |
200 }, | |
201 "expr": 'increase(zpool_device_error_count[3d]) > 0', | |
202 }, { | |
31 | 203 "alert": "zpool_device_error_count", |
204 "labels": { | |
205 "severity": "warning" | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
206 }, |
36 | 207 "expr": 'zpool_device_error_count > 0', |
31 | 208 }], |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
209 }, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
210 { |
49 | 211 "name": "lighting", |
212 "interval": "5m", | |
213 "rules": [{ | |
214 "alert": "light_bridge_no_mqtt", | |
215 "expr": 'mqtt_connected{job="light-bridge"} != 1', | |
216 }], | |
217 }, | |
218 { | |
31 | 219 "name": |
34 | 220 "front_door", |
221 "interval": | |
222 "5m", | |
223 "rules": [ | |
224 { | |
36 | 225 "alert": "front_door_reader_esp32_no_mqtt", |
226 'expr': 'hw_connected{job="fingerprint"} < 1', | |
227 "annotations": { | |
228 "summary": "see https://bigasterisk.com/front-door-lock/" | |
229 }, | |
230 }, | |
231 { | |
48 | 232 "alert": "front_door_reader_svc_down", |
36 | 233 'expr': 'up{job="fingerprint"} < 1', |
234 "annotations": { | |
235 "summary": "see https://bigasterisk.com/front-door-lock/" | |
236 }, | |
237 }, | |
238 { | |
48 | 239 "alert": "front_door_reader_svc_reader_no_mqtt", |
36 | 240 'expr': 'mqtt_connected{job="fingerprint"} < 1', |
241 "annotations": { | |
242 "summary": "see https://bigasterisk.com/front-door-lock/" | |
243 }, | |
34 | 244 }, |
245 { | |
36 | 246 "alert": "front_door_lock_svc_down", |
247 'expr': 'up{job="front-door-lock"} < 1', | |
248 "annotations": { | |
249 "summary": "see https://bigasterisk.com/front-door-lock/" | |
250 }, | |
34 | 251 }, |
36 | 252 { |
253 "alert": "front_door_lock_svc_no_mqtt", | |
254 'expr': 'mqtt_connected{job="front-door-lock"} < 1', | |
255 "annotations": { | |
256 "summary": "see https://bigasterisk.com/front-door-lock/" | |
257 }, | |
258 }, | |
259 { | |
260 "alert": "front_door_lock_esp32_no_mqtt", | |
261 'expr': 'hw_connected{job="front-door-lock"} < 1', | |
262 "annotations": { | |
263 "summary": "see https://bigasterisk.com/front-door-lock/" | |
264 }, | |
265 }, | |
266 ], | |
34 | 267 }, |
268 { | |
49 | 269 "name": |
270 "net_routes", | |
271 "interval": | |
272 "5m", | |
273 "rules": [ | |
274 { | |
275 "alert": "no_house_ip_service", | |
276 "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' | |
277 }, | |
278 { | |
279 "alert": "no_net_routes_running", | |
280 "expr": 'absent(python_info{job="net-routes"})' | |
281 }, | |
282 { | |
283 "alert": "allowed_check_never_returned_200", | |
284 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' | |
285 }, | |
286 { | |
287 "alert": "allowed_check_never_returned_403", | |
288 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' | |
289 }, | |
290 { | |
291 'alert': 'net_route_input_eval_cal_loop_is_down', | |
292 'expr': 'eval_cal_up!=1' | |
293 }, | |
294 { | |
295 'alert': 'net_route_input_mongo_loop_is_down', | |
296 'expr': 'mongo_to_net_routes_up!=1' | |
297 }, | |
298 { | |
299 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', | |
300 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' | |
301 }, | |
302 { | |
303 'alert': 'gcalendarwatch_current_events_loop_is_down', | |
304 'expr': 'current_events_up != 1' | |
305 }, | |
306 ], | |
307 }, | |
308 { | |
41
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
309 "name": "http", |
37
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
310 "interval": "1h", |
41
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
311 'rules': [ |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
312 { |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
313 'alert': 'old_https_certs', |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
314 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', |
48 | 315 }, |
316 { | |
41
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
317 'alert': 'high_500_response_rate', |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
318 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
319 }, |
407ee7fbda13
rm double metrics; add alert for too-many-500s
drewp@bigasterisk.com
parents:
40
diff
changeset
|
320 ], |
37
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
321 }, |
6e27d280b598
watch https cert ages (testing with 45d)
drewp@bigasterisk.com
parents:
36
diff
changeset
|
322 { |
42 | 323 "name": "ping", |
324 "interval": "1m", | |
325 "rules": [{ | |
326 "alert": "ping_failed", | |
327 "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', | |
82 | 328 "for": "10m", |
42 | 329 }] |
330 }, | |
331 { | |
34 | 332 "name": |
31 | 333 "alerts", |
23 | 334 "rules": [ |
335 { | |
336 "alert": "kube_node_status_bad_condition", | |
337 "for": "2h", | |
31 | 338 "labels": { |
339 "severity": "warning" | |
340 }, | |
23 | 341 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', |
342 }, | |
343 { | |
344 "alert": "housePower", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
345 "for": "1h", |
31 | 346 "labels": { |
347 "severity": "waste" | |
348 }, | |
23 | 349 "expr": "house_power_w > 4000", |
31 | 350 "annotations": { |
351 "summary": "house power usage over 4KW" | |
352 }, | |
23 | 353 }, |
354 { | |
355 "alert": "host_root_fs_space_low", | |
356 "for": "20m", | |
31 | 357 "labels": { |
358 "severity": "warning" | |
359 }, | |
82 | 360 "expr": 'disk_used_percent{path="/"} > 85', |
23 | 361 }, |
362 { | |
363 "alert": "zpool_space_low", | |
364 "for": "20m", | |
31 | 365 "labels": { |
366 "severity": "warning" | |
367 }, | |
23 | 368 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', |
369 }, | |
370 { | |
371 "alert": "disk_week_incr", | |
372 "for": "20m", | |
31 | 373 "labels": { |
374 "severity": "warning" | |
375 }, | |
23 | 376 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', |
31 | 377 "annotations": { |
378 "summary": "high mb/week on zfs dir" | |
379 }, | |
23 | 380 }, |
381 { | |
382 "alert": "high_logging", | |
31 | 383 "for": "3h", |
384 "labels": { | |
385 "severity": "waste" | |
386 }, | |
76
009527a145d0
add kube-state-metrics scrape; loosen some high-logging thresholds
drewp@bigasterisk.com
parents:
67
diff
changeset
|
387 "expr": 'sum by (namespace, pod, container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 10k', |
31 | 388 "annotations": { |
389 "summary": "high log output rate" | |
390 }, | |
23 | 391 }, |
392 { | |
393 "alert": "stale_process", | |
394 "for": "1d", | |
31 | 395 "labels": { |
396 "severity": "dataRisk" | |
397 }, | |
23 | 398 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", |
31 | 399 "annotations": { |
400 "summary": "process time is old" | |
401 }, | |
23 | 402 }, |
403 { | |
404 "alert": "starlette", | |
405 "for": "1m", | |
31 | 406 "labels": { |
407 "severity": "fix" | |
408 }, | |
23 | 409 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', |
31 | 410 "annotations": { |
411 "summary": "set starlette app name" | |
412 }, | |
23 | 413 }, |
414 { | |
415 "alert": "ssl_certs_expiring_soon", | |
416 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", | |
31 | 417 "labels": { |
418 "severity": "warning" | |
419 }, | |
23 | 420 "annotations": { |
421 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" | |
422 }, | |
423 }, | |
424 ], | |
425 }, | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
426 ] + hostsExpectedOnline(ctx)['groups'] |
23 | 427 } |
428 | |
429 | |
430 def _runJson(ctx, cmd): | |
431 return json.loads(ctx.run(cmd, hide="stdout").stdout) | |
432 | |
433 | |
434 def hostsExpectedOnline(ctx): | |
435 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") |