comparison alert_rules.py @ 67:adde35eb4773

collapse ./next to ./
author drewp@bigasterisk.com
date Fri, 03 May 2024 11:21:08 -0700
parents next/alert_rules.py@8134cd480817
children 009527a145d0
comparison
equal deleted inserted replaced
66:429bfd62e6ba 67:adde35eb4773
1 """
2 pdm run invoke push-config
3
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
5 "Whenever the alert expression results in one or more vector
6 elements at a given point in time, the alert counts as active for
7 these elements' label sets."
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
9
10 """
11
12 import json
13
14
15 def pomRules():
16 return [
17 {
18 "alert": "frequent_upstream_connect_failures",
19 "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
20 },
21 {
22 "alert": "high_logging_pomerium",
23 "for": "3h",
24 "labels": {
25 "severity": "waste"
26 },
27 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
28 "annotations": {
29 "summary": "high log output rate"
30 },
31 },
32 ]
33
34
35 def k8sRules():
36 # from https://awesome-prometheus-alerts.grep.to/rules.html
37 return [
38 {
39 "alert": "metricsTargetMissing",
40 "expr": 'up{job!~"cm-acme-.*"} == 0',
41 'for': '10m',
42 "labels": {
43 "severity": "critical"
44 },
45 "annotations": {
46 "summary": "metrics target missing (instance {{ $labels.instance }})",
47 "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
48 },
49 },
50 {
51 "alert": "KubernetesMemoryPressure",
52 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
53 "for": "2m",
54 "labels": {
55 "severity": "critical"
56 },
57 "annotations": {
58 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
59 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
60 },
61 },
62 {
63 "alert": "KubernetesDiskPressure",
64 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
65 "for": "2m",
66 "labels": {
67 "severity": "critical"
68 },
69 "annotations": {
70 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
71 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
72 },
73 },
74 {
75 "alert": "KubernetesOutOfDisk",
76 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
77 "for": "2m",
78 "labels": {
79 "severity": "critical"
80 },
81 "annotations": {
82 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
83 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
84 },
85 },
86 {
87 "alert": "KubernetesJobFailed",
88 "expr": "kube_job_status_failed > 0",
89 "labels": {
90 "severity": "warning"
91 },
92 "annotations": {
93 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
94 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
95 },
96 },
97 {
98 "alert": "KubernetesPodCrashLooping",
99 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
100 "for": "2m",
101 "labels": {
102 "severity": "warning"
103 },
104 "annotations": {
105 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
106 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
107 },
108 },
109 {
110 "alert": "KubernetesClientCertificateExpiresNextWeek",
111 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
112 "labels": {
113 "severity": "warning"
114 },
115 "annotations": {
116 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
117 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
118 },
119 },
120 {
121 "alert": "container_waiting",
122 "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
123 "annotations": {
124 "description": '',
125 "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
126 },
127 "for": "2m",
128 },
129 ]
130
131
132 def allRules(ctx):
133 return {
134 "groups": [
135 {
136 "name": "k8s",
137 "interval": "1m",
138 "rules": k8sRules(),
139 },
140 {
141 "name": "pomerium_proxy",
142 "interval": "1m",
143 "rules": pomRules(),
144 },
145 {
146 "name":
147 "Outages",
148 "interval":
149 "1m",
150 "rules": [
151 {
152 "alert": "powereagleStalled",
153 "expr": "rate(house_power_w[100m]) == 0",
154 "for": "0m",
155 "labels": {
156 "severity": "losingData"
157 },
158 "annotations": {
159 "summary": "power eagle data stalled",
160 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
161 },
162 },
163 {
164 "alert": "powereagleAbsent",
165 "expr": "absent_over_time(house_power_w[5m])",
166 "for": "2m",
167 "labels": {
168 "severity": "losingData"
169 },
170 "annotations": {
171 "summary": "power eagle data missing",
172 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
173 },
174 },
175 {
176 "alert": "absent_zigbee",
177 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
178 },
179 {
180 "alert": "net_routes_sync",
181 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
182 "for": "10m",
183 "labels": {
184 "severity": "houseUsersAffected"
185 },
186 "annotations": {
187 "summary": "net_routes is not getting regular updates"
188 },
189 },
190 ],
191 },
192 {
193 "name": "disk_errs",
194 "interval": "2d",
195 "rules": [{
196 "alert": "zpool_device_error_increase",
197 "labels": {
198 "severity": "warning"
199 },
200 "expr": 'increase(zpool_device_error_count[3d]) > 0',
201 }, {
202 "alert": "zpool_device_error_count",
203 "labels": {
204 "severity": "warning"
205 },
206 "expr": 'zpool_device_error_count > 0',
207 }],
208 },
209 {
210 "name": "lighting",
211 "interval": "5m",
212 "rules": [{
213 "alert": "light_bridge_no_mqtt",
214 "expr": 'mqtt_connected{job="light-bridge"} != 1',
215 }],
216 },
217 {
218 "name":
219 "front_door",
220 "interval":
221 "5m",
222 "rules": [
223 {
224 "alert": "front_door_reader_esp32_no_mqtt",
225 'expr': 'hw_connected{job="fingerprint"} < 1',
226 "annotations": {
227 "summary": "see https://bigasterisk.com/front-door-lock/"
228 },
229 },
230 {
231 "alert": "front_door_reader_svc_down",
232 'expr': 'up{job="fingerprint"} < 1',
233 "annotations": {
234 "summary": "see https://bigasterisk.com/front-door-lock/"
235 },
236 },
237 {
238 "alert": "front_door_reader_svc_reader_no_mqtt",
239 'expr': 'mqtt_connected{job="fingerprint"} < 1',
240 "annotations": {
241 "summary": "see https://bigasterisk.com/front-door-lock/"
242 },
243 },
244 {
245 "alert": "front_door_lock_svc_down",
246 'expr': 'up{job="front-door-lock"} < 1',
247 "annotations": {
248 "summary": "see https://bigasterisk.com/front-door-lock/"
249 },
250 },
251 {
252 "alert": "front_door_lock_svc_no_mqtt",
253 'expr': 'mqtt_connected{job="front-door-lock"} < 1',
254 "annotations": {
255 "summary": "see https://bigasterisk.com/front-door-lock/"
256 },
257 },
258 {
259 "alert": "front_door_lock_esp32_no_mqtt",
260 'expr': 'hw_connected{job="front-door-lock"} < 1',
261 "annotations": {
262 "summary": "see https://bigasterisk.com/front-door-lock/"
263 },
264 },
265 ],
266 },
267 {
268 "name":
269 "net_routes",
270 "interval":
271 "5m",
272 "rules": [
273 {
274 "alert": "no_house_ip_service",
275 "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
276 },
277 {
278 "alert": "no_net_routes_running",
279 "expr": 'absent(python_info{job="net-routes"})'
280 },
281 {
282 "alert": "allowed_check_never_returned_200",
283 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
284 },
285 {
286 "alert": "allowed_check_never_returned_403",
287 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
288 },
289 {
290 'alert': 'net_route_input_eval_cal_loop_is_down',
291 'expr': 'eval_cal_up!=1'
292 },
293 {
294 'alert': 'net_route_input_mongo_loop_is_down',
295 'expr': 'mongo_to_net_routes_up!=1'
296 },
297 {
298 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
299 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
300 },
301 {
302 'alert': 'gcalendarwatch_current_events_loop_is_down',
303 'expr': 'current_events_up != 1'
304 },
305 ],
306 },
307 {
308 "name": "http",
309 "interval": "1h",
310 'rules': [
311 {
312 'alert': 'old_https_certs',
313 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
314 },
315 {
316 'alert': 'high_500_response_rate',
317 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
318 },
319 ],
320 },
321 {
322 "name": "ping",
323 "interval": "1m",
324 "rules": [{
325 "alert": "ping_failed",
326 "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
327 }]
328 },
329 {
330 "name":
331 "alerts",
332 "rules": [
333 {
334 "alert": "kube_node_status_bad_condition",
335 "for": "2h",
336 "labels": {
337 "severity": "warning"
338 },
339 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
340 },
341 {
342 "alert": "housePower",
343 "for": "1h",
344 "labels": {
345 "severity": "waste"
346 },
347 "expr": "house_power_w > 4000",
348 "annotations": {
349 "summary": "house power usage over 4KW"
350 },
351 },
352 {
353 "alert": "host_root_fs_space_low",
354 "for": "20m",
355 "labels": {
356 "severity": "warning"
357 },
358 "expr": 'disk_free{host!="garage",path="/"} < 20G',
359 },
360 {
361 "alert": "zpool_space_low",
362 "for": "20m",
363 "labels": {
364 "severity": "warning"
365 },
366 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
367 },
368 {
369 "alert": "disk_week_incr",
370 "for": "20m",
371 "labels": {
372 "severity": "warning"
373 },
374 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
375 "annotations": {
376 "summary": "high mb/week on zfs dir"
377 },
378 },
379 {
380 "alert": "high_logging",
381 "for": "3h",
382 "labels": {
383 "severity": "waste"
384 },
385 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
386 "annotations": {
387 "summary": "high log output rate"
388 },
389 },
390 {
391 "alert": "stale_process",
392 "for": "1d",
393 "labels": {
394 "severity": "dataRisk"
395 },
396 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
397 "annotations": {
398 "summary": "process time is old"
399 },
400 },
401 {
402 "alert": "starlette",
403 "for": "1m",
404 "labels": {
405 "severity": "fix"
406 },
407 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
408 "annotations": {
409 "summary": "set starlette app name"
410 },
411 },
412 {
413 "alert": "ssl_certs_expiring_soon",
414 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
415 "labels": {
416 "severity": "warning"
417 },
418 "annotations": {
419 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
420 },
421 },
422 ],
423 },
424 ] + hostsExpectedOnline(ctx)['groups']
425 }
426
427
428 def _runJson(ctx, cmd):
429 return json.loads(ctx.run(cmd, hide="stdout").stdout)
430
431
432 def hostsExpectedOnline(ctx):
433 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")