Mercurial > code > home > repos > victoriametrics
comparison alert_rules.py @ 49:febc20caabcb
more alerts
author | drewp@bigasterisk.com |
---|---|
date | Sun, 10 Mar 2024 14:49:32 -0700 |
parents | daa0df13bf06 |
children | df44473de6a1 |
comparison
equal
deleted
inserted
replaced
48:daa0df13bf06 | 49:febc20caabcb |
---|---|
10 """ | 10 """ |
11 | 11 |
12 import json | 12 import json |
13 | 13 |
14 | 14 |
15 def pomRules(): | |
16 return [ | |
17 { | |
18 "alert": "frequent_upstream_connect_failures", | |
19 "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[6h]) > 0" | |
20 }, | |
21 { | |
22 "alert": "high_logging_pomerium", | |
23 "for": "3h", | |
24 "labels": { | |
25 "severity": "waste" | |
26 }, | |
27 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', | |
28 "annotations": { | |
29 "summary": "high log output rate" | |
30 }, | |
31 }, | |
32 ] | |
33 | |
34 | |
15 def k8sRules(): | 35 def k8sRules(): |
16 # from https://awesome-prometheus-alerts.grep.to/rules.html | 36 # from https://awesome-prometheus-alerts.grep.to/rules.html |
17 return [ | 37 return [ |
18 { | 38 { |
19 "alert": "metricsTargetMissing", | 39 "alert": "metricsTargetMissing", |
114 "groups": [ | 134 "groups": [ |
115 { | 135 { |
116 "name": "k8s", | 136 "name": "k8s", |
117 "interval": "1m", | 137 "interval": "1m", |
118 "rules": k8sRules(), | 138 "rules": k8sRules(), |
139 }, | |
140 { | |
141 "name": "pomerium_proxy", | |
142 "interval": "1m", | |
143 "rules": pomRules(), | |
119 }, | 144 }, |
120 { | 145 { |
121 "name": | 146 "name": |
122 "Outages", | 147 "Outages", |
123 "interval": | 148 "interval": |
180 }, | 205 }, |
181 "expr": 'zpool_device_error_count > 0', | 206 "expr": 'zpool_device_error_count > 0', |
182 }], | 207 }], |
183 }, | 208 }, |
184 { | 209 { |
210 "name": "lighting", | |
211 "interval": "5m", | |
212 "rules": [{ | |
213 "alert": "light_bridge_no_mqtt", | |
214 "expr": 'mqtt_connected{job="light-bridge"} != 1', | |
215 }], | |
216 }, | |
217 { | |
185 "name": | 218 "name": |
186 "front_door", | 219 "front_door", |
187 "interval": | 220 "interval": |
188 "5m", | 221 "5m", |
189 "rules": [ | 222 "rules": [ |
226 "alert": "front_door_lock_esp32_no_mqtt", | 259 "alert": "front_door_lock_esp32_no_mqtt", |
227 'expr': 'hw_connected{job="front-door-lock"} < 1', | 260 'expr': 'hw_connected{job="front-door-lock"} < 1', |
228 "annotations": { | 261 "annotations": { |
229 "summary": "see https://bigasterisk.com/front-door-lock/" | 262 "summary": "see https://bigasterisk.com/front-door-lock/" |
230 }, | 263 }, |
264 }, | |
265 ], | |
266 }, | |
267 { | |
268 "name": | |
269 "net_routes", | |
270 "interval": | |
271 "5m", | |
272 "rules": [ | |
273 { | |
274 "alert": "no_house_ip_service", | |
275 "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' | |
276 }, | |
277 { | |
278 "alert": "no_net_routes_running", | |
279 "expr": 'absent(python_info{job="net-routes"})' | |
280 }, | |
281 { | |
282 "alert": "allowed_check_never_returned_200", | |
283 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' | |
284 }, | |
285 { | |
286 "alert": "allowed_check_never_returned_403", | |
287 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' | |
288 }, | |
289 { | |
290 'alert': 'net_route_input_eval_cal_loop_is_down', | |
291 'expr': 'eval_cal_up!=1' | |
292 }, | |
293 { | |
294 'alert': 'net_route_input_mongo_loop_is_down', | |
295 'expr': 'mongo_to_net_routes_up!=1' | |
296 }, | |
297 { | |
298 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', | |
299 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' | |
300 }, | |
301 { | |
302 'alert': 'gcalendarwatch_current_events_loop_is_down', | |
303 'expr': 'current_events_up != 1' | |
231 }, | 304 }, |
232 ], | 305 ], |
233 }, | 306 }, |
234 { | 307 { |
235 "name": "http", | 308 "name": "http", |
307 "alert": "high_logging", | 380 "alert": "high_logging", |
308 "for": "3h", | 381 "for": "3h", |
309 "labels": { | 382 "labels": { |
310 "severity": "waste" | 383 "severity": "waste" |
311 }, | 384 }, |
312 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", | 385 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', |
313 "annotations": { | 386 "annotations": { |
314 "summary": "high log output rate" | 387 "summary": "high log output rate" |
315 }, | 388 }, |
316 }, | 389 }, |
317 { | 390 { |