comparison alert_rules.py @ 49:febc20caabcb

more alerts
author drewp@bigasterisk.com
date Sun, 10 Mar 2024 14:49:32 -0700
parents daa0df13bf06
children df44473de6a1
comparison
equal deleted inserted replaced
48:daa0df13bf06 49:febc20caabcb
10 """ 10 """
11 11
12 import json 12 import json
13 13
14 14
15 def pomRules():
16 return [
17 {
18 "alert": "frequent_upstream_connect_failures",
19 "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[6h]) > 0"
20 },
21 {
22 "alert": "high_logging_pomerium",
23 "for": "3h",
24 "labels": {
25 "severity": "waste"
26 },
27 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
28 "annotations": {
29 "summary": "high log output rate"
30 },
31 },
32 ]
33
34
15 def k8sRules(): 35 def k8sRules():
16 # from https://awesome-prometheus-alerts.grep.to/rules.html 36 # from https://awesome-prometheus-alerts.grep.to/rules.html
17 return [ 37 return [
18 { 38 {
19 "alert": "metricsTargetMissing", 39 "alert": "metricsTargetMissing",
114 "groups": [ 134 "groups": [
115 { 135 {
116 "name": "k8s", 136 "name": "k8s",
117 "interval": "1m", 137 "interval": "1m",
118 "rules": k8sRules(), 138 "rules": k8sRules(),
139 },
140 {
141 "name": "pomerium_proxy",
142 "interval": "1m",
143 "rules": pomRules(),
119 }, 144 },
120 { 145 {
121 "name": 146 "name":
122 "Outages", 147 "Outages",
123 "interval": 148 "interval":
180 }, 205 },
181 "expr": 'zpool_device_error_count > 0', 206 "expr": 'zpool_device_error_count > 0',
182 }], 207 }],
183 }, 208 },
184 { 209 {
210 "name": "lighting",
211 "interval": "5m",
212 "rules": [{
213 "alert": "light_bridge_no_mqtt",
214 "expr": 'mqtt_connected{job="light-bridge"} != 1',
215 }],
216 },
217 {
185 "name": 218 "name":
186 "front_door", 219 "front_door",
187 "interval": 220 "interval":
188 "5m", 221 "5m",
189 "rules": [ 222 "rules": [
226 "alert": "front_door_lock_esp32_no_mqtt", 259 "alert": "front_door_lock_esp32_no_mqtt",
227 'expr': 'hw_connected{job="front-door-lock"} < 1', 260 'expr': 'hw_connected{job="front-door-lock"} < 1',
228 "annotations": { 261 "annotations": {
229 "summary": "see https://bigasterisk.com/front-door-lock/" 262 "summary": "see https://bigasterisk.com/front-door-lock/"
230 }, 263 },
264 },
265 ],
266 },
267 {
268 "name":
269 "net_routes",
270 "interval":
271 "5m",
272 "rules": [
273 {
274 "alert": "no_house_ip_service",
275 "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
276 },
277 {
278 "alert": "no_net_routes_running",
279 "expr": 'absent(python_info{job="net-routes"})'
280 },
281 {
282 "alert": "allowed_check_never_returned_200",
283 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
284 },
285 {
286 "alert": "allowed_check_never_returned_403",
287 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
288 },
289 {
290 'alert': 'net_route_input_eval_cal_loop_is_down',
291 'expr': 'eval_cal_up!=1'
292 },
293 {
294 'alert': 'net_route_input_mongo_loop_is_down',
295 'expr': 'mongo_to_net_routes_up!=1'
296 },
297 {
298 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
299 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
300 },
301 {
302 'alert': 'gcalendarwatch_current_events_loop_is_down',
303 'expr': 'current_events_up != 1'
231 }, 304 },
232 ], 305 ],
233 }, 306 },
234 { 307 {
235 "name": "http", 308 "name": "http",
307 "alert": "high_logging", 380 "alert": "high_logging",
308 "for": "3h", 381 "for": "3h",
309 "labels": { 382 "labels": {
310 "severity": "waste" 383 "severity": "waste"
311 }, 384 },
312 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", 385 "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
313 "annotations": { 386 "annotations": {
314 "summary": "high log output rate" 387 "summary": "high log output rate"
315 }, 388 },
316 }, 389 },
317 { 390 {