Mercurial > code > home > repos > victoriametrics
diff alert_rules.py @ 49:febc20caabcb
more alerts
author | drewp@bigasterisk.com |
---|---|
date | Sun, 10 Mar 2024 14:49:32 -0700 |
parents | daa0df13bf06 |
children | df44473de6a1 |
line wrap: on
line diff
--- a/alert_rules.py Sun Mar 10 14:48:55 2024 -0700 +++ b/alert_rules.py Sun Mar 10 14:49:32 2024 -0700 @@ -12,6 +12,26 @@ import json +def pomRules(): + return [ + { + "alert": "frequent_upstream_connect_failures", + "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[6h]) > 0" + }, + { + "alert": "high_logging_pomerium", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', + "annotations": { + "summary": "high log output rate" + }, + }, + ] + + def k8sRules(): # from https://awesome-prometheus-alerts.grep.to/rules.html return [ @@ -118,6 +138,11 @@ "rules": k8sRules(), }, { + "name": "pomerium_proxy", + "interval": "1m", + "rules": pomRules(), + }, + { "name": "Outages", "interval": @@ -182,6 +207,14 @@ }], }, { + "name": "lighting", + "interval": "5m", + "rules": [{ + "alert": "light_bridge_no_mqtt", + "expr": 'mqtt_connected{job="light-bridge"} != 1', + }], + }, + { "name": "front_door", "interval": @@ -232,6 +265,46 @@ ], }, { + "name": + "net_routes", + "interval": + "5m", + "rules": [ + { + "alert": "no_house_ip_service", + "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' + }, + { + "alert": "no_net_routes_running", + "expr": 'absent(python_info{job="net-routes"})' + }, + { + "alert": "allowed_check_never_returned_200", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' + }, + { + "alert": "allowed_check_never_returned_403", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' + }, + { + 'alert': 'net_route_input_eval_cal_loop_is_down', + 'expr': 'eval_cal_up!=1' + }, + { + 'alert': 'net_route_input_mongo_loop_is_down', + 'expr': 'mongo_to_net_routes_up!=1' + }, + { + 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', + 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' + }, + { + 'alert': 'gcalendarwatch_current_events_loop_is_down', + 'expr': 'current_events_up != 1' + }, + ], + }, + { "name": "http", "interval": "1h", 'rules': [ @@ -309,7 +382,7 @@ "labels": { "severity": "waste" }, - "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', "annotations": { "summary": "high log output rate" },