diff alert_rules.py @ 49:febc20caabcb

more alerts
author drewp@bigasterisk.com
date Sun, 10 Mar 2024 14:49:32 -0700
parents daa0df13bf06
children df44473de6a1
line wrap: on
line diff
--- a/alert_rules.py	Sun Mar 10 14:48:55 2024 -0700
+++ b/alert_rules.py	Sun Mar 10 14:49:32 2024 -0700
@@ -12,6 +12,26 @@
 import json
 
 
+def pomRules():
+    return [
+        {
+            "alert": "frequent_upstream_connect_failures",
+            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[6h]) > 0"
+        },
+        {
+            "alert": "high_logging_pomerium",
+            "for": "3h",
+            "labels": {
+                "severity": "waste"
+            },
+            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
+            "annotations": {
+                "summary": "high log output rate"
+            },
+        },
+    ]
+
+
 def k8sRules():
     # from https://awesome-prometheus-alerts.grep.to/rules.html
     return [
@@ -118,6 +138,11 @@
                 "rules": k8sRules(),
             },
             {
+                "name": "pomerium_proxy",
+                "interval": "1m",
+                "rules": pomRules(),
+            },
+            {
                 "name":
                     "Outages",
                 "interval":
@@ -182,6 +207,14 @@
                 }],
             },
             {
+                "name": "lighting",
+                "interval": "5m",
+                "rules": [{
+                    "alert": "light_bridge_no_mqtt",
+                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
+                }],
+            },
+            {
                 "name":
                     "front_door",
                 "interval":
@@ -232,6 +265,46 @@
                 ],
             },
             {
+                "name":
+                    "net_routes",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "no_house_ip_service",
+                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
+                    },
+                    {
+                        "alert": "no_net_routes_running",
+                        "expr": 'absent(python_info{job="net-routes"})'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_200",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_403",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
+                    },
+                    {
+                        'alert': 'net_route_input_eval_cal_loop_is_down',
+                        'expr': 'eval_cal_up!=1'
+                    },
+                    {
+                        'alert': 'net_route_input_mongo_loop_is_down',
+                        'expr': 'mongo_to_net_routes_up!=1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
+                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_current_events_loop_is_down',
+                        'expr': 'current_events_up != 1'
+                    },
+                ],
+            },
+            {
                 "name": "http",
                 "interval": "1h",
                 'rules': [
@@ -309,7 +382,7 @@
                         "labels": {
                             "severity": "waste"
                         },
-                        "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
+                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
                         "annotations": {
                             "summary": "high log output rate"
                         },