changeset 31:d39a8038227b

reformat
author drewp@bigasterisk.com
date Wed, 19 Jul 2023 21:27:46 -0700
parents 4165f4fa6ccf
children eb1de82c93aa
files .flake8 alert_rules.py
diffstat 2 files changed, 92 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.flake8	Wed Jul 19 21:27:46 2023 -0700
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length=160
+ignore=E722,W503,E741,E201,E202,E241,E231
--- a/alert_rules.py	Wed Jul 19 21:17:57 2023 -0700
+++ b/alert_rules.py	Wed Jul 19 21:27:46 2023 -0700
@@ -18,7 +18,9 @@
         {
             "alert": "PrometheusTargetMissing",
             "expr": "up == 0",
-            "labels": {"severity": "critical"},
+            "labels": {
+                "severity": "critical"
+            },
             "annotations": {
                 "summary": "Prometheus target missing (instance {{ $labels.instance }})",
                 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
@@ -28,7 +30,9 @@
             "alert": "KubernetesMemoryPressure",
             "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
             "for": "2m",
-            "labels": {"severity": "critical"},
+            "labels": {
+                "severity": "critical"
+            },
             "annotations": {
                 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
                 "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
@@ -38,7 +42,9 @@
             "alert": "KubernetesDiskPressure",
             "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
             "for": "2m",
-            "labels": {"severity": "critical"},
+            "labels": {
+                "severity": "critical"
+            },
             "annotations": {
                 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
                 "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
@@ -48,7 +54,9 @@
             "alert": "KubernetesOutOfDisk",
             "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
             "for": "2m",
-            "labels": {"severity": "critical"},
+            "labels": {
+                "severity": "critical"
+            },
             "annotations": {
                 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
                 "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
@@ -57,7 +65,9 @@
         {
             "alert": "KubernetesJobFailed",
             "expr": "kube_job_status_failed > 0",
-            "labels": {"severity": "warning"},
+            "labels": {
+                "severity": "warning"
+            },
             "annotations": {
                 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
                 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
@@ -67,16 +77,22 @@
             "alert": "KubernetesPodCrashLooping",
             "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
             "for": "2m",
-            "labels": {"severity": "warning"},
+            "labels": {
+                "severity": "warning"
+            },
             "annotations": {
                 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
                 "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
             },
         },
         {
-            "alert": "KubernetesClientCertificateExpiresNextWeek",
-            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
-            "labels": {"severity": "warning"},
+            "alert":
+                "KubernetesClientCertificateExpiresNextWeek",
+            "expr":
+                'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
+            "labels": {
+                "severity": "warning"
+            },
             "annotations": {
                 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
                 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
@@ -101,14 +117,18 @@
             #
             # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
             {
-                "name": "Outages",
-                "interval": "1m",
+                "name":
+                    "Outages",
+                "interval":
+                    "1m",
                 "rules": [
                     {
                         "alert": "powereagleStalled",
                         "expr": "rate(house_power_w[100m]) == 0",
                         "for": "0m",
-                        "labels": {"severity": "losingData"},
+                        "labels": {
+                            "severity": "losingData"
+                        },
                         "annotations": {
                             "summary": "power eagle data stalled",
                             "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
@@ -118,7 +138,9 @@
                         "alert": "powereagleAbsent",
                         "expr": "absent_over_time(house_power_w[5m])",
                         "for": "2m",
-                        "labels": {"severity": "losingData"},
+                        "labels": {
+                            "severity": "losingData"
+                        },
                         "annotations": {
                             "summary": "power eagle data missing",
                             "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
@@ -132,7 +154,9 @@
                         "alert": "net_routes_sync",
                         "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
                         "for": "10m",
-                        "labels": {"severity": "houseUsersAffected"},
+                        "labels": {
+                            "severity": "houseUsersAffected"
+                        },
                         "annotations": {
                             "summary": "net_routes is not getting regular updates"
                         },
@@ -142,74 +166,103 @@
             {
                 "name": "disk_errs",
                 "interval": "2d",
-                "rules": [
-                    {
-                        "alert": "zpool_device_error_count",
-                        "labels": {"severity": "warning"},
-                        "expr": 'increase(zpool_device_error_count[3d]) > 0',
+                "rules": [{
+                    "alert": "zpool_device_error_count",
+                    "labels": {
+                        "severity": "warning"
                     },
-                ],
+                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
+                }],
             },
             {
-                "name": "alerts",
+                "name":
+                    "alerts",
                 "rules": [
                     {
                         "alert": "kube_node_status_bad_condition",
                         "for": "2h",
-                        "labels": {"severity": "warning"},
+                        "labels": {
+                            "severity": "warning"
+                        },
                         "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
                     },
                     {
                         "alert": "housePower",
                         "for": "1h",
-                        "labels": {"severity": "waste"},
+                        "labels": {
+                            "severity": "waste"
+                        },
                         "expr": "house_power_w > 4000",
-                        "annotations": {"summary": "house power usage over 4KW"},
+                        "annotations": {
+                            "summary": "house power usage over 4KW"
+                        },
                     },
                     {
                         "alert": "host_root_fs_space_low",
                         "for": "20m",
-                        "labels": {"severity": "warning"},
+                        "labels": {
+                            "severity": "warning"
+                        },
                         "expr": 'disk_free{path="/"} < 20G',
                     },
                     {
                         "alert": "zpool_space_low",
                         "for": "20m",
-                        "labels": {"severity": "warning"},
+                        "labels": {
+                            "severity": "warning"
+                        },
                         "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
                     },
                     {
                         "alert": "disk_week_incr",
                         "for": "20m",
-                        "labels": {"severity": "warning"},
+                        "labels": {
+                            "severity": "warning"
+                        },
                         "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
-                        "annotations": {"summary": "high mb/week on zfs dir"},
+                        "annotations": {
+                            "summary": "high mb/week on zfs dir"
+                        },
                     },
                     {
                         "alert": "high_logging",
-                        "for": "20m",
-                        "labels": {"severity": "waste"},
+                        "for": "3h",
+                        "labels": {
+                            "severity": "waste"
+                        },
                         "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
-                        "annotations": {"summary": "high log output rate"},
+                        "annotations": {
+                            "summary": "high log output rate"
+                        },
                     },
                     {
                         "alert": "stale_process",
                         "for": "1d",
-                        "labels": {"severity": "dataRisk"},
+                        "labels": {
+                            "severity": "dataRisk"
+                        },
                         "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
-                        "annotations": {"summary": "process time is old"},
+                        "annotations": {
+                            "summary": "process time is old"
+                        },
                     },
                     {
                         "alert": "starlette",
                         "for": "1m",
-                        "labels": {"severity": "fix"},
+                        "labels": {
+                            "severity": "fix"
+                        },
                         "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
-                        "annotations": {"summary": "set starlette app name"},
+                        "annotations": {
+                            "summary": "set starlette app name"
+                        },
                     },
                     {
                         "alert": "ssl_certs_expiring_soon",
                         "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
-                        "labels": {"severity": "warning"},
+                        "labels": {
+                            "severity": "warning"
+                        },
                         "annotations": {
                             "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
                         },