view alert_rules.py @ 74:177c0a2eab1e

prom annotations are ignored
author drewp@bigasterisk.com
date Fri, 03 May 2024 15:00:40 -0700
parents adde35eb4773
children 009527a145d0
line wrap: on
line source

"""
pdm run invoke push-config

docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
"Whenever the alert expression results in one or more vector
elements at a given point in time, the alert counts as active for
these elements' label sets."
also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics

"""

import json


def pomRules():
    return [
        {
            "alert": "frequent_upstream_connect_failures",
            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
        },
        {
            "alert": "high_logging_pomerium",
            "for": "3h",
            "labels": {
                "severity": "waste"
            },
            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
            "annotations": {
                "summary": "high log output rate"
            },
        },
    ]


def k8sRules():
    # from https://awesome-prometheus-alerts.grep.to/rules.html
    return [
        {
            "alert": "metricsTargetMissing",
            "expr": 'up{job!~"cm-acme-.*"} == 0',
            'for': '10m',
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "metrics target missing (instance {{ $labels.instance }})",
                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesMemoryPressure",
            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesDiskPressure",
            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesOutOfDisk",
            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesJobFailed",
            "expr": "kube_job_status_failed > 0",
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesPodCrashLooping",
            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
            "for": "2m",
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesClientCertificateExpiresNextWeek",
            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "container_waiting",
            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
            "annotations": {
                "description": '',
                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
            },
            "for": "2m",
        },
    ]


def allRules(ctx):
    return {
        "groups": [
            {
                "name": "k8s",
                "interval": "1m",
                "rules": k8sRules(),
            },
            {
                "name": "pomerium_proxy",
                "interval": "1m",
                "rules": pomRules(),
            },
            {
                "name":
                    "Outages",
                "interval":
                    "1m",
                "rules": [
                    {
                        "alert": "powereagleStalled",
                        "expr": "rate(house_power_w[100m]) == 0",
                        "for": "0m",
                        "labels": {
                            "severity": "losingData"
                        },
                        "annotations": {
                            "summary": "power eagle data stalled",
                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
                        },
                    },
                    {
                        "alert": "powereagleAbsent",
                        "expr": "absent_over_time(house_power_w[5m])",
                        "for": "2m",
                        "labels": {
                            "severity": "losingData"
                        },
                        "annotations": {
                            "summary": "power eagle data missing",
                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
                        },
                    },
                    {
                        "alert": "absent_zigbee",
                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
                    },
                    {
                        "alert": "net_routes_sync",
                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
                        "for": "10m",
                        "labels": {
                            "severity": "houseUsersAffected"
                        },
                        "annotations": {
                            "summary": "net_routes is not getting regular updates"
                        },
                    },
                ],
            },
            {
                "name": "disk_errs",
                "interval": "2d",
                "rules": [{
                    "alert": "zpool_device_error_increase",
                    "labels": {
                        "severity": "warning"
                    },
                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
                }, {
                    "alert": "zpool_device_error_count",
                    "labels": {
                        "severity": "warning"
                    },
                    "expr": 'zpool_device_error_count > 0',
                }],
            },
            {
                "name": "lighting",
                "interval": "5m",
                "rules": [{
                    "alert": "light_bridge_no_mqtt",
                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
                }],
            },
            {
                "name":
                    "front_door",
                "interval":
                    "5m",
                "rules": [
                    {
                        "alert": "front_door_reader_esp32_no_mqtt",
                        'expr': 'hw_connected{job="fingerprint"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                    {
                        "alert": "front_door_reader_svc_down",
                        'expr': 'up{job="fingerprint"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                    {
                        "alert": "front_door_reader_svc_reader_no_mqtt",
                        'expr': 'mqtt_connected{job="fingerprint"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                    {
                        "alert": "front_door_lock_svc_down",
                        'expr': 'up{job="front-door-lock"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                    {
                        "alert": "front_door_lock_svc_no_mqtt",
                        'expr': 'mqtt_connected{job="front-door-lock"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                    {
                        "alert": "front_door_lock_esp32_no_mqtt",
                        'expr': 'hw_connected{job="front-door-lock"} < 1',
                        "annotations": {
                            "summary": "see https://bigasterisk.com/front-door-lock/"
                        },
                    },
                ],
            },
            {
                "name":
                    "net_routes",
                "interval":
                    "5m",
                "rules": [
                    {
                        "alert": "no_house_ip_service",
                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
                    },
                    {
                        "alert": "no_net_routes_running",
                        "expr": 'absent(python_info{job="net-routes"})'
                    },
                    {
                        "alert": "allowed_check_never_returned_200",
                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
                    },
                    {
                        "alert": "allowed_check_never_returned_403",
                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
                    },
                    {
                        'alert': 'net_route_input_eval_cal_loop_is_down',
                        'expr': 'eval_cal_up!=1'
                    },
                    {
                        'alert': 'net_route_input_mongo_loop_is_down',
                        'expr': 'mongo_to_net_routes_up!=1'
                    },
                    {
                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
                    },
                    {
                        'alert': 'gcalendarwatch_current_events_loop_is_down',
                        'expr': 'current_events_up != 1'
                    },
                ],
            },
            {
                "name": "http",
                "interval": "1h",
                'rules': [
                    {
                        'alert': 'old_https_certs',
                        'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
                    },
                    {
                        'alert': 'high_500_response_rate',
                        'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
                    },
                ],
            },
            {
                "name": "ping",
                "interval": "1m",
                "rules": [{
                    "alert": "ping_failed",
                    "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
                }]
            },
            {
                "name":
                    "alerts",
                "rules": [
                    {
                        "alert": "kube_node_status_bad_condition",
                        "for": "2h",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
                    },
                    {
                        "alert": "housePower",
                        "for": "1h",
                        "labels": {
                            "severity": "waste"
                        },
                        "expr": "house_power_w > 4000",
                        "annotations": {
                            "summary": "house power usage over 4KW"
                        },
                    },
                    {
                        "alert": "host_root_fs_space_low",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
                    },
                    {
                        "alert": "zpool_space_low",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
                    },
                    {
                        "alert": "disk_week_incr",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
                        "annotations": {
                            "summary": "high mb/week on zfs dir"
                        },
                    },
                    {
                        "alert": "high_logging",
                        "for": "3h",
                        "labels": {
                            "severity": "waste"
                        },
                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
                        "annotations": {
                            "summary": "high log output rate"
                        },
                    },
                    {
                        "alert": "stale_process",
                        "for": "1d",
                        "labels": {
                            "severity": "dataRisk"
                        },
                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
                        "annotations": {
                            "summary": "process time is old"
                        },
                    },
                    {
                        "alert": "starlette",
                        "for": "1m",
                        "labels": {
                            "severity": "fix"
                        },
                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
                        "annotations": {
                            "summary": "set starlette app name"
                        },
                    },
                    {
                        "alert": "ssl_certs_expiring_soon",
                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
                        "labels": {
                            "severity": "warning"
                        },
                        "annotations": {
                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
                        },
                    },
                ],
            },
        ] + hostsExpectedOnline(ctx)['groups']
    }


def _runJson(ctx, cmd):
    return json.loads(ctx.run(cmd, hide="stdout").stdout)


def hostsExpectedOnline(ctx):
    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")