view alert_rules.py @ 31:d39a8038227b

reformat
author drewp@bigasterisk.com
date Wed, 19 Jul 2023 21:27:46 -0700
parents e114edff93dc
children eb1de82c93aa
line wrap: on
line source

"""
pdm run invoke push-config

docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
"Whenever the alert expression results in one or more vector
elements at a given point in time, the alert counts as active for
these elements' label sets."
also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics

"""

import json


def k8sRules():
    # from https://awesome-prometheus-alerts.grep.to/rules.html
    return [
        {
            "alert": "PrometheusTargetMissing",
            "expr": "up == 0",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Prometheus target missing (instance {{ $labels.instance }})",
                "description": "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesMemoryPressure",
            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesDiskPressure",
            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesOutOfDisk",
            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
            "for": "2m",
            "labels": {
                "severity": "critical"
            },
            "annotations": {
                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesJobFailed",
            "expr": "kube_job_status_failed > 0",
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "KubernetesPodCrashLooping",
            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
            "for": "2m",
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert":
                "KubernetesClientCertificateExpiresNextWeek",
            "expr":
                'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
            "labels": {
                "severity": "warning"
            },
            "annotations": {
                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
            },
        },
        {
            "alert": "container_waiting",
            "expr": "sum by (container)(kube_pod_container_status_waiting!=0)",
            "for": "2m",
        },
    ]


def allRules():
    return {
        "groups": [
            {
                "name": "k8s",
                "interval": "1m",
                "rules": k8sRules(),
            },
            #
            # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
            {
                "name":
                    "Outages",
                "interval":
                    "1m",
                "rules": [
                    {
                        "alert": "powereagleStalled",
                        "expr": "rate(house_power_w[100m]) == 0",
                        "for": "0m",
                        "labels": {
                            "severity": "losingData"
                        },
                        "annotations": {
                            "summary": "power eagle data stalled",
                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
                        },
                    },
                    {
                        "alert": "powereagleAbsent",
                        "expr": "absent_over_time(house_power_w[5m])",
                        "for": "2m",
                        "labels": {
                            "severity": "losingData"
                        },
                        "annotations": {
                            "summary": "power eagle data missing",
                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
                        },
                    },
                    {
                        "alert": "absent_zigbee",
                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
                    },
                    {
                        "alert": "net_routes_sync",
                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
                        "for": "10m",
                        "labels": {
                            "severity": "houseUsersAffected"
                        },
                        "annotations": {
                            "summary": "net_routes is not getting regular updates"
                        },
                    },
                ],
            },
            {
                "name": "disk_errs",
                "interval": "2d",
                "rules": [{
                    "alert": "zpool_device_error_count",
                    "labels": {
                        "severity": "warning"
                    },
                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
                }],
            },
            {
                "name":
                    "alerts",
                "rules": [
                    {
                        "alert": "kube_node_status_bad_condition",
                        "for": "2h",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
                    },
                    {
                        "alert": "housePower",
                        "for": "1h",
                        "labels": {
                            "severity": "waste"
                        },
                        "expr": "house_power_w > 4000",
                        "annotations": {
                            "summary": "house power usage over 4KW"
                        },
                    },
                    {
                        "alert": "host_root_fs_space_low",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'disk_free{path="/"} < 20G',
                    },
                    {
                        "alert": "zpool_space_low",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
                    },
                    {
                        "alert": "disk_week_incr",
                        "for": "20m",
                        "labels": {
                            "severity": "warning"
                        },
                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
                        "annotations": {
                            "summary": "high mb/week on zfs dir"
                        },
                    },
                    {
                        "alert": "high_logging",
                        "for": "3h",
                        "labels": {
                            "severity": "waste"
                        },
                        "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
                        "annotations": {
                            "summary": "high log output rate"
                        },
                    },
                    {
                        "alert": "stale_process",
                        "for": "1d",
                        "labels": {
                            "severity": "dataRisk"
                        },
                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
                        "annotations": {
                            "summary": "process time is old"
                        },
                    },
                    {
                        "alert": "starlette",
                        "for": "1m",
                        "labels": {
                            "severity": "fix"
                        },
                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
                        "annotations": {
                            "summary": "set starlette app name"
                        },
                    },
                    {
                        "alert": "ssl_certs_expiring_soon",
                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
                        "labels": {
                            "severity": "warning"
                        },
                        "annotations": {
                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
                        },
                    },
                ],
            },
        ]
    }


def _runJson(ctx, cmd):
    return json.loads(ctx.run(cmd, hide="stdout").stdout)


def hostsExpectedOnline(ctx):
    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")


def expectedK8sNodes(ctx):
    getNode = _runJson(ctx, "kubectl get node -o json")
    hosts = [item["metadata"]["name"] for item in getNode["items"]]
    optionalHosts = {'slash'}
    return {
        "groups": [
            {
                "name": "k8s_expected_nodes",
                "rules": [
                    {
                        "alert": "kube_node_log_size_report_" + h,
                        "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})'
                        % h,
                        "for": "1h",
                        "annotations": {
                            "summary": f"no recent k8s log size report from host {h}"
                        },
                    }
                    for h in hosts if not h in optionalHosts
                ],
            }
        ]
    }