Mercurial > code > home > repos > victoriametrics
view alert_rules.py @ 36:2bc188c4117a
rules updates, incl front door group
author | drewp@bigasterisk.com |
---|---|
date | Sun, 15 Oct 2023 18:34:29 -0700 |
parents | 3b91d52b007d |
children | 6e27d280b598 |
line wrap: on
line source
""" pdm run invoke push-config docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ "Whenever the alert expression results in one or more vector elements at a given point in time, the alert counts as active for these elements' label sets." also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics """ import json def k8sRules(): # from https://awesome-prometheus-alerts.grep.to/rules.html return [ { "alert": "metricsTargetMissing", "expr": 'up{job!~"cm-acme-.*"} == 0', 'for': '10m', "labels": { "severity": "critical" }, "annotations": { "summary": "metrics target missing (instance {{ $labels.instance }})", "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesMemoryPressure", "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesDiskPressure", "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesOutOfDisk", "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesJobFailed", "expr": "kube_job_status_failed > 0", "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesPodCrashLooping", "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", "for": "2m", "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesClientCertificateExpiresNextWeek", "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", }, }, { "alert": "container_waiting", "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", "annotations": { "description": '', "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", }, "for": "2m", }, ] def allRules(ctx): return { "groups": [ { "name": "k8s", "interval": "1m", "rules": k8sRules(), }, expectedK8sNodesGroup(ctx), # # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name { "name": "Outages", "interval": "1m", "rules": [ { "alert": "powereagleStalled", "expr": "rate(house_power_w[100m]) == 0", "for": "0m", "labels": { "severity": "losingData" }, "annotations": { "summary": "power eagle data stalled", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", }, }, { "alert": "powereagleAbsent", "expr": "absent_over_time(house_power_w[5m])", "for": "2m", "labels": { "severity": "losingData" }, "annotations": { "summary": "power eagle data missing", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", }, }, { "alert": "absent_zigbee", "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', }, { "alert": "net_routes_sync", "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', "for": "10m", "labels": { "severity": "houseUsersAffected" }, "annotations": { "summary": "net_routes is not getting regular updates" }, }, ], }, { "name": "disk_errs", "interval": "2d", "rules": [{ "alert": "zpool_device_error_increase", "labels": { "severity": "warning" }, "expr": 'increase(zpool_device_error_count[3d]) > 0', }, { "alert": "zpool_device_error_count", "labels": { "severity": "warning" }, "expr": 'zpool_device_error_count > 0', }], }, { "name": "front_door", "interval": "5m", "rules": [ { "alert": "front_door_reader_esp32_no_mqtt", 'expr': 'hw_connected{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "fronr_door_reader_svc_down", 'expr': 'up{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "fronr_door_reader_svc_reader_no_mqtt", 'expr': 'mqtt_connected{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_svc_down", 'expr': 'up{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_svc_no_mqtt", 'expr': 'mqtt_connected{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_esp32_no_mqtt", 'expr': 'hw_connected{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, ], }, { "name": "alerts", "rules": [ { "alert": "kube_node_status_bad_condition", "for": "2h", "labels": { "severity": "warning" }, "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', }, { "alert": "housePower", "for": "1h", "labels": { "severity": "waste" }, "expr": "house_power_w > 4000", "annotations": { "summary": "house power usage over 4KW" }, }, { "alert": "host_root_fs_space_low", "for": "20m", "labels": { "severity": "warning" }, "expr": 'disk_free{host!="garage",path="/"} < 20G', }, { "alert": "zpool_space_low", "for": "20m", "labels": { "severity": "warning" }, "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', }, { "alert": "disk_week_incr", "for": "20m", "labels": { "severity": "warning" }, "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', "annotations": { "summary": "high mb/week on zfs dir" }, }, { "alert": "high_logging", "for": "3h", "labels": { "severity": "waste" }, "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", "annotations": { "summary": "high log output rate" }, }, { "alert": "stale_process", "for": "1d", "labels": { "severity": "dataRisk" }, "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", "annotations": { "summary": "process time is old" }, }, { "alert": "starlette", "for": "1m", "labels": { "severity": "fix" }, "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', "annotations": { "summary": "set starlette app name" }, }, { "alert": "ssl_certs_expiring_soon", "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", "labels": { "severity": "warning" }, "annotations": { "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" }, }, ], }, ] + hostsExpectedOnline(ctx)['groups'] } def _runJson(ctx, cmd): return json.loads(ctx.run(cmd, hide="stdout").stdout) def hostsExpectedOnline(ctx): return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") def expectedK8sNodesGroup(ctx): getNode = _runJson(ctx, "kubectl get node -o json") hosts = [item["metadata"]["name"] for item in getNode["items"]] optionalHosts = {'slash'} return { "name": "k8s_expected_nodes", "rules": [{ "alert": "kube_node_log_size_report_" + h, "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' % h, "for": "1h", "annotations": { "summary": f"no recent k8s log size report from host {h}" }, } for h in hosts if h not in optionalHosts], }