Mercurial > code > home > repos > victoriametrics
diff alert_rules.py @ 23:ffa013944200
mv
author | drewp@bigasterisk.com |
---|---|
date | Sat, 24 Jun 2023 23:02:17 -0700 |
parents | |
children | b15cfe483964 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alert_rules.py Sat Jun 24 23:02:17 2023 -0700 @@ -0,0 +1,254 @@ +""" +pdm run invoke push-config + +docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +"Whenever the alert expression results in one or more vector +elements at a given point in time, the alert counts as active for +these elements' label sets." +also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + +""" + +import json + +import yaml + + +def k8sRules(): + # from https://awesome-prometheus-alerts.grep.to/rules.html + return [ + { + "alert": "PrometheusTargetMissing", + "expr": "up == 0", + "for": "0m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Prometheus target missing (instance {{ $labels.instance }})", + "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesMemoryPressure", + "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesDiskPressure", + "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesOutOfDisk", + "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesJobFailed", + "expr": "kube_job_status_failed > 0", + "for": "0m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", + "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesPodCrashLooping", + "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", + "for": "2m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", + "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesClientCertificateExpiresNextWeek", + "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "for": "0m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", + "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "container_waiting", + "expr": "sum by (container)(kube_pod_container_status_waiting!=0)", + "for": "2m", + }, + ] + + +def allRules(): + return { + "groups": [ + { + "name": "k8s", + "rules": k8sRules(), + }, + # + # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name + { + "name": "Outages", + "rules": [ + { + "alert": "powereagleStalled", + "expr": "rate(house_power_w[100m]) == 0", + "for": "0m", + "labels": {"severity": "losingData"}, + "annotations": { + "summary": "power eagle data stalled", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "powereagleAbsent", + "expr": "absent_over_time(house_power_w[5m])", + "for": "2m", + "labels": {"severity": "losingData"}, + "annotations": { + "summary": "power eagle data missing", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "absent_zigbee", + "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', + }, + { + "alert": "net_routes_sync", + "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', + "for": "10m", + "labels": {"severity": "houseUsersAffected"}, + "annotations": { + "summary": "net_routes is not getting regular updates" + }, + }, + ], + }, + { + "name": "alerts", + "rules": [ + { + "alert": "kube_node_status_bad_condition", + "for": "2h", + "labels": {"severity": "warning"}, + "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', + }, + { + "alert": "housePower", + "for": "24h", + "labels": {"severity": "waste"}, + "expr": "house_power_w > 4000", + "annotations": {"summary": "house power usage over 4KW"}, + }, + { + "alert": "host_root_fs_space_low", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'disk_free{path="/"} < 20G', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "zpool_space_low", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "zpool_device_error_count", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'increase(zpool_device_error_count[1d]) > 0', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "disk_week_incr", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', + "annotations": {"summary": "high mb/week on zfs dir"}, + }, + { + "alert": "high_logging", + "for": "20m", + "labels": {"severity": "waste"}, + "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", + "annotations": {"summary": "high log output rate"}, + }, + { + "alert": "stale_process", + "for": "1d", + "labels": {"severity": "dataRisk"}, + "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", + "annotations": {"summary": "process time is old"}, + }, + { + "alert": "starlette", + "for": "1m", + "labels": {"severity": "fix"}, + "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', + "annotations": {"summary": "set starlette app name"}, + }, + { + "alert": "ssl_certs_expiring_soon", + "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" + }, + }, + ], + }, + ] + } + + +def _runJson(ctx, cmd): + return json.loads(ctx.run(cmd, hide="stdout").stdout) + + +def hostsExpectedOnline(ctx): + return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") + + +def expectedK8sNodes(ctx): + getNode = _runJson(ctx, "kubectl get node -o json") + hosts = [item["metadata"]["name"] for item in getNode["items"]] + optionalHosts = {'slash'} + return { + "groups": [ + { + "name": "k8s_expected_nodes", + "rules": [ + { + "alert": "kube_node_log_size_report_" + h, + "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' + % h, + "for": "1h", + "annotations": { + "summary": f"no recent k8s log size report from host {h}" + }, + } + for h in hosts if not h in optionalHosts + ], + } + ] + }