Mercurial > code > home > repos > victoriametrics
diff next/alert_rules.py @ 62:8134cd480817
make next/ a complete standalone setup dir- no deps on ./
author | drewp@bigasterisk.com |
---|---|
date | Thu, 02 May 2024 20:33:29 -0700 |
parents | alert_rules.py@df44473de6a1 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/alert_rules.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,433 @@ +""" +pdm run invoke push-config + +docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +"Whenever the alert expression results in one or more vector +elements at a given point in time, the alert counts as active for +these elements' label sets." +also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + +""" + +import json + + +def pomRules(): + return [ + { + "alert": "frequent_upstream_connect_failures", + "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" + }, + { + "alert": "high_logging_pomerium", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', + "annotations": { + "summary": "high log output rate" + }, + }, + ] + + +def k8sRules(): + # from https://awesome-prometheus-alerts.grep.to/rules.html + return [ + { + "alert": "metricsTargetMissing", + "expr": 'up{job!~"cm-acme-.*"} == 0', + 'for': '10m', + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "metrics target missing (instance {{ $labels.instance }})", + "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesMemoryPressure", + "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesDiskPressure", + "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesOutOfDisk", + "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesJobFailed", + "expr": "kube_job_status_failed > 0", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", + "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesPodCrashLooping", + "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", + "for": "2m", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", + "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesClientCertificateExpiresNextWeek", + "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", + "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "container_waiting", + "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", + "annotations": { + "description": '', + "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", + }, + "for": "2m", + }, + ] + + +def allRules(ctx): + return { + "groups": [ + { + "name": "k8s", + "interval": "1m", + "rules": k8sRules(), + }, + { + "name": "pomerium_proxy", + "interval": "1m", + "rules": pomRules(), + }, + { + "name": + "Outages", + "interval": + "1m", + "rules": [ + { + "alert": "powereagleStalled", + "expr": "rate(house_power_w[100m]) == 0", + "for": "0m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data stalled", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "powereagleAbsent", + "expr": "absent_over_time(house_power_w[5m])", + "for": "2m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data missing", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "absent_zigbee", + "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', + }, + { + "alert": "net_routes_sync", + "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', + "for": "10m", + "labels": { + "severity": "houseUsersAffected" + }, + "annotations": { + "summary": "net_routes is not getting regular updates" + }, + }, + ], + }, + { + "name": "disk_errs", + "interval": "2d", + "rules": [{ + "alert": "zpool_device_error_increase", + "labels": { + "severity": "warning" + }, + "expr": 'increase(zpool_device_error_count[3d]) > 0', + }, { + "alert": "zpool_device_error_count", + "labels": { + "severity": "warning" + }, + "expr": 'zpool_device_error_count > 0', + }], + }, + { + "name": "lighting", + "interval": "5m", + "rules": [{ + "alert": "light_bridge_no_mqtt", + "expr": 'mqtt_connected{job="light-bridge"} != 1', + }], + }, + { + "name": + "front_door", + "interval": + "5m", + "rules": [ + { + "alert": "front_door_reader_esp32_no_mqtt", + 'expr': 'hw_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_down", + 'expr': 'up{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_reader_no_mqtt", + 'expr': 'mqtt_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_down", + 'expr': 'up{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_no_mqtt", + 'expr': 'mqtt_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_esp32_no_mqtt", + 'expr': 'hw_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + ], + }, + { + "name": + "net_routes", + "interval": + "5m", + "rules": [ + { + "alert": "no_house_ip_service", + "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' + }, + { + "alert": "no_net_routes_running", + "expr": 'absent(python_info{job="net-routes"})' + }, + { + "alert": "allowed_check_never_returned_200", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' + }, + { + "alert": "allowed_check_never_returned_403", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' + }, + { + 'alert': 'net_route_input_eval_cal_loop_is_down', + 'expr': 'eval_cal_up!=1' + }, + { + 'alert': 'net_route_input_mongo_loop_is_down', + 'expr': 'mongo_to_net_routes_up!=1' + }, + { + 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', + 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' + }, + { + 'alert': 'gcalendarwatch_current_events_loop_is_down', + 'expr': 'current_events_up != 1' + }, + ], + }, + { + "name": "http", + "interval": "1h", + 'rules': [ + { + 'alert': 'old_https_certs', + 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', + }, + { + 'alert': 'high_500_response_rate', + 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', + }, + ], + }, + { + "name": "ping", + "interval": "1m", + "rules": [{ + "alert": "ping_failed", + "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', + }] + }, + { + "name": + "alerts", + "rules": [ + { + "alert": "kube_node_status_bad_condition", + "for": "2h", + "labels": { + "severity": "warning" + }, + "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', + }, + { + "alert": "housePower", + "for": "1h", + "labels": { + "severity": "waste" + }, + "expr": "house_power_w > 4000", + "annotations": { + "summary": "house power usage over 4KW" + }, + }, + { + "alert": "host_root_fs_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'disk_free{host!="garage",path="/"} < 20G', + }, + { + "alert": "zpool_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', + }, + { + "alert": "disk_week_incr", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', + "annotations": { + "summary": "high mb/week on zfs dir" + }, + }, + { + "alert": "high_logging", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', + "annotations": { + "summary": "high log output rate" + }, + }, + { + "alert": "stale_process", + "for": "1d", + "labels": { + "severity": "dataRisk" + }, + "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", + "annotations": { + "summary": "process time is old" + }, + }, + { + "alert": "starlette", + "for": "1m", + "labels": { + "severity": "fix" + }, + "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', + "annotations": { + "summary": "set starlette app name" + }, + }, + { + "alert": "ssl_certs_expiring_soon", + "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" + }, + }, + ], + }, + ] + hostsExpectedOnline(ctx)['groups'] + } + + +def _runJson(ctx, cmd): + return json.loads(ctx.run(cmd, hide="stdout").stdout) + + +def hostsExpectedOnline(ctx): + return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")