Mercurial > code > home > repos > victoriametrics
view alert_rules.py @ 85:b5cea75d68dd
metrics updates
author | drewp@bigasterisk.com |
---|---|
date | Wed, 14 Aug 2024 10:20:48 -0700 |
parents | 5a526531305f |
children | ce7c4a918832 |
line wrap: on
line source
""" pdm run invoke push-config docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ "Whenever the alert expression results in one or more vector elements at a given point in time, the alert counts as active for these elements' label sets." also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics """ import json def pomRules(): return [ { "alert": "frequent_upstream_connect_failures", "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" }, { "alert": "high_logging_pomerium", "for": "3h", "labels": { "severity": "waste" }, "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[1h])) > 50k', "annotations": { "summary": "high log output rate" }, }, ] def k8sRules(): # from https://awesome-prometheus-alerts.grep.to/rules.html return [ { "alert": "metricsTargetMissing", "expr": 'up{job!~"cm-acme-.*"} == 0', 'for': '10m', "labels": { "severity": "critical" }, "annotations": { "summary": "metrics target missing (instance {{ $labels.instance }})", "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesMemoryPressure", "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesDiskPressure", "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesOutOfDisk", "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', "for": "2m", "labels": { "severity": "critical" }, "annotations": { "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesJobFailed", "expr": "kube_job_status_failed > 0", "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesPodCrashLooping", "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", "for": "2m", "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", }, }, { "alert": "KubernetesClientCertificateExpiresNextWeek", "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', "labels": { "severity": "warning" }, "annotations": { "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", }, }, { "alert": "container_waiting", "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", "annotations": { "description": '', "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", }, "for": "10m", }, ] def allRules(ctx): return { "groups": [ { "name": "k8s", "interval": "1m", "rules": k8sRules(), }, { "name": "pomerium_proxy", "interval": "1m", "rules": pomRules(), }, { "name": "Outages", "interval": "1m", "rules": [ { "alert": "powereagleStalled", "expr": "rate(house_power_w[100m]) == 0", "for": "0m", "labels": { "severity": "losingData" }, "annotations": { "summary": "power eagle data stalled", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", }, }, { "alert": "powereagleAbsent", "expr": "absent_over_time(house_power_w[5m])", "for": "2m", "labels": { "severity": "losingData" }, "annotations": { "summary": "power eagle data missing", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", }, }, { "alert": "absent_zigbee", "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', "for": "10m", }, { "alert": "net_routes_sync", "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', "for": "10m", "labels": { "severity": "houseUsersAffected" }, "annotations": { "summary": "net_routes is not getting regular updates from net_routes_input", }, }, ], }, { "name": "disk_errs", "interval": "2d", "rules": [{ "alert": "zpool_device_error_increase", "labels": { "severity": "warning" }, "expr": 'increase(zpool_device_error_count[3d]) > 0', }, { "alert": "zpool_device_error_count", "labels": { "severity": "warning" }, "expr": 'zpool_device_error_count > 0', }], }, { "name": "lighting", "interval": "5m", "rules": [{ "alert": "light_bridge_no_mqtt", "expr": 'mqtt_connected{job="light-bridge"} != 1', }], }, { "name": "front_door", "interval": "5m", "rules": [ { "alert": "front_door_reader_esp32_no_mqtt", 'expr': 'hw_connected{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_reader_svc_down", 'expr': 'up{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_reader_svc_reader_no_mqtt", 'expr': 'mqtt_connected{job="fingerprint"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_svc_down", 'expr': 'up{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_svc_no_mqtt", 'expr': 'mqtt_connected{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, { "alert": "front_door_lock_esp32_no_mqtt", 'expr': 'hw_connected{job="front-door-lock"} < 1', "annotations": { "summary": "see https://bigasterisk.com/front-door-lock/" }, }, ], }, { "name": "net_routes", "interval": "5m", "rules": [ { "alert": "no_house_ip_service", "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' }, { "alert": "no_net_routes_running", "expr": 'absent(python_info{job="net-routes"})' }, { "alert": "allowed_check_never_returned_200", 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' }, { "alert": "allowed_check_never_returned_403", 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' }, { 'alert': 'net_route_input_eval_cal_loop_is_down', 'expr': 'eval_cal_up!=1' }, { 'alert': 'net_route_input_mongo_loop_is_down', 'expr': 'mongo_to_net_routes_up!=1' }, { 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' }, { 'alert': 'gcalendarwatch_current_events_loop_is_down', 'expr': 'current_events_up != 1' }, ], }, { "name": "http", "interval": "1h", 'rules': [ { 'alert': 'old_https_certs', 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', }, { 'alert': 'high_500_response_rate', 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', }, ], }, { "name": "ping", "interval": "1m", "rules": [{ "alert": "ping_failed", "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', "for": "10m", }] }, { "name": "alerts", "rules": [ { "alert": "kube_node_status_bad_condition", "for": "2h", "labels": { "severity": "warning" }, "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', }, { "alert": "housePower", "for": "1h", "labels": { "severity": "waste" }, "expr": "house_power_w > 4000", "annotations": { "summary": "house power usage over 4KW" }, }, { "alert": "host_root_fs_space_low", "for": "20m", "labels": { "severity": "warning" }, "expr": 'disk_used_percent{path="/"} > 85', }, { "alert": "zpool_space_low", "for": "20m", "labels": { "severity": "warning" }, "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', }, { "alert": "disk_week_incr", "for": "20m", "labels": { "severity": "warning" }, "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', "annotations": { "summary": "high mb/week on zfs dir" }, }, { "alert": "high_logging", "for": "3h", "labels": { "severity": "waste" }, "expr": 'sum by (namespace, pod, container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 10k', "annotations": { "summary": "high log output rate" }, }, { "alert": "stale_process", "for": "1d", "labels": { "severity": "dataRisk" }, "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", "annotations": { "summary": "process time is old" }, }, { "alert": "starlette", "for": "1m", "labels": { "severity": "fix" }, "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', "annotations": { "summary": "set starlette app name" }, }, { "alert": "ssl_certs_expiring_soon", "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", "labels": { "severity": "warning" }, "annotations": { "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" }, }, ], }, ] + hostsExpectedOnline(ctx)['groups'] } def _runJson(ctx, cmd): return json.loads(ctx.run(cmd, hide="stdout").stdout) def hostsExpectedOnline(ctx): return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")