Mercurial > code > home > repos > victoriametrics
changeset 23:ffa013944200
mv
author | drewp@bigasterisk.com |
---|---|
date | Sat, 24 Jun 2023 23:02:17 -0700 |
parents | cd115f1ca2a8 |
children | d7aa409ebf78 |
files | alert_rules.py rules/rules_k8s.yaml rules/rules_main.yaml |
diffstat | 3 files changed, 254 insertions(+), 174 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alert_rules.py Sat Jun 24 23:02:17 2023 -0700 @@ -0,0 +1,254 @@ +""" +pdm run invoke push-config + +docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +"Whenever the alert expression results in one or more vector +elements at a given point in time, the alert counts as active for +these elements' label sets." +also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + +""" + +import json + +import yaml + + +def k8sRules(): + # from https://awesome-prometheus-alerts.grep.to/rules.html + return [ + { + "alert": "PrometheusTargetMissing", + "expr": "up == 0", + "for": "0m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Prometheus target missing (instance {{ $labels.instance }})", + "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesMemoryPressure", + "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesDiskPressure", + "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesOutOfDisk", + "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', + "for": "2m", + "labels": {"severity": "critical"}, + "annotations": { + "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesJobFailed", + "expr": "kube_job_status_failed > 0", + "for": "0m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", + "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesPodCrashLooping", + "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", + "for": "2m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", + "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesClientCertificateExpiresNextWeek", + "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "for": "0m", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", + "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "container_waiting", + "expr": "sum by (container)(kube_pod_container_status_waiting!=0)", + "for": "2m", + }, + ] + + +def allRules(): + return { + "groups": [ + { + "name": "k8s", + "rules": k8sRules(), + }, + # + # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name + { + "name": "Outages", + "rules": [ + { + "alert": "powereagleStalled", + "expr": "rate(house_power_w[100m]) == 0", + "for": "0m", + "labels": {"severity": "losingData"}, + "annotations": { + "summary": "power eagle data stalled", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "powereagleAbsent", + "expr": "absent_over_time(house_power_w[5m])", + "for": "2m", + "labels": {"severity": "losingData"}, + "annotations": { + "summary": "power eagle data missing", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "absent_zigbee", + "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', + }, + { + "alert": "net_routes_sync", + "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', + "for": "10m", + "labels": {"severity": "houseUsersAffected"}, + "annotations": { + "summary": "net_routes is not getting regular updates" + }, + }, + ], + }, + { + "name": "alerts", + "rules": [ + { + "alert": "kube_node_status_bad_condition", + "for": "2h", + "labels": {"severity": "warning"}, + "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', + }, + { + "alert": "housePower", + "for": "24h", + "labels": {"severity": "waste"}, + "expr": "house_power_w > 4000", + "annotations": {"summary": "house power usage over 4KW"}, + }, + { + "alert": "host_root_fs_space_low", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'disk_free{path="/"} < 20G', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "zpool_space_low", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "zpool_device_error_count", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'increase(zpool_device_error_count[1d]) > 0', + "annotations": {"summary": "low disk_free"}, + }, + { + "alert": "disk_week_incr", + "for": "20m", + "labels": {"severity": "warning"}, + "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', + "annotations": {"summary": "high mb/week on zfs dir"}, + }, + { + "alert": "high_logging", + "for": "20m", + "labels": {"severity": "waste"}, + "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", + "annotations": {"summary": "high log output rate"}, + }, + { + "alert": "stale_process", + "for": "1d", + "labels": {"severity": "dataRisk"}, + "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", + "annotations": {"summary": "process time is old"}, + }, + { + "alert": "starlette", + "for": "1m", + "labels": {"severity": "fix"}, + "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', + "annotations": {"summary": "set starlette app name"}, + }, + { + "alert": "ssl_certs_expiring_soon", + "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", + "labels": {"severity": "warning"}, + "annotations": { + "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" + }, + }, + ], + }, + ] + } + + +def _runJson(ctx, cmd): + return json.loads(ctx.run(cmd, hide="stdout").stdout) + + +def hostsExpectedOnline(ctx): + return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") + + +def expectedK8sNodes(ctx): + getNode = _runJson(ctx, "kubectl get node -o json") + hosts = [item["metadata"]["name"] for item in getNode["items"]] + optionalHosts = {'slash'} + return { + "groups": [ + { + "name": "k8s_expected_nodes", + "rules": [ + { + "alert": "kube_node_log_size_report_" + h, + "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' + % h, + "for": "1h", + "annotations": { + "summary": f"no recent k8s log size report from host {h}" + }, + } + for h in hosts if not h in optionalHosts + ], + } + ] + }
--- a/rules/rules_k8s.yaml Sat Jun 24 23:02:04 2023 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -groups: - - name: k8s - rules: - # from https://awesome-prometheus-alerts.grep.to/rules.html - - alert: PrometheusTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes memory pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes disk pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes out of disk (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesJobFailed - expr: kube_job_status_failed > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes Job failed (instance {{ $labels.instance }}) - description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: KubernetesPodCrashLooping - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) - description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesClientCertificateExpiresNextWeek - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) - description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: container_waiting - expr: sum by (container)(kube_pod_container_status_waiting!=0) - for: 2m
--- a/rules/rules_main.yaml Sat Jun 24 23:02:04 2023 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -groups: - # docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - # "Whenever the alert expression results in one or more vector - # elements at a given point in time, the alert counts as active for - # these elements' label sets." - - # also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics - # - # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name - - # - name: webcam - # rules: - # waiting for twinscam revival - # - alert: twinscam_not_reporting - # expr: absent(cam_pipeline_state{job="webcam-record-twinscam"}) - # for: 2m - # labels: - # severity: losingData - # annotations: - # summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}" - - # - alert: cam_garagehall_not_reporting - # expr: absent(cam_pipeline_state{job="webcam-record-garagehall"}) - # for: 2m - # labels: - # severity: losingData - # annotations: - # # summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}" - - # - alert: cam_pipeline_stopped - # expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1 - # for: 10m - # labels: - # severity: losingData - # annotations: - # summary: "webcam-record gst pipeline is not state=playing {{ $labels }}" - - # - alert: cam_not_advancing - # expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2 - # for: 10m - # labels: - # severity: losingData - # annotations: - # summary: "cam output bytes is advancing too slowly. {{ $labels }}" - - # - alert: webcam_indexer_stalled - # expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01 - # for: 10m - # labels: - # severity: webcamUsersAffected - # annotations: - # summary: "webcam indexer update loop is stalled" - - - name: Outages - rules: - - alert: powereagleStalled - expr: rate(house_power_w[100m]) == 0 - for: 0m - labels: - severity: losingData - annotations: - summary: "power eagle data stalled" - description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - - - alert: powereagleAbsent - expr: absent_over_time(house_power_w[5m]) - for: 2m - labels: - severity: losingData - annotations: - summary: "power eagle data missing" - description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - - # - alert: absent_mitmproxy - # expr: absent(process_resident_memory_bytes{job="mitmproxy"}) - # labels: - # severity: houseUsersAffected - # annotations: - # summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)" - - - alert: absent_zigbee_dash - expr: absent(container_last_seen{container="zigbee2mqtt-dash"}) - - - alert: net_routes_sync - expr: rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70 - for: 10m - labels: - severity: houseUsersAffected - annotations: - summary: "net_routes is not getting regular updates" - - - - name: alerts - rules: - - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} - - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk3, for: 20m, labels: {severity: warning}, expr: '1 > 2', annotations: {summary: "unused"}} - - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} - # - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} - - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k', annotations: {summary: "high log output rate {{ $labels }}"}} - - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} - - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} - - alert: ssl_certs_expiring_soon - expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 - labels: - severity: warning - annotations: - summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n LABELS = {{ $labels }}" -# williamperttula.com needs to serve real cert, not self-signed