Mercurial > code > home > repos > victoriametrics
changeset 31:d39a8038227b
reformat
author | drewp@bigasterisk.com |
---|---|
date | Wed, 19 Jul 2023 21:27:46 -0700 |
parents | 4165f4fa6ccf |
children | eb1de82c93aa |
files | .flake8 alert_rules.py |
diffstat | 2 files changed, 92 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.flake8 Wed Jul 19 21:27:46 2023 -0700 @@ -0,0 +1,3 @@ +[flake8] +max-line-length=160 +ignore=E722,W503,E741,E201,E202,E241,E231
--- a/alert_rules.py Wed Jul 19 21:17:57 2023 -0700 +++ b/alert_rules.py Wed Jul 19 21:27:46 2023 -0700 @@ -18,7 +18,9 @@ { "alert": "PrometheusTargetMissing", "expr": "up == 0", - "labels": {"severity": "critical"}, + "labels": { + "severity": "critical" + }, "annotations": { "summary": "Prometheus target missing (instance {{ $labels.instance }})", "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", @@ -28,7 +30,9 @@ "alert": "KubernetesMemoryPressure", "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', "for": "2m", - "labels": {"severity": "critical"}, + "labels": { + "severity": "critical" + }, "annotations": { "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", @@ -38,7 +42,9 @@ "alert": "KubernetesDiskPressure", "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', "for": "2m", - "labels": {"severity": "critical"}, + "labels": { + "severity": "critical" + }, "annotations": { "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", @@ -48,7 +54,9 @@ "alert": "KubernetesOutOfDisk", "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', "for": "2m", - "labels": {"severity": "critical"}, + "labels": { + "severity": "critical" + }, "annotations": { "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", @@ -57,7 +65,9 @@ { "alert": "KubernetesJobFailed", "expr": "kube_job_status_failed > 0", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "annotations": { "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", @@ -67,16 +77,22 @@ "alert": "KubernetesPodCrashLooping", "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", "for": "2m", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "annotations": { "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", }, }, { - "alert": "KubernetesClientCertificateExpiresNextWeek", - "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', - "labels": {"severity": "warning"}, + "alert": + "KubernetesClientCertificateExpiresNextWeek", + "expr": + 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "labels": { + "severity": "warning" + }, "annotations": { "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", @@ -101,14 +117,18 @@ # # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name { - "name": "Outages", - "interval": "1m", + "name": + "Outages", + "interval": + "1m", "rules": [ { "alert": "powereagleStalled", "expr": "rate(house_power_w[100m]) == 0", "for": "0m", - "labels": {"severity": "losingData"}, + "labels": { + "severity": "losingData" + }, "annotations": { "summary": "power eagle data stalled", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", @@ -118,7 +138,9 @@ "alert": "powereagleAbsent", "expr": "absent_over_time(house_power_w[5m])", "for": "2m", - "labels": {"severity": "losingData"}, + "labels": { + "severity": "losingData" + }, "annotations": { "summary": "power eagle data missing", "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", @@ -132,7 +154,9 @@ "alert": "net_routes_sync", "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', "for": "10m", - "labels": {"severity": "houseUsersAffected"}, + "labels": { + "severity": "houseUsersAffected" + }, "annotations": { "summary": "net_routes is not getting regular updates" }, @@ -142,74 +166,103 @@ { "name": "disk_errs", "interval": "2d", - "rules": [ - { - "alert": "zpool_device_error_count", - "labels": {"severity": "warning"}, - "expr": 'increase(zpool_device_error_count[3d]) > 0', + "rules": [{ + "alert": "zpool_device_error_count", + "labels": { + "severity": "warning" }, - ], + "expr": 'increase(zpool_device_error_count[3d]) > 0', + }], }, { - "name": "alerts", + "name": + "alerts", "rules": [ { "alert": "kube_node_status_bad_condition", "for": "2h", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', }, { "alert": "housePower", "for": "1h", - "labels": {"severity": "waste"}, + "labels": { + "severity": "waste" + }, "expr": "house_power_w > 4000", - "annotations": {"summary": "house power usage over 4KW"}, + "annotations": { + "summary": "house power usage over 4KW" + }, }, { "alert": "host_root_fs_space_low", "for": "20m", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "expr": 'disk_free{path="/"} < 20G', }, { "alert": "zpool_space_low", "for": "20m", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', }, { "alert": "disk_week_incr", "for": "20m", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', - "annotations": {"summary": "high mb/week on zfs dir"}, + "annotations": { + "summary": "high mb/week on zfs dir" + }, }, { "alert": "high_logging", - "for": "20m", - "labels": {"severity": "waste"}, + "for": "3h", + "labels": { + "severity": "waste" + }, "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", - "annotations": {"summary": "high log output rate"}, + "annotations": { + "summary": "high log output rate" + }, }, { "alert": "stale_process", "for": "1d", - "labels": {"severity": "dataRisk"}, + "labels": { + "severity": "dataRisk" + }, "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", - "annotations": {"summary": "process time is old"}, + "annotations": { + "summary": "process time is old" + }, }, { "alert": "starlette", "for": "1m", - "labels": {"severity": "fix"}, + "labels": { + "severity": "fix" + }, "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', - "annotations": {"summary": "set starlette app name"}, + "annotations": { + "summary": "set starlette app name" + }, }, { "alert": "ssl_certs_expiring_soon", "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", - "labels": {"severity": "warning"}, + "labels": { + "severity": "warning" + }, "annotations": { "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" },