Mercurial > code > home > repos > victoriametrics
changeset 28:e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
author | drewp@bigasterisk.com |
---|---|
date | Wed, 19 Jul 2023 21:17:22 -0700 |
parents | eec015e90818 |
children | a4c49fa01c9d |
files | alert_rules.py deploy_vmalert.yaml |
diffstat | 2 files changed, 16 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/alert_rules.py Thu Jun 29 14:12:22 2023 -0700 +++ b/alert_rules.py Wed Jul 19 21:17:22 2023 -0700 @@ -18,7 +18,6 @@ { "alert": "PrometheusTargetMissing", "expr": "up == 0", - "for": "0m", "labels": {"severity": "critical"}, "annotations": { "summary": "Prometheus target missing (instance {{ $labels.instance }})", @@ -58,7 +57,6 @@ { "alert": "KubernetesJobFailed", "expr": "kube_job_status_failed > 0", - "for": "0m", "labels": {"severity": "warning"}, "annotations": { "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", @@ -78,7 +76,6 @@ { "alert": "KubernetesClientCertificateExpiresNextWeek", "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', - "for": "0m", "labels": {"severity": "warning"}, "annotations": { "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", @@ -98,12 +95,14 @@ "groups": [ { "name": "k8s", + "interval": "1m", "rules": k8sRules(), }, # # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name { "name": "Outages", + "interval": "1m", "rules": [ { "alert": "powereagleStalled", @@ -141,6 +140,17 @@ ], }, { + "name": "disk_errs", + "interval": "2d", + "rules": [ + { + "alert": "zpool_device_error_count", + "labels": {"severity": "warning"}, + "expr": 'increase(zpool_device_error_count[3d]) > 0', + }, + ], + }, + { "name": "alerts", "rules": [ { @@ -151,7 +161,7 @@ }, { "alert": "housePower", - "for": "24h", + "for": "1h", "labels": {"severity": "waste"}, "expr": "house_power_w > 4000", "annotations": {"summary": "house power usage over 4KW"}, @@ -169,12 +179,6 @@ "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', }, { - "alert": "zpool_device_error_count", - "for": "20m", - "labels": {"severity": "warning"}, - "expr": 'increase(zpool_device_error_count[2h]) > 0', - }, - { "alert": "disk_week_incr", "for": "20m", "labels": {"severity": "warning"},
--- a/deploy_vmalert.yaml Thu Jun 29 14:12:22 2023 -0700 +++ b/deploy_vmalert.yaml Wed Jul 19 21:17:22 2023 -0700 @@ -25,6 +25,8 @@ args: - -configCheckInterval=5s - -datasource.url=http://victoriametrics/m/ + - -datasource.queryStep=5m + - -evaluationInterval=1m - -external.url=https://bigasterisk.com/vmalert - -loggerLevel=INFO - -loggerTimezone=America/Los_Angeles