Mercurial > code > home > repos > victoriametrics
annotate alert_rules.py @ 34:3b91d52b007d
rules tuning
author | drewp@bigasterisk.com |
---|---|
date | Mon, 09 Oct 2023 18:50:36 -0700 |
parents | eb1de82c93aa |
children | 2bc188c4117a |
rev | line source |
---|---|
23 | 1 """ |
2 pdm run invoke push-config | |
3 | |
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ | |
5 "Whenever the alert expression results in one or more vector | |
6 elements at a given point in time, the alert counts as active for | |
7 these elements' label sets." | |
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics | |
9 | |
10 """ | |
11 | |
12 import json | |
13 | |
14 | |
15 def k8sRules(): | |
16 # from https://awesome-prometheus-alerts.grep.to/rules.html | |
17 return [ | |
18 { | |
34 | 19 "alert": "metricsTargetMissing", |
20 "expr": 'up{job!~"cm-acme-.*"} == 0', | |
31 | 21 "labels": { |
22 "severity": "critical" | |
23 }, | |
23 | 24 "annotations": { |
34 | 25 "summary": "metrics target missing (instance {{ $labels.instance }})", |
26 "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", | |
23 | 27 }, |
28 }, | |
29 { | |
30 "alert": "KubernetesMemoryPressure", | |
31 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', | |
32 "for": "2m", | |
31 | 33 "labels": { |
34 "severity": "critical" | |
35 }, | |
23 | 36 "annotations": { |
37 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", | |
38 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", | |
39 }, | |
40 }, | |
41 { | |
42 "alert": "KubernetesDiskPressure", | |
43 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', | |
44 "for": "2m", | |
31 | 45 "labels": { |
46 "severity": "critical" | |
47 }, | |
23 | 48 "annotations": { |
49 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", | |
50 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", | |
51 }, | |
52 }, | |
53 { | |
54 "alert": "KubernetesOutOfDisk", | |
55 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', | |
56 "for": "2m", | |
31 | 57 "labels": { |
58 "severity": "critical" | |
59 }, | |
23 | 60 "annotations": { |
61 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", | |
62 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", | |
63 }, | |
64 }, | |
65 { | |
66 "alert": "KubernetesJobFailed", | |
67 "expr": "kube_job_status_failed > 0", | |
31 | 68 "labels": { |
69 "severity": "warning" | |
70 }, | |
23 | 71 "annotations": { |
72 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", | |
73 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", | |
74 }, | |
75 }, | |
76 { | |
77 "alert": "KubernetesPodCrashLooping", | |
78 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", | |
79 "for": "2m", | |
31 | 80 "labels": { |
81 "severity": "warning" | |
82 }, | |
23 | 83 "annotations": { |
84 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", | |
85 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", | |
86 }, | |
87 }, | |
88 { | |
31 | 89 "alert": |
90 "KubernetesClientCertificateExpiresNextWeek", | |
91 "expr": | |
92 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | |
93 "labels": { | |
94 "severity": "warning" | |
95 }, | |
23 | 96 "annotations": { |
97 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", | |
98 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", | |
99 }, | |
100 }, | |
101 { | |
102 "alert": "container_waiting", | |
34 | 103 "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", |
104 "annotations": { | |
105 "description": '', | |
106 "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", | |
107 }, | |
23 | 108 "for": "2m", |
109 }, | |
110 ] | |
111 | |
112 | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
113 def allRules(ctx): |
23 | 114 return { |
115 "groups": [ | |
116 { | |
117 "name": "k8s", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
118 "interval": "1m", |
23 | 119 "rules": k8sRules(), |
120 }, | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
121 expectedK8sNodesGroup(ctx), |
23 | 122 # |
123 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name | |
124 { | |
31 | 125 "name": |
126 "Outages", | |
127 "interval": | |
128 "1m", | |
23 | 129 "rules": [ |
130 { | |
131 "alert": "powereagleStalled", | |
132 "expr": "rate(house_power_w[100m]) == 0", | |
133 "for": "0m", | |
31 | 134 "labels": { |
135 "severity": "losingData" | |
136 }, | |
23 | 137 "annotations": { |
138 "summary": "power eagle data stalled", | |
139 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
140 }, | |
141 }, | |
142 { | |
143 "alert": "powereagleAbsent", | |
144 "expr": "absent_over_time(house_power_w[5m])", | |
145 "for": "2m", | |
31 | 146 "labels": { |
147 "severity": "losingData" | |
148 }, | |
23 | 149 "annotations": { |
150 "summary": "power eagle data missing", | |
151 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
152 }, | |
153 }, | |
154 { | |
155 "alert": "absent_zigbee", | |
156 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', | |
157 }, | |
158 { | |
159 "alert": "net_routes_sync", | |
160 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', | |
161 "for": "10m", | |
31 | 162 "labels": { |
163 "severity": "houseUsersAffected" | |
164 }, | |
23 | 165 "annotations": { |
166 "summary": "net_routes is not getting regular updates" | |
167 }, | |
168 }, | |
169 ], | |
170 }, | |
171 { | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
172 "name": "disk_errs", |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
173 "interval": "2d", |
31 | 174 "rules": [{ |
175 "alert": "zpool_device_error_count", | |
176 "labels": { | |
177 "severity": "warning" | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
178 }, |
31 | 179 "expr": 'increase(zpool_device_error_count[3d]) > 0', |
180 }], | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
181 }, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
182 { |
31 | 183 "name": |
34 | 184 "front_door", |
185 "interval": | |
186 "5m", | |
187 "rules": [ | |
188 { | |
189 "alert": "service_disconnected_from_mqtt", | |
190 "expr": "mqtt_connected < 1" | |
191 }, | |
192 { | |
193 "alert": "esp_not_connected_to_mqtt", | |
194 "expr": "hw_connected < 1", | |
195 }, | |
196 ] | |
197 }, | |
198 { | |
199 "name": | |
31 | 200 "alerts", |
23 | 201 "rules": [ |
202 { | |
203 "alert": "kube_node_status_bad_condition", | |
204 "for": "2h", | |
31 | 205 "labels": { |
206 "severity": "warning" | |
207 }, | |
23 | 208 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', |
209 }, | |
210 { | |
211 "alert": "housePower", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
212 "for": "1h", |
31 | 213 "labels": { |
214 "severity": "waste" | |
215 }, | |
23 | 216 "expr": "house_power_w > 4000", |
31 | 217 "annotations": { |
218 "summary": "house power usage over 4KW" | |
219 }, | |
23 | 220 }, |
221 { | |
222 "alert": "host_root_fs_space_low", | |
223 "for": "20m", | |
31 | 224 "labels": { |
225 "severity": "warning" | |
226 }, | |
34 | 227 "expr": 'disk_free{host!="garage",path="/"} < 20G', |
23 | 228 }, |
229 { | |
230 "alert": "zpool_space_low", | |
231 "for": "20m", | |
31 | 232 "labels": { |
233 "severity": "warning" | |
234 }, | |
23 | 235 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', |
236 }, | |
237 { | |
238 "alert": "disk_week_incr", | |
239 "for": "20m", | |
31 | 240 "labels": { |
241 "severity": "warning" | |
242 }, | |
23 | 243 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', |
31 | 244 "annotations": { |
245 "summary": "high mb/week on zfs dir" | |
246 }, | |
23 | 247 }, |
248 { | |
249 "alert": "high_logging", | |
31 | 250 "for": "3h", |
251 "labels": { | |
252 "severity": "waste" | |
253 }, | |
23 | 254 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", |
31 | 255 "annotations": { |
256 "summary": "high log output rate" | |
257 }, | |
23 | 258 }, |
259 { | |
260 "alert": "stale_process", | |
261 "for": "1d", | |
31 | 262 "labels": { |
263 "severity": "dataRisk" | |
264 }, | |
23 | 265 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", |
31 | 266 "annotations": { |
267 "summary": "process time is old" | |
268 }, | |
23 | 269 }, |
270 { | |
271 "alert": "starlette", | |
272 "for": "1m", | |
31 | 273 "labels": { |
274 "severity": "fix" | |
275 }, | |
23 | 276 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', |
31 | 277 "annotations": { |
278 "summary": "set starlette app name" | |
279 }, | |
23 | 280 }, |
281 { | |
282 "alert": "ssl_certs_expiring_soon", | |
283 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", | |
31 | 284 "labels": { |
285 "severity": "warning" | |
286 }, | |
23 | 287 "annotations": { |
288 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" | |
289 }, | |
290 }, | |
291 ], | |
292 }, | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
293 ] + hostsExpectedOnline(ctx)['groups'] |
23 | 294 } |
295 | |
296 | |
297 def _runJson(ctx, cmd): | |
298 return json.loads(ctx.run(cmd, hide="stdout").stdout) | |
299 | |
300 | |
301 def hostsExpectedOnline(ctx): | |
302 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") | |
303 | |
304 | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
305 def expectedK8sNodesGroup(ctx): |
23 | 306 getNode = _runJson(ctx, "kubectl get node -o json") |
307 hosts = [item["metadata"]["name"] for item in getNode["items"]] | |
308 optionalHosts = {'slash'} | |
309 return { | |
32
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
310 "name": |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
311 "k8s_expected_nodes", |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
312 "rules": [{ |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
313 "alert": "kube_node_log_size_report_" + h, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
314 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' % h, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
315 "for": "1h", |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
316 "annotations": { |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
317 "summary": f"no recent k8s log size report from host {h}" |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
318 }, |
eb1de82c93aa
refactor the merging of all the groups
drewp@bigasterisk.com
parents:
31
diff
changeset
|
319 } for h in hosts if h not in optionalHosts], |
23 | 320 } |