Mercurial > code > home > repos > victoriametrics
annotate alert_rules.py @ 29:a4c49fa01c9d
correct v-logs path
author | drewp@bigasterisk.com |
---|---|
date | Wed, 19 Jul 2023 21:17:44 -0700 |
parents | e114edff93dc |
children | d39a8038227b |
rev | line source |
---|---|
23 | 1 """ |
2 pdm run invoke push-config | |
3 | |
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ | |
5 "Whenever the alert expression results in one or more vector | |
6 elements at a given point in time, the alert counts as active for | |
7 these elements' label sets." | |
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics | |
9 | |
10 """ | |
11 | |
12 import json | |
13 | |
14 | |
15 def k8sRules(): | |
16 # from https://awesome-prometheus-alerts.grep.to/rules.html | |
17 return [ | |
18 { | |
19 "alert": "PrometheusTargetMissing", | |
20 "expr": "up == 0", | |
21 "labels": {"severity": "critical"}, | |
22 "annotations": { | |
23 "summary": "Prometheus target missing (instance {{ $labels.instance }})", | |
24 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", | |
25 }, | |
26 }, | |
27 { | |
28 "alert": "KubernetesMemoryPressure", | |
29 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', | |
30 "for": "2m", | |
31 "labels": {"severity": "critical"}, | |
32 "annotations": { | |
33 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", | |
34 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", | |
35 }, | |
36 }, | |
37 { | |
38 "alert": "KubernetesDiskPressure", | |
39 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', | |
40 "for": "2m", | |
41 "labels": {"severity": "critical"}, | |
42 "annotations": { | |
43 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", | |
44 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", | |
45 }, | |
46 }, | |
47 { | |
48 "alert": "KubernetesOutOfDisk", | |
49 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', | |
50 "for": "2m", | |
51 "labels": {"severity": "critical"}, | |
52 "annotations": { | |
53 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", | |
54 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", | |
55 }, | |
56 }, | |
57 { | |
58 "alert": "KubernetesJobFailed", | |
59 "expr": "kube_job_status_failed > 0", | |
60 "labels": {"severity": "warning"}, | |
61 "annotations": { | |
62 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", | |
63 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", | |
64 }, | |
65 }, | |
66 { | |
67 "alert": "KubernetesPodCrashLooping", | |
68 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", | |
69 "for": "2m", | |
70 "labels": {"severity": "warning"}, | |
71 "annotations": { | |
72 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", | |
73 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", | |
74 }, | |
75 }, | |
76 { | |
77 "alert": "KubernetesClientCertificateExpiresNextWeek", | |
78 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | |
79 "labels": {"severity": "warning"}, | |
80 "annotations": { | |
81 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", | |
82 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", | |
83 }, | |
84 }, | |
85 { | |
86 "alert": "container_waiting", | |
87 "expr": "sum by (container)(kube_pod_container_status_waiting!=0)", | |
88 "for": "2m", | |
89 }, | |
90 ] | |
91 | |
92 | |
93 def allRules(): | |
94 return { | |
95 "groups": [ | |
96 { | |
97 "name": "k8s", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
98 "interval": "1m", |
23 | 99 "rules": k8sRules(), |
100 }, | |
101 # | |
102 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name | |
103 { | |
104 "name": "Outages", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
105 "interval": "1m", |
23 | 106 "rules": [ |
107 { | |
108 "alert": "powereagleStalled", | |
109 "expr": "rate(house_power_w[100m]) == 0", | |
110 "for": "0m", | |
111 "labels": {"severity": "losingData"}, | |
112 "annotations": { | |
113 "summary": "power eagle data stalled", | |
114 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
115 }, | |
116 }, | |
117 { | |
118 "alert": "powereagleAbsent", | |
119 "expr": "absent_over_time(house_power_w[5m])", | |
120 "for": "2m", | |
121 "labels": {"severity": "losingData"}, | |
122 "annotations": { | |
123 "summary": "power eagle data missing", | |
124 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | |
125 }, | |
126 }, | |
127 { | |
128 "alert": "absent_zigbee", | |
129 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', | |
130 }, | |
131 { | |
132 "alert": "net_routes_sync", | |
133 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', | |
134 "for": "10m", | |
135 "labels": {"severity": "houseUsersAffected"}, | |
136 "annotations": { | |
137 "summary": "net_routes is not getting regular updates" | |
138 }, | |
139 }, | |
140 ], | |
141 }, | |
142 { | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
143 "name": "disk_errs", |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
144 "interval": "2d", |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
145 "rules": [ |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
146 { |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
147 "alert": "zpool_device_error_count", |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
148 "labels": {"severity": "warning"}, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
149 "expr": 'increase(zpool_device_error_count[3d]) > 0', |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
150 }, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
151 ], |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
152 }, |
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
153 { |
23 | 154 "name": "alerts", |
155 "rules": [ | |
156 { | |
157 "alert": "kube_node_status_bad_condition", | |
158 "for": "2h", | |
159 "labels": {"severity": "warning"}, | |
160 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', | |
161 }, | |
162 { | |
163 "alert": "housePower", | |
28
e114edff93dc
more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents:
27
diff
changeset
|
164 "for": "1h", |
23 | 165 "labels": {"severity": "waste"}, |
166 "expr": "house_power_w > 4000", | |
167 "annotations": {"summary": "house power usage over 4KW"}, | |
168 }, | |
169 { | |
170 "alert": "host_root_fs_space_low", | |
171 "for": "20m", | |
172 "labels": {"severity": "warning"}, | |
173 "expr": 'disk_free{path="/"} < 20G', | |
174 }, | |
175 { | |
176 "alert": "zpool_space_low", | |
177 "for": "20m", | |
178 "labels": {"severity": "warning"}, | |
179 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', | |
180 }, | |
181 { | |
182 "alert": "disk_week_incr", | |
183 "for": "20m", | |
184 "labels": {"severity": "warning"}, | |
185 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', | |
186 "annotations": {"summary": "high mb/week on zfs dir"}, | |
187 }, | |
188 { | |
189 "alert": "high_logging", | |
190 "for": "20m", | |
191 "labels": {"severity": "waste"}, | |
192 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", | |
193 "annotations": {"summary": "high log output rate"}, | |
194 }, | |
195 { | |
196 "alert": "stale_process", | |
197 "for": "1d", | |
198 "labels": {"severity": "dataRisk"}, | |
199 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", | |
200 "annotations": {"summary": "process time is old"}, | |
201 }, | |
202 { | |
203 "alert": "starlette", | |
204 "for": "1m", | |
205 "labels": {"severity": "fix"}, | |
206 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', | |
207 "annotations": {"summary": "set starlette app name"}, | |
208 }, | |
209 { | |
210 "alert": "ssl_certs_expiring_soon", | |
211 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", | |
212 "labels": {"severity": "warning"}, | |
213 "annotations": { | |
214 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" | |
215 }, | |
216 }, | |
217 ], | |
218 }, | |
219 ] | |
220 } | |
221 | |
222 | |
223 def _runJson(ctx, cmd): | |
224 return json.loads(ctx.run(cmd, hide="stdout").stdout) | |
225 | |
226 | |
227 def hostsExpectedOnline(ctx): | |
228 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") | |
229 | |
230 | |
231 def expectedK8sNodes(ctx): | |
232 getNode = _runJson(ctx, "kubectl get node -o json") | |
233 hosts = [item["metadata"]["name"] for item in getNode["items"]] | |
234 optionalHosts = {'slash'} | |
235 return { | |
236 "groups": [ | |
237 { | |
238 "name": "k8s_expected_nodes", | |
239 "rules": [ | |
240 { | |
241 "alert": "kube_node_log_size_report_" + h, | |
242 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' | |
243 % h, | |
244 "for": "1h", | |
245 "annotations": { | |
246 "summary": f"no recent k8s log size report from host {h}" | |
247 }, | |
248 } | |
249 for h in hosts if not h in optionalHosts | |
250 ], | |
251 } | |
252 ] | |
253 } |