Mercurial > code > home > repos > victoriametrics
comparison alert_rules.py @ 31:d39a8038227b
reformat
author | drewp@bigasterisk.com |
---|---|
date | Wed, 19 Jul 2023 21:27:46 -0700 |
parents | e114edff93dc |
children | eb1de82c93aa |
comparison
equal
deleted
inserted
replaced
30:4165f4fa6ccf | 31:d39a8038227b |
---|---|
16 # from https://awesome-prometheus-alerts.grep.to/rules.html | 16 # from https://awesome-prometheus-alerts.grep.to/rules.html |
17 return [ | 17 return [ |
18 { | 18 { |
19 "alert": "PrometheusTargetMissing", | 19 "alert": "PrometheusTargetMissing", |
20 "expr": "up == 0", | 20 "expr": "up == 0", |
21 "labels": {"severity": "critical"}, | 21 "labels": { |
22 "severity": "critical" | |
23 }, | |
22 "annotations": { | 24 "annotations": { |
23 "summary": "Prometheus target missing (instance {{ $labels.instance }})", | 25 "summary": "Prometheus target missing (instance {{ $labels.instance }})", |
24 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", | 26 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", |
25 }, | 27 }, |
26 }, | 28 }, |
27 { | 29 { |
28 "alert": "KubernetesMemoryPressure", | 30 "alert": "KubernetesMemoryPressure", |
29 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', | 31 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', |
30 "for": "2m", | 32 "for": "2m", |
31 "labels": {"severity": "critical"}, | 33 "labels": { |
34 "severity": "critical" | |
35 }, | |
32 "annotations": { | 36 "annotations": { |
33 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", | 37 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", |
34 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", | 38 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", |
35 }, | 39 }, |
36 }, | 40 }, |
37 { | 41 { |
38 "alert": "KubernetesDiskPressure", | 42 "alert": "KubernetesDiskPressure", |
39 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', | 43 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', |
40 "for": "2m", | 44 "for": "2m", |
41 "labels": {"severity": "critical"}, | 45 "labels": { |
46 "severity": "critical" | |
47 }, | |
42 "annotations": { | 48 "annotations": { |
43 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", | 49 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", |
44 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", | 50 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", |
45 }, | 51 }, |
46 }, | 52 }, |
47 { | 53 { |
48 "alert": "KubernetesOutOfDisk", | 54 "alert": "KubernetesOutOfDisk", |
49 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', | 55 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', |
50 "for": "2m", | 56 "for": "2m", |
51 "labels": {"severity": "critical"}, | 57 "labels": { |
58 "severity": "critical" | |
59 }, | |
52 "annotations": { | 60 "annotations": { |
53 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", | 61 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", |
54 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", | 62 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", |
55 }, | 63 }, |
56 }, | 64 }, |
57 { | 65 { |
58 "alert": "KubernetesJobFailed", | 66 "alert": "KubernetesJobFailed", |
59 "expr": "kube_job_status_failed > 0", | 67 "expr": "kube_job_status_failed > 0", |
60 "labels": {"severity": "warning"}, | 68 "labels": { |
69 "severity": "warning" | |
70 }, | |
61 "annotations": { | 71 "annotations": { |
62 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", | 72 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", |
63 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", | 73 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", |
64 }, | 74 }, |
65 }, | 75 }, |
66 { | 76 { |
67 "alert": "KubernetesPodCrashLooping", | 77 "alert": "KubernetesPodCrashLooping", |
68 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", | 78 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", |
69 "for": "2m", | 79 "for": "2m", |
70 "labels": {"severity": "warning"}, | 80 "labels": { |
81 "severity": "warning" | |
82 }, | |
71 "annotations": { | 83 "annotations": { |
72 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", | 84 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", |
73 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", | 85 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", |
74 }, | 86 }, |
75 }, | 87 }, |
76 { | 88 { |
77 "alert": "KubernetesClientCertificateExpiresNextWeek", | 89 "alert": |
78 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | 90 "KubernetesClientCertificateExpiresNextWeek", |
79 "labels": {"severity": "warning"}, | 91 "expr": |
92 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', | |
93 "labels": { | |
94 "severity": "warning" | |
95 }, | |
80 "annotations": { | 96 "annotations": { |
81 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", | 97 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", |
82 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", | 98 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", |
83 }, | 99 }, |
84 }, | 100 }, |
99 "rules": k8sRules(), | 115 "rules": k8sRules(), |
100 }, | 116 }, |
101 # | 117 # |
102 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name | 118 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name |
103 { | 119 { |
104 "name": "Outages", | 120 "name": |
105 "interval": "1m", | 121 "Outages", |
122 "interval": | |
123 "1m", | |
106 "rules": [ | 124 "rules": [ |
107 { | 125 { |
108 "alert": "powereagleStalled", | 126 "alert": "powereagleStalled", |
109 "expr": "rate(house_power_w[100m]) == 0", | 127 "expr": "rate(house_power_w[100m]) == 0", |
110 "for": "0m", | 128 "for": "0m", |
111 "labels": {"severity": "losingData"}, | 129 "labels": { |
130 "severity": "losingData" | |
131 }, | |
112 "annotations": { | 132 "annotations": { |
113 "summary": "power eagle data stalled", | 133 "summary": "power eagle data stalled", |
114 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | 134 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", |
115 }, | 135 }, |
116 }, | 136 }, |
117 { | 137 { |
118 "alert": "powereagleAbsent", | 138 "alert": "powereagleAbsent", |
119 "expr": "absent_over_time(house_power_w[5m])", | 139 "expr": "absent_over_time(house_power_w[5m])", |
120 "for": "2m", | 140 "for": "2m", |
121 "labels": {"severity": "losingData"}, | 141 "labels": { |
142 "severity": "losingData" | |
143 }, | |
122 "annotations": { | 144 "annotations": { |
123 "summary": "power eagle data missing", | 145 "summary": "power eagle data missing", |
124 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", | 146 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", |
125 }, | 147 }, |
126 }, | 148 }, |
130 }, | 152 }, |
131 { | 153 { |
132 "alert": "net_routes_sync", | 154 "alert": "net_routes_sync", |
133 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', | 155 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', |
134 "for": "10m", | 156 "for": "10m", |
135 "labels": {"severity": "houseUsersAffected"}, | 157 "labels": { |
158 "severity": "houseUsersAffected" | |
159 }, | |
136 "annotations": { | 160 "annotations": { |
137 "summary": "net_routes is not getting regular updates" | 161 "summary": "net_routes is not getting regular updates" |
138 }, | 162 }, |
139 }, | 163 }, |
140 ], | 164 ], |
141 }, | 165 }, |
142 { | 166 { |
143 "name": "disk_errs", | 167 "name": "disk_errs", |
144 "interval": "2d", | 168 "interval": "2d", |
145 "rules": [ | 169 "rules": [{ |
146 { | 170 "alert": "zpool_device_error_count", |
147 "alert": "zpool_device_error_count", | 171 "labels": { |
148 "labels": {"severity": "warning"}, | 172 "severity": "warning" |
149 "expr": 'increase(zpool_device_error_count[3d]) > 0', | 173 }, |
150 }, | 174 "expr": 'increase(zpool_device_error_count[3d]) > 0', |
151 ], | 175 }], |
152 }, | 176 }, |
153 { | 177 { |
154 "name": "alerts", | 178 "name": |
179 "alerts", | |
155 "rules": [ | 180 "rules": [ |
156 { | 181 { |
157 "alert": "kube_node_status_bad_condition", | 182 "alert": "kube_node_status_bad_condition", |
158 "for": "2h", | 183 "for": "2h", |
159 "labels": {"severity": "warning"}, | 184 "labels": { |
185 "severity": "warning" | |
186 }, | |
160 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', | 187 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', |
161 }, | 188 }, |
162 { | 189 { |
163 "alert": "housePower", | 190 "alert": "housePower", |
164 "for": "1h", | 191 "for": "1h", |
165 "labels": {"severity": "waste"}, | 192 "labels": { |
193 "severity": "waste" | |
194 }, | |
166 "expr": "house_power_w > 4000", | 195 "expr": "house_power_w > 4000", |
167 "annotations": {"summary": "house power usage over 4KW"}, | 196 "annotations": { |
197 "summary": "house power usage over 4KW" | |
198 }, | |
168 }, | 199 }, |
169 { | 200 { |
170 "alert": "host_root_fs_space_low", | 201 "alert": "host_root_fs_space_low", |
171 "for": "20m", | 202 "for": "20m", |
172 "labels": {"severity": "warning"}, | 203 "labels": { |
204 "severity": "warning" | |
205 }, | |
173 "expr": 'disk_free{path="/"} < 20G', | 206 "expr": 'disk_free{path="/"} < 20G', |
174 }, | 207 }, |
175 { | 208 { |
176 "alert": "zpool_space_low", | 209 "alert": "zpool_space_low", |
177 "for": "20m", | 210 "for": "20m", |
178 "labels": {"severity": "warning"}, | 211 "labels": { |
212 "severity": "warning" | |
213 }, | |
179 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', | 214 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', |
180 }, | 215 }, |
181 { | 216 { |
182 "alert": "disk_week_incr", | 217 "alert": "disk_week_incr", |
183 "for": "20m", | 218 "for": "20m", |
184 "labels": {"severity": "warning"}, | 219 "labels": { |
220 "severity": "warning" | |
221 }, | |
185 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', | 222 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', |
186 "annotations": {"summary": "high mb/week on zfs dir"}, | 223 "annotations": { |
224 "summary": "high mb/week on zfs dir" | |
225 }, | |
187 }, | 226 }, |
188 { | 227 { |
189 "alert": "high_logging", | 228 "alert": "high_logging", |
190 "for": "20m", | 229 "for": "3h", |
191 "labels": {"severity": "waste"}, | 230 "labels": { |
231 "severity": "waste" | |
232 }, | |
192 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", | 233 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", |
193 "annotations": {"summary": "high log output rate"}, | 234 "annotations": { |
235 "summary": "high log output rate" | |
236 }, | |
194 }, | 237 }, |
195 { | 238 { |
196 "alert": "stale_process", | 239 "alert": "stale_process", |
197 "for": "1d", | 240 "for": "1d", |
198 "labels": {"severity": "dataRisk"}, | 241 "labels": { |
242 "severity": "dataRisk" | |
243 }, | |
199 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", | 244 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", |
200 "annotations": {"summary": "process time is old"}, | 245 "annotations": { |
246 "summary": "process time is old" | |
247 }, | |
201 }, | 248 }, |
202 { | 249 { |
203 "alert": "starlette", | 250 "alert": "starlette", |
204 "for": "1m", | 251 "for": "1m", |
205 "labels": {"severity": "fix"}, | 252 "labels": { |
253 "severity": "fix" | |
254 }, | |
206 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', | 255 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', |
207 "annotations": {"summary": "set starlette app name"}, | 256 "annotations": { |
257 "summary": "set starlette app name" | |
258 }, | |
208 }, | 259 }, |
209 { | 260 { |
210 "alert": "ssl_certs_expiring_soon", | 261 "alert": "ssl_certs_expiring_soon", |
211 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", | 262 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", |
212 "labels": {"severity": "warning"}, | 263 "labels": { |
264 "severity": "warning" | |
265 }, | |
213 "annotations": { | 266 "annotations": { |
214 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" | 267 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" |
215 }, | 268 }, |
216 }, | 269 }, |
217 ], | 270 ], |