comparison alert_rules.py @ 31:d39a8038227b

reformat
author drewp@bigasterisk.com
date Wed, 19 Jul 2023 21:27:46 -0700
parents e114edff93dc
children eb1de82c93aa
comparison
equal deleted inserted replaced
30:4165f4fa6ccf 31:d39a8038227b
16 # from https://awesome-prometheus-alerts.grep.to/rules.html 16 # from https://awesome-prometheus-alerts.grep.to/rules.html
17 return [ 17 return [
18 { 18 {
19 "alert": "PrometheusTargetMissing", 19 "alert": "PrometheusTargetMissing",
20 "expr": "up == 0", 20 "expr": "up == 0",
21 "labels": {"severity": "critical"}, 21 "labels": {
22 "severity": "critical"
23 },
22 "annotations": { 24 "annotations": {
23 "summary": "Prometheus target missing (instance {{ $labels.instance }})", 25 "summary": "Prometheus target missing (instance {{ $labels.instance }})",
24 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", 26 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
25 }, 27 },
26 }, 28 },
27 { 29 {
28 "alert": "KubernetesMemoryPressure", 30 "alert": "KubernetesMemoryPressure",
29 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', 31 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
30 "for": "2m", 32 "for": "2m",
31 "labels": {"severity": "critical"}, 33 "labels": {
34 "severity": "critical"
35 },
32 "annotations": { 36 "annotations": {
33 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", 37 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
34 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", 38 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
35 }, 39 },
36 }, 40 },
37 { 41 {
38 "alert": "KubernetesDiskPressure", 42 "alert": "KubernetesDiskPressure",
39 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', 43 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
40 "for": "2m", 44 "for": "2m",
41 "labels": {"severity": "critical"}, 45 "labels": {
46 "severity": "critical"
47 },
42 "annotations": { 48 "annotations": {
43 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", 49 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
44 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", 50 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
45 }, 51 },
46 }, 52 },
47 { 53 {
48 "alert": "KubernetesOutOfDisk", 54 "alert": "KubernetesOutOfDisk",
49 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', 55 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
50 "for": "2m", 56 "for": "2m",
51 "labels": {"severity": "critical"}, 57 "labels": {
58 "severity": "critical"
59 },
52 "annotations": { 60 "annotations": {
53 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", 61 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
54 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", 62 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
55 }, 63 },
56 }, 64 },
57 { 65 {
58 "alert": "KubernetesJobFailed", 66 "alert": "KubernetesJobFailed",
59 "expr": "kube_job_status_failed > 0", 67 "expr": "kube_job_status_failed > 0",
60 "labels": {"severity": "warning"}, 68 "labels": {
69 "severity": "warning"
70 },
61 "annotations": { 71 "annotations": {
62 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", 72 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
63 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", 73 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
64 }, 74 },
65 }, 75 },
66 { 76 {
67 "alert": "KubernetesPodCrashLooping", 77 "alert": "KubernetesPodCrashLooping",
68 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", 78 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
69 "for": "2m", 79 "for": "2m",
70 "labels": {"severity": "warning"}, 80 "labels": {
81 "severity": "warning"
82 },
71 "annotations": { 83 "annotations": {
72 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", 84 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
73 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", 85 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
74 }, 86 },
75 }, 87 },
76 { 88 {
77 "alert": "KubernetesClientCertificateExpiresNextWeek", 89 "alert":
78 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', 90 "KubernetesClientCertificateExpiresNextWeek",
79 "labels": {"severity": "warning"}, 91 "expr":
92 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
93 "labels": {
94 "severity": "warning"
95 },
80 "annotations": { 96 "annotations": {
81 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", 97 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
82 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", 98 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
83 }, 99 },
84 }, 100 },
99 "rules": k8sRules(), 115 "rules": k8sRules(),
100 }, 116 },
101 # 117 #
102 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name 118 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
103 { 119 {
104 "name": "Outages", 120 "name":
105 "interval": "1m", 121 "Outages",
122 "interval":
123 "1m",
106 "rules": [ 124 "rules": [
107 { 125 {
108 "alert": "powereagleStalled", 126 "alert": "powereagleStalled",
109 "expr": "rate(house_power_w[100m]) == 0", 127 "expr": "rate(house_power_w[100m]) == 0",
110 "for": "0m", 128 "for": "0m",
111 "labels": {"severity": "losingData"}, 129 "labels": {
130 "severity": "losingData"
131 },
112 "annotations": { 132 "annotations": {
113 "summary": "power eagle data stalled", 133 "summary": "power eagle data stalled",
114 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", 134 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
115 }, 135 },
116 }, 136 },
117 { 137 {
118 "alert": "powereagleAbsent", 138 "alert": "powereagleAbsent",
119 "expr": "absent_over_time(house_power_w[5m])", 139 "expr": "absent_over_time(house_power_w[5m])",
120 "for": "2m", 140 "for": "2m",
121 "labels": {"severity": "losingData"}, 141 "labels": {
142 "severity": "losingData"
143 },
122 "annotations": { 144 "annotations": {
123 "summary": "power eagle data missing", 145 "summary": "power eagle data missing",
124 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", 146 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
125 }, 147 },
126 }, 148 },
130 }, 152 },
131 { 153 {
132 "alert": "net_routes_sync", 154 "alert": "net_routes_sync",
133 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', 155 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
134 "for": "10m", 156 "for": "10m",
135 "labels": {"severity": "houseUsersAffected"}, 157 "labels": {
158 "severity": "houseUsersAffected"
159 },
136 "annotations": { 160 "annotations": {
137 "summary": "net_routes is not getting regular updates" 161 "summary": "net_routes is not getting regular updates"
138 }, 162 },
139 }, 163 },
140 ], 164 ],
141 }, 165 },
142 { 166 {
143 "name": "disk_errs", 167 "name": "disk_errs",
144 "interval": "2d", 168 "interval": "2d",
145 "rules": [ 169 "rules": [{
146 { 170 "alert": "zpool_device_error_count",
147 "alert": "zpool_device_error_count", 171 "labels": {
148 "labels": {"severity": "warning"}, 172 "severity": "warning"
149 "expr": 'increase(zpool_device_error_count[3d]) > 0', 173 },
150 }, 174 "expr": 'increase(zpool_device_error_count[3d]) > 0',
151 ], 175 }],
152 }, 176 },
153 { 177 {
154 "name": "alerts", 178 "name":
179 "alerts",
155 "rules": [ 180 "rules": [
156 { 181 {
157 "alert": "kube_node_status_bad_condition", 182 "alert": "kube_node_status_bad_condition",
158 "for": "2h", 183 "for": "2h",
159 "labels": {"severity": "warning"}, 184 "labels": {
185 "severity": "warning"
186 },
160 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', 187 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
161 }, 188 },
162 { 189 {
163 "alert": "housePower", 190 "alert": "housePower",
164 "for": "1h", 191 "for": "1h",
165 "labels": {"severity": "waste"}, 192 "labels": {
193 "severity": "waste"
194 },
166 "expr": "house_power_w > 4000", 195 "expr": "house_power_w > 4000",
167 "annotations": {"summary": "house power usage over 4KW"}, 196 "annotations": {
197 "summary": "house power usage over 4KW"
198 },
168 }, 199 },
169 { 200 {
170 "alert": "host_root_fs_space_low", 201 "alert": "host_root_fs_space_low",
171 "for": "20m", 202 "for": "20m",
172 "labels": {"severity": "warning"}, 203 "labels": {
204 "severity": "warning"
205 },
173 "expr": 'disk_free{path="/"} < 20G', 206 "expr": 'disk_free{path="/"} < 20G',
174 }, 207 },
175 { 208 {
176 "alert": "zpool_space_low", 209 "alert": "zpool_space_low",
177 "for": "20m", 210 "for": "20m",
178 "labels": {"severity": "warning"}, 211 "labels": {
212 "severity": "warning"
213 },
179 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', 214 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
180 }, 215 },
181 { 216 {
182 "alert": "disk_week_incr", 217 "alert": "disk_week_incr",
183 "for": "20m", 218 "for": "20m",
184 "labels": {"severity": "warning"}, 219 "labels": {
220 "severity": "warning"
221 },
185 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', 222 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
186 "annotations": {"summary": "high mb/week on zfs dir"}, 223 "annotations": {
224 "summary": "high mb/week on zfs dir"
225 },
187 }, 226 },
188 { 227 {
189 "alert": "high_logging", 228 "alert": "high_logging",
190 "for": "20m", 229 "for": "3h",
191 "labels": {"severity": "waste"}, 230 "labels": {
231 "severity": "waste"
232 },
192 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k", 233 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
193 "annotations": {"summary": "high log output rate"}, 234 "annotations": {
235 "summary": "high log output rate"
236 },
194 }, 237 },
195 { 238 {
196 "alert": "stale_process", 239 "alert": "stale_process",
197 "for": "1d", 240 "for": "1d",
198 "labels": {"severity": "dataRisk"}, 241 "labels": {
242 "severity": "dataRisk"
243 },
199 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", 244 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
200 "annotations": {"summary": "process time is old"}, 245 "annotations": {
246 "summary": "process time is old"
247 },
201 }, 248 },
202 { 249 {
203 "alert": "starlette", 250 "alert": "starlette",
204 "for": "1m", 251 "for": "1m",
205 "labels": {"severity": "fix"}, 252 "labels": {
253 "severity": "fix"
254 },
206 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', 255 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
207 "annotations": {"summary": "set starlette app name"}, 256 "annotations": {
257 "summary": "set starlette app name"
258 },
208 }, 259 },
209 { 260 {
210 "alert": "ssl_certs_expiring_soon", 261 "alert": "ssl_certs_expiring_soon",
211 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", 262 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
212 "labels": {"severity": "warning"}, 263 "labels": {
264 "severity": "warning"
265 },
213 "annotations": { 266 "annotations": {
214 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" 267 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
215 }, 268 },
216 }, 269 },
217 ], 270 ],