annotate alert_rules.py @ 31:d39a8038227b

reformat
author drewp@bigasterisk.com
date Wed, 19 Jul 2023 21:27:46 -0700
parents e114edff93dc
children eb1de82c93aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
23
drewp@bigasterisk.com
parents:
diff changeset
1 """
drewp@bigasterisk.com
parents:
diff changeset
2 pdm run invoke push-config
drewp@bigasterisk.com
parents:
diff changeset
3
drewp@bigasterisk.com
parents:
diff changeset
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
drewp@bigasterisk.com
parents:
diff changeset
5 "Whenever the alert expression results in one or more vector
drewp@bigasterisk.com
parents:
diff changeset
6 elements at a given point in time, the alert counts as active for
drewp@bigasterisk.com
parents:
diff changeset
7 these elements' label sets."
drewp@bigasterisk.com
parents:
diff changeset
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
drewp@bigasterisk.com
parents:
diff changeset
9
drewp@bigasterisk.com
parents:
diff changeset
10 """
drewp@bigasterisk.com
parents:
diff changeset
11
drewp@bigasterisk.com
parents:
diff changeset
12 import json
drewp@bigasterisk.com
parents:
diff changeset
13
drewp@bigasterisk.com
parents:
diff changeset
14
drewp@bigasterisk.com
parents:
diff changeset
15 def k8sRules():
drewp@bigasterisk.com
parents:
diff changeset
16 # from https://awesome-prometheus-alerts.grep.to/rules.html
drewp@bigasterisk.com
parents:
diff changeset
17 return [
drewp@bigasterisk.com
parents:
diff changeset
18 {
drewp@bigasterisk.com
parents:
diff changeset
19 "alert": "PrometheusTargetMissing",
drewp@bigasterisk.com
parents:
diff changeset
20 "expr": "up == 0",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
21 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
22 "severity": "critical"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
23 },
23
drewp@bigasterisk.com
parents:
diff changeset
24 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
25 "summary": "Prometheus target missing (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
26 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
27 },
drewp@bigasterisk.com
parents:
diff changeset
28 },
drewp@bigasterisk.com
parents:
diff changeset
29 {
drewp@bigasterisk.com
parents:
diff changeset
30 "alert": "KubernetesMemoryPressure",
drewp@bigasterisk.com
parents:
diff changeset
31 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
drewp@bigasterisk.com
parents:
diff changeset
32 "for": "2m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
33 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
34 "severity": "critical"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
35 },
23
drewp@bigasterisk.com
parents:
diff changeset
36 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
37 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
38 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
39 },
drewp@bigasterisk.com
parents:
diff changeset
40 },
drewp@bigasterisk.com
parents:
diff changeset
41 {
drewp@bigasterisk.com
parents:
diff changeset
42 "alert": "KubernetesDiskPressure",
drewp@bigasterisk.com
parents:
diff changeset
43 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
drewp@bigasterisk.com
parents:
diff changeset
44 "for": "2m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
45 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
46 "severity": "critical"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
47 },
23
drewp@bigasterisk.com
parents:
diff changeset
48 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
49 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
50 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
51 },
drewp@bigasterisk.com
parents:
diff changeset
52 },
drewp@bigasterisk.com
parents:
diff changeset
53 {
drewp@bigasterisk.com
parents:
diff changeset
54 "alert": "KubernetesOutOfDisk",
drewp@bigasterisk.com
parents:
diff changeset
55 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
drewp@bigasterisk.com
parents:
diff changeset
56 "for": "2m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
57 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
58 "severity": "critical"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
59 },
23
drewp@bigasterisk.com
parents:
diff changeset
60 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
61 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
62 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
63 },
drewp@bigasterisk.com
parents:
diff changeset
64 },
drewp@bigasterisk.com
parents:
diff changeset
65 {
drewp@bigasterisk.com
parents:
diff changeset
66 "alert": "KubernetesJobFailed",
drewp@bigasterisk.com
parents:
diff changeset
67 "expr": "kube_job_status_failed > 0",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
68 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
69 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
70 },
23
drewp@bigasterisk.com
parents:
diff changeset
71 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
72 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
73 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
74 },
drewp@bigasterisk.com
parents:
diff changeset
75 },
drewp@bigasterisk.com
parents:
diff changeset
76 {
drewp@bigasterisk.com
parents:
diff changeset
77 "alert": "KubernetesPodCrashLooping",
drewp@bigasterisk.com
parents:
diff changeset
78 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
drewp@bigasterisk.com
parents:
diff changeset
79 "for": "2m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
80 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
81 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
82 },
23
drewp@bigasterisk.com
parents:
diff changeset
83 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
84 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
85 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
86 },
drewp@bigasterisk.com
parents:
diff changeset
87 },
drewp@bigasterisk.com
parents:
diff changeset
88 {
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
89 "alert":
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
90 "KubernetesClientCertificateExpiresNextWeek",
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
91 "expr":
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
92 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
93 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
94 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
95 },
23
drewp@bigasterisk.com
parents:
diff changeset
96 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
97 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
drewp@bigasterisk.com
parents:
diff changeset
98 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
drewp@bigasterisk.com
parents:
diff changeset
99 },
drewp@bigasterisk.com
parents:
diff changeset
100 },
drewp@bigasterisk.com
parents:
diff changeset
101 {
drewp@bigasterisk.com
parents:
diff changeset
102 "alert": "container_waiting",
drewp@bigasterisk.com
parents:
diff changeset
103 "expr": "sum by (container)(kube_pod_container_status_waiting!=0)",
drewp@bigasterisk.com
parents:
diff changeset
104 "for": "2m",
drewp@bigasterisk.com
parents:
diff changeset
105 },
drewp@bigasterisk.com
parents:
diff changeset
106 ]
drewp@bigasterisk.com
parents:
diff changeset
107
drewp@bigasterisk.com
parents:
diff changeset
108
drewp@bigasterisk.com
parents:
diff changeset
109 def allRules():
drewp@bigasterisk.com
parents:
diff changeset
110 return {
drewp@bigasterisk.com
parents:
diff changeset
111 "groups": [
drewp@bigasterisk.com
parents:
diff changeset
112 {
drewp@bigasterisk.com
parents:
diff changeset
113 "name": "k8s",
28
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
114 "interval": "1m",
23
drewp@bigasterisk.com
parents:
diff changeset
115 "rules": k8sRules(),
drewp@bigasterisk.com
parents:
diff changeset
116 },
drewp@bigasterisk.com
parents:
diff changeset
117 #
drewp@bigasterisk.com
parents:
diff changeset
118 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
drewp@bigasterisk.com
parents:
diff changeset
119 {
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
120 "name":
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
121 "Outages",
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
122 "interval":
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
123 "1m",
23
drewp@bigasterisk.com
parents:
diff changeset
124 "rules": [
drewp@bigasterisk.com
parents:
diff changeset
125 {
drewp@bigasterisk.com
parents:
diff changeset
126 "alert": "powereagleStalled",
drewp@bigasterisk.com
parents:
diff changeset
127 "expr": "rate(house_power_w[100m]) == 0",
drewp@bigasterisk.com
parents:
diff changeset
128 "for": "0m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
129 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
130 "severity": "losingData"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
131 },
23
drewp@bigasterisk.com
parents:
diff changeset
132 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
133 "summary": "power eagle data stalled",
drewp@bigasterisk.com
parents:
diff changeset
134 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
drewp@bigasterisk.com
parents:
diff changeset
135 },
drewp@bigasterisk.com
parents:
diff changeset
136 },
drewp@bigasterisk.com
parents:
diff changeset
137 {
drewp@bigasterisk.com
parents:
diff changeset
138 "alert": "powereagleAbsent",
drewp@bigasterisk.com
parents:
diff changeset
139 "expr": "absent_over_time(house_power_w[5m])",
drewp@bigasterisk.com
parents:
diff changeset
140 "for": "2m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
141 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
142 "severity": "losingData"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
143 },
23
drewp@bigasterisk.com
parents:
diff changeset
144 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
145 "summary": "power eagle data missing",
drewp@bigasterisk.com
parents:
diff changeset
146 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
drewp@bigasterisk.com
parents:
diff changeset
147 },
drewp@bigasterisk.com
parents:
diff changeset
148 },
drewp@bigasterisk.com
parents:
diff changeset
149 {
drewp@bigasterisk.com
parents:
diff changeset
150 "alert": "absent_zigbee",
drewp@bigasterisk.com
parents:
diff changeset
151 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
drewp@bigasterisk.com
parents:
diff changeset
152 },
drewp@bigasterisk.com
parents:
diff changeset
153 {
drewp@bigasterisk.com
parents:
diff changeset
154 "alert": "net_routes_sync",
drewp@bigasterisk.com
parents:
diff changeset
155 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
drewp@bigasterisk.com
parents:
diff changeset
156 "for": "10m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
157 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
158 "severity": "houseUsersAffected"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
159 },
23
drewp@bigasterisk.com
parents:
diff changeset
160 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
161 "summary": "net_routes is not getting regular updates"
drewp@bigasterisk.com
parents:
diff changeset
162 },
drewp@bigasterisk.com
parents:
diff changeset
163 },
drewp@bigasterisk.com
parents:
diff changeset
164 ],
drewp@bigasterisk.com
parents:
diff changeset
165 },
drewp@bigasterisk.com
parents:
diff changeset
166 {
28
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
167 "name": "disk_errs",
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
168 "interval": "2d",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
169 "rules": [{
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
170 "alert": "zpool_device_error_count",
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
171 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
172 "severity": "warning"
28
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
173 },
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
174 "expr": 'increase(zpool_device_error_count[3d]) > 0',
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
175 }],
28
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
176 },
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
177 {
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
178 "name":
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
179 "alerts",
23
drewp@bigasterisk.com
parents:
diff changeset
180 "rules": [
drewp@bigasterisk.com
parents:
diff changeset
181 {
drewp@bigasterisk.com
parents:
diff changeset
182 "alert": "kube_node_status_bad_condition",
drewp@bigasterisk.com
parents:
diff changeset
183 "for": "2h",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
184 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
185 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
186 },
23
drewp@bigasterisk.com
parents:
diff changeset
187 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
drewp@bigasterisk.com
parents:
diff changeset
188 },
drewp@bigasterisk.com
parents:
diff changeset
189 {
drewp@bigasterisk.com
parents:
diff changeset
190 "alert": "housePower",
28
e114edff93dc more explicit intervals. try to get a single day of notification out of a disk err increase
drewp@bigasterisk.com
parents: 27
diff changeset
191 "for": "1h",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
192 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
193 "severity": "waste"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
194 },
23
drewp@bigasterisk.com
parents:
diff changeset
195 "expr": "house_power_w > 4000",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
196 "annotations": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
197 "summary": "house power usage over 4KW"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
198 },
23
drewp@bigasterisk.com
parents:
diff changeset
199 },
drewp@bigasterisk.com
parents:
diff changeset
200 {
drewp@bigasterisk.com
parents:
diff changeset
201 "alert": "host_root_fs_space_low",
drewp@bigasterisk.com
parents:
diff changeset
202 "for": "20m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
203 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
204 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
205 },
23
drewp@bigasterisk.com
parents:
diff changeset
206 "expr": 'disk_free{path="/"} < 20G',
drewp@bigasterisk.com
parents:
diff changeset
207 },
drewp@bigasterisk.com
parents:
diff changeset
208 {
drewp@bigasterisk.com
parents:
diff changeset
209 "alert": "zpool_space_low",
drewp@bigasterisk.com
parents:
diff changeset
210 "for": "20m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
211 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
212 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
213 },
23
drewp@bigasterisk.com
parents:
diff changeset
214 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
drewp@bigasterisk.com
parents:
diff changeset
215 },
drewp@bigasterisk.com
parents:
diff changeset
216 {
drewp@bigasterisk.com
parents:
diff changeset
217 "alert": "disk_week_incr",
drewp@bigasterisk.com
parents:
diff changeset
218 "for": "20m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
219 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
220 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
221 },
23
drewp@bigasterisk.com
parents:
diff changeset
222 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
223 "annotations": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
224 "summary": "high mb/week on zfs dir"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
225 },
23
drewp@bigasterisk.com
parents:
diff changeset
226 },
drewp@bigasterisk.com
parents:
diff changeset
227 {
drewp@bigasterisk.com
parents:
diff changeset
228 "alert": "high_logging",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
229 "for": "3h",
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
230 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
231 "severity": "waste"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
232 },
23
drewp@bigasterisk.com
parents:
diff changeset
233 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
234 "annotations": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
235 "summary": "high log output rate"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
236 },
23
drewp@bigasterisk.com
parents:
diff changeset
237 },
drewp@bigasterisk.com
parents:
diff changeset
238 {
drewp@bigasterisk.com
parents:
diff changeset
239 "alert": "stale_process",
drewp@bigasterisk.com
parents:
diff changeset
240 "for": "1d",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
241 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
242 "severity": "dataRisk"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
243 },
23
drewp@bigasterisk.com
parents:
diff changeset
244 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
245 "annotations": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
246 "summary": "process time is old"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
247 },
23
drewp@bigasterisk.com
parents:
diff changeset
248 },
drewp@bigasterisk.com
parents:
diff changeset
249 {
drewp@bigasterisk.com
parents:
diff changeset
250 "alert": "starlette",
drewp@bigasterisk.com
parents:
diff changeset
251 "for": "1m",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
252 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
253 "severity": "fix"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
254 },
23
drewp@bigasterisk.com
parents:
diff changeset
255 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
256 "annotations": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
257 "summary": "set starlette app name"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
258 },
23
drewp@bigasterisk.com
parents:
diff changeset
259 },
drewp@bigasterisk.com
parents:
diff changeset
260 {
drewp@bigasterisk.com
parents:
diff changeset
261 "alert": "ssl_certs_expiring_soon",
drewp@bigasterisk.com
parents:
diff changeset
262 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
31
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
263 "labels": {
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
264 "severity": "warning"
d39a8038227b reformat
drewp@bigasterisk.com
parents: 28
diff changeset
265 },
23
drewp@bigasterisk.com
parents:
diff changeset
266 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
267 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
drewp@bigasterisk.com
parents:
diff changeset
268 },
drewp@bigasterisk.com
parents:
diff changeset
269 },
drewp@bigasterisk.com
parents:
diff changeset
270 ],
drewp@bigasterisk.com
parents:
diff changeset
271 },
drewp@bigasterisk.com
parents:
diff changeset
272 ]
drewp@bigasterisk.com
parents:
diff changeset
273 }
drewp@bigasterisk.com
parents:
diff changeset
274
drewp@bigasterisk.com
parents:
diff changeset
275
drewp@bigasterisk.com
parents:
diff changeset
276 def _runJson(ctx, cmd):
drewp@bigasterisk.com
parents:
diff changeset
277 return json.loads(ctx.run(cmd, hide="stdout").stdout)
drewp@bigasterisk.com
parents:
diff changeset
278
drewp@bigasterisk.com
parents:
diff changeset
279
drewp@bigasterisk.com
parents:
diff changeset
280 def hostsExpectedOnline(ctx):
drewp@bigasterisk.com
parents:
diff changeset
281 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
drewp@bigasterisk.com
parents:
diff changeset
282
drewp@bigasterisk.com
parents:
diff changeset
283
drewp@bigasterisk.com
parents:
diff changeset
284 def expectedK8sNodes(ctx):
drewp@bigasterisk.com
parents:
diff changeset
285 getNode = _runJson(ctx, "kubectl get node -o json")
drewp@bigasterisk.com
parents:
diff changeset
286 hosts = [item["metadata"]["name"] for item in getNode["items"]]
drewp@bigasterisk.com
parents:
diff changeset
287 optionalHosts = {'slash'}
drewp@bigasterisk.com
parents:
diff changeset
288 return {
drewp@bigasterisk.com
parents:
diff changeset
289 "groups": [
drewp@bigasterisk.com
parents:
diff changeset
290 {
drewp@bigasterisk.com
parents:
diff changeset
291 "name": "k8s_expected_nodes",
drewp@bigasterisk.com
parents:
diff changeset
292 "rules": [
drewp@bigasterisk.com
parents:
diff changeset
293 {
drewp@bigasterisk.com
parents:
diff changeset
294 "alert": "kube_node_log_size_report_" + h,
drewp@bigasterisk.com
parents:
diff changeset
295 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})'
drewp@bigasterisk.com
parents:
diff changeset
296 % h,
drewp@bigasterisk.com
parents:
diff changeset
297 "for": "1h",
drewp@bigasterisk.com
parents:
diff changeset
298 "annotations": {
drewp@bigasterisk.com
parents:
diff changeset
299 "summary": f"no recent k8s log size report from host {h}"
drewp@bigasterisk.com
parents:
diff changeset
300 },
drewp@bigasterisk.com
parents:
diff changeset
301 }
drewp@bigasterisk.com
parents:
diff changeset
302 for h in hosts if not h in optionalHosts
drewp@bigasterisk.com
parents:
diff changeset
303 ],
drewp@bigasterisk.com
parents:
diff changeset
304 }
drewp@bigasterisk.com
parents:
diff changeset
305 ]
drewp@bigasterisk.com
parents:
diff changeset
306 }