23
|
1 """
|
|
2 pdm run invoke push-config
|
|
3
|
|
4 docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
|
|
5 "Whenever the alert expression results in one or more vector
|
|
6 elements at a given point in time, the alert counts as active for
|
|
7 these elements' label sets."
|
|
8 also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
|
|
9
|
|
10 """
|
|
11
|
|
12 import json
|
|
13
|
|
14
|
|
15 def k8sRules():
|
|
16 # from https://awesome-prometheus-alerts.grep.to/rules.html
|
|
17 return [
|
|
18 {
|
|
19 "alert": "PrometheusTargetMissing",
|
|
20 "expr": "up == 0",
|
|
21 "for": "0m",
|
|
22 "labels": {"severity": "critical"},
|
|
23 "annotations": {
|
|
24 "summary": "Prometheus target missing (instance {{ $labels.instance }})",
|
|
25 "description": "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}",
|
|
26 },
|
|
27 },
|
|
28 {
|
|
29 "alert": "KubernetesMemoryPressure",
|
|
30 "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
|
|
31 "for": "2m",
|
|
32 "labels": {"severity": "critical"},
|
|
33 "annotations": {
|
|
34 "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
|
|
35 "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}",
|
|
36 },
|
|
37 },
|
|
38 {
|
|
39 "alert": "KubernetesDiskPressure",
|
|
40 "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
|
|
41 "for": "2m",
|
|
42 "labels": {"severity": "critical"},
|
|
43 "annotations": {
|
|
44 "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
|
|
45 "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}",
|
|
46 },
|
|
47 },
|
|
48 {
|
|
49 "alert": "KubernetesOutOfDisk",
|
|
50 "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
|
|
51 "for": "2m",
|
|
52 "labels": {"severity": "critical"},
|
|
53 "annotations": {
|
|
54 "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
|
|
55 "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}",
|
|
56 },
|
|
57 },
|
|
58 {
|
|
59 "alert": "KubernetesJobFailed",
|
|
60 "expr": "kube_job_status_failed > 0",
|
|
61 "for": "0m",
|
|
62 "labels": {"severity": "warning"},
|
|
63 "annotations": {
|
|
64 "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
|
|
65 "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}",
|
|
66 },
|
|
67 },
|
|
68 {
|
|
69 "alert": "KubernetesPodCrashLooping",
|
|
70 "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
|
|
71 "for": "2m",
|
|
72 "labels": {"severity": "warning"},
|
|
73 "annotations": {
|
|
74 "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
|
|
75 "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}",
|
|
76 },
|
|
77 },
|
|
78 {
|
|
79 "alert": "KubernetesClientCertificateExpiresNextWeek",
|
|
80 "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
|
|
81 "for": "0m",
|
|
82 "labels": {"severity": "warning"},
|
|
83 "annotations": {
|
|
84 "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
|
|
85 "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}",
|
|
86 },
|
|
87 },
|
|
88 {
|
|
89 "alert": "container_waiting",
|
|
90 "expr": "sum by (container)(kube_pod_container_status_waiting!=0)",
|
|
91 "for": "2m",
|
|
92 },
|
|
93 ]
|
|
94
|
|
95
|
|
96 def allRules():
|
|
97 return {
|
|
98 "groups": [
|
|
99 {
|
|
100 "name": "k8s",
|
|
101 "rules": k8sRules(),
|
|
102 },
|
|
103 #
|
|
104 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
|
|
105 {
|
|
106 "name": "Outages",
|
|
107 "rules": [
|
|
108 {
|
|
109 "alert": "powereagleStalled",
|
|
110 "expr": "rate(house_power_w[100m]) == 0",
|
|
111 "for": "0m",
|
|
112 "labels": {"severity": "losingData"},
|
|
113 "annotations": {
|
|
114 "summary": "power eagle data stalled",
|
|
115 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
|
|
116 },
|
|
117 },
|
|
118 {
|
|
119 "alert": "powereagleAbsent",
|
|
120 "expr": "absent_over_time(house_power_w[5m])",
|
|
121 "for": "2m",
|
|
122 "labels": {"severity": "losingData"},
|
|
123 "annotations": {
|
|
124 "summary": "power eagle data missing",
|
|
125 "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
|
|
126 },
|
|
127 },
|
|
128 {
|
|
129 "alert": "absent_zigbee",
|
|
130 "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
|
|
131 },
|
|
132 {
|
|
133 "alert": "net_routes_sync",
|
|
134 "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
|
|
135 "for": "10m",
|
|
136 "labels": {"severity": "houseUsersAffected"},
|
|
137 "annotations": {
|
|
138 "summary": "net_routes is not getting regular updates"
|
|
139 },
|
|
140 },
|
|
141 ],
|
|
142 },
|
|
143 {
|
|
144 "name": "alerts",
|
|
145 "rules": [
|
|
146 {
|
|
147 "alert": "kube_node_status_bad_condition",
|
|
148 "for": "2h",
|
|
149 "labels": {"severity": "warning"},
|
|
150 "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
|
|
151 },
|
|
152 {
|
|
153 "alert": "housePower",
|
|
154 "for": "24h",
|
|
155 "labels": {"severity": "waste"},
|
|
156 "expr": "house_power_w > 4000",
|
|
157 "annotations": {"summary": "house power usage over 4KW"},
|
|
158 },
|
|
159 {
|
|
160 "alert": "host_root_fs_space_low",
|
|
161 "for": "20m",
|
|
162 "labels": {"severity": "warning"},
|
|
163 "expr": 'disk_free{path="/"} < 20G',
|
|
164 },
|
|
165 {
|
|
166 "alert": "zpool_space_low",
|
|
167 "for": "20m",
|
|
168 "labels": {"severity": "warning"},
|
|
169 "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
|
|
170 },
|
|
171 {
|
|
172 "alert": "zpool_device_error_count",
|
|
173 "for": "20m",
|
|
174 "labels": {"severity": "warning"},
|
26
|
175 "expr": 'increase(zpool_device_error_count[2h]) > 0',
|
23
|
176 },
|
|
177 {
|
|
178 "alert": "disk_week_incr",
|
|
179 "for": "20m",
|
|
180 "labels": {"severity": "warning"},
|
|
181 "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
|
|
182 "annotations": {"summary": "high mb/week on zfs dir"},
|
|
183 },
|
|
184 {
|
|
185 "alert": "high_logging",
|
|
186 "for": "20m",
|
|
187 "labels": {"severity": "waste"},
|
|
188 "expr": "sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k",
|
|
189 "annotations": {"summary": "high log output rate"},
|
|
190 },
|
|
191 {
|
|
192 "alert": "stale_process",
|
|
193 "for": "1d",
|
|
194 "labels": {"severity": "dataRisk"},
|
|
195 "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
|
|
196 "annotations": {"summary": "process time is old"},
|
|
197 },
|
|
198 {
|
|
199 "alert": "starlette",
|
|
200 "for": "1m",
|
|
201 "labels": {"severity": "fix"},
|
|
202 "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
|
|
203 "annotations": {"summary": "set starlette app name"},
|
|
204 },
|
|
205 {
|
|
206 "alert": "ssl_certs_expiring_soon",
|
|
207 "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
|
|
208 "labels": {"severity": "warning"},
|
|
209 "annotations": {
|
|
210 "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
|
|
211 },
|
|
212 },
|
|
213 ],
|
|
214 },
|
|
215 ]
|
|
216 }
|
|
217
|
|
218
|
|
219 def _runJson(ctx, cmd):
|
|
220 return json.loads(ctx.run(cmd, hide="stdout").stdout)
|
|
221
|
|
222
|
|
223 def hostsExpectedOnline(ctx):
|
|
224 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
|
|
225
|
|
226
|
|
227 def expectedK8sNodes(ctx):
|
|
228 getNode = _runJson(ctx, "kubectl get node -o json")
|
|
229 hosts = [item["metadata"]["name"] for item in getNode["items"]]
|
|
230 optionalHosts = {'slash'}
|
|
231 return {
|
|
232 "groups": [
|
|
233 {
|
|
234 "name": "k8s_expected_nodes",
|
|
235 "rules": [
|
|
236 {
|
|
237 "alert": "kube_node_log_size_report_" + h,
|
|
238 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})'
|
|
239 % h,
|
|
240 "for": "1h",
|
|
241 "annotations": {
|
|
242 "summary": f"no recent k8s log size report from host {h}"
|
|
243 },
|
|
244 }
|
|
245 for h in hosts if not h in optionalHosts
|
|
246 ],
|
|
247 }
|
|
248 ]
|
|
249 }
|