Mercurial > code > home > repos > victoriametrics
comparison alert_rules.py @ 43:f05be84926e2
drop this kubelet rule because kubelet_container_log_filesystem_used_bytes doesn't seem to have 'instance' anymore
author | drewp@bigasterisk.com |
---|---|
date | Wed, 14 Feb 2024 19:10:49 -0800 |
parents | 2f87ecd2a754 |
children | e1db51416e73 |
comparison
equal
deleted
inserted
replaced
42:2f87ecd2a754 | 43:f05be84926e2 |
---|---|
117 { | 117 { |
118 "name": "k8s", | 118 "name": "k8s", |
119 "interval": "1m", | 119 "interval": "1m", |
120 "rules": k8sRules(), | 120 "rules": k8sRules(), |
121 }, | 121 }, |
122 expectedK8sNodesGroup(ctx), | |
123 # | 122 # |
124 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name | 123 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name |
125 { | 124 { |
126 "name": | 125 "name": |
127 "Outages", | 126 "Outages", |
364 | 363 |
365 def hostsExpectedOnline(ctx): | 364 def hostsExpectedOnline(ctx): |
366 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") | 365 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") |
367 | 366 |
368 | 367 |
369 def expectedK8sNodesGroup(ctx): | |
370 getNode = _runJson(ctx, "kubectl get node -o json") | |
371 hosts = [item["metadata"]["name"] for item in getNode["items"]] | |
372 optionalHosts = {'slash'} | |
373 return { | |
374 "name": | |
375 "k8s_expected_nodes", | |
376 "rules": [{ | |
377 "alert": "kube_node_log_size_report_" + h, | |
378 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' % h, | |
379 "for": "1h", | |
380 "annotations": { | |
381 "summary": f"no recent k8s log size report from host {h}" | |
382 }, | |
383 } for h in hosts if h not in optionalHosts], | |
384 } |