comparison alert_rules.py @ 43:f05be84926e2

drop this kubelet rule because kubelet_container_log_filesystem_used_bytes doesn't seem to have 'instance' anymore
author drewp@bigasterisk.com
date Wed, 14 Feb 2024 19:10:49 -0800
parents 2f87ecd2a754
children e1db51416e73
comparison
equal deleted inserted replaced
42:2f87ecd2a754 43:f05be84926e2
117 { 117 {
118 "name": "k8s", 118 "name": "k8s",
119 "interval": "1m", 119 "interval": "1m",
120 "rules": k8sRules(), 120 "rules": k8sRules(),
121 }, 121 },
122 expectedK8sNodesGroup(ctx),
123 # 122 #
124 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name 123 # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
125 { 124 {
126 "name": 125 "name":
127 "Outages", 126 "Outages",
364 363
365 def hostsExpectedOnline(ctx): 364 def hostsExpectedOnline(ctx):
366 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") 365 return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
367 366
368 367
369 def expectedK8sNodesGroup(ctx):
370 getNode = _runJson(ctx, "kubectl get node -o json")
371 hosts = [item["metadata"]["name"] for item in getNode["items"]]
372 optionalHosts = {'slash'}
373 return {
374 "name":
375 "k8s_expected_nodes",
376 "rules": [{
377 "alert": "kube_node_log_size_report_" + h,
378 "expr": 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' % h,
379 "for": "1h",
380 "annotations": {
381 "summary": f"no recent k8s log size report from host {h}"
382 },
383 } for h in hosts if h not in optionalHosts],
384 }