comparison config/rules_k8s.yaml @ 4:1eb6e6a2b9b6

version control configs finally; use configmaps to present them to VM
author drewp@bigasterisk.com
date Sun, 12 Jun 2022 17:08:31 -0700
parents
children
comparison
equal deleted inserted replaced
3:6056f2e2aba5 4:1eb6e6a2b9b6
1 groups:
2 - name: k8s
3 rules:
4 # from https://awesome-prometheus-alerts.grep.to/rules.html
5 - alert: PrometheusTargetMissing
6 expr: up == 0
7 for: 0m
8 labels:
9 severity: critical
10 annotations:
11 summary: Prometheus target missing (instance {{ $labels.instance }})
12 description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
13 - alert: KubernetesMemoryPressure
14 expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
15 for: 2m
16 labels:
17 severity: critical
18 annotations:
19 summary: Kubernetes memory pressure (instance {{ $labels.instance }})
20 description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
21 - alert: KubernetesDiskPressure
22 expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
23 for: 2m
24 labels:
25 severity: critical
26 annotations:
27 summary: Kubernetes disk pressure (instance {{ $labels.instance }})
28 description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
29 - alert: KubernetesOutOfDisk
30 expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
31 for: 2m
32 labels:
33 severity: critical
34 annotations:
35 summary: Kubernetes out of disk (instance {{ $labels.instance }})
36 description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
37 - alert: KubernetesJobFailed
38 expr: kube_job_status_failed > 0
39 for: 0m
40 labels:
41 severity: warning
42 annotations:
43 summary: Kubernetes Job failed (instance {{ $labels.instance }})
44 description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
45
46 - alert: KubernetesPodCrashLooping
47 expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
48 for: 2m
49 labels:
50 severity: warning
51 annotations:
52 summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
53 description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
54 - alert: KubernetesClientCertificateExpiresNextWeek
55 expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
56 for: 0m
57 labels:
58 severity: warning
59 annotations:
60 summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
61 description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
62 - alert: container_waiting
63 expr: sum by (container)(kube_pod_container_status_waiting!=0)
64 for: 2m