Mercurial > code > home > repos > victoriametrics
comparison config/rules_k8s.yaml @ 4:1eb6e6a2b9b6
version control configs finally; use configmaps to present them to VM
author | drewp@bigasterisk.com |
---|---|
date | Sun, 12 Jun 2022 17:08:31 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:6056f2e2aba5 | 4:1eb6e6a2b9b6 |
---|---|
1 groups: | |
2 - name: k8s | |
3 rules: | |
4 # from https://awesome-prometheus-alerts.grep.to/rules.html | |
5 - alert: PrometheusTargetMissing | |
6 expr: up == 0 | |
7 for: 0m | |
8 labels: | |
9 severity: critical | |
10 annotations: | |
11 summary: Prometheus target missing (instance {{ $labels.instance }}) | |
12 description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
13 - alert: KubernetesMemoryPressure | |
14 expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 | |
15 for: 2m | |
16 labels: | |
17 severity: critical | |
18 annotations: | |
19 summary: Kubernetes memory pressure (instance {{ $labels.instance }}) | |
20 description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
21 - alert: KubernetesDiskPressure | |
22 expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 | |
23 for: 2m | |
24 labels: | |
25 severity: critical | |
26 annotations: | |
27 summary: Kubernetes disk pressure (instance {{ $labels.instance }}) | |
28 description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
29 - alert: KubernetesOutOfDisk | |
30 expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 | |
31 for: 2m | |
32 labels: | |
33 severity: critical | |
34 annotations: | |
35 summary: Kubernetes out of disk (instance {{ $labels.instance }}) | |
36 description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
37 - alert: KubernetesJobFailed | |
38 expr: kube_job_status_failed > 0 | |
39 for: 0m | |
40 labels: | |
41 severity: warning | |
42 annotations: | |
43 summary: Kubernetes Job failed (instance {{ $labels.instance }}) | |
44 description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
45 | |
46 - alert: KubernetesPodCrashLooping | |
47 expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 | |
48 for: 2m | |
49 labels: | |
50 severity: warning | |
51 annotations: | |
52 summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) | |
53 description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
54 - alert: KubernetesClientCertificateExpiresNextWeek | |
55 expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 | |
56 for: 0m | |
57 labels: | |
58 severity: warning | |
59 annotations: | |
60 summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) | |
61 description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | |
62 - alert: container_waiting | |
63 expr: sum by (container)(kube_pod_container_status_waiting!=0) | |
64 for: 2m |