Mercurial > code > home > repos > victoriametrics
diff config/rules_k8s.yaml @ 4:1eb6e6a2b9b6
version control configs finally; use configmaps to present them to VM
author | drewp@bigasterisk.com |
---|---|
date | Sun, 12 Jun 2022 17:08:31 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config/rules_k8s.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,64 @@ +groups: + - name: k8s + rules: + # from https://awesome-prometheus-alerts.grep.to/rules.html + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes memory pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes disk pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes out of disk (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job failed (instance {{ $labels.instance }}) + description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) + description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesClientCertificateExpiresNextWeek + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) + description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: container_waiting + expr: sum by (container)(kube_pod_container_status_waiting!=0) + for: 2m