changeset 9:17db5e8e7a2f

big rules and scrape config updates
author drewp@bigasterisk.com
date Sun, 04 Dec 2022 02:08:08 -0800
parents e393b24f0e01
children 2023a6ce7bc0
files config/rules_k8s.yaml config/rules_main.yaml config/scrape_main.yaml rules/rules_k8s.yaml rules/rules_main.yaml
diffstat 5 files changed, 223 insertions(+), 208 deletions(-) [+]
line wrap: on
line diff
--- a/config/rules_k8s.yaml	Sun Dec 04 02:07:14 2022 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,64 +0,0 @@
-groups: 
-  - name: k8s
-    rules:
-      # from https://awesome-prometheus-alerts.grep.to/rules.html
-      - alert: PrometheusTargetMissing
-        expr: up == 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Prometheus target missing (instance {{ $labels.instance }})
-          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesMemoryPressure
-        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesDiskPressure
-        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesOutOfDisk
-        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes out of disk (instance {{ $labels.instance }})
-          description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesJobFailed
-        expr: kube_job_status_failed > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes Job failed (instance {{ $labels.instance }})
-          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPodCrashLooping
-        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
-          description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: KubernetesClientCertificateExpiresNextWeek
-        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
-          description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: container_waiting
-        expr: sum by (container)(kube_pod_container_status_waiting!=0)
-        for: 2m
--- a/config/rules_main.yaml	Sun Dec 04 02:07:14 2022 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,110 +0,0 @@
-# docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-# "Whenever the alert expression results in one or more vector
-# elements at a given point in time, the alert counts as active for
-# these elements' label sets."
-
-# also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
-#
-# any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
-
-groups:
-
-  - name: webcam
-    rules:
-    # waiting for twinscam revival
-      # - alert: twinscam_not_reporting
-      #   expr: absent(cam_pipeline_state{job="webcam-record-twinscam"})
-      #   for: 2m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      #     summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}"
-
-      # - alert: cam_garagehall_not_reporting
-      #   expr: absent(cam_pipeline_state{job="webcam-record-garagehall"})
-      #   for: 2m
-      #   labels:
-      #     severity: losingData
-      #   annotations:
-      #     summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}"
-
-      - alert: cam_pipeline_stopped
-        expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1
-        for: 10m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "webcam-record gst pipeline is not state=playing {{ $labels }}"
-
-      - alert: cam_not_advancing
-        expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2
-        for: 10m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "cam output bytes is advancing too slowly. {{ $labels }}"
-
-      - alert: webcam_indexer_stalled
-        expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01
-        for: 10m
-        labels:
-          severity: webcamUsersAffected
-        annotations:
-          summary: "webcam indexer update loop is stalled"
-
-  - name: Outages
-    rules:
-      - alert: powereagleStalled
-        expr: rate(house_power_w[100m]) == 0
-        for: 0m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "power eagle data stalled"
-          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
-
-      - alert: powereagleAbsent
-        expr: absent_over_time(house_power_w[5m])
-        for: 2m
-        labels:
-          severity: losingData
-        annotations:
-          summary: "power eagle data missing"
-          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
-
-      - alert: wifi_scrape_errors
-        expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
-        labels:
-          severity: houseUsersAffected
-        annotations:
-          summary: "errors getting wifi users list"
-
-      - alert: absent_mitmproxy
-        expr: absent(process_resident_memory_bytes{job="mitmproxy"})
-        labels:
-          severity: houseUsersAffected
-        annotations:
-          summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)"
-
-      # also belongs on frontbed but nothing is submitting container_last_seen on there
-      - alert: absent_zigbee_dash
-        expr: absent(container_last_seen{container="zigbee2mqtt-dash"})
-
-      - alert: net_routes_sync
-        expr: min(sync_is_up{job="net-routes"}) != 1
-        for: 30m
-        labels:
-          severity: houseUsersAffected
-        annotations:
-          summary: "mitmproxy not syncing. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/net-routes"
-
-      
-  - name: alerts
-    rules:
-      - { alert: housePower, expr: "house_power_w > 3000", for: 20m, labels: { severity: waste }, annotations: { summary: "house power usage over 3KW" } }
-      - alert: ssl_certs_expiring_soon
-        expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
-        labels:
-          severity: futureUsersAffected
-        annotations:
-          summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/config/scrape_main.yaml	Sun Dec 04 02:07:14 2022 -0800
+++ b/config/scrape_main.yaml	Sun Dec 04 02:08:08 2022 -0800
@@ -1,25 +1,21 @@
+# see https://relabeler.promlabs.com/
+
 global:
   scrape_interval: 1m
   scrape_timeout: 10s
 
-scrape_config_files:
-  - scrape_ssl.yaml
+# scrape_config_files:
+#   - build/scrape_ssl.yaml
 # These can even be urls: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#loading-scrape-configs-from-multiple-files
 
 scrape_configs:
   # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
 
-  - job_name: "victoriametrics"
-    metrics_path: /m/metrics
-    static_configs:
-      - targets:
-          - victoriametrics.default.svc.cluster.local
-
-  - job_name: "vmalert"
-    metrics_path: /vmalert/metrics
-    static_configs:
-      - targets:
-          - vmalert.default.svc.cluster.local
+  # - job_name: "vmalert"
+  #   metrics_path: /vmalert/metrics
+  #   static_configs:
+  #     - targets:
+  #         - vmalert.default.svc.cluster.local
 
   - job_name: "kubernetes-apiservers"
     scheme: https
@@ -64,35 +60,44 @@
       # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations):
       #   annotations: { prometheus.io/scrape: "false" }
       - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
-        regex: false
-        action: drop
-
-      - source_labels: [__meta_kubernetes_service_name]
-        regex: kubernetes
+        regex: "false"
         action: drop
 
       - source_labels: [__meta_kubernetes_namespace]
         regex: default
         action: keep
 
-      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_pod_container_port_number]
-        regex: "mitmproxy;1008[01]"
-        action: drop
-
+      # promote these to display
       - source_labels: [__meta_kubernetes_service_name]
         target_label: job
 
       - source_labels: [__meta_kubernetes_pod_node_name]
         target_label: node
 
-      - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number]
-        action: drop
-        regex: jsregistry;4873
+      # for convenience in this config
+      - source_labels: [__meta_kubernetes_pod_container_port_number]
+        target_label: __port_number
+
+      # period tweaks
+      - if: '{job="power-eagle"}'
+        action: replace
+        target_label: __scrape_interval__
+        # from powerEagle/private_config.periodSec
+        replacement: 8s
 
-      - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number]
-        action: drop
-        regex: mosquitto-ext;1883
+      # path tweaks
+      - if: '{job="victoriametrics",__port_number="8428"}'
+        action: replace
+        target_label: "__metrics_path__"
+        replacement: "/m/metrics"
 
+      # discovery is matching extra ports that don't serve metrics- remove these targets
+      - {if: '{job="kubernetes"}', action: drop}
+      - {if: '{job="mongodb",           __port_number="27017"}', action: drop}
+      - {if: '{job="mosquitto-ext",     __port_number="1883"}', action: drop}
+      - {if: '{job="filesync-syncthing",__port_number="8384"}', action: drop}
+      - {if: '{job="jsregistry",        __port_number="4873"}', action: drop}
+  
   # # seems like this would match more stuff, but all I get is coredns
   # - job_name: 'old_coredns'
   #   kubernetes_sd_configs: [{role: pod}]
@@ -116,7 +121,12 @@
   - job_name: "net-routes"
     static_configs:
       - targets:
-          - 10.2.0.3:10001
+          - pipe:9999
+
+  - job_name: "net-traffic"
+    static_configs:
+      - targets:
+          - pipe:8080
 
   - job_name: "ping"
     scrape_interval: 2m
@@ -126,14 +136,14 @@
     static_configs:
       - targets:
           # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://10.2.0.37/general/status.html?pageid=1
-          - 10.2.0.37
+          - printer014032ED
           # frontbed, for monitoring
           - 10.5.0.17
+          # asher bulb, not sure why it sleeps so long
+          - bulb1
 
     relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
+      - {source_labels: [__address__],    target_label: __param_target}
+      - {source_labels: [__param_target], target_label: instance}
       - target_label: __address__
         replacement: prober
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rules/rules_k8s.yaml	Sun Dec 04 02:08:08 2022 -0800
@@ -0,0 +1,64 @@
+groups: 
+  - name: k8s
+    rules:
+      # from https://awesome-prometheus-alerts.grep.to/rules.html
+      - alert: PrometheusTargetMissing
+        expr: up == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesMemoryPressure
+        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesDiskPressure
+        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesOutOfDisk
+        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes out of disk (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesJobFailed
+        expr: kube_job_status_failed > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Job failed (instance {{ $labels.instance }})
+          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPodCrashLooping
+        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+          description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesClientCertificateExpiresNextWeek
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+          description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: container_waiting
+        expr: sum by (container)(kube_pod_container_status_waiting!=0)
+        for: 2m
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rules/rules_main.yaml	Sun Dec 04 02:08:08 2022 -0800
@@ -0,0 +1,115 @@
+groups:
+  # docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+  # "Whenever the alert expression results in one or more vector
+  # elements at a given point in time, the alert counts as active for
+  # these elements' label sets."
+
+  # also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+  #
+  # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
+
+  # - name: webcam
+  #   rules:
+    # waiting for twinscam revival
+      # - alert: twinscam_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-twinscam"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}"
+
+      # - alert: cam_garagehall_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-garagehall"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      # #     summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}"
+
+      # - alert: cam_pipeline_stopped
+      #   expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1
+      #   for: 10m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record gst pipeline is not state=playing {{ $labels }}"
+
+      # - alert: cam_not_advancing
+      #   expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2
+      #   for: 10m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "cam output bytes is advancing too slowly. {{ $labels }}"
+
+      # - alert: webcam_indexer_stalled
+      #   expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01
+      #   for: 10m
+      #   labels:
+      #     severity: webcamUsersAffected
+      #   annotations:
+      #     summary: "webcam indexer update loop is stalled"
+
+  - name: Outages
+    rules:
+      - alert: powereagleStalled
+        expr: rate(house_power_w[100m]) == 0
+        for: 0m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data stalled"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      - alert: powereagleAbsent
+        expr: absent_over_time(house_power_w[5m])
+        for: 2m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data missing"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      # - alert: wifi_scrape_errors
+      #   expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
+      #   labels:
+      #     severity: houseUsersAffected
+      #   annotations:
+      #     summary: "errors getting wifi users list"
+
+      # - alert: absent_mitmproxy
+      #   expr: absent(process_resident_memory_bytes{job="mitmproxy"})
+      #   labels:
+      #     severity: houseUsersAffected
+      #   annotations:
+      #     summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)"
+
+      - alert: absent_zigbee_dash
+        expr: absent(container_last_seen{container="zigbee2mqtt-dash"})
+
+      - alert: net_routes_sync
+        expr: rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70
+        for: 10m
+        labels:
+          severity: houseUsersAffected
+        annotations:
+          summary: "net_routes is not getting regular updates"
+
+      
+  - name: alerts
+    rules:
+      - {alert: housePower,   for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
+      - {alert: disk1,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk2,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk3,        for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500',           annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
+      - {alert: oom,          for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
+      - {alert: high_logging, for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}}
+      - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
+      - {alert: starlette,    for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                       annotations: {summary: "set starlette app name {{ $labels }}"}}
+      - alert: ssl_certs_expiring_soon
+        expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
+        labels:
+          severity: warning
+        annotations:
+          summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n  LABELS = {{ $labels }}"