changeset 4:1eb6e6a2b9b6

version control configs finally; use configmaps to present them to VM
author drewp@bigasterisk.com
date Sun, 12 Jun 2022 17:08:31 -0700
parents 6056f2e2aba5
children 862e79fbbf14
files config/rules_k8s.yaml config/rules_main.yaml config/scrape_config.yaml config/scrape_ssl.yaml deploy.yaml tasks.py vmalert.yaml
diffstat 7 files changed, 378 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/config/rules_k8s.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -0,0 +1,64 @@
+groups: 
+  - name: k8s
+    rules:
+      # from https://awesome-prometheus-alerts.grep.to/rules.html
+      - alert: PrometheusTargetMissing
+        expr: up == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesMemoryPressure
+        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesDiskPressure
+        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesOutOfDisk
+        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes out of disk (instance {{ $labels.instance }})
+          description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesJobFailed
+        expr: kube_job_status_failed > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Job failed (instance {{ $labels.instance }})
+          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPodCrashLooping
+        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+          description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: KubernetesClientCertificateExpiresNextWeek
+        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+          description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: container_waiting
+        expr: sum by (container)(kube_pod_container_status_waiting!=0)
+        for: 2m
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/config/rules_main.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -0,0 +1,110 @@
+# docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+# "Whenever the alert expression results in one or more vector
+# elements at a given point in time, the alert counts as active for
+# these elements' label sets."
+
+# also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+#
+# any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name
+
+groups:
+
+  - name: webcam
+    rules:
+    # waiting for twinscam revival
+      # - alert: twinscam_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-twinscam"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}"
+
+      # - alert: cam_garagehall_not_reporting
+      #   expr: absent(cam_pipeline_state{job="webcam-record-garagehall"})
+      #   for: 2m
+      #   labels:
+      #     severity: losingData
+      #   annotations:
+      #     summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}"
+
+      - alert: cam_pipeline_stopped
+        expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1
+        for: 10m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "webcam-record gst pipeline is not state=playing {{ $labels }}"
+
+      - alert: cam_not_advancing
+        expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2
+        for: 10m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "cam output bytes is advancing too slowly. {{ $labels }}"
+
+      - alert: webcam_indexer_stalled
+        expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01
+        for: 10m
+        labels:
+          severity: webcamUsersAffected
+        annotations:
+          summary: "webcam indexer update loop is stalled"
+
+  - name: Outages
+    rules:
+      - alert: powereagleStalled
+        expr: rate(house_power_w[100m]) == 0
+        for: 0m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data stalled"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      - alert: powereagleAbsent
+        expr: absent_over_time(house_power_w[5m])
+        for: 2m
+        labels:
+          severity: losingData
+        annotations:
+          summary: "power eagle data missing"
+          description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
+
+      - alert: wifi_scrape_errors
+        expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
+        labels:
+          severity: houseUsersAffected
+        annotations:
+          summary: "errors getting wifi users list"
+
+      - alert: absent_mitmproxy
+        expr: absent(process_resident_memory_bytes{job="mitmproxy"})
+        labels:
+          severity: houseUsersAffected
+        annotations:
+          summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)"
+
+      # also belongs on frontbed but nothing is submitting container_last_seen on there
+      - alert: absent_zigbee_dash
+        expr: absent(container_last_seen{container="zigbee2mqtt-dash"})
+
+      - alert: net_routes_sync
+        expr: min(sync_is_up{job="net-routes"}) != 1
+        for: 30m
+        labels:
+          severity: houseUsersAffected
+        annotations:
+          summary: "mitmproxy not syncing. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/net-routes"
+
+      
+  - name: alerts
+    rules:
+      - { alert: housePower, expr: "house_power_w > 3000", for: 20m, labels: { severity: waste }, annotations: { summary: "house power usage over 3KW" } }
+      - alert: ssl_certs_expiring_soon
+        expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
+        labels:
+          severity: futureUsersAffected
+        annotations:
+          summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/config/scrape_config.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -0,0 +1,151 @@
+global:
+  scrape_interval: 1m
+  scrape_timeout: 10s
+
+scrape_config_files:
+  - scrape_ssl.yaml
+# These can even be urls: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#loading-scrape-configs-from-multiple-files
+
+scrape_configs:
+  # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
+
+  - job_name: "victoriametrics"
+    metrics_path: /m/metrics
+    static_configs:
+      - targets:
+          - victoriametrics.default.svc.cluster.local
+
+  - job_name: "vmalert"
+    metrics_path: /vmalert/metrics
+    static_configs:
+      - targets:
+          - vmalert.default.svc.cluster.local
+
+  - job_name: "kubernetes-apiservers"
+    scheme: https
+    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
+    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+    kubernetes_sd_configs: [{ role: endpoints }]
+
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+        action: keep
+        regex: default;kubernetes;https
+
+  - job_name: "kubernetes-nodes"
+    scheme: https
+    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
+    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+    kubernetes_sd_configs: [{ role: node }]
+
+    relabel_configs:
+      - action: labelmap
+        regex: __meta_kubernetes_node_label_(.+)
+
+  # see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
+  # for metric definitions
+  - job_name: "kubernetes-cadvisor"
+    scheme: https
+    metrics_path: /metrics/cadvisor
+    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
+    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+    kubernetes_sd_configs: [{ role: node }]
+
+    relabel_configs:
+      - action: labelmap
+        regex: __meta_kubernetes_node_label_(.+)
+
+  - job_name: "kube-state-metrics"
+    static_configs:
+      - targets:
+          - kube-state-metrics.kube-system.svc.cluster.local:8080
+          - kube-state-metrics.kube-system.svc.cluster.local:8081
+
+  - job_name: "k8services"
+    kubernetes_sd_configs: [{ role: endpoints }]
+    relabel_configs:
+      # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations):
+      #   annotations: { prometheus.io/scrape: "false" }
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+        regex: false
+        action: drop
+
+      - source_labels: [__meta_kubernetes_service_name]
+        regex: kubernetes
+        action: drop
+
+      - source_labels: [__meta_kubernetes_namespace]
+        regex: default
+        action: keep
+
+      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_pod_container_port_number]
+        regex: "mitmproxy;1008[01]"
+        action: drop
+
+      - source_labels: [__meta_kubernetes_service_name]
+        target_label: job
+
+      - source_labels: [__meta_kubernetes_pod_node_name]
+        target_label: node
+
+      - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number]
+        action: drop
+        regex: jsregistry;4873
+
+      - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number]
+        action: drop
+        regex: mosquitto-ext;1883
+
+  # # seems like this would match more stuff, but all I get is coredns
+  # - job_name: 'old_coredns'
+  #   kubernetes_sd_configs: [{role: pod}]
+  #   relabel_configs:
+  #     - source_labels: [__meta_kubernetes_pod_container_port_name]
+  #       regex: metrics
+  #       action: keep
+  #     - source_labels: [__meta_kubernetes_pod_container_name]
+  #       target_label: job
+
+  - job_name: "telegraf"
+    scheme: http
+    kubernetes_sd_configs: [{ role: node }]
+    relabel_configs:
+      - source_labels: [__address__]
+        regex: "(.*):(\\d+)"
+        target_label: __address__
+        replacement: "${1}:9273"
+        action: replace
+
+  - job_name: "ntop"
+    metrics_path: /lua/local/lanscape/main.lua
+    static_configs:
+      - targets:
+          - 10.5.0.1:3000
+
+  - job_name: "net-routes"
+    static_configs:
+      - targets:
+          - 10.2.0.3:10001
+
+  - job_name: "ping"
+    scrape_interval: 2m
+    metrics_path: /probe
+    params:
+      module: [icmp]
+    static_configs:
+      - targets:
+          # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://10.2.0.37/general/status.html?pageid=1
+          - 10.2.0.37
+          # frontbed, for monitoring
+          - 10.5.0.17
+
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: prober
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/config/scrape_ssl.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -0,0 +1,19 @@
+scrape_configs:
+  - job_name: "prober"
+    scrape_interval: 24h
+    metrics_path: /probe
+    params:
+      module: [https]
+    static_configs:
+      - targets:
+          # sync with /my/doc/ssl/letsencrypt/run.py
+          - fantasyfamegame.com
+          - bigast.com
+          - bigasterisk.com
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: prober
--- a/deploy.yaml	Sun Jun 12 17:06:10 2022 -0700
+++ b/deploy.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -18,6 +18,8 @@
         - name: data
           persistentVolumeClaim:
             claimName: opt-victoriametrics
+        - name: config
+          configMap: {name: victoriametrics-config}
       serviceAccountName: victoriametrics
       containers:
         - name: victoriametrics
@@ -27,7 +29,7 @@
             - -storageDataPath=/data
             - -memory.allowedBytes=512MB
             - -http.pathPrefix=/m/
-            - -promscrape.config=/data/scrape_config.yaml
+            - -promscrape.config=/config/scrape_config.yaml
             - -loggerTimezone=America/Los_Angeles
             - -retentionPeriod=10y
             - -sortLabels
@@ -35,6 +37,7 @@
             - containerPort: 8428
           volumeMounts:
             - { name: data, mountPath: /data }
+            - { name: config, mountPath: /config }
           # resources:
           #   limits:
           #     memory: 0.5Gi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tasks.py	Sun Jun 12 17:08:31 2022 -0700
@@ -0,0 +1,26 @@
+import urllib.request
+from invoke import task
+
+
+def updateConfigmapCmd(path):
+    return f'kubectl create configmap victoriametrics-config --from-file {path}=config/{path} -o yaml --dry-run=client | kubectl apply -f -'
+
+
+def reload(ctx, svc):
+    host = ctx.run(f'khost {svc}').stdout
+    path = {'victoriametrics': '/m', 'vmalert': '/vmalert'}[svc]
+    print(' -> status',
+          urllib.request.urlopen(f'http://{host}{path}/-/reload').status)
+
+
+@task
+def sync_config(ctx):
+    ctx.run(updateConfigmapCmd("scrape_config.yaml"))
+    ctx.run(updateConfigmapCmd("scrape_ssl.yaml"))
+    reload(ctx, 'victoriametrics')
+
+    ctx.run(updateConfigmapCmd("rules_expected_hosts.yaml"))
+    ctx.run(updateConfigmapCmd("rules_expected_nodes.yaml"))
+    ctx.run(updateConfigmapCmd("rules_k8s.yaml"))
+    ctx.run(updateConfigmapCmd("rules_main.yaml"))
+    reload(ctx, 'vmalert')
--- a/vmalert.yaml	Sun Jun 12 17:06:10 2022 -0700
+++ b/vmalert.yaml	Sun Jun 12 17:08:31 2022 -0700
@@ -18,12 +18,14 @@
         - name: data
           persistentVolumeClaim:
             claimName: opt-victoriametrics
+        - name: config
+          configMap: {name: victoriametrics-config}
       serviceAccountName: victoriametrics
       containers:
         - name: vmalert
           image: victoriametrics/vmalert:v1.77.2
           args:
-            - -rule=/data/rules/*.yaml
+            - -rule=/config/rules/*.yaml
             - -datasource.url=http://victoriametrics.default.svc.cluster.local./m/
             - -notifier.url=http://alertmanager.default.svc.cluster.local.
             - -remoteWrite.url=http://victoriametrics.default.svc.cluster.local./m/
@@ -36,6 +38,7 @@
             - containerPort: 8880
           volumeMounts:
             - { name: data, mountPath: /data }
+            - { name: config, mountPath: /config }
           resources:
             limits:
               memory: 0.5Gi