Mercurial > code > home > repos > victoriametrics
changeset 9:17db5e8e7a2f
big rules and scrape config updates
author | drewp@bigasterisk.com |
---|---|
date | Sun, 04 Dec 2022 02:08:08 -0800 |
parents | e393b24f0e01 |
children | 2023a6ce7bc0 |
files | config/rules_k8s.yaml config/rules_main.yaml config/scrape_main.yaml rules/rules_k8s.yaml rules/rules_main.yaml |
diffstat | 5 files changed, 223 insertions(+), 208 deletions(-) [+] |
line wrap: on
line diff
--- a/config/rules_k8s.yaml Sun Dec 04 02:07:14 2022 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -groups: - - name: k8s - rules: - # from https://awesome-prometheus-alerts.grep.to/rules.html - - alert: PrometheusTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes memory pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes disk pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Kubernetes out of disk (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesJobFailed - expr: kube_job_status_failed > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes Job failed (instance {{ $labels.instance }}) - description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: KubernetesPodCrashLooping - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 - for: 2m - labels: - severity: warning - annotations: - summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) - description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesClientCertificateExpiresNextWeek - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 - for: 0m - labels: - severity: warning - annotations: - summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) - description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: container_waiting - expr: sum by (container)(kube_pod_container_status_waiting!=0) - for: 2m
--- a/config/rules_main.yaml Sun Dec 04 02:07:14 2022 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -# docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ -# "Whenever the alert expression results in one or more vector -# elements at a given point in time, the alert counts as active for -# these elements' label sets." - -# also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics -# -# any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name - -groups: - - - name: webcam - rules: - # waiting for twinscam revival - # - alert: twinscam_not_reporting - # expr: absent(cam_pipeline_state{job="webcam-record-twinscam"}) - # for: 2m - # labels: - # severity: losingData - # annotations: - # summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}" - - # - alert: cam_garagehall_not_reporting - # expr: absent(cam_pipeline_state{job="webcam-record-garagehall"}) - # for: 2m - # labels: - # severity: losingData - # annotations: - # summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}" - - - alert: cam_pipeline_stopped - expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1 - for: 10m - labels: - severity: losingData - annotations: - summary: "webcam-record gst pipeline is not state=playing {{ $labels }}" - - - alert: cam_not_advancing - expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2 - for: 10m - labels: - severity: losingData - annotations: - summary: "cam output bytes is advancing too slowly. {{ $labels }}" - - - alert: webcam_indexer_stalled - expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01 - for: 10m - labels: - severity: webcamUsersAffected - annotations: - summary: "webcam indexer update loop is stalled" - - - name: Outages - rules: - - alert: powereagleStalled - expr: rate(house_power_w[100m]) == 0 - for: 0m - labels: - severity: losingData - annotations: - summary: "power eagle data stalled" - description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - - - alert: powereagleAbsent - expr: absent_over_time(house_power_w[5m]) - for: 2m - labels: - severity: losingData - annotations: - summary: "power eagle data missing" - description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - - - alert: wifi_scrape_errors - expr: rate(poll_errors_total{job="wifi"}[2m]) > .1 - labels: - severity: houseUsersAffected - annotations: - summary: "errors getting wifi users list" - - - alert: absent_mitmproxy - expr: absent(process_resident_memory_bytes{job="mitmproxy"}) - labels: - severity: houseUsersAffected - annotations: - summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)" - - # also belongs on frontbed but nothing is submitting container_last_seen on there - - alert: absent_zigbee_dash - expr: absent(container_last_seen{container="zigbee2mqtt-dash"}) - - - alert: net_routes_sync - expr: min(sync_is_up{job="net-routes"}) != 1 - for: 30m - labels: - severity: houseUsersAffected - annotations: - summary: "mitmproxy not syncing. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/net-routes" - - - - name: alerts - rules: - - { alert: housePower, expr: "house_power_w > 3000", for: 20m, labels: { severity: waste }, annotations: { summary: "house power usage over 3KW" } } - - alert: ssl_certs_expiring_soon - expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 - labels: - severity: futureUsersAffected - annotations: - summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n LABELS = {{ $labels }}"
--- a/config/scrape_main.yaml Sun Dec 04 02:07:14 2022 -0800 +++ b/config/scrape_main.yaml Sun Dec 04 02:08:08 2022 -0800 @@ -1,25 +1,21 @@ +# see https://relabeler.promlabs.com/ + global: scrape_interval: 1m scrape_timeout: 10s -scrape_config_files: - - scrape_ssl.yaml +# scrape_config_files: +# - build/scrape_ssl.yaml # These can even be urls: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#loading-scrape-configs-from-multiple-files scrape_configs: # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml - - job_name: "victoriametrics" - metrics_path: /m/metrics - static_configs: - - targets: - - victoriametrics.default.svc.cluster.local - - - job_name: "vmalert" - metrics_path: /vmalert/metrics - static_configs: - - targets: - - vmalert.default.svc.cluster.local + # - job_name: "vmalert" + # metrics_path: /vmalert/metrics + # static_configs: + # - targets: + # - vmalert.default.svc.cluster.local - job_name: "kubernetes-apiservers" scheme: https @@ -64,35 +60,44 @@ # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations): # annotations: { prometheus.io/scrape: "false" } - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - regex: false - action: drop - - - source_labels: [__meta_kubernetes_service_name] - regex: kubernetes + regex: "false" action: drop - source_labels: [__meta_kubernetes_namespace] regex: default action: keep - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_pod_container_port_number] - regex: "mitmproxy;1008[01]" - action: drop - + # promote these to display - source_labels: [__meta_kubernetes_service_name] target_label: job - source_labels: [__meta_kubernetes_pod_node_name] target_label: node - - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number] - action: drop - regex: jsregistry;4873 + # for convenience in this config + - source_labels: [__meta_kubernetes_pod_container_port_number] + target_label: __port_number + + # period tweaks + - if: '{job="power-eagle"}' + action: replace + target_label: __scrape_interval__ + # from powerEagle/private_config.periodSec + replacement: 8s - - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number] - action: drop - regex: mosquitto-ext;1883 + # path tweaks + - if: '{job="victoriametrics",__port_number="8428"}' + action: replace + target_label: "__metrics_path__" + replacement: "/m/metrics" + # discovery is matching extra ports that don't serve metrics- remove these targets + - {if: '{job="kubernetes"}', action: drop} + - {if: '{job="mongodb", __port_number="27017"}', action: drop} + - {if: '{job="mosquitto-ext", __port_number="1883"}', action: drop} + - {if: '{job="filesync-syncthing",__port_number="8384"}', action: drop} + - {if: '{job="jsregistry", __port_number="4873"}', action: drop} + # # seems like this would match more stuff, but all I get is coredns # - job_name: 'old_coredns' # kubernetes_sd_configs: [{role: pod}] @@ -116,7 +121,12 @@ - job_name: "net-routes" static_configs: - targets: - - 10.2.0.3:10001 + - pipe:9999 + + - job_name: "net-traffic" + static_configs: + - targets: + - pipe:8080 - job_name: "ping" scrape_interval: 2m @@ -126,14 +136,14 @@ static_configs: - targets: # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://10.2.0.37/general/status.html?pageid=1 - - 10.2.0.37 + - printer014032ED # frontbed, for monitoring - 10.5.0.17 + # asher bulb, not sure why it sleeps so long + - bulb1 relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance + - {source_labels: [__address__], target_label: __param_target} + - {source_labels: [__param_target], target_label: instance} - target_label: __address__ replacement: prober
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rules/rules_k8s.yaml Sun Dec 04 02:08:08 2022 -0800 @@ -0,0 +1,64 @@ +groups: + - name: k8s + rules: + # from https://awesome-prometheus-alerts.grep.to/rules.html + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes memory pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes disk pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes out of disk (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job failed (instance {{ $labels.instance }}) + description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) + description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesClientCertificateExpiresNextWeek + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) + description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: container_waiting + expr: sum by (container)(kube_pod_container_status_waiting!=0) + for: 2m
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rules/rules_main.yaml Sun Dec 04 02:08:08 2022 -0800 @@ -0,0 +1,115 @@ +groups: + # docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + # "Whenever the alert expression results in one or more vector + # elements at a given point in time, the alert counts as active for + # these elements' label sets." + + # also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + # + # any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name + + # - name: webcam + # rules: + # waiting for twinscam revival + # - alert: twinscam_not_reporting + # expr: absent(cam_pipeline_state{job="webcam-record-twinscam"}) + # for: 2m + # labels: + # severity: losingData + # annotations: + # summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}" + + # - alert: cam_garagehall_not_reporting + # expr: absent(cam_pipeline_state{job="webcam-record-garagehall"}) + # for: 2m + # labels: + # severity: losingData + # annotations: + # # summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}" + + # - alert: cam_pipeline_stopped + # expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1 + # for: 10m + # labels: + # severity: losingData + # annotations: + # summary: "webcam-record gst pipeline is not state=playing {{ $labels }}" + + # - alert: cam_not_advancing + # expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2 + # for: 10m + # labels: + # severity: losingData + # annotations: + # summary: "cam output bytes is advancing too slowly. {{ $labels }}" + + # - alert: webcam_indexer_stalled + # expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01 + # for: 10m + # labels: + # severity: webcamUsersAffected + # annotations: + # summary: "webcam indexer update loop is stalled" + + - name: Outages + rules: + - alert: powereagleStalled + expr: rate(house_power_w[100m]) == 0 + for: 0m + labels: + severity: losingData + annotations: + summary: "power eagle data stalled" + description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" + + - alert: powereagleAbsent + expr: absent_over_time(house_power_w[5m]) + for: 2m + labels: + severity: losingData + annotations: + summary: "power eagle data missing" + description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" + + # - alert: wifi_scrape_errors + # expr: rate(poll_errors_total{job="wifi"}[2m]) > .1 + # labels: + # severity: houseUsersAffected + # annotations: + # summary: "errors getting wifi users list" + + # - alert: absent_mitmproxy + # expr: absent(process_resident_memory_bytes{job="mitmproxy"}) + # labels: + # severity: houseUsersAffected + # annotations: + # summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)" + + - alert: absent_zigbee_dash + expr: absent(container_last_seen{container="zigbee2mqtt-dash"}) + + - alert: net_routes_sync + expr: rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70 + for: 10m + labels: + severity: houseUsersAffected + annotations: + summary: "net_routes is not getting regular updates" + + + - name: alerts + rules: + - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} + - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk3, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} + - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} + - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}} + - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} + - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} + - alert: ssl_certs_expiring_soon + expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 + labels: + severity: warning + annotations: + summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n LABELS = {{ $labels }}"