Mercurial > code > home > repos > victoriametrics
changeset 4:1eb6e6a2b9b6
version control configs finally; use configmaps to present them to VM
author | drewp@bigasterisk.com |
---|---|
date | Sun, 12 Jun 2022 17:08:31 -0700 |
parents | 6056f2e2aba5 |
children | 862e79fbbf14 |
files | config/rules_k8s.yaml config/rules_main.yaml config/scrape_config.yaml config/scrape_ssl.yaml deploy.yaml tasks.py vmalert.yaml |
diffstat | 7 files changed, 378 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config/rules_k8s.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,64 @@ +groups: + - name: k8s + rules: + # from https://awesome-prometheus-alerts.grep.to/rules.html + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes memory pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes disk pressure (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Kubernetes out of disk (instance {{ $labels.instance }}) + description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job failed (instance {{ $labels.instance }}) + description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + for: 2m + labels: + severity: warning + annotations: + summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) + description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesClientCertificateExpiresNextWeek + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) + description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: container_waiting + expr: sum by (container)(kube_pod_container_status_waiting!=0) + for: 2m
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config/rules_main.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,110 @@ +# docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +# "Whenever the alert expression results in one or more vector +# elements at a given point in time, the alert counts as active for +# these elements' label sets." + +# also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics +# +# any presence of starlette_request_duration_seconds_created{app_name="starlette",method="GET",path="/",status_code="200"} 1.6460176156784086e+09 means someone forgot to set app name + +groups: + + - name: webcam + rules: + # waiting for twinscam revival + # - alert: twinscam_not_reporting + # expr: absent(cam_pipeline_state{job="webcam-record-twinscam"}) + # for: 2m + # labels: + # severity: losingData + # annotations: + # summary: "webcam-record-twinscam is not reporting metrics {{ $labels }}" + + # - alert: cam_garagehall_not_reporting + # expr: absent(cam_pipeline_state{job="webcam-record-garagehall"}) + # for: 2m + # labels: + # severity: losingData + # annotations: + # summary: "webcam-record-garagehall is not reporting metrics {{ $labels }}" + + - alert: cam_pipeline_stopped + expr: sum without (instance) (cam_pipeline_state{cam_pipeline_state="playing"}) < 1 + for: 10m + labels: + severity: losingData + annotations: + summary: "webcam-record gst pipeline is not state=playing {{ $labels }}" + + - alert: cam_not_advancing + expr: rate(cam_stream_bytes{element="splitmux"}[3m]) < 0.2 + for: 10m + labels: + severity: losingData + annotations: + summary: "cam output bytes is advancing too slowly. {{ $labels }}" + + - alert: webcam_indexer_stalled + expr: rate(webcam_indexer_update_count{job="webcam-indexer"}[5m]) < .01 + for: 10m + labels: + severity: webcamUsersAffected + annotations: + summary: "webcam indexer update loop is stalled" + + - name: Outages + rules: + - alert: powereagleStalled + expr: rate(house_power_w[100m]) == 0 + for: 0m + labels: + severity: losingData + annotations: + summary: "power eagle data stalled" + description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" + + - alert: powereagleAbsent + expr: absent_over_time(house_power_w[5m]) + for: 2m + labels: + severity: losingData + annotations: + summary: "power eagle data missing" + description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" + + - alert: wifi_scrape_errors + expr: rate(poll_errors_total{job="wifi"}[2m]) > .1 + labels: + severity: houseUsersAffected + annotations: + summary: "errors getting wifi users list" + + - alert: absent_mitmproxy + expr: absent(process_resident_memory_bytes{job="mitmproxy"}) + labels: + severity: houseUsersAffected + annotations: + summary: "mitmproxy metrics not responding. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/mitmproxy (metrics actually come from webfilter.py plugin)" + + # also belongs on frontbed but nothing is submitting container_last_seen on there + - alert: absent_zigbee_dash + expr: absent(container_last_seen{container="zigbee2mqtt-dash"}) + + - alert: net_routes_sync + expr: min(sync_is_up{job="net-routes"}) != 1 + for: 30m + labels: + severity: houseUsersAffected + annotations: + summary: "mitmproxy not syncing. See https://bigasterisk.com/grafana/d/ix3hMAdMk/webfilter?orgId=1&from=now-12h&to=now and https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/net-routes" + + + - name: alerts + rules: + - { alert: housePower, expr: "house_power_w > 3000", for: 20m, labels: { severity: waste }, annotations: { summary: "house power usage over 3KW" } } + - alert: ssl_certs_expiring_soon + expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 + labels: + severity: futureUsersAffected + annotations: + summary: "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}\n LABELS = {{ $labels }}"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config/scrape_config.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,151 @@ +global: + scrape_interval: 1m + scrape_timeout: 10s + +scrape_config_files: + - scrape_ssl.yaml +# These can even be urls: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#loading-scrape-configs-from-multiple-files + +scrape_configs: + # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml + + - job_name: "victoriametrics" + metrics_path: /m/metrics + static_configs: + - targets: + - victoriametrics.default.svc.cluster.local + + - job_name: "vmalert" + metrics_path: /vmalert/metrics + static_configs: + - targets: + - vmalert.default.svc.cluster.local + + - job_name: "kubernetes-apiservers" + scheme: https + tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt } + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: [{ role: endpoints }] + + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + - job_name: "kubernetes-nodes" + scheme: https + tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt } + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: [{ role: node }] + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + + # see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md + # for metric definitions + - job_name: "kubernetes-cadvisor" + scheme: https + metrics_path: /metrics/cadvisor + tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt } + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: [{ role: node }] + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + + - job_name: "kube-state-metrics" + static_configs: + - targets: + - kube-state-metrics.kube-system.svc.cluster.local:8080 + - kube-state-metrics.kube-system.svc.cluster.local:8081 + + - job_name: "k8services" + kubernetes_sd_configs: [{ role: endpoints }] + relabel_configs: + # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations): + # annotations: { prometheus.io/scrape: "false" } + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + regex: false + action: drop + + - source_labels: [__meta_kubernetes_service_name] + regex: kubernetes + action: drop + + - source_labels: [__meta_kubernetes_namespace] + regex: default + action: keep + + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_pod_container_port_number] + regex: "mitmproxy;1008[01]" + action: drop + + - source_labels: [__meta_kubernetes_service_name] + target_label: job + + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + + - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number] + action: drop + regex: jsregistry;4873 + + - source_labels: [__meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_number] + action: drop + regex: mosquitto-ext;1883 + + # # seems like this would match more stuff, but all I get is coredns + # - job_name: 'old_coredns' + # kubernetes_sd_configs: [{role: pod}] + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_container_port_name] + # regex: metrics + # action: keep + # - source_labels: [__meta_kubernetes_pod_container_name] + # target_label: job + + - job_name: "telegraf" + scheme: http + kubernetes_sd_configs: [{ role: node }] + relabel_configs: + - source_labels: [__address__] + regex: "(.*):(\\d+)" + target_label: __address__ + replacement: "${1}:9273" + action: replace + + - job_name: "ntop" + metrics_path: /lua/local/lanscape/main.lua + static_configs: + - targets: + - 10.5.0.1:3000 + + - job_name: "net-routes" + static_configs: + - targets: + - 10.2.0.3:10001 + + - job_name: "ping" + scrape_interval: 2m + metrics_path: /probe + params: + module: [icmp] + static_configs: + - targets: + # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://10.2.0.37/general/status.html?pageid=1 + - 10.2.0.37 + # frontbed, for monitoring + - 10.5.0.17 + + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: prober
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config/scrape_ssl.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,19 @@ +scrape_configs: + - job_name: "prober" + scrape_interval: 24h + metrics_path: /probe + params: + module: [https] + static_configs: + - targets: + # sync with /my/doc/ssl/letsencrypt/run.py + - fantasyfamegame.com + - bigast.com + - bigasterisk.com + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: prober
--- a/deploy.yaml Sun Jun 12 17:06:10 2022 -0700 +++ b/deploy.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -18,6 +18,8 @@ - name: data persistentVolumeClaim: claimName: opt-victoriametrics + - name: config + configMap: {name: victoriametrics-config} serviceAccountName: victoriametrics containers: - name: victoriametrics @@ -27,7 +29,7 @@ - -storageDataPath=/data - -memory.allowedBytes=512MB - -http.pathPrefix=/m/ - - -promscrape.config=/data/scrape_config.yaml + - -promscrape.config=/config/scrape_config.yaml - -loggerTimezone=America/Los_Angeles - -retentionPeriod=10y - -sortLabels @@ -35,6 +37,7 @@ - containerPort: 8428 volumeMounts: - { name: data, mountPath: /data } + - { name: config, mountPath: /config } # resources: # limits: # memory: 0.5Gi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tasks.py Sun Jun 12 17:08:31 2022 -0700 @@ -0,0 +1,26 @@ +import urllib.request +from invoke import task + + +def updateConfigmapCmd(path): + return f'kubectl create configmap victoriametrics-config --from-file {path}=config/{path} -o yaml --dry-run=client | kubectl apply -f -' + + +def reload(ctx, svc): + host = ctx.run(f'khost {svc}').stdout + path = {'victoriametrics': '/m', 'vmalert': '/vmalert'}[svc] + print(' -> status', + urllib.request.urlopen(f'http://{host}{path}/-/reload').status) + + +@task +def sync_config(ctx): + ctx.run(updateConfigmapCmd("scrape_config.yaml")) + ctx.run(updateConfigmapCmd("scrape_ssl.yaml")) + reload(ctx, 'victoriametrics') + + ctx.run(updateConfigmapCmd("rules_expected_hosts.yaml")) + ctx.run(updateConfigmapCmd("rules_expected_nodes.yaml")) + ctx.run(updateConfigmapCmd("rules_k8s.yaml")) + ctx.run(updateConfigmapCmd("rules_main.yaml")) + reload(ctx, 'vmalert')
--- a/vmalert.yaml Sun Jun 12 17:06:10 2022 -0700 +++ b/vmalert.yaml Sun Jun 12 17:08:31 2022 -0700 @@ -18,12 +18,14 @@ - name: data persistentVolumeClaim: claimName: opt-victoriametrics + - name: config + configMap: {name: victoriametrics-config} serviceAccountName: victoriametrics containers: - name: vmalert image: victoriametrics/vmalert:v1.77.2 args: - - -rule=/data/rules/*.yaml + - -rule=/config/rules/*.yaml - -datasource.url=http://victoriametrics.default.svc.cluster.local./m/ - -notifier.url=http://alertmanager.default.svc.cluster.local. - -remoteWrite.url=http://victoriametrics.default.svc.cluster.local./m/ @@ -36,6 +38,7 @@ - containerPort: 8880 volumeMounts: - { name: data, mountPath: /data } + - { name: config, mountPath: /config } resources: limits: memory: 0.5Gi