Mercurial > code > home > repos > victoriametrics
changeset 66:429bfd62e6ba
clean out old config; move remaining bits into next/
author | drewp@bigasterisk.com |
---|---|
date | Fri, 03 May 2024 11:19:50 -0700 |
parents | fada8d64c4d3 |
children | adde35eb4773 |
files | alert_rules.py config/scrape_main.yaml deploy_alertmanager.yaml deploy_vmalert.yaml deploy_vmetrics.yaml ingress.yaml k8s_ops.py next/deploy_alertmanager.yaml next/deploy_vmalert.yaml next/ingress_alertmanager.yaml next/tasks.py next/volumes_alert.yaml push-config.sh roles.yaml skaffold.yaml tasks.py volumes.yaml |
diffstat | 17 files changed, 190 insertions(+), 989 deletions(-) [+] |
line wrap: on
line diff
--- a/alert_rules.py Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,433 +0,0 @@ -""" -pdm run invoke push-config - -docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ -"Whenever the alert expression results in one or more vector -elements at a given point in time, the alert counts as active for -these elements' label sets." -also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics - -""" - -import json - - -def pomRules(): - return [ - { - "alert": "frequent_upstream_connect_failures", - "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" - }, - { - "alert": "high_logging_pomerium", - "for": "3h", - "labels": { - "severity": "waste" - }, - "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', - "annotations": { - "summary": "high log output rate" - }, - }, - ] - - -def k8sRules(): - # from https://awesome-prometheus-alerts.grep.to/rules.html - return [ - { - "alert": "metricsTargetMissing", - "expr": 'up{job!~"cm-acme-.*"} == 0', - 'for': '10m', - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "metrics target missing (instance {{ $labels.instance }})", - "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesMemoryPressure", - "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesDiskPressure", - "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesOutOfDisk", - "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', - "for": "2m", - "labels": { - "severity": "critical" - }, - "annotations": { - "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", - "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesJobFailed", - "expr": "kube_job_status_failed > 0", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", - "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesPodCrashLooping", - "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", - "for": "2m", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", - "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", - }, - }, - { - "alert": "KubernetesClientCertificateExpiresNextWeek", - "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", - "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", - }, - }, - { - "alert": "container_waiting", - "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", - "annotations": { - "description": '', - "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", - }, - "for": "2m", - }, - ] - - -def allRules(ctx): - return { - "groups": [ - { - "name": "k8s", - "interval": "1m", - "rules": k8sRules(), - }, - { - "name": "pomerium_proxy", - "interval": "1m", - "rules": pomRules(), - }, - { - "name": - "Outages", - "interval": - "1m", - "rules": [ - { - "alert": "powereagleStalled", - "expr": "rate(house_power_w[100m]) == 0", - "for": "0m", - "labels": { - "severity": "losingData" - }, - "annotations": { - "summary": "power eagle data stalled", - "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", - }, - }, - { - "alert": "powereagleAbsent", - "expr": "absent_over_time(house_power_w[5m])", - "for": "2m", - "labels": { - "severity": "losingData" - }, - "annotations": { - "summary": "power eagle data missing", - "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", - }, - }, - { - "alert": "absent_zigbee", - "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', - }, - { - "alert": "net_routes_sync", - "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', - "for": "10m", - "labels": { - "severity": "houseUsersAffected" - }, - "annotations": { - "summary": "net_routes is not getting regular updates" - }, - }, - ], - }, - { - "name": "disk_errs", - "interval": "2d", - "rules": [{ - "alert": "zpool_device_error_increase", - "labels": { - "severity": "warning" - }, - "expr": 'increase(zpool_device_error_count[3d]) > 0', - }, { - "alert": "zpool_device_error_count", - "labels": { - "severity": "warning" - }, - "expr": 'zpool_device_error_count > 0', - }], - }, - { - "name": "lighting", - "interval": "5m", - "rules": [{ - "alert": "light_bridge_no_mqtt", - "expr": 'mqtt_connected{job="light-bridge"} != 1', - }], - }, - { - "name": - "front_door", - "interval": - "5m", - "rules": [ - { - "alert": "front_door_reader_esp32_no_mqtt", - 'expr': 'hw_connected{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_reader_svc_down", - 'expr': 'up{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_reader_svc_reader_no_mqtt", - 'expr': 'mqtt_connected{job="fingerprint"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_svc_down", - 'expr': 'up{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_svc_no_mqtt", - 'expr': 'mqtt_connected{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - { - "alert": "front_door_lock_esp32_no_mqtt", - 'expr': 'hw_connected{job="front-door-lock"} < 1', - "annotations": { - "summary": "see https://bigasterisk.com/front-door-lock/" - }, - }, - ], - }, - { - "name": - "net_routes", - "interval": - "5m", - "rules": [ - { - "alert": "no_house_ip_service", - "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' - }, - { - "alert": "no_net_routes_running", - "expr": 'absent(python_info{job="net-routes"})' - }, - { - "alert": "allowed_check_never_returned_200", - 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' - }, - { - "alert": "allowed_check_never_returned_403", - 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' - }, - { - 'alert': 'net_route_input_eval_cal_loop_is_down', - 'expr': 'eval_cal_up!=1' - }, - { - 'alert': 'net_route_input_mongo_loop_is_down', - 'expr': 'mongo_to_net_routes_up!=1' - }, - { - 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', - 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' - }, - { - 'alert': 'gcalendarwatch_current_events_loop_is_down', - 'expr': 'current_events_up != 1' - }, - ], - }, - { - "name": "http", - "interval": "1h", - 'rules': [ - { - 'alert': 'old_https_certs', - 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', - }, - { - 'alert': 'high_500_response_rate', - 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', - }, - ], - }, - { - "name": "ping", - "interval": "1m", - "rules": [{ - "alert": "ping_failed", - "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', - }] - }, - { - "name": - "alerts", - "rules": [ - { - "alert": "kube_node_status_bad_condition", - "for": "2h", - "labels": { - "severity": "warning" - }, - "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', - }, - { - "alert": "housePower", - "for": "1h", - "labels": { - "severity": "waste" - }, - "expr": "house_power_w > 4000", - "annotations": { - "summary": "house power usage over 4KW" - }, - }, - { - "alert": "host_root_fs_space_low", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'disk_free{host!="garage",path="/"} < 20G', - }, - { - "alert": "zpool_space_low", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', - }, - { - "alert": "disk_week_incr", - "for": "20m", - "labels": { - "severity": "warning" - }, - "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', - "annotations": { - "summary": "high mb/week on zfs dir" - }, - }, - { - "alert": "high_logging", - "for": "3h", - "labels": { - "severity": "waste" - }, - "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', - "annotations": { - "summary": "high log output rate" - }, - }, - { - "alert": "stale_process", - "for": "1d", - "labels": { - "severity": "dataRisk" - }, - "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", - "annotations": { - "summary": "process time is old" - }, - }, - { - "alert": "starlette", - "for": "1m", - "labels": { - "severity": "fix" - }, - "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', - "annotations": { - "summary": "set starlette app name" - }, - }, - { - "alert": "ssl_certs_expiring_soon", - "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", - "labels": { - "severity": "warning" - }, - "annotations": { - "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" - }, - }, - ], - }, - ] + hostsExpectedOnline(ctx)['groups'] - } - - -def _runJson(ctx, cmd): - return json.loads(ctx.run(cmd, hide="stdout").stdout) - - -def hostsExpectedOnline(ctx): - return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
--- a/config/scrape_main.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml - - - job_name: "k8services" - kubernetes_sd_configs: [{ role: endpoints }] - relabel_configs: - # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations): - # annotations: { prometheus.io/scrape: "false" } - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - regex: "false" - action: drop - - # - source_labels: [__meta_kubernetes_namespace] - # regex: default - # action: keep - - # promote these to display - - source_labels: [__meta_kubernetes_service_name] - target_label: job - - - source_labels: [__meta_kubernetes_pod_node_name] - target_label: node - - # for convenience in this config - - source_labels: [__meta_kubernetes_pod_container_port_number] - target_label: __port_number - - # path tweaks - - if: '{job="victoriametrics",__port_number="8428"}' - action: replace - target_label: "__metrics_path__" - replacement: "/m/metrics" - - if: '{job="victorialogs",__port_number="9428"}' - action: replace - target_label: "__metrics_path__" - replacement: "/logs/metrics" - - if: '{job="video-files",__port_number="8004"}' - action: replace - target_label: "__metrics_path__" - replacement: "/video/api/metrics" - - # discovery is matching extra ports that don't serve metrics- remove these targets - - {if: '{job="cert-manager-webhook"}', action: drop} - - {if: '{job="cert-manager", __port_number="9403"}', action: drop} - - {if: '{job="filesync-syncthing",__port_number="21027"}', action: drop} - - {if: '{job="filesync-syncthing",__port_number="22000"}', action: drop} - - {if: '{job="filesync-syncthing",__port_number="8384"}', action: drop} - - {if: '{job="jsregistry", __port_number="4873"}', action: drop} - - {if: '{job="kube-dns", __port_number="53"}', action: drop} - - {if: '{job="kubernetes"}', action: drop} - - {if: '{job="mongodb", __port_number="27017"}', action: drop} - - {if: '{job="mosquitto-ext", __port_number="1883"}', action: drop} - - {if: '{job="net-route-input", __port_number="80"}', action: drop} - - {if: '{job="photoprism", __port_number="2342"}', action: drop} - - {if: '{job="pomerium-metrics", __port_number="8080"}', action: drop} - - {if: '{job="pomerium-metrics", __port_number="8443"}', action: drop} - - {if: '{job="pomerium-proxy", __port_number="8080"}', action: drop} - - {if: '{job="pomerium-proxy", __port_number="8443"}', action: drop} - - {if: '{job="video-files", __port_number="8003"}', action: drop} - - {if: '{job=~"cm-acme-.*"}', action: drop} - - {if: '{job="nvidiagpu-node-feature-discovery-master", __port_number="8080"}', action: drop} - - # already have this with a job="pomerium-proxy" - - {if: '{job="pomerium-metrics"}', action: drop} - - - - # Assume all 8001/8002 port discoveries are redundant with an nginx proxy - - {if: '{__port_number="8001"}', action: drop} - - {if: '{__port_number="8002"}', action: drop} - - # Needs https. Used by `kubectl top` - - {if: '{job="metrics-server", __port_number="443"}', action: drop} - # Something doesn't work with the scrape, and I don't see why I should care: - - {if: '{job="metrics-server" }', action: drop} -
--- a/deploy_alertmanager.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: alertmanager -spec: - replicas: 1 - selector: - matchLabels: - app: alertmanager - template: - metadata: - labels: - app: alertmanager - spec: - volumes: - - name: opt-alertmanager - persistentVolumeClaim: - claimName: opt-alertmanager - serviceAccountName: victoriametrics - containers: - - name: alertmanager - image: docker.io/prom/alertmanager:v0.25.0 - args: - - --config.file=/alertmanager/alertmanager.yml - - --web.external-url=https://bigasterisk.com/alertmanager/ - - --web.route-prefix=/ - - --log.level=info - ports: - - containerPort: 9093 - volumeMounts: - - name: opt-alertmanager - mountPath: /alertmanager - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "kubernetes.io/hostname" - operator: In - values: ["ditto"] ---- -apiVersion: v1 -kind: Service -metadata: - name: alertmanager -spec: - ports: - - port: 80 - targetPort: 9093 - selector: - app: alertmanager
--- a/deploy_vmalert.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vmalert -spec: - replicas: 1 - strategy: { type: Recreate } - selector: - matchLabels: - app: vmalert - template: - metadata: - labels: - app: vmalert - annotations: - prometheus.io/scrape: "true" - spec: - volumes: - - name: config - configMap: { name: victoriametrics-config } - serviceAccountName: victoriametrics - containers: - - name: vmalert - image: docker.io/victoriametrics/vmalert:v1.91.2 - args: - - -configCheckInterval=5s - - -datasource.url=http://victoriametrics/m/ - - -datasource.queryStep=5m - - -evaluationInterval=1m - - -external.url=https://bigasterisk.com/vmalert - - -loggerLevel=INFO - - -loggerTimezone=America/Los_Angeles - - -memory.allowedBytes=512MB - - -notifier.url=http://alertmanager - - -remoteRead.url=http://victoriametrics/m/ - - -remoteWrite.url=http://victoriametrics/m/ - - -rule=/local/rules - ports: - - containerPort: 8880 - volumeMounts: - - { name: config, mountPath: /local } ---- -apiVersion: v1 -kind: Service -metadata: - name: vmalert -spec: - ports: - - port: 80 - targetPort: 8880 - selector: - app: vmalert
--- a/deploy_vmetrics.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: victoriametrics -spec: - replicas: 1 - strategy: { type: Recreate } - selector: - matchLabels: - app: victoriametrics - template: - metadata: - labels: - app: victoriametrics - annotations: - prometheus.io/scrape: "true" - prometheus.io/path: "/m/metrics" - prometheus.io/port: "80" - spec: - volumes: - - name: data - persistentVolumeClaim: - claimName: opt-victoriametrics - - name: config - configMap: {name: victoriametrics-config} - serviceAccountName: victoriametrics - containers: - - name: victoriametrics - # https://hub.docker.com/r/victoriametrics/victoria-metrics/tags also check vmalert.yaml - image: docker.io/victoriametrics/victoria-metrics:v1.91.2 - args: - - -http.pathPrefix=/m/ - - -loggerTimezone=America/Los_Angeles - - -memory.allowedBytes=512MB - - -promscrape.config=/local/config/scrape_main - - -promscrape.configCheckInterval=5s - - -retentionPeriod=10y - - -sortLabels - - -storageDataPath=/data - ports: - - containerPort: 8428 - volumeMounts: - - { name: data, mountPath: /data } - - { name: config, mountPath: "/local/config" } - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "kubernetes.io/hostname" - operator: In - values: ["ditto"] ---- -apiVersion: v1 -kind: Service -metadata: - name: victoriametrics -spec: - ports: - - port: 80 - targetPort: 8428 - selector: - app: victoriametrics
--- a/ingress.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: victoriametrics - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - ingress.pomerium.io/allow_public_unauthenticated_access: "false" - ingress.pomerium.io/pass_identity_headers: "true" - ingress.pomerium.io/preserve_host_header: "true" - ingress.pomerium.io/policy: | - allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - ingress.pomerium.io/prefix_rewrite: "/m/" -spec: - ingressClassName: pomerium - rules: - - host: "bigasterisk.com" - http: - paths: - - pathType: Prefix - path: /m/ - backend: { service: { name: victoriametrics, port: { number: 80 } } } - tls: - - hosts: [bigasterisk.com] - secretName: bigasterisk.com-tls ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: vmalert - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - ingress.pomerium.io/allow_public_unauthenticated_access: "false" - ingress.pomerium.io/pass_identity_headers: "true" - ingress.pomerium.io/preserve_host_header: "true" - ingress.pomerium.io/policy: | - allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - # ingress.pomerium.io/prefix_rewrite: "/vmalert/" -spec: - ingressClassName: pomerium - rules: - - host: "bigasterisk.com" - http: - paths: - - pathType: Prefix - path: /vmalert/ - backend: { service: { name: vmalert, port: { number: 80 } } } - tls: - - hosts: [bigasterisk.com] - secretName: bigasterisk.com-tls ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: alertmanager - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - ingress.pomerium.io/allow_public_unauthenticated_access: "false" - ingress.pomerium.io/pass_identity_headers: "true" - ingress.pomerium.io/preserve_host_header: "true" - ingress.pomerium.io/policy: | - allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - ingress.pomerium.io/prefix_rewrite: "/" -spec: - ingressClassName: pomerium - rules: - - host: "bigasterisk.com" - http: - paths: - - pathType: Prefix - path: /alertmanager/ - backend: { service: { name: alertmanager, port: { number: 80 } } } - tls: - - hosts: [bigasterisk.com] - secretName: bigasterisk.com-tls \ No newline at end of file
--- a/k8s_ops.py Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -import json -import time - -from kubernetes import client - - -def refreshPodCmaps(pod_name, namespace="default"): - """ - Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while - until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations. - """ - api_instance = client.CoreV1Api() - - pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace) - if pod.metadata.annotations is None: - pod.metadata.annotations = {} - pod.metadata.annotations["force-configmap-update"] = str(time.time()) - api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod) - - -def firstPodName(selector): - api_instance = client.CoreV1Api() - pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector) - return pod_list.items[0].metadata.name - - -def hup(ctx, deployment, process_name): - ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}") - - -def replaceCmap(name, dataObj): - api_instance = client.CoreV1Api() - - data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items()) - - try: - - existing_config_map = api_instance.read_namespaced_config_map(name, 'default') - existing_config_map.data.update(data) - api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map) - except client.rest.ApiException as e: - if e.status == 404: - config_map = client.V1ConfigMap() - config_map.metadata = client.V1ObjectMeta(name=name) - config_map.data = data - api_response = api_instance.create_namespaced_config_map('default', config_map) - else: - raise - - print(f"{name} resource_version is now {api_response.metadata.resource_version}")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/deploy_alertmanager.yaml Fri May 03 11:19:50 2024 -0700 @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + volumes: + - name: opt-alertmanager + persistentVolumeClaim: + claimName: opt-alertmanager + serviceAccountName: victoriametrics + containers: + - name: alertmanager + image: docker.io/prom/alertmanager:v0.27.0 + args: + - --config.file=/alertmanager/alertmanager.yml + - --web.external-url=https://bigasterisk.com/alertmanager/ + - --web.route-prefix=/ + - --log.level=info + ports: + - containerPort: 9093 + volumeMounts: + - name: opt-alertmanager + mountPath: /alertmanager + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/hostname" + operator: In + values: ["ditto"] +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager +spec: + ports: + - port: 80 + targetPort: 9093 + selector: + app: alertmanager
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/deploy_vmalert.yaml Fri May 03 11:19:50 2024 -0700 @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vmalert +spec: + replicas: 1 + strategy: { type: Recreate } + selector: + matchLabels: + app: vmalert + template: + metadata: + labels: + app: vmalert + annotations: + prometheus.io/scrape: "true" + spec: + volumes: + - name: config + configMap: { name: victoriametrics-config } + serviceAccountName: victoriametrics + containers: + - name: vmalert + image: docker.io/victoriametrics/vmalert:v1.91.2 + args: + - -configCheckInterval=5s + - -datasource.url=http://victoriametrics/m/ + - -datasource.queryStep=5m + - -evaluationInterval=1m + - -external.url=https://bigasterisk.com/vmalert + - -loggerLevel=INFO + - -loggerTimezone=America/Los_Angeles + - -memory.allowedBytes=512MB + - -notifier.url=http://alertmanager + - -remoteRead.url=http://victoriametrics/m/ + - -remoteWrite.url=http://victoriametrics/m/ + - -rule=/local/rules + ports: + - containerPort: 8880 + volumeMounts: + - { name: config, mountPath: /local } +--- +apiVersion: v1 +kind: Service +metadata: + name: vmalert +spec: + ports: + - port: 80 + targetPort: 8880 + selector: + app: vmalert
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/ingress_alertmanager.yaml Fri May 03 11:19:50 2024 -0700 @@ -0,0 +1,55 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vmalert + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + ingress.pomerium.io/allow_public_unauthenticated_access: "false" + ingress.pomerium.io/pass_identity_headers: "true" + ingress.pomerium.io/preserve_host_header: "true" + ingress.pomerium.io/policy: | + allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + # ingress.pomerium.io/prefix_rewrite: "/vmalert/" +spec: + ingressClassName: pomerium + rules: + - host: "bigasterisk.com" + http: + paths: + - pathType: Prefix + path: /vmalert/ + backend: { service: { name: vmalert, port: { number: 80 } } } + tls: + - hosts: [bigasterisk.com] + secretName: bigasterisk.com-tls +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: alertmanager + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + ingress.pomerium.io/allow_public_unauthenticated_access: "false" + ingress.pomerium.io/pass_identity_headers: "true" + ingress.pomerium.io/preserve_host_header: "true" + ingress.pomerium.io/policy: | + allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + ingress.pomerium.io/prefix_rewrite: "/" +spec: + ingressClassName: pomerium + rules: + - host: "bigasterisk.com" + http: + paths: + - pathType: Prefix + path: /alertmanager/ + backend: { service: { name: alertmanager, port: { number: 80 } } } + tls: + - hosts: [bigasterisk.com] + secretName: bigasterisk.com-tls \ No newline at end of file
--- a/next/tasks.py Thu May 02 23:15:37 2024 -0700 +++ b/next/tasks.py Fri May 03 11:19:50 2024 -0700 @@ -15,7 +15,7 @@ @task -def push_config_2024(ctx): +def push_config(ctx): # plan: # every discovered service may: # - be described here as a forever retention - ignore the discovery
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/volumes_alert.yaml Fri May 03 11:19:50 2024 -0700 @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: opt-alertmanager + labels: + type: local +spec: + storageClassName: manual + hostPath: + path: "/opt/alertmanager" + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + claimRef: + namespace: default + name: opt-alertmanager +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: opt-alertmanager +spec: + storageClassName: "" + volumeName: "opt-alertmanager" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi \ No newline at end of file
--- a/push-config.sh Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#!/bin/sh -pdm run invoke push-config
--- a/roles.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: victoriametrics -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: - - extensions - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: victoriametrics ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: victoriametrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: victoriametrics -subjects: -- kind: ServiceAccount - name: victoriametrics - namespace: default -# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account -# - kind: ServiceAccount -# name: default -# namespace: default \ No newline at end of file
--- a/skaffold.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -apiVersion: skaffold/v3 -kind: Config -metadata: - name: victoriametrics -manifests: - rawYaml: - - roles.yaml - - volumes.yaml - - ingress.yaml - - deploy_vmetrics.yaml - - deploy_vmalert.yaml - - deploy_alertmanager.yaml -deploy: - kubectl: {}
--- a/tasks.py Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -from pathlib import Path -import yaml -from invoke import task -from kubernetes import config - -import alert_rules - -from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap - -config.load_kube_config() - - -@task -def push_config(ctx): - rulesObj = alert_rules.allRules(ctx) - - replaceCmap("victoriametrics-config", { - "scrape_main": scrapeConfig("config/scrape_main.yaml"), - "scrape_recent": scrapeConfig("config/scrape_recent.yaml"), - "scrape_forever": scrapeConfig("config/scrape_forever.yaml"), - "rules": rulesObj, - }) - - # these don't give errors on rules format! they just quietly keep the old - # rules! use `skaffold run` to get errs. - # - # or run - # validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates") - # validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") - - refreshPodCmaps(firstPodName("app=victoriametrics")) - refreshPodCmaps(firstPodName("app=vmalert")) - - # If the VM reloader isn't fast enough, we could do this too: - # hup(ctx, 'deploy/victoriametrics', 'victoria-metrics-prod') - - -@task -def push_config_2024(ctx): - # plan: - # every discovered service may: - # - be described here as a forever retention - ignore the discovery - # - be blocked here as a no-metrics service - ignore the discovery - # - be scraped as 'recent', with possible overrides of port/path - # all per-node metrics shall be 'recent' (oops, not smartctl!) - map: dict[str, object] = { - 'rules': alert_rules.allRules(ctx), - } - top=Path('config/build/scrape_jobs') - for p in top.glob('**/*.yaml'): - map[str(p.relative_to(top))] = scrapeConfig(p) - replaceCmap("next-victoriametrics-config", map) - refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent")) - refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent")) - - -def scrapeConfig(fn): - return yaml.load(open(fn), yaml.FullLoader) -
--- a/volumes.yaml Thu May 02 23:15:37 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: opt-victoriametrics - labels: - type: local -spec: - storageClassName: manual - hostPath: - path: "/opt/victoriametrics" - capacity: - storage: 50Gi - accessModes: - - ReadWriteMany - persistentVolumeReclaimPolicy: Retain - claimRef: - namespace: default - name: opt-victoriametrics ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: opt-victoriametrics -spec: - storageClassName: "" - volumeName: "opt-victoriametrics" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi ---- -apiVersion: v1 -kind: PersistentVolume -metadata: - name: opt-alertmanager - labels: - type: local -spec: - storageClassName: manual - hostPath: - path: "/opt/alertmanager" - capacity: - storage: 50Gi - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain - claimRef: - namespace: default - name: opt-alertmanager ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: opt-alertmanager -spec: - storageClassName: "" - volumeName: "opt-alertmanager" - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi \ No newline at end of file