changeset 66:429bfd62e6ba

clean out old config; move remaining bits into next/
author drewp@bigasterisk.com
date Fri, 03 May 2024 11:19:50 -0700
parents fada8d64c4d3
children adde35eb4773
files alert_rules.py config/scrape_main.yaml deploy_alertmanager.yaml deploy_vmalert.yaml deploy_vmetrics.yaml ingress.yaml k8s_ops.py next/deploy_alertmanager.yaml next/deploy_vmalert.yaml next/ingress_alertmanager.yaml next/tasks.py next/volumes_alert.yaml push-config.sh roles.yaml skaffold.yaml tasks.py volumes.yaml
diffstat 17 files changed, 190 insertions(+), 989 deletions(-) [+]
line wrap: on
line diff
--- a/alert_rules.py	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,433 +0,0 @@
-"""
-pdm run invoke push-config
-
-docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-"Whenever the alert expression results in one or more vector
-elements at a given point in time, the alert counts as active for
-these elements' label sets."
-also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
-
-"""
-
-import json
-
-
-def pomRules():
-    return [
-        {
-            "alert": "frequent_upstream_connect_failures",
-            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
-        },
-        {
-            "alert": "high_logging_pomerium",
-            "for": "3h",
-            "labels": {
-                "severity": "waste"
-            },
-            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
-            "annotations": {
-                "summary": "high log output rate"
-            },
-        },
-    ]
-
-
-def k8sRules():
-    # from https://awesome-prometheus-alerts.grep.to/rules.html
-    return [
-        {
-            "alert": "metricsTargetMissing",
-            "expr": 'up{job!~"cm-acme-.*"} == 0',
-            'for': '10m',
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "metrics target missing (instance {{ $labels.instance }})",
-                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesMemoryPressure",
-            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesDiskPressure",
-            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesOutOfDisk",
-            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesJobFailed",
-            "expr": "kube_job_status_failed > 0",
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
-                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesPodCrashLooping",
-            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
-            "for": "2m",
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
-                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesClientCertificateExpiresNextWeek",
-            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
-                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "container_waiting",
-            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
-            "annotations": {
-                "description": '',
-                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
-            },
-            "for": "2m",
-        },
-    ]
-
-
-def allRules(ctx):
-    return {
-        "groups": [
-            {
-                "name": "k8s",
-                "interval": "1m",
-                "rules": k8sRules(),
-            },
-            {
-                "name": "pomerium_proxy",
-                "interval": "1m",
-                "rules": pomRules(),
-            },
-            {
-                "name":
-                    "Outages",
-                "interval":
-                    "1m",
-                "rules": [
-                    {
-                        "alert": "powereagleStalled",
-                        "expr": "rate(house_power_w[100m]) == 0",
-                        "for": "0m",
-                        "labels": {
-                            "severity": "losingData"
-                        },
-                        "annotations": {
-                            "summary": "power eagle data stalled",
-                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
-                        },
-                    },
-                    {
-                        "alert": "powereagleAbsent",
-                        "expr": "absent_over_time(house_power_w[5m])",
-                        "for": "2m",
-                        "labels": {
-                            "severity": "losingData"
-                        },
-                        "annotations": {
-                            "summary": "power eagle data missing",
-                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
-                        },
-                    },
-                    {
-                        "alert": "absent_zigbee",
-                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
-                    },
-                    {
-                        "alert": "net_routes_sync",
-                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
-                        "for": "10m",
-                        "labels": {
-                            "severity": "houseUsersAffected"
-                        },
-                        "annotations": {
-                            "summary": "net_routes is not getting regular updates"
-                        },
-                    },
-                ],
-            },
-            {
-                "name": "disk_errs",
-                "interval": "2d",
-                "rules": [{
-                    "alert": "zpool_device_error_increase",
-                    "labels": {
-                        "severity": "warning"
-                    },
-                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
-                }, {
-                    "alert": "zpool_device_error_count",
-                    "labels": {
-                        "severity": "warning"
-                    },
-                    "expr": 'zpool_device_error_count > 0',
-                }],
-            },
-            {
-                "name": "lighting",
-                "interval": "5m",
-                "rules": [{
-                    "alert": "light_bridge_no_mqtt",
-                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
-                }],
-            },
-            {
-                "name":
-                    "front_door",
-                "interval":
-                    "5m",
-                "rules": [
-                    {
-                        "alert": "front_door_reader_esp32_no_mqtt",
-                        'expr': 'hw_connected{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_reader_svc_down",
-                        'expr': 'up{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_reader_svc_reader_no_mqtt",
-                        'expr': 'mqtt_connected{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_svc_down",
-                        'expr': 'up{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_svc_no_mqtt",
-                        'expr': 'mqtt_connected{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_esp32_no_mqtt",
-                        'expr': 'hw_connected{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                ],
-            },
-            {
-                "name":
-                    "net_routes",
-                "interval":
-                    "5m",
-                "rules": [
-                    {
-                        "alert": "no_house_ip_service",
-                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
-                    },
-                    {
-                        "alert": "no_net_routes_running",
-                        "expr": 'absent(python_info{job="net-routes"})'
-                    },
-                    {
-                        "alert": "allowed_check_never_returned_200",
-                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
-                    },
-                    {
-                        "alert": "allowed_check_never_returned_403",
-                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
-                    },
-                    {
-                        'alert': 'net_route_input_eval_cal_loop_is_down',
-                        'expr': 'eval_cal_up!=1'
-                    },
-                    {
-                        'alert': 'net_route_input_mongo_loop_is_down',
-                        'expr': 'mongo_to_net_routes_up!=1'
-                    },
-                    {
-                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
-                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
-                    },
-                    {
-                        'alert': 'gcalendarwatch_current_events_loop_is_down',
-                        'expr': 'current_events_up != 1'
-                    },
-                ],
-            },
-            {
-                "name": "http",
-                "interval": "1h",
-                'rules': [
-                    {
-                        'alert': 'old_https_certs',
-                        'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
-                    },
-                    {
-                        'alert': 'high_500_response_rate',
-                        'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
-                    },
-                ],
-            },
-            {
-                "name": "ping",
-                "interval": "1m",
-                "rules": [{
-                    "alert": "ping_failed",
-                    "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
-                }]
-            },
-            {
-                "name":
-                    "alerts",
-                "rules": [
-                    {
-                        "alert": "kube_node_status_bad_condition",
-                        "for": "2h",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
-                    },
-                    {
-                        "alert": "housePower",
-                        "for": "1h",
-                        "labels": {
-                            "severity": "waste"
-                        },
-                        "expr": "house_power_w > 4000",
-                        "annotations": {
-                            "summary": "house power usage over 4KW"
-                        },
-                    },
-                    {
-                        "alert": "host_root_fs_space_low",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
-                    },
-                    {
-                        "alert": "zpool_space_low",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
-                    },
-                    {
-                        "alert": "disk_week_incr",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
-                        "annotations": {
-                            "summary": "high mb/week on zfs dir"
-                        },
-                    },
-                    {
-                        "alert": "high_logging",
-                        "for": "3h",
-                        "labels": {
-                            "severity": "waste"
-                        },
-                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
-                        "annotations": {
-                            "summary": "high log output rate"
-                        },
-                    },
-                    {
-                        "alert": "stale_process",
-                        "for": "1d",
-                        "labels": {
-                            "severity": "dataRisk"
-                        },
-                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
-                        "annotations": {
-                            "summary": "process time is old"
-                        },
-                    },
-                    {
-                        "alert": "starlette",
-                        "for": "1m",
-                        "labels": {
-                            "severity": "fix"
-                        },
-                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
-                        "annotations": {
-                            "summary": "set starlette app name"
-                        },
-                    },
-                    {
-                        "alert": "ssl_certs_expiring_soon",
-                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "annotations": {
-                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
-                        },
-                    },
-                ],
-            },
-        ] + hostsExpectedOnline(ctx)['groups']
-    }
-
-
-def _runJson(ctx, cmd):
-    return json.loads(ctx.run(cmd, hide="stdout").stdout)
-
-
-def hostsExpectedOnline(ctx):
-    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
--- a/config/scrape_main.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-  # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
-
-  - job_name: "k8services"
-    kubernetes_sd_configs: [{ role: endpoints }]
-    relabel_configs:
-      # To omit a service, add this at pod-level (Deployment.spec.template.metadata.annotations):
-      #   annotations: { prometheus.io/scrape: "false" }
-      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
-        regex: "false"
-        action: drop
-
-      # - source_labels: [__meta_kubernetes_namespace]
-      #   regex: default
-      #   action: keep
-
-      # promote these to display
-      - source_labels: [__meta_kubernetes_service_name]
-        target_label: job
-
-      - source_labels: [__meta_kubernetes_pod_node_name]
-        target_label: node
-
-      # for convenience in this config
-      - source_labels: [__meta_kubernetes_pod_container_port_number]
-        target_label: __port_number
-
-      # path tweaks
-      - if: '{job="victoriametrics",__port_number="8428"}'
-        action: replace
-        target_label: "__metrics_path__"
-        replacement: "/m/metrics"
-      - if: '{job="victorialogs",__port_number="9428"}'
-        action: replace
-        target_label: "__metrics_path__"
-        replacement: "/logs/metrics"
-      - if: '{job="video-files",__port_number="8004"}'
-        action: replace
-        target_label: "__metrics_path__"
-        replacement: "/video/api/metrics"
-
-      # discovery is matching extra ports that don't serve metrics- remove these targets
-      - {if: '{job="cert-manager-webhook"}',                     action: drop}
-      - {if: '{job="cert-manager",      __port_number="9403"}',  action: drop}
-      - {if: '{job="filesync-syncthing",__port_number="21027"}', action: drop}
-      - {if: '{job="filesync-syncthing",__port_number="22000"}', action: drop}
-      - {if: '{job="filesync-syncthing",__port_number="8384"}',  action: drop}
-      - {if: '{job="jsregistry",        __port_number="4873"}',  action: drop}
-      - {if: '{job="kube-dns",          __port_number="53"}',    action: drop}
-      - {if: '{job="kubernetes"}',                               action: drop}
-      - {if: '{job="mongodb",           __port_number="27017"}', action: drop}
-      - {if: '{job="mosquitto-ext",     __port_number="1883"}',  action: drop}
-      - {if: '{job="net-route-input",   __port_number="80"}',    action: drop}
-      - {if: '{job="photoprism",        __port_number="2342"}',  action: drop}
-      - {if: '{job="pomerium-metrics",  __port_number="8080"}',  action: drop}
-      - {if: '{job="pomerium-metrics",  __port_number="8443"}',  action: drop}
-      - {if: '{job="pomerium-proxy",    __port_number="8080"}',  action: drop}
-      - {if: '{job="pomerium-proxy",    __port_number="8443"}',  action: drop}
-      - {if: '{job="video-files",       __port_number="8003"}',  action: drop}
-      - {if: '{job=~"cm-acme-.*"}',                              action: drop}
-      - {if: '{job="nvidiagpu-node-feature-discovery-master", __port_number="8080"}',  action: drop}
-      
-      # already have this with a job="pomerium-proxy"
-      - {if: '{job="pomerium-metrics"}',                         action: drop}
-
-
-
-      # Assume all 8001/8002 port discoveries are redundant with an nginx proxy
-      - {if: '{__port_number="8001"}', action: drop}
-      - {if: '{__port_number="8002"}', action: drop}
-      
-      # Needs https. Used by `kubectl top`
-      - {if: '{job="metrics-server",    __port_number="443"}', action: drop}
-      # Something doesn't work with the scrape, and I don't see why I should care:
-      - {if: '{job="metrics-server"                        }', action: drop}
-
--- a/deploy_alertmanager.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,51 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: alertmanager
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: alertmanager
-  template:
-    metadata:
-      labels:
-        app: alertmanager
-    spec:
-      volumes:
-        - name: opt-alertmanager
-          persistentVolumeClaim:
-            claimName: opt-alertmanager
-      serviceAccountName: victoriametrics
-      containers:
-        - name: alertmanager
-          image: docker.io/prom/alertmanager:v0.25.0
-          args:
-            - --config.file=/alertmanager/alertmanager.yml
-            - --web.external-url=https://bigasterisk.com/alertmanager/
-            - --web.route-prefix=/
-            - --log.level=info
-          ports:
-          - containerPort: 9093
-          volumeMounts:
-          - name: opt-alertmanager
-            mountPath: /alertmanager
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: "kubernetes.io/hostname"
-                operator: In
-                values: ["ditto"]
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager
-spec:
-  ports:
-  - port: 80
-    targetPort: 9093
-  selector:
-    app: alertmanager
--- a/deploy_vmalert.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vmalert
-spec:
-  replicas: 1
-  strategy: { type: Recreate }
-  selector:
-    matchLabels:
-      app: vmalert
-  template:
-    metadata:
-      labels:
-        app: vmalert
-      annotations:
-        prometheus.io/scrape: "true"
-    spec:
-      volumes:
-        - name: config
-          configMap: { name: victoriametrics-config }
-      serviceAccountName: victoriametrics
-      containers:
-        - name: vmalert
-          image: docker.io/victoriametrics/vmalert:v1.91.2
-          args:
-            - -configCheckInterval=5s
-            - -datasource.url=http://victoriametrics/m/
-            - -datasource.queryStep=5m
-            - -evaluationInterval=1m
-            - -external.url=https://bigasterisk.com/vmalert
-            - -loggerLevel=INFO
-            - -loggerTimezone=America/Los_Angeles
-            - -memory.allowedBytes=512MB
-            - -notifier.url=http://alertmanager
-            - -remoteRead.url=http://victoriametrics/m/
-            - -remoteWrite.url=http://victoriametrics/m/
-            - -rule=/local/rules
-          ports:
-            - containerPort: 8880
-          volumeMounts:
-            - { name: config, mountPath: /local }
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vmalert
-spec:
-  ports:
-    - port: 80
-      targetPort: 8880
-  selector:
-    app: vmalert
--- a/deploy_vmetrics.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: victoriametrics
-spec:
-  replicas: 1
-  strategy: { type: Recreate }
-  selector:
-    matchLabels:
-      app: victoriametrics
-  template:
-    metadata:
-      labels:
-        app: victoriametrics
-      annotations: 
-        prometheus.io/scrape: "true"
-        prometheus.io/path: "/m/metrics"
-        prometheus.io/port: "80"
-    spec:
-      volumes:
-        - name: data
-          persistentVolumeClaim:
-            claimName: opt-victoriametrics
-        - name: config
-          configMap: {name: victoriametrics-config}
-      serviceAccountName: victoriametrics
-      containers:
-        - name: victoriametrics
-          # https://hub.docker.com/r/victoriametrics/victoria-metrics/tags also check vmalert.yaml
-          image: docker.io/victoriametrics/victoria-metrics:v1.91.2
-          args:
-            - -http.pathPrefix=/m/
-            - -loggerTimezone=America/Los_Angeles
-            - -memory.allowedBytes=512MB
-            - -promscrape.config=/local/config/scrape_main
-            - -promscrape.configCheckInterval=5s
-            - -retentionPeriod=10y
-            - -sortLabels
-            - -storageDataPath=/data
-          ports:
-            - containerPort: 8428
-          volumeMounts:
-            - { name: data, mountPath: /data }
-            - { name: config, mountPath: "/local/config" }
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: "kubernetes.io/hostname"
-                    operator: In
-                    values: ["ditto"]
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: victoriametrics
-spec:
-  ports:
-    - port: 80
-      targetPort: 8428
-  selector:
-    app: victoriametrics
--- a/ingress.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: victoriametrics
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
-    ingress.pomerium.io/pass_identity_headers: "true"
-    ingress.pomerium.io/preserve_host_header: "true"
-    ingress.pomerium.io/policy: |
-      allow:
-        or: 
-          - { email: { is: "drewpca@gmail.com" }}
-          - { email: { is: "kelsimp@gmail.com" }}
-    ingress.pomerium.io/prefix_rewrite: "/m/"
-spec:
-  ingressClassName: pomerium
-  rules:
-    - host: "bigasterisk.com"
-      http:
-        paths:
-          - pathType: Prefix
-            path: /m/
-            backend: { service: { name: victoriametrics, port: { number: 80 } } }
-  tls:
-    - hosts: [bigasterisk.com]
-      secretName: bigasterisk.com-tls
----
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: vmalert
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
-    ingress.pomerium.io/pass_identity_headers: "true"
-    ingress.pomerium.io/preserve_host_header: "true"
-    ingress.pomerium.io/policy: |
-      allow:
-        or: 
-          - { email: { is: "drewpca@gmail.com" }}
-          - { email: { is: "kelsimp@gmail.com" }}
-    # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
-spec:
-  ingressClassName: pomerium
-  rules:
-    - host: "bigasterisk.com"
-      http:
-        paths:
-          - pathType: Prefix
-            path: /vmalert/
-            backend: { service: { name: vmalert, port: { number: 80 } } }
-  tls:
-    - hosts: [bigasterisk.com]
-      secretName: bigasterisk.com-tls
----
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: alertmanager
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
-    ingress.pomerium.io/pass_identity_headers: "true"
-    ingress.pomerium.io/preserve_host_header: "true"
-    ingress.pomerium.io/policy: |
-      allow:
-        or: 
-          - { email: { is: "drewpca@gmail.com" }}
-          - { email: { is: "kelsimp@gmail.com" }}
-    ingress.pomerium.io/prefix_rewrite: "/"
-spec:
-  ingressClassName: pomerium
-  rules:
-    - host: "bigasterisk.com"
-      http:
-        paths:
-          - pathType: Prefix
-            path: /alertmanager/
-            backend: { service: { name: alertmanager, port: { number: 80 } } }
-  tls:
-    - hosts: [bigasterisk.com]
-      secretName: bigasterisk.com-tls
\ No newline at end of file
--- a/k8s_ops.py	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-import json
-import time
-
-from kubernetes import client
-
-
-def refreshPodCmaps(pod_name, namespace="default"):
-    """
-    Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
-    until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
-    """
-    api_instance = client.CoreV1Api()
-
-    pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
-    if pod.metadata.annotations is None:
-        pod.metadata.annotations = {}
-    pod.metadata.annotations["force-configmap-update"] = str(time.time())
-    api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
-
-
-def firstPodName(selector):
-    api_instance = client.CoreV1Api()
-    pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
-    return pod_list.items[0].metadata.name
-
-
-def hup(ctx, deployment, process_name):
-    ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
-
-
-def replaceCmap(name, dataObj):
-    api_instance = client.CoreV1Api()
-
-    data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
-
-    try:
-
-        existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
-        existing_config_map.data.update(data)
-        api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
-    except client.rest.ApiException as e:
-        if e.status == 404:
-            config_map = client.V1ConfigMap()
-            config_map.metadata = client.V1ObjectMeta(name=name)
-            config_map.data = data
-            api_response = api_instance.create_namespaced_config_map('default', config_map)
-        else:
-            raise
-
-    print(f"{name} resource_version is now {api_response.metadata.resource_version}")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/deploy_alertmanager.yaml	Fri May 03 11:19:50 2024 -0700
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: alertmanager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: alertmanager
+  template:
+    metadata:
+      labels:
+        app: alertmanager
+    spec:
+      volumes:
+        - name: opt-alertmanager
+          persistentVolumeClaim:
+            claimName: opt-alertmanager
+      serviceAccountName: victoriametrics
+      containers:
+        - name: alertmanager
+          image: docker.io/prom/alertmanager:v0.27.0
+          args:
+            - --config.file=/alertmanager/alertmanager.yml
+            - --web.external-url=https://bigasterisk.com/alertmanager/
+            - --web.route-prefix=/
+            - --log.level=info
+          ports:
+          - containerPort: 9093
+          volumeMounts:
+          - name: opt-alertmanager
+            mountPath: /alertmanager
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: "kubernetes.io/hostname"
+                operator: In
+                values: ["ditto"]
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alertmanager
+spec:
+  ports:
+  - port: 80
+    targetPort: 9093
+  selector:
+    app: alertmanager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/deploy_vmalert.yaml	Fri May 03 11:19:50 2024 -0700
@@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vmalert
+spec:
+  replicas: 1
+  strategy: { type: Recreate }
+  selector:
+    matchLabels:
+      app: vmalert
+  template:
+    metadata:
+      labels:
+        app: vmalert
+      annotations:
+        prometheus.io/scrape: "true"
+    spec:
+      volumes:
+        - name: config
+          configMap: { name: victoriametrics-config }
+      serviceAccountName: victoriametrics
+      containers:
+        - name: vmalert
+          image: docker.io/victoriametrics/vmalert:v1.91.2
+          args:
+            - -configCheckInterval=5s
+            - -datasource.url=http://victoriametrics/m/
+            - -datasource.queryStep=5m
+            - -evaluationInterval=1m
+            - -external.url=https://bigasterisk.com/vmalert
+            - -loggerLevel=INFO
+            - -loggerTimezone=America/Los_Angeles
+            - -memory.allowedBytes=512MB
+            - -notifier.url=http://alertmanager
+            - -remoteRead.url=http://victoriametrics/m/
+            - -remoteWrite.url=http://victoriametrics/m/
+            - -rule=/local/rules
+          ports:
+            - containerPort: 8880
+          volumeMounts:
+            - { name: config, mountPath: /local }
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vmalert
+spec:
+  ports:
+    - port: 80
+      targetPort: 8880
+  selector:
+    app: vmalert
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/ingress_alertmanager.yaml	Fri May 03 11:19:50 2024 -0700
@@ -0,0 +1,55 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vmalert
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+    ingress.pomerium.io/pass_identity_headers: "true"
+    ingress.pomerium.io/preserve_host_header: "true"
+    ingress.pomerium.io/policy: |
+      allow:
+        or: 
+          - { email: { is: "drewpca@gmail.com" }}
+          - { email: { is: "kelsimp@gmail.com" }}
+    # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
+spec:
+  ingressClassName: pomerium
+  rules:
+    - host: "bigasterisk.com"
+      http:
+        paths:
+          - pathType: Prefix
+            path: /vmalert/
+            backend: { service: { name: vmalert, port: { number: 80 } } }
+  tls:
+    - hosts: [bigasterisk.com]
+      secretName: bigasterisk.com-tls
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: alertmanager
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+    ingress.pomerium.io/pass_identity_headers: "true"
+    ingress.pomerium.io/preserve_host_header: "true"
+    ingress.pomerium.io/policy: |
+      allow:
+        or: 
+          - { email: { is: "drewpca@gmail.com" }}
+          - { email: { is: "kelsimp@gmail.com" }}
+    ingress.pomerium.io/prefix_rewrite: "/"
+spec:
+  ingressClassName: pomerium
+  rules:
+    - host: "bigasterisk.com"
+      http:
+        paths:
+          - pathType: Prefix
+            path: /alertmanager/
+            backend: { service: { name: alertmanager, port: { number: 80 } } }
+  tls:
+    - hosts: [bigasterisk.com]
+      secretName: bigasterisk.com-tls
\ No newline at end of file
--- a/next/tasks.py	Thu May 02 23:15:37 2024 -0700
+++ b/next/tasks.py	Fri May 03 11:19:50 2024 -0700
@@ -15,7 +15,7 @@
 
 
 @task
-def push_config_2024(ctx):
+def push_config(ctx):
     # plan:
     #   every discovered service may:
     #      - be described here as a forever retention - ignore the discovery
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/volumes_alert.yaml	Fri May 03 11:19:50 2024 -0700
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: opt-alertmanager
+  labels:
+    type: local
+spec:
+  storageClassName: manual
+  hostPath:
+    path: "/opt/alertmanager"
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  claimRef:
+    namespace: default
+    name: opt-alertmanager
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: opt-alertmanager
+spec:
+  storageClassName: ""
+  volumeName: "opt-alertmanager"
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
\ No newline at end of file
--- a/push-config.sh	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-#!/bin/sh
-pdm run invoke push-config
--- a/roles.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: victoriametrics
-rules:
-- apiGroups: [""]
-  resources:
-  - nodes
-  - nodes/metrics
-  - nodes/proxy
-  - services
-  - endpoints
-  - pods
-  verbs: ["get", "list", "watch"]
-- apiGroups:
-  - extensions
-  resources:
-  - ingresses
-  verbs: ["get", "list", "watch"]
-- nonResourceURLs: ["/metrics"]
-  verbs: ["get"]
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: victoriametrics
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: victoriametrics
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: victoriametrics
-subjects:
-- kind: ServiceAccount
-  name: victoriametrics
-  namespace: default
-# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
-# - kind: ServiceAccount
-#   name: default
-#   namespace: default
\ No newline at end of file
--- a/skaffold.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-apiVersion: skaffold/v3
-kind: Config
-metadata:
-  name: victoriametrics
-manifests:
-  rawYaml:
-    - roles.yaml
-    - volumes.yaml
-    - ingress.yaml
-    - deploy_vmetrics.yaml
-    - deploy_vmalert.yaml
-    - deploy_alertmanager.yaml
-deploy:
-  kubectl: {}
--- a/tasks.py	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-from pathlib import Path
-import yaml
-from invoke import task
-from kubernetes import config
-
-import alert_rules
-
-from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
-
-config.load_kube_config()
-
-
-@task
-def push_config(ctx):
-    rulesObj = alert_rules.allRules(ctx)
-
-    replaceCmap("victoriametrics-config", {
-        "scrape_main": scrapeConfig("config/scrape_main.yaml"),
-        "scrape_recent": scrapeConfig("config/scrape_recent.yaml"),
-        "scrape_forever": scrapeConfig("config/scrape_forever.yaml"),
-        "rules": rulesObj,
-    })
-
-    # these don't give errors on rules format! they just quietly keep the old
-    # rules! use `skaffold run` to get errs.
-    #
-    # or run
-    #  validateTemplates   = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
-    #  validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
-
-    refreshPodCmaps(firstPodName("app=victoriametrics"))
-    refreshPodCmaps(firstPodName("app=vmalert"))
-
-    # If the VM reloader isn't fast enough, we could do this too:
-    # hup(ctx, 'deploy/victoriametrics', 'victoria-metrics-prod')
-
-
-@task
-def push_config_2024(ctx):
-    # plan:
-    #   every discovered service may:
-    #      - be described here as a forever retention - ignore the discovery
-    #      - be blocked here as a no-metrics service - ignore the discovery
-    #      - be scraped as 'recent', with possible overrides of port/path
-    #   all per-node metrics shall be 'recent' (oops, not smartctl!)
-    map: dict[str, object] = {
-        'rules': alert_rules.allRules(ctx),
-    }
-    top=Path('config/build/scrape_jobs')
-    for p in top.glob('**/*.yaml'):
-        map[str(p.relative_to(top))] = scrapeConfig(p)
-    replaceCmap("next-victoriametrics-config", map)
-    refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
-    refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
-  
-
-def scrapeConfig(fn):
-    return yaml.load(open(fn), yaml.FullLoader)
-
--- a/volumes.yaml	Thu May 02 23:15:37 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: opt-victoriametrics
-  labels:
-    type: local
-spec:
-  storageClassName: manual
-  hostPath:
-    path: "/opt/victoriametrics"
-  capacity:
-    storage: 50Gi
-  accessModes:
-    - ReadWriteMany
-  persistentVolumeReclaimPolicy: Retain
-  claimRef:
-    namespace: default
-    name: opt-victoriametrics
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: opt-victoriametrics
-spec:
-  storageClassName: ""
-  volumeName: "opt-victoriametrics"
-  accessModes:
-    - ReadWriteMany
-  resources:
-    requests:
-      storage: 50Gi
----
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: opt-alertmanager
-  labels:
-    type: local
-spec:
-  storageClassName: manual
-  hostPath:
-    path: "/opt/alertmanager"
-  capacity:
-    storage: 50Gi
-  accessModes:
-    - ReadWriteOnce
-  persistentVolumeReclaimPolicy: Retain
-  claimRef:
-    namespace: default
-    name: opt-alertmanager
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: opt-alertmanager
-spec:
-  storageClassName: ""
-  volumeName: "opt-alertmanager"
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 50Gi
\ No newline at end of file