changeset 67:adde35eb4773

collapse ./next to ./
author drewp@bigasterisk.com
date Fri, 03 May 2024 11:21:08 -0700
parents 429bfd62e6ba
children c5e98d891638
files alert_rules.py create_k8s.py create_scrape_configs.py deploy_alertmanager.yaml deploy_vmalert.yaml index.css index.js index_page.py ingress_alertmanager.yaml k8s_ops.py next/alert_rules.py next/create_k8s.py next/create_scrape_configs.py next/deploy_alertmanager.yaml next/deploy_vmalert.yaml next/index.css next/index.js next/index_page.py next/ingress_alertmanager.yaml next/k8s_ops.py next/output.py next/roles.yaml next/scrape_job.py next/skaffold.yaml next/tasks.py next/volumes_alert.yaml output.py roles.yaml scrape_job.py skaffold.yaml tasks.py volumes_alert.yaml
diffstat 32 files changed, 1493 insertions(+), 1493 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alert_rules.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,433 @@
+"""
+pdm run invoke push-config
+
+docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+"Whenever the alert expression results in one or more vector
+elements at a given point in time, the alert counts as active for
+these elements' label sets."
+also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+
+"""
+
+import json
+
+
+def pomRules():
+    return [
+        {
+            "alert": "frequent_upstream_connect_failures",
+            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
+        },
+        {
+            "alert": "high_logging_pomerium",
+            "for": "3h",
+            "labels": {
+                "severity": "waste"
+            },
+            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
+            "annotations": {
+                "summary": "high log output rate"
+            },
+        },
+    ]
+
+
+def k8sRules():
+    # from https://awesome-prometheus-alerts.grep.to/rules.html
+    return [
+        {
+            "alert": "metricsTargetMissing",
+            "expr": 'up{job!~"cm-acme-.*"} == 0',
+            'for': '10m',
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "metrics target missing (instance {{ $labels.instance }})",
+                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesMemoryPressure",
+            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesDiskPressure",
+            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesOutOfDisk",
+            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesJobFailed",
+            "expr": "kube_job_status_failed > 0",
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
+                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesPodCrashLooping",
+            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
+            "for": "2m",
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
+                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesClientCertificateExpiresNextWeek",
+            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
+                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "container_waiting",
+            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
+            "annotations": {
+                "description": '',
+                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
+            },
+            "for": "2m",
+        },
+    ]
+
+
+def allRules(ctx):
+    return {
+        "groups": [
+            {
+                "name": "k8s",
+                "interval": "1m",
+                "rules": k8sRules(),
+            },
+            {
+                "name": "pomerium_proxy",
+                "interval": "1m",
+                "rules": pomRules(),
+            },
+            {
+                "name":
+                    "Outages",
+                "interval":
+                    "1m",
+                "rules": [
+                    {
+                        "alert": "powereagleStalled",
+                        "expr": "rate(house_power_w[100m]) == 0",
+                        "for": "0m",
+                        "labels": {
+                            "severity": "losingData"
+                        },
+                        "annotations": {
+                            "summary": "power eagle data stalled",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "powereagleAbsent",
+                        "expr": "absent_over_time(house_power_w[5m])",
+                        "for": "2m",
+                        "labels": {
+                            "severity": "losingData"
+                        },
+                        "annotations": {
+                            "summary": "power eagle data missing",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "absent_zigbee",
+                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
+                    },
+                    {
+                        "alert": "net_routes_sync",
+                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
+                        "for": "10m",
+                        "labels": {
+                            "severity": "houseUsersAffected"
+                        },
+                        "annotations": {
+                            "summary": "net_routes is not getting regular updates"
+                        },
+                    },
+                ],
+            },
+            {
+                "name": "disk_errs",
+                "interval": "2d",
+                "rules": [{
+                    "alert": "zpool_device_error_increase",
+                    "labels": {
+                        "severity": "warning"
+                    },
+                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
+                }, {
+                    "alert": "zpool_device_error_count",
+                    "labels": {
+                        "severity": "warning"
+                    },
+                    "expr": 'zpool_device_error_count > 0',
+                }],
+            },
+            {
+                "name": "lighting",
+                "interval": "5m",
+                "rules": [{
+                    "alert": "light_bridge_no_mqtt",
+                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
+                }],
+            },
+            {
+                "name":
+                    "front_door",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "front_door_reader_esp32_no_mqtt",
+                        'expr': 'hw_connected{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_reader_svc_down",
+                        'expr': 'up{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_reader_svc_reader_no_mqtt",
+                        'expr': 'mqtt_connected{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_svc_down",
+                        'expr': 'up{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_svc_no_mqtt",
+                        'expr': 'mqtt_connected{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_esp32_no_mqtt",
+                        'expr': 'hw_connected{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                ],
+            },
+            {
+                "name":
+                    "net_routes",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "no_house_ip_service",
+                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
+                    },
+                    {
+                        "alert": "no_net_routes_running",
+                        "expr": 'absent(python_info{job="net-routes"})'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_200",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_403",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
+                    },
+                    {
+                        'alert': 'net_route_input_eval_cal_loop_is_down',
+                        'expr': 'eval_cal_up!=1'
+                    },
+                    {
+                        'alert': 'net_route_input_mongo_loop_is_down',
+                        'expr': 'mongo_to_net_routes_up!=1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
+                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_current_events_loop_is_down',
+                        'expr': 'current_events_up != 1'
+                    },
+                ],
+            },
+            {
+                "name": "http",
+                "interval": "1h",
+                'rules': [
+                    {
+                        'alert': 'old_https_certs',
+                        'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
+                    },
+                    {
+                        'alert': 'high_500_response_rate',
+                        'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
+                    },
+                ],
+            },
+            {
+                "name": "ping",
+                "interval": "1m",
+                "rules": [{
+                    "alert": "ping_failed",
+                    "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
+                }]
+            },
+            {
+                "name":
+                    "alerts",
+                "rules": [
+                    {
+                        "alert": "kube_node_status_bad_condition",
+                        "for": "2h",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
+                    },
+                    {
+                        "alert": "housePower",
+                        "for": "1h",
+                        "labels": {
+                            "severity": "waste"
+                        },
+                        "expr": "house_power_w > 4000",
+                        "annotations": {
+                            "summary": "house power usage over 4KW"
+                        },
+                    },
+                    {
+                        "alert": "host_root_fs_space_low",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
+                    },
+                    {
+                        "alert": "zpool_space_low",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
+                    },
+                    {
+                        "alert": "disk_week_incr",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
+                        "annotations": {
+                            "summary": "high mb/week on zfs dir"
+                        },
+                    },
+                    {
+                        "alert": "high_logging",
+                        "for": "3h",
+                        "labels": {
+                            "severity": "waste"
+                        },
+                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
+                        "annotations": {
+                            "summary": "high log output rate"
+                        },
+                    },
+                    {
+                        "alert": "stale_process",
+                        "for": "1d",
+                        "labels": {
+                            "severity": "dataRisk"
+                        },
+                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
+                        "annotations": {
+                            "summary": "process time is old"
+                        },
+                    },
+                    {
+                        "alert": "starlette",
+                        "for": "1m",
+                        "labels": {
+                            "severity": "fix"
+                        },
+                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
+                        "annotations": {
+                            "summary": "set starlette app name"
+                        },
+                    },
+                    {
+                        "alert": "ssl_certs_expiring_soon",
+                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "annotations": {
+                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
+                        },
+                    },
+                ],
+            },
+        ] + hostsExpectedOnline(ctx)['groups']
+    }
+
+
+def _runJson(ctx, cmd):
+    return json.loads(ctx.run(cmd, hide="stdout").stdout)
+
+
+def hostsExpectedOnline(ctx):
+    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/create_k8s.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,254 @@
+from pathlib import Path
+from index_page import makeIndexHtml
+from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
+
+
+def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
+    (build / f'{agentFileName}_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": agentName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmagent",
+                            "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
+                                tzArg,
+                                f"-promscrape.config=/local/config/{scrapeMapKey}",
+                                "-promscrape.configCheckInterval=5s",
+                                "-sortLabels",
+                                f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
+                                "-remoteWrite.showURL",
+                            ],
+                            "ports": [{ "containerPort": agentPort }],
+                            "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
+    (build / f'{insertFileName}_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": insertName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vminsert",
+                            "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
+                                tzArg,
+                                f"-storageNode={storageName}",
+                            ],
+                            "ports": [{ "containerPort": insertPort }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
+    (build / f'{storageFileName}_2deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": storageName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmstorage",
+                            "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
+                                tzArg,
+                                f"-retentionPeriod={retention}",
+                                f"-storageDataPath=/data/{pipelineName}",
+                            ],
+                            "ports": [
+                                { "containerPort": 8482, "name": "http" },
+                                { "containerPort": storageInsertPort, "name": "vminsert" },
+                                { "containerPort": storageSelectPort, "name": "vmselect" },
+                            ],
+                            "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
+                        }],
+                        "affinity": affinityToNode(localPvHost)
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
+    name = f"{objPrefix}-vmselect"
+    (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+            "spec": {
+                "replicas": 1,
+                "strategy": { "type": "Recreate" },
+                "selector": { "matchLabels": { "app": name } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": name },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={webRoot}/vmselect/",
+                                tzArg,
+                            ] + [f"-storageNode={n}" for n in storageSvcs],
+                            "ports": [{ "containerPort": selectPort }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
+    agentName = f"{objPrefix}-{pipelineName}-vmagent"
+    insertName = f"{objPrefix}-{pipelineName}-vminsert"
+    storageName = f"{objPrefix}-{pipelineName}-vmstorage"
+
+    agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
+    insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
+    storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
+
+    localPvHost = "ditto"
+    insertPort = 8480
+    agentPort = 8429
+    storageInsertPort = 8400
+    storageSelectPort = 8401
+    volName = f"{objPrefix}-data-{pipelineName}"
+    request = "50Gi"
+    pipelineWebRoot = f'{webRoot}/{pipelineName}'
+
+    createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
+    createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
+    createPv(storageFileName, volName, request)
+    createPvc(storageFileName, volName, request)
+    createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
+
+    createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
+    createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
+    createSvc(storageFileName,storageName, [
+        {"port": 80, "targetPort": "http", "name": "http"},
+        {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
+        {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
+        ]) # yapf: disable
+
+    return storageName
+
+
+def createIndex(objPrefix, webRoot, html):
+    name = f'{objPrefix}-index'
+    httpServeRoot = '/opt/html'
+
+    (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
+        "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
+        "data": {
+            "index.html": html,
+            "index.js": Path("index.js").read_text(),
+            "index.css": Path("index.css").read_text(),
+        }
+    })) # yapf: disable
+
+    (build / f'{objPrefix}-3index_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+            "spec": {
+                "replicas": 1,
+                "selector": { "matchLabels": { "app": name } },
+                "template": {
+                    "metadata": { "labels": { "app": name } },
+                    "spec": {
+                        "containers": [{
+                            "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f'--root={httpServeRoot}',
+                                '--directory-listing=true',
+                                '--experimental-metrics=true',
+                            ],
+                            "ports": [{ "containerPort": 80 }],
+                            "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
+                        }],
+                        "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
+                    }
+                }
+            }
+        })) # yapf: disable
+    createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
+
+
+def main():
+    tzArg = "-loggerTimezone=America/Los_Angeles"
+    objPrefix = "next-victoriametrics"  # prefix on all k8s object names
+    webRoot = "/m/next"
+    vmVersion = "v1.100.1"
+    webHost = 'bigasterisk.com'
+    pipelines = [
+        ('forever', '100y'),
+        ('recent', '90y'),
+    ]
+    storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
+
+    selectPort = 8481
+    createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
+    createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
+
+    ingressPaths = [
+        { "pathType": "Prefix", "path": f"{webRoot}/",          "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
+        { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
+    ]  # yapf: disable
+    for p, _ in pipelines:
+        ingressPaths.extend([
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/",   "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent",   "port": { "number": 80 } } } },
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/",  "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert",  "port": { "number": 80 } } } },
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
+        ]) # yapf: disable
+
+    policy = """\
+allow:
+    or: 
+        - { email: { is: "drewpca@gmail.com" }}
+        - { email: { is: "kelsimp@gmail.com" }}
+    """
+    createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
+    createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
+
+
+main()
+
+# in vmui, set server url to
+# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/create_scrape_configs.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,170 @@
+from pathlib import Path
+
+from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName
+import private
+
+# previously this used `kubernetes_sd_configs: [{ role: node }]`
+all_hosts = [
+    'dash',
+    'ditto',
+    # 'ws-printer',
+    #todo:
+]
+
+smartctl_hosts = [
+    # ideally, all nodes with disks, but many turn off and on
+    'dash',
+    'ditto',
+]
+
+ping_hosts = [
+    # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
+    'printer',
+    # wireguard connection test
+    'prime5',
+    # after pyinfra or reboot, seems to lose wg0 address
+    'garage5',
+]
+
+
+forever_jobs = [
+    jobConfig(name='maildir-count',        targets=['prime:2500']),
+    jobConfig(name='mongodb',              targets=['mongodb:9216']),
+    jobConfig(name='net-traffic',          targets=['pipe:8080']),
+    jobConfig(name='ping',                 targets=ping_hosts,              scrape_interval='2m', ping_job=True),
+    jobConfig(name='power-eagle',          targets=['power-eagle:80'],      scrape_interval='8s'),  # from powerEagle/private_config.periodSec
+    jobConfig(name='powermeter-exporter',  targets=['powermeter-exporter'], scrape_interval='10s'),
+    jobConfig(name='smartctl',             targets=[f'{h}:9633' for h in smartctl_hosts]),
+    jobConfig(name='wifi',                 targets=['wifi:80']),
+    jobConfig(name='zfs-exporter',         targets=['ditto:9634']),
+    jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
+    jobConfig(name='zpool-exporter',       targets=['ditto:9986']),
+    jobConfig(name='octoprint',            targets=['octoprint'],
+              metrics_path='/plugin/prometheus_exporter/metrics',
+              params={'apikey' : [private.octoprint_apikey]},
+              ),
+]  # yapf: disable
+
+recent_jobs = [
+    jobConfig(name="dnsmasq-log", targets=['pipe:9991']),
+    jobConfig(name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
+    jobConfig(name="net-routes",  targets=['pipe:9999']),
+    jobConfig(name="net-traffic", targets=['pipe:8080']),
+    jobConfig(name="pomerium",    targets=['pomerium-metrics.pomerium:9090']),
+    jobConfig(name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
+    jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'),
+
+    jobConfig(name="next-victoriametrics-forever-vmagent",   metrics_path='/m/next/forever/vmagent/metrics',  targets=FromName),
+    jobConfig(name="next-victoriametrics-forever-vminsert",  metrics_path='/m/next/forever/vminsert/metrics', targets=FromName),
+    jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vmagent",    metrics_path='/m/next/recent/vmagent/metrics',   targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vminsert",   metrics_path='/m/next/recent/vminsert/metrics',  targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vmstorage",  metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName),
+    jobConfig(name="next-victoriametrics-vmselect",          metrics_path='/m/next/vmselect/metrics',         targets=FromName),
+    jobConfig(name="next-victoriametrics-index",                                                              targets=FromName),
+
+    # todo:
+    #  - video-files
+    #  - cert-manager
+    #  - syncthing(s)
+    #  - nvidia runner
+    #  - longhorn
+    #  - kube-system.metrics-server
+    jobConfig(
+        name="racc",
+        scrape_interval='30s',
+        targets=[
+            # - dash:5150
+            # - dot:5150
+            # - squib:5150
+            # - ashermac:5150
+        ],
+    ),
+]  # yapf: disable
+
+
+deploy_doesnt_serve_metrics = [
+    'apprise',
+    'bitwarden',
+    'digi-files',
+    'digi-pose-predict',
+    'digi-tts-mimic',
+    'digi-web',
+    'dovecot',
+    'ectoscope',
+    'front-door-display',
+    'hass',
+    'homepage',
+    'itch150',
+    'jsregistry',
+    'kallithea',
+    'kube-web-view',
+    'magma',
+    'megasecond',
+    'minecraft-build-world',
+    'minecraft-lake-world',
+    'minecraft-smp-world',
+    'mongodb',
+    'mqtt1',
+    'mqtt2',
+    'nodered',
+    'photoprism',
+    'plik',
+    'projects',
+    'registry-ui',
+    'registry',
+    'speakerphone',
+    'victorialogs-ui',
+    'video-files',
+    'video',
+    'zigbee2mqtt',
+    'zwave2mqtt',
+]
+
+existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs]
+recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs))
+
+recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | {
+    'kubernetes_sd_configs': [{
+        'role': 'endpoints'
+    }],
+    'relabel_configs': [{
+        'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'],
+        'action': 'keep',
+        'regex': 'default;kubernetes;https'
+    }],
+})
+
+recent_jobs.append(
+    jobConfig(name="kubernetes-nodes", https=True, targets=[]) | {
+        "kubernetes_sd_configs": [{
+            "role": "node"
+        }],
+        "relabel_configs": [{
+            "action": "labeldrop",
+            "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+        }, {
+            "action": "labelmap",
+            "regex": "__meta_kubernetes_node_label_(.+)"
+        }, {
+            "action": "labeldrop",
+            "regex": "kubernetes_io_hostname"
+        }],
+    })
+
+# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
+# for metric definitions
+
+recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | {
+    "kubernetes_sd_configs": [{
+        "role": "node"
+    }],
+    "relabel_configs": [{
+        "action": "labeldrop",
+        "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+    }],
+})
+
+outDir = Path('build/scrape_config')
+writeJobConfigs(outDir, forever_jobs, 'forever')
+writeJobConfigs(outDir, recent_jobs, 'recent')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deploy_alertmanager.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: alertmanager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: alertmanager
+  template:
+    metadata:
+      labels:
+        app: alertmanager
+    spec:
+      volumes:
+        - name: opt-alertmanager
+          persistentVolumeClaim:
+            claimName: opt-alertmanager
+      serviceAccountName: victoriametrics
+      containers:
+        - name: alertmanager
+          image: docker.io/prom/alertmanager:v0.27.0
+          args:
+            - --config.file=/alertmanager/alertmanager.yml
+            - --web.external-url=https://bigasterisk.com/alertmanager/
+            - --web.route-prefix=/
+            - --log.level=info
+          ports:
+          - containerPort: 9093
+          volumeMounts:
+          - name: opt-alertmanager
+            mountPath: /alertmanager
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: "kubernetes.io/hostname"
+                operator: In
+                values: ["ditto"]
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alertmanager
+spec:
+  ports:
+  - port: 80
+    targetPort: 9093
+  selector:
+    app: alertmanager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deploy_vmalert.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vmalert
+spec:
+  replicas: 1
+  strategy: { type: Recreate }
+  selector:
+    matchLabels:
+      app: vmalert
+  template:
+    metadata:
+      labels:
+        app: vmalert
+      annotations:
+        prometheus.io/scrape: "true"
+    spec:
+      volumes:
+        - name: config
+          configMap: { name: victoriametrics-config }
+      serviceAccountName: victoriametrics
+      containers:
+        - name: vmalert
+          image: docker.io/victoriametrics/vmalert:v1.91.2
+          args:
+            - -configCheckInterval=5s
+            - -datasource.url=http://victoriametrics/m/
+            - -datasource.queryStep=5m
+            - -evaluationInterval=1m
+            - -external.url=https://bigasterisk.com/vmalert
+            - -loggerLevel=INFO
+            - -loggerTimezone=America/Los_Angeles
+            - -memory.allowedBytes=512MB
+            - -notifier.url=http://alertmanager
+            - -remoteRead.url=http://victoriametrics/m/
+            - -remoteWrite.url=http://victoriametrics/m/
+            - -rule=/local/rules
+          ports:
+            - containerPort: 8880
+          volumeMounts:
+            - { name: config, mountPath: /local }
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vmalert
+spec:
+  ports:
+    - port: 80
+      targetPort: 8880
+  selector:
+    app: vmalert
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/index.css	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,8 @@
+section {
+  margin-left: 2em;
+}
+
+h1,
+h2 {
+  border-top: 1px solid lightgray;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/index.js	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,4 @@
+function init(serverUrl) {
+    // this defaults to something incorrect, so we fix it hopefully before you go to vmui
+    localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl }));
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/index_page.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,75 @@
+def makeIndexHtml(objPrefix, webRoot, webHost):
+    return f"""<!DOCTYPE html>
+    <html>
+        <head>
+          <title>{objPrefix}</title>
+          <link rel="stylesheet" href="index.css">
+        </head>
+        <body>
+            <h1>{objPrefix}</h1>
+            <section>
+              <h2>Retentions</h2>
+              <section>
+                <h3><code>recent</code></h3>
+                <table>
+                  <tr>
+                    <td><a href="recent/vmagent/">vmagent</a></td>
+                    <td><a href="recent/vmagent/metrics">metrics</a></td>
+                    <td><a href="recent/vmagent/targets">targets</a></td>
+                  </tr>
+                  <tr>
+                    <td><a href="recent/vminsert/">vminsert</a></td>
+                    <td><a href="recent/vminsert/metrics">metrics</a></td>
+                  </tr>
+                  <tr>
+                    <td><a href="recent/vmstorage/">vmstorage</a></td>
+                    <td><a href="recent/vmstorage/metrics">metrics</a></td>
+                  </tr>
+                </table>
+              </section>
+            
+              <section>
+                <h3><code>forever</code></h3>
+                <table>
+                  <tr>
+                    <td><a href="forever/vmagent/">vmagent</a></td>
+                    <td><a href="forever/vmagent/metrics">metrics</a></td>
+                    <td><a href="forever/vmagent/targets">targets</a></td>
+                  </tr>
+                  <tr>
+                    <td><a href="forever/vminsert/">vminsert</a></td>
+                    <td><a href="forever/vminsert/metrics">metrics</a></td>
+                  </tr>
+                  <tr>
+                    <td><a href="forever/vmstorage/">vmstorage</a></td>
+                    <td><a href="forever/vmstorage/metrics">metrics</a></td>
+                  </tr>
+                </table>
+              </section>
+            </section>
+
+            <section>
+              <h2>vmselect</h2>
+              <table>
+                <tr>
+                  <td><a href="vmselect/">vmselect</a></td>
+                  <td><a href="vmselect/metrics">metrics</a></td>
+                </tr>
+              </table>
+            </section>
+
+            <section>
+              <h2>vmui</h2>
+              <table>
+                <tr>
+                  <td><a href="vmselect/0/vmui/vmui">vmui</a></td>
+                </tr>
+              </table>
+            </section>  
+              
+            <script src="index.js"></script>
+            <script> 
+              init("https://{webHost}{webRoot}/vmselect/select/0/prometheus");
+            </script>
+        </body>
+    </html>"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ingress_alertmanager.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,55 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vmalert
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+    ingress.pomerium.io/pass_identity_headers: "true"
+    ingress.pomerium.io/preserve_host_header: "true"
+    ingress.pomerium.io/policy: |
+      allow:
+        or: 
+          - { email: { is: "drewpca@gmail.com" }}
+          - { email: { is: "kelsimp@gmail.com" }}
+    # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
+spec:
+  ingressClassName: pomerium
+  rules:
+    - host: "bigasterisk.com"
+      http:
+        paths:
+          - pathType: Prefix
+            path: /vmalert/
+            backend: { service: { name: vmalert, port: { number: 80 } } }
+  tls:
+    - hosts: [bigasterisk.com]
+      secretName: bigasterisk.com-tls
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: alertmanager
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
+    ingress.pomerium.io/pass_identity_headers: "true"
+    ingress.pomerium.io/preserve_host_header: "true"
+    ingress.pomerium.io/policy: |
+      allow:
+        or: 
+          - { email: { is: "drewpca@gmail.com" }}
+          - { email: { is: "kelsimp@gmail.com" }}
+    ingress.pomerium.io/prefix_rewrite: "/"
+spec:
+  ingressClassName: pomerium
+  rules:
+    - host: "bigasterisk.com"
+      http:
+        paths:
+          - pathType: Prefix
+            path: /alertmanager/
+            backend: { service: { name: alertmanager, port: { number: 80 } } }
+  tls:
+    - hosts: [bigasterisk.com]
+      secretName: bigasterisk.com-tls
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/k8s_ops.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,50 @@
+import json
+import time
+
+from kubernetes import client
+
+
+def refreshPodCmaps(pod_name, namespace="default"):
+    """
+    Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
+    until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
+    """
+    api_instance = client.CoreV1Api()
+
+    pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
+    if pod.metadata.annotations is None:
+        pod.metadata.annotations = {}
+    pod.metadata.annotations["force-configmap-update"] = str(time.time())
+    api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
+
+
+def firstPodName(selector):
+    api_instance = client.CoreV1Api()
+    pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
+    return pod_list.items[0].metadata.name
+
+
+def hup(ctx, deployment, process_name):
+    ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
+
+
+def replaceCmap(name, dataObj):
+    api_instance = client.CoreV1Api()
+
+    data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
+
+    try:
+
+        existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
+        existing_config_map.data.update(data)
+        api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
+    except client.rest.ApiException as e:
+        if e.status == 404:
+            config_map = client.V1ConfigMap()
+            config_map.metadata = client.V1ObjectMeta(name=name)
+            config_map.data = data
+            api_response = api_instance.create_namespaced_config_map('default', config_map)
+        else:
+            raise
+
+    print(f"{name} resource_version is now {api_response.metadata.resource_version}")
--- a/next/alert_rules.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,433 +0,0 @@
-"""
-pdm run invoke push-config
-
-docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-"Whenever the alert expression results in one or more vector
-elements at a given point in time, the alert counts as active for
-these elements' label sets."
-also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
-
-"""
-
-import json
-
-
-def pomRules():
-    return [
-        {
-            "alert": "frequent_upstream_connect_failures",
-            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
-        },
-        {
-            "alert": "high_logging_pomerium",
-            "for": "3h",
-            "labels": {
-                "severity": "waste"
-            },
-            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
-            "annotations": {
-                "summary": "high log output rate"
-            },
-        },
-    ]
-
-
-def k8sRules():
-    # from https://awesome-prometheus-alerts.grep.to/rules.html
-    return [
-        {
-            "alert": "metricsTargetMissing",
-            "expr": 'up{job!~"cm-acme-.*"} == 0',
-            'for': '10m',
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "metrics target missing (instance {{ $labels.instance }})",
-                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesMemoryPressure",
-            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesDiskPressure",
-            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesOutOfDisk",
-            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
-            "for": "2m",
-            "labels": {
-                "severity": "critical"
-            },
-            "annotations": {
-                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
-                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesJobFailed",
-            "expr": "kube_job_status_failed > 0",
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
-                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesPodCrashLooping",
-            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
-            "for": "2m",
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
-                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "KubernetesClientCertificateExpiresNextWeek",
-            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
-            "labels": {
-                "severity": "warning"
-            },
-            "annotations": {
-                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
-                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
-            },
-        },
-        {
-            "alert": "container_waiting",
-            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
-            "annotations": {
-                "description": '',
-                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
-            },
-            "for": "2m",
-        },
-    ]
-
-
-def allRules(ctx):
-    return {
-        "groups": [
-            {
-                "name": "k8s",
-                "interval": "1m",
-                "rules": k8sRules(),
-            },
-            {
-                "name": "pomerium_proxy",
-                "interval": "1m",
-                "rules": pomRules(),
-            },
-            {
-                "name":
-                    "Outages",
-                "interval":
-                    "1m",
-                "rules": [
-                    {
-                        "alert": "powereagleStalled",
-                        "expr": "rate(house_power_w[100m]) == 0",
-                        "for": "0m",
-                        "labels": {
-                            "severity": "losingData"
-                        },
-                        "annotations": {
-                            "summary": "power eagle data stalled",
-                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
-                        },
-                    },
-                    {
-                        "alert": "powereagleAbsent",
-                        "expr": "absent_over_time(house_power_w[5m])",
-                        "for": "2m",
-                        "labels": {
-                            "severity": "losingData"
-                        },
-                        "annotations": {
-                            "summary": "power eagle data missing",
-                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
-                        },
-                    },
-                    {
-                        "alert": "absent_zigbee",
-                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
-                    },
-                    {
-                        "alert": "net_routes_sync",
-                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
-                        "for": "10m",
-                        "labels": {
-                            "severity": "houseUsersAffected"
-                        },
-                        "annotations": {
-                            "summary": "net_routes is not getting regular updates"
-                        },
-                    },
-                ],
-            },
-            {
-                "name": "disk_errs",
-                "interval": "2d",
-                "rules": [{
-                    "alert": "zpool_device_error_increase",
-                    "labels": {
-                        "severity": "warning"
-                    },
-                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
-                }, {
-                    "alert": "zpool_device_error_count",
-                    "labels": {
-                        "severity": "warning"
-                    },
-                    "expr": 'zpool_device_error_count > 0',
-                }],
-            },
-            {
-                "name": "lighting",
-                "interval": "5m",
-                "rules": [{
-                    "alert": "light_bridge_no_mqtt",
-                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
-                }],
-            },
-            {
-                "name":
-                    "front_door",
-                "interval":
-                    "5m",
-                "rules": [
-                    {
-                        "alert": "front_door_reader_esp32_no_mqtt",
-                        'expr': 'hw_connected{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_reader_svc_down",
-                        'expr': 'up{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_reader_svc_reader_no_mqtt",
-                        'expr': 'mqtt_connected{job="fingerprint"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_svc_down",
-                        'expr': 'up{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_svc_no_mqtt",
-                        'expr': 'mqtt_connected{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                    {
-                        "alert": "front_door_lock_esp32_no_mqtt",
-                        'expr': 'hw_connected{job="front-door-lock"} < 1',
-                        "annotations": {
-                            "summary": "see https://bigasterisk.com/front-door-lock/"
-                        },
-                    },
-                ],
-            },
-            {
-                "name":
-                    "net_routes",
-                "interval":
-                    "5m",
-                "rules": [
-                    {
-                        "alert": "no_house_ip_service",
-                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
-                    },
-                    {
-                        "alert": "no_net_routes_running",
-                        "expr": 'absent(python_info{job="net-routes"})'
-                    },
-                    {
-                        "alert": "allowed_check_never_returned_200",
-                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
-                    },
-                    {
-                        "alert": "allowed_check_never_returned_403",
-                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
-                    },
-                    {
-                        'alert': 'net_route_input_eval_cal_loop_is_down',
-                        'expr': 'eval_cal_up!=1'
-                    },
-                    {
-                        'alert': 'net_route_input_mongo_loop_is_down',
-                        'expr': 'mongo_to_net_routes_up!=1'
-                    },
-                    {
-                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
-                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
-                    },
-                    {
-                        'alert': 'gcalendarwatch_current_events_loop_is_down',
-                        'expr': 'current_events_up != 1'
-                    },
-                ],
-            },
-            {
-                "name": "http",
-                "interval": "1h",
-                'rules': [
-                    {
-                        'alert': 'old_https_certs',
-                        'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
-                    },
-                    {
-                        'alert': 'high_500_response_rate',
-                        'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
-                    },
-                ],
-            },
-            {
-                "name": "ping",
-                "interval": "1m",
-                "rules": [{
-                    "alert": "ping_failed",
-                    "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
-                }]
-            },
-            {
-                "name":
-                    "alerts",
-                "rules": [
-                    {
-                        "alert": "kube_node_status_bad_condition",
-                        "for": "2h",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
-                    },
-                    {
-                        "alert": "housePower",
-                        "for": "1h",
-                        "labels": {
-                            "severity": "waste"
-                        },
-                        "expr": "house_power_w > 4000",
-                        "annotations": {
-                            "summary": "house power usage over 4KW"
-                        },
-                    },
-                    {
-                        "alert": "host_root_fs_space_low",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
-                    },
-                    {
-                        "alert": "zpool_space_low",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
-                    },
-                    {
-                        "alert": "disk_week_incr",
-                        "for": "20m",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
-                        "annotations": {
-                            "summary": "high mb/week on zfs dir"
-                        },
-                    },
-                    {
-                        "alert": "high_logging",
-                        "for": "3h",
-                        "labels": {
-                            "severity": "waste"
-                        },
-                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
-                        "annotations": {
-                            "summary": "high log output rate"
-                        },
-                    },
-                    {
-                        "alert": "stale_process",
-                        "for": "1d",
-                        "labels": {
-                            "severity": "dataRisk"
-                        },
-                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
-                        "annotations": {
-                            "summary": "process time is old"
-                        },
-                    },
-                    {
-                        "alert": "starlette",
-                        "for": "1m",
-                        "labels": {
-                            "severity": "fix"
-                        },
-                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
-                        "annotations": {
-                            "summary": "set starlette app name"
-                        },
-                    },
-                    {
-                        "alert": "ssl_certs_expiring_soon",
-                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
-                        "labels": {
-                            "severity": "warning"
-                        },
-                        "annotations": {
-                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
-                        },
-                    },
-                ],
-            },
-        ] + hostsExpectedOnline(ctx)['groups']
-    }
-
-
-def _runJson(ctx, cmd):
-    return json.loads(ctx.run(cmd, hide="stdout").stdout)
-
-
-def hostsExpectedOnline(ctx):
-    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
--- a/next/create_k8s.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,254 +0,0 @@
-from pathlib import Path
-from index_page import makeIndexHtml
-from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
-
-
-def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
-    (build / f'{agentFileName}_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": agentName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmagent",
-                            "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
-                                tzArg,
-                                f"-promscrape.config=/local/config/{scrapeMapKey}",
-                                "-promscrape.configCheckInterval=5s",
-                                "-sortLabels",
-                                f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
-                                "-remoteWrite.showURL",
-                            ],
-                            "ports": [{ "containerPort": agentPort }],
-                            "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
-    (build / f'{insertFileName}_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": insertName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vminsert",
-                            "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
-                                tzArg,
-                                f"-storageNode={storageName}",
-                            ],
-                            "ports": [{ "containerPort": insertPort }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
-    (build / f'{storageFileName}_2deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": storageName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmstorage",
-                            "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
-                                tzArg,
-                                f"-retentionPeriod={retention}",
-                                f"-storageDataPath=/data/{pipelineName}",
-                            ],
-                            "ports": [
-                                { "containerPort": 8482, "name": "http" },
-                                { "containerPort": storageInsertPort, "name": "vminsert" },
-                                { "containerPort": storageSelectPort, "name": "vmselect" },
-                            ],
-                            "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
-                        }],
-                        "affinity": affinityToNode(localPvHost)
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
-    name = f"{objPrefix}-vmselect"
-    (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
-            "spec": {
-                "replicas": 1,
-                "strategy": { "type": "Recreate" },
-                "selector": { "matchLabels": { "app": name } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": name },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={webRoot}/vmselect/",
-                                tzArg,
-                            ] + [f"-storageNode={n}" for n in storageSvcs],
-                            "ports": [{ "containerPort": selectPort }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
-    agentName = f"{objPrefix}-{pipelineName}-vmagent"
-    insertName = f"{objPrefix}-{pipelineName}-vminsert"
-    storageName = f"{objPrefix}-{pipelineName}-vmstorage"
-
-    agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
-    insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
-    storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
-
-    localPvHost = "ditto"
-    insertPort = 8480
-    agentPort = 8429
-    storageInsertPort = 8400
-    storageSelectPort = 8401
-    volName = f"{objPrefix}-data-{pipelineName}"
-    request = "50Gi"
-    pipelineWebRoot = f'{webRoot}/{pipelineName}'
-
-    createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
-    createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
-    createPv(storageFileName, volName, request)
-    createPvc(storageFileName, volName, request)
-    createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
-
-    createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
-    createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
-    createSvc(storageFileName,storageName, [
-        {"port": 80, "targetPort": "http", "name": "http"},
-        {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
-        {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
-        ]) # yapf: disable
-
-    return storageName
-
-
-def createIndex(objPrefix, webRoot, html):
-    name = f'{objPrefix}-index'
-    httpServeRoot = '/opt/html'
-
-    (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
-        "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
-        "data": {
-            "index.html": html,
-            "index.js": Path("index.js").read_text(),
-            "index.css": Path("index.css").read_text(),
-        }
-    })) # yapf: disable
-
-    (build / f'{objPrefix}-3index_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
-            "spec": {
-                "replicas": 1,
-                "selector": { "matchLabels": { "app": name } },
-                "template": {
-                    "metadata": { "labels": { "app": name } },
-                    "spec": {
-                        "containers": [{
-                            "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f'--root={httpServeRoot}',
-                                '--directory-listing=true',
-                                '--experimental-metrics=true',
-                            ],
-                            "ports": [{ "containerPort": 80 }],
-                            "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
-                        }],
-                        "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
-                    }
-                }
-            }
-        })) # yapf: disable
-    createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
-
-
-def main():
-    tzArg = "-loggerTimezone=America/Los_Angeles"
-    objPrefix = "next-victoriametrics"  # prefix on all k8s object names
-    webRoot = "/m/next"
-    vmVersion = "v1.100.1"
-    webHost = 'bigasterisk.com'
-    pipelines = [
-        ('forever', '100y'),
-        ('recent', '90y'),
-    ]
-    storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
-
-    selectPort = 8481
-    createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
-    createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
-
-    ingressPaths = [
-        { "pathType": "Prefix", "path": f"{webRoot}/",          "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
-        { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
-    ]  # yapf: disable
-    for p, _ in pipelines:
-        ingressPaths.extend([
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/",   "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent",   "port": { "number": 80 } } } },
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/",  "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert",  "port": { "number": 80 } } } },
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
-        ]) # yapf: disable
-
-    policy = """\
-allow:
-    or: 
-        - { email: { is: "drewpca@gmail.com" }}
-        - { email: { is: "kelsimp@gmail.com" }}
-    """
-    createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
-    createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
-
-
-main()
-
-# in vmui, set server url to
-# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
--- a/next/create_scrape_configs.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,170 +0,0 @@
-from pathlib import Path
-
-from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName
-import private
-
-# previously this used `kubernetes_sd_configs: [{ role: node }]`
-all_hosts = [
-    'dash',
-    'ditto',
-    # 'ws-printer',
-    #todo:
-]
-
-smartctl_hosts = [
-    # ideally, all nodes with disks, but many turn off and on
-    'dash',
-    'ditto',
-]
-
-ping_hosts = [
-    # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
-    'printer',
-    # wireguard connection test
-    'prime5',
-    # after pyinfra or reboot, seems to lose wg0 address
-    'garage5',
-]
-
-
-forever_jobs = [
-    jobConfig(name='maildir-count',        targets=['prime:2500']),
-    jobConfig(name='mongodb',              targets=['mongodb:9216']),
-    jobConfig(name='net-traffic',          targets=['pipe:8080']),
-    jobConfig(name='ping',                 targets=ping_hosts,              scrape_interval='2m', ping_job=True),
-    jobConfig(name='power-eagle',          targets=['power-eagle:80'],      scrape_interval='8s'),  # from powerEagle/private_config.periodSec
-    jobConfig(name='powermeter-exporter',  targets=['powermeter-exporter'], scrape_interval='10s'),
-    jobConfig(name='smartctl',             targets=[f'{h}:9633' for h in smartctl_hosts]),
-    jobConfig(name='wifi',                 targets=['wifi:80']),
-    jobConfig(name='zfs-exporter',         targets=['ditto:9634']),
-    jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
-    jobConfig(name='zpool-exporter',       targets=['ditto:9986']),
-    jobConfig(name='octoprint',            targets=['octoprint'],
-              metrics_path='/plugin/prometheus_exporter/metrics',
-              params={'apikey' : [private.octoprint_apikey]},
-              ),
-]  # yapf: disable
-
-recent_jobs = [
-    jobConfig(name="dnsmasq-log", targets=['pipe:9991']),
-    jobConfig(name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
-    jobConfig(name="net-routes",  targets=['pipe:9999']),
-    jobConfig(name="net-traffic", targets=['pipe:8080']),
-    jobConfig(name="pomerium",    targets=['pomerium-metrics.pomerium:9090']),
-    jobConfig(name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
-    jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'),
-
-    jobConfig(name="next-victoriametrics-forever-vmagent",   metrics_path='/m/next/forever/vmagent/metrics',  targets=FromName),
-    jobConfig(name="next-victoriametrics-forever-vminsert",  metrics_path='/m/next/forever/vminsert/metrics', targets=FromName),
-    jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName),
-    jobConfig(name="next-victoriametrics-recent-vmagent",    metrics_path='/m/next/recent/vmagent/metrics',   targets=FromName),
-    jobConfig(name="next-victoriametrics-recent-vminsert",   metrics_path='/m/next/recent/vminsert/metrics',  targets=FromName),
-    jobConfig(name="next-victoriametrics-recent-vmstorage",  metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName),
-    jobConfig(name="next-victoriametrics-vmselect",          metrics_path='/m/next/vmselect/metrics',         targets=FromName),
-    jobConfig(name="next-victoriametrics-index",                                                              targets=FromName),
-
-    # todo:
-    #  - video-files
-    #  - cert-manager
-    #  - syncthing(s)
-    #  - nvidia runner
-    #  - longhorn
-    #  - kube-system.metrics-server
-    jobConfig(
-        name="racc",
-        scrape_interval='30s',
-        targets=[
-            # - dash:5150
-            # - dot:5150
-            # - squib:5150
-            # - ashermac:5150
-        ],
-    ),
-]  # yapf: disable
-
-
-deploy_doesnt_serve_metrics = [
-    'apprise',
-    'bitwarden',
-    'digi-files',
-    'digi-pose-predict',
-    'digi-tts-mimic',
-    'digi-web',
-    'dovecot',
-    'ectoscope',
-    'front-door-display',
-    'hass',
-    'homepage',
-    'itch150',
-    'jsregistry',
-    'kallithea',
-    'kube-web-view',
-    'magma',
-    'megasecond',
-    'minecraft-build-world',
-    'minecraft-lake-world',
-    'minecraft-smp-world',
-    'mongodb',
-    'mqtt1',
-    'mqtt2',
-    'nodered',
-    'photoprism',
-    'plik',
-    'projects',
-    'registry-ui',
-    'registry',
-    'speakerphone',
-    'victorialogs-ui',
-    'video-files',
-    'video',
-    'zigbee2mqtt',
-    'zwave2mqtt',
-]
-
-existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs]
-recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs))
-
-recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | {
-    'kubernetes_sd_configs': [{
-        'role': 'endpoints'
-    }],
-    'relabel_configs': [{
-        'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'],
-        'action': 'keep',
-        'regex': 'default;kubernetes;https'
-    }],
-})
-
-recent_jobs.append(
-    jobConfig(name="kubernetes-nodes", https=True, targets=[]) | {
-        "kubernetes_sd_configs": [{
-            "role": "node"
-        }],
-        "relabel_configs": [{
-            "action": "labeldrop",
-            "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
-        }, {
-            "action": "labelmap",
-            "regex": "__meta_kubernetes_node_label_(.+)"
-        }, {
-            "action": "labeldrop",
-            "regex": "kubernetes_io_hostname"
-        }],
-    })
-
-# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
-# for metric definitions
-
-recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | {
-    "kubernetes_sd_configs": [{
-        "role": "node"
-    }],
-    "relabel_configs": [{
-        "action": "labeldrop",
-        "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
-    }],
-})
-
-outDir = Path('build/scrape_config')
-writeJobConfigs(outDir, forever_jobs, 'forever')
-writeJobConfigs(outDir, recent_jobs, 'recent')
--- a/next/deploy_alertmanager.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,51 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: alertmanager
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: alertmanager
-  template:
-    metadata:
-      labels:
-        app: alertmanager
-    spec:
-      volumes:
-        - name: opt-alertmanager
-          persistentVolumeClaim:
-            claimName: opt-alertmanager
-      serviceAccountName: victoriametrics
-      containers:
-        - name: alertmanager
-          image: docker.io/prom/alertmanager:v0.27.0
-          args:
-            - --config.file=/alertmanager/alertmanager.yml
-            - --web.external-url=https://bigasterisk.com/alertmanager/
-            - --web.route-prefix=/
-            - --log.level=info
-          ports:
-          - containerPort: 9093
-          volumeMounts:
-          - name: opt-alertmanager
-            mountPath: /alertmanager
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: "kubernetes.io/hostname"
-                operator: In
-                values: ["ditto"]
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager
-spec:
-  ports:
-  - port: 80
-    targetPort: 9093
-  selector:
-    app: alertmanager
--- a/next/deploy_vmalert.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vmalert
-spec:
-  replicas: 1
-  strategy: { type: Recreate }
-  selector:
-    matchLabels:
-      app: vmalert
-  template:
-    metadata:
-      labels:
-        app: vmalert
-      annotations:
-        prometheus.io/scrape: "true"
-    spec:
-      volumes:
-        - name: config
-          configMap: { name: victoriametrics-config }
-      serviceAccountName: victoriametrics
-      containers:
-        - name: vmalert
-          image: docker.io/victoriametrics/vmalert:v1.91.2
-          args:
-            - -configCheckInterval=5s
-            - -datasource.url=http://victoriametrics/m/
-            - -datasource.queryStep=5m
-            - -evaluationInterval=1m
-            - -external.url=https://bigasterisk.com/vmalert
-            - -loggerLevel=INFO
-            - -loggerTimezone=America/Los_Angeles
-            - -memory.allowedBytes=512MB
-            - -notifier.url=http://alertmanager
-            - -remoteRead.url=http://victoriametrics/m/
-            - -remoteWrite.url=http://victoriametrics/m/
-            - -rule=/local/rules
-          ports:
-            - containerPort: 8880
-          volumeMounts:
-            - { name: config, mountPath: /local }
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: vmalert
-spec:
-  ports:
-    - port: 80
-      targetPort: 8880
-  selector:
-    app: vmalert
--- a/next/index.css	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-section {
-  margin-left: 2em;
-}
-
-h1,
-h2 {
-  border-top: 1px solid lightgray;
-}
--- a/next/index.js	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
-function init(serverUrl) {
-    // this defaults to something incorrect, so we fix it hopefully before you go to vmui
-    localStorage.setItem('SERVER_URL', JSON.stringify({ value: serverUrl }));
-}
\ No newline at end of file
--- a/next/index_page.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-def makeIndexHtml(objPrefix, webRoot, webHost):
-    return f"""<!DOCTYPE html>
-    <html>
-        <head>
-          <title>{objPrefix}</title>
-          <link rel="stylesheet" href="index.css">
-        </head>
-        <body>
-            <h1>{objPrefix}</h1>
-            <section>
-              <h2>Retentions</h2>
-              <section>
-                <h3><code>recent</code></h3>
-                <table>
-                  <tr>
-                    <td><a href="recent/vmagent/">vmagent</a></td>
-                    <td><a href="recent/vmagent/metrics">metrics</a></td>
-                    <td><a href="recent/vmagent/targets">targets</a></td>
-                  </tr>
-                  <tr>
-                    <td><a href="recent/vminsert/">vminsert</a></td>
-                    <td><a href="recent/vminsert/metrics">metrics</a></td>
-                  </tr>
-                  <tr>
-                    <td><a href="recent/vmstorage/">vmstorage</a></td>
-                    <td><a href="recent/vmstorage/metrics">metrics</a></td>
-                  </tr>
-                </table>
-              </section>
-            
-              <section>
-                <h3><code>forever</code></h3>
-                <table>
-                  <tr>
-                    <td><a href="forever/vmagent/">vmagent</a></td>
-                    <td><a href="forever/vmagent/metrics">metrics</a></td>
-                    <td><a href="forever/vmagent/targets">targets</a></td>
-                  </tr>
-                  <tr>
-                    <td><a href="forever/vminsert/">vminsert</a></td>
-                    <td><a href="forever/vminsert/metrics">metrics</a></td>
-                  </tr>
-                  <tr>
-                    <td><a href="forever/vmstorage/">vmstorage</a></td>
-                    <td><a href="forever/vmstorage/metrics">metrics</a></td>
-                  </tr>
-                </table>
-              </section>
-            </section>
-
-            <section>
-              <h2>vmselect</h2>
-              <table>
-                <tr>
-                  <td><a href="vmselect/">vmselect</a></td>
-                  <td><a href="vmselect/metrics">metrics</a></td>
-                </tr>
-              </table>
-            </section>
-
-            <section>
-              <h2>vmui</h2>
-              <table>
-                <tr>
-                  <td><a href="vmselect/0/vmui/vmui">vmui</a></td>
-                </tr>
-              </table>
-            </section>  
-              
-            <script src="index.js"></script>
-            <script> 
-              init("https://{webHost}{webRoot}/vmselect/select/0/prometheus");
-            </script>
-        </body>
-    </html>"""
--- a/next/ingress_alertmanager.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: vmalert
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
-    ingress.pomerium.io/pass_identity_headers: "true"
-    ingress.pomerium.io/preserve_host_header: "true"
-    ingress.pomerium.io/policy: |
-      allow:
-        or: 
-          - { email: { is: "drewpca@gmail.com" }}
-          - { email: { is: "kelsimp@gmail.com" }}
-    # ingress.pomerium.io/prefix_rewrite: "/vmalert/"
-spec:
-  ingressClassName: pomerium
-  rules:
-    - host: "bigasterisk.com"
-      http:
-        paths:
-          - pathType: Prefix
-            path: /vmalert/
-            backend: { service: { name: vmalert, port: { number: 80 } } }
-  tls:
-    - hosts: [bigasterisk.com]
-      secretName: bigasterisk.com-tls
----
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: alertmanager
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    ingress.pomerium.io/allow_public_unauthenticated_access: "false"
-    ingress.pomerium.io/pass_identity_headers: "true"
-    ingress.pomerium.io/preserve_host_header: "true"
-    ingress.pomerium.io/policy: |
-      allow:
-        or: 
-          - { email: { is: "drewpca@gmail.com" }}
-          - { email: { is: "kelsimp@gmail.com" }}
-    ingress.pomerium.io/prefix_rewrite: "/"
-spec:
-  ingressClassName: pomerium
-  rules:
-    - host: "bigasterisk.com"
-      http:
-        paths:
-          - pathType: Prefix
-            path: /alertmanager/
-            backend: { service: { name: alertmanager, port: { number: 80 } } }
-  tls:
-    - hosts: [bigasterisk.com]
-      secretName: bigasterisk.com-tls
\ No newline at end of file
--- a/next/k8s_ops.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-import json
-import time
-
-from kubernetes import client
-
-
-def refreshPodCmaps(pod_name, namespace="default"):
-    """
-    Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
-    until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
-    """
-    api_instance = client.CoreV1Api()
-
-    pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
-    if pod.metadata.annotations is None:
-        pod.metadata.annotations = {}
-    pod.metadata.annotations["force-configmap-update"] = str(time.time())
-    api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
-
-
-def firstPodName(selector):
-    api_instance = client.CoreV1Api()
-    pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
-    return pod_list.items[0].metadata.name
-
-
-def hup(ctx, deployment, process_name):
-    ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
-
-
-def replaceCmap(name, dataObj):
-    api_instance = client.CoreV1Api()
-
-    data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
-
-    try:
-
-        existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
-        existing_config_map.data.update(data)
-        api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
-    except client.rest.ApiException as e:
-        if e.status == 404:
-            config_map = client.V1ConfigMap()
-            config_map.metadata = client.V1ObjectMeta(name=name)
-            config_map.data = data
-            api_response = api_instance.create_namespaced_config_map('default', config_map)
-        else:
-            raise
-
-    print(f"{name} resource_version is now {api_response.metadata.resource_version}")
--- a/next/output.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,122 +0,0 @@
-import json
-from pathlib import Path
-
-build = Path('build/k8s_config')
-build.mkdir(parents=True, exist_ok=True)
-
-
-def toJson(d):
-    return json.dumps(d, sort_keys=True, indent=2)
-
-
-def createSvc(filename, name, ports):
-    (build / f'{filename}_svc.yaml').write_text(toJson({
-        "apiVersion": "v1",
-        "kind": "Service",
-        "metadata": {
-            "name": name
-        },
-        "spec": {
-            "ports": ports,
-            "selector": {
-                "app": name
-            }
-        },
-    }))
-
-
-def createIngress(filename, objName, policy, ingressPaths, host):
-
-    (build / filename).write_text(
-        toJson({
-            "apiVersion": "networking.k8s.io/v1",
-            "kind": "Ingress",
-            "metadata": {
-                "name": objName,
-                "annotations": {
-                    "cert-manager.io/cluster-issuer": "letsencrypt-prod",
-                    "ingress.pomerium.io/allow_public_unauthenticated_access": "false",
-                    "ingress.pomerium.io/pass_identity_headers": "true",
-                    "ingress.pomerium.io/preserve_host_header": "true",
-                    "ingress.pomerium.io/policy": policy,
-                }
-            },
-            "spec": {
-                "ingressClassName": "pomerium",
-                "rules": [{
-                    "host": host,
-                    "http": {
-                        "paths": ingressPaths
-                    }
-                },],
-                "tls": [{
-                    "hosts": [host],
-                    "secretName": f"{host}-tls"
-                }]
-            }
-        }))
-
-
-def createPv(storageFileName, volName, request):
-    (build / f'{storageFileName}_0pv.yaml').write_text(
-        toJson({
-            "apiVersion": "v1",
-            "kind": "PersistentVolume",
-            "metadata": {
-                "name": volName,
-                "labels": {
-                    "type": "local"
-                }
-            },
-            "spec": {
-                "storageClassName": "manual",
-                "hostPath": {
-                    "path": f"/opt/{volName}"
-                },
-                "capacity": {
-                    "storage": request
-                },
-                "accessModes": ["ReadWriteMany"],
-                "persistentVolumeReclaimPolicy": "Retain",
-                "claimRef": {
-                    "namespace": "default",
-                    "name": volName
-                }
-            }
-        }))
-
-
-def createPvc(storageFileName, volName, request):
-    (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({
-        "apiVersion": "v1",
-        "kind": "PersistentVolumeClaim",
-        "metadata": {
-            "name": volName,
-        },
-        "spec": {
-            "storageClassName": "",
-            "volumeName": volName,
-            "accessModes": ["ReadWriteMany"],
-            "resources": {
-                "requests": {
-                    "storage": request
-                }
-            }
-        },
-    }))
-
-
-def affinityToNode(node):
-    return {
-        "nodeAffinity": {
-            "requiredDuringSchedulingIgnoredDuringExecution": {
-                "nodeSelectorTerms": [{
-                    "matchExpressions": [{
-                        "key": "kubernetes.io/hostname",
-                        "operator": "In",
-                        "values": [node],
-                    }],
-                }],
-            },
-        }
-    }
--- a/next/roles.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: victoriametrics
-rules:
-- apiGroups: [""]
-  resources:
-  - nodes
-  - nodes/metrics
-  - nodes/proxy
-  - services
-  - endpoints
-  - pods
-  verbs: ["get", "list", "watch"]
-- apiGroups:
-  - extensions
-  resources:
-  - ingresses
-  verbs: ["get", "list", "watch"]
-- nonResourceURLs: ["/metrics"]
-  verbs: ["get"]
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: victoriametrics
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: victoriametrics
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: victoriametrics
-subjects:
-- kind: ServiceAccount
-  name: victoriametrics
-  namespace: default
-# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
-# - kind: ServiceAccount
-#   name: default
-#   namespace: default
\ No newline at end of file
--- a/next/scrape_job.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,102 +0,0 @@
-import json
-from pathlib import Path
-import subprocess
-
-class FromName:
-    pass
-
-def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False):
-    """one scrape job config"""
-    ret = {
-        "job_name": name,
-        "relabel_configs": [
-            {
-                "target_label": "namespace",
-                "replacement": "default"
-            },
-            {
-                "source_labels": ["__meta_kubernetes_pod_node_name"],
-                "target_label": "node"
-            },
-        ]
-    }
-    
-    if targets is FromName:
-        targets = [name]
-
-    if targets:
-        ret["static_configs"] = [{
-            "targets": targets,
-        }]
-
-    if metrics_path:
-        ret.setdefault('relabel_configs', []).append({
-            "action": "replace",
-            "target_label": "__metrics_path__",
-            "replacement": metrics_path,
-        })
-
-    if scrape_interval:
-        ret['scrape_interval'] = scrape_interval
-
-    if params:
-        ret['params'] = params
-
-    if ping_job:
-        ret['metrics_path'] = '/probe'
-        ret['params'] = {'module': ['icmp']}
-        ret["relabel_configs"] = [
-            {
-                "source_labels": ["__address__"],
-                "target_label": "__param_target"
-            },
-            {
-                "source_labels": ["__param_target"],
-                "target_label": "instance"
-            },
-            {
-                "target_label": "__address__",
-                "replacement": "prober"
-            },
-        ]
-
-    if https:
-        ret['scheme'] = 'https'
-        ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"}
-        ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token"
-
-    return ret
-
-
-def current_deployments():
-    deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
-    for deploy in deploys['items']:
-        name = deploy['metadata']['name']
-        yield name
-
-
-def scrape_deployments(skip_names):
-    ret = []
-    for name in current_deployments():
-        if name in skip_names:
-            continue
-        targets = [name]
-        ret.append(jobConfig(name=name, targets=targets))
-    return ret
-
-
-def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
-    outDir.mkdir(exist_ok=True, parents=True)
-    filenames_written = []
-    for job in jobConfs:
-        filename = f'job_{job["job_name"]}.yaml'
-        (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
-        filenames_written.append(filename)
-
-    (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
-        "global": {
-            "scrape_interval": "1m",
-            "scrape_timeout": "10s"
-        },
-        "scrape_config_files": sorted(filenames_written),
-    }, indent=2))
--- a/next/skaffold.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-apiVersion: skaffold/v3
-kind: Config
-metadata:
-  name: victoriametrics
-manifests:
-  rawYaml:
-    - roles.yaml
-    - build/k8s_config/*.yaml
-deploy:
-  kubectl: {}
--- a/next/tasks.py	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-from pathlib import Path
-
-import yaml
-from invoke import task
-from kubernetes import config
-
-import alert_rules
-from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
-
-config.load_kube_config()
-
-
-def scrapeConfig(fn):
-    return yaml.load(open(fn), yaml.FullLoader)
-
-
-@task
-def push_config(ctx):
-    # plan:
-    #   every discovered service may:
-    #      - be described here as a forever retention - ignore the discovery
-    #      - be blocked here as a no-metrics service - ignore the discovery
-    #      - be scraped as 'recent', with possible overrides of port/path
-    #   all per-node metrics shall be 'recent' (oops, not smartctl!)
-    map: dict[str, object] = {
-        'rules': alert_rules.allRules(ctx),
-    }
-    top = Path('build/scrape_config')
-    for p in top.glob('*.yaml'):
-        map[str(p.relative_to(top))] = scrapeConfig(p)
-    replaceCmap("next-victoriametrics-config", map)
-    refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
-    refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
--- a/next/volumes_alert.yaml	Fri May 03 11:19:50 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: opt-alertmanager
-  labels:
-    type: local
-spec:
-  storageClassName: manual
-  hostPath:
-    path: "/opt/alertmanager"
-  capacity:
-    storage: 50Gi
-  accessModes:
-    - ReadWriteOnce
-  persistentVolumeReclaimPolicy: Retain
-  claimRef:
-    namespace: default
-    name: opt-alertmanager
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: opt-alertmanager
-spec:
-  storageClassName: ""
-  volumeName: "opt-alertmanager"
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 50Gi
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/output.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+
+build = Path('build/k8s_config')
+build.mkdir(parents=True, exist_ok=True)
+
+
+def toJson(d):
+    return json.dumps(d, sort_keys=True, indent=2)
+
+
+def createSvc(filename, name, ports):
+    (build / f'{filename}_svc.yaml').write_text(toJson({
+        "apiVersion": "v1",
+        "kind": "Service",
+        "metadata": {
+            "name": name
+        },
+        "spec": {
+            "ports": ports,
+            "selector": {
+                "app": name
+            }
+        },
+    }))
+
+
+def createIngress(filename, objName, policy, ingressPaths, host):
+
+    (build / filename).write_text(
+        toJson({
+            "apiVersion": "networking.k8s.io/v1",
+            "kind": "Ingress",
+            "metadata": {
+                "name": objName,
+                "annotations": {
+                    "cert-manager.io/cluster-issuer": "letsencrypt-prod",
+                    "ingress.pomerium.io/allow_public_unauthenticated_access": "false",
+                    "ingress.pomerium.io/pass_identity_headers": "true",
+                    "ingress.pomerium.io/preserve_host_header": "true",
+                    "ingress.pomerium.io/policy": policy,
+                }
+            },
+            "spec": {
+                "ingressClassName": "pomerium",
+                "rules": [{
+                    "host": host,
+                    "http": {
+                        "paths": ingressPaths
+                    }
+                },],
+                "tls": [{
+                    "hosts": [host],
+                    "secretName": f"{host}-tls"
+                }]
+            }
+        }))
+
+
+def createPv(storageFileName, volName, request):
+    (build / f'{storageFileName}_0pv.yaml').write_text(
+        toJson({
+            "apiVersion": "v1",
+            "kind": "PersistentVolume",
+            "metadata": {
+                "name": volName,
+                "labels": {
+                    "type": "local"
+                }
+            },
+            "spec": {
+                "storageClassName": "manual",
+                "hostPath": {
+                    "path": f"/opt/{volName}"
+                },
+                "capacity": {
+                    "storage": request
+                },
+                "accessModes": ["ReadWriteMany"],
+                "persistentVolumeReclaimPolicy": "Retain",
+                "claimRef": {
+                    "namespace": "default",
+                    "name": volName
+                }
+            }
+        }))
+
+
+def createPvc(storageFileName, volName, request):
+    (build / f'{storageFileName}_1pvc.yaml').write_text(toJson({
+        "apiVersion": "v1",
+        "kind": "PersistentVolumeClaim",
+        "metadata": {
+            "name": volName,
+        },
+        "spec": {
+            "storageClassName": "",
+            "volumeName": volName,
+            "accessModes": ["ReadWriteMany"],
+            "resources": {
+                "requests": {
+                    "storage": request
+                }
+            }
+        },
+    }))
+
+
+def affinityToNode(node):
+    return {
+        "nodeAffinity": {
+            "requiredDuringSchedulingIgnoredDuringExecution": {
+                "nodeSelectorTerms": [{
+                    "matchExpressions": [{
+                        "key": "kubernetes.io/hostname",
+                        "operator": "In",
+                        "values": [node],
+                    }],
+                }],
+            },
+        }
+    }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roles.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,43 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: victoriametrics
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - nodes/proxy
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups:
+  - extensions
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: victoriametrics
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: victoriametrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: victoriametrics
+subjects:
+- kind: ServiceAccount
+  name: victoriametrics
+  namespace: default
+# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
+# - kind: ServiceAccount
+#   name: default
+#   namespace: default
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scrape_job.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,102 @@
+import json
+from pathlib import Path
+import subprocess
+
+class FromName:
+    pass
+
+def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False):
+    """one scrape job config"""
+    ret = {
+        "job_name": name,
+        "relabel_configs": [
+            {
+                "target_label": "namespace",
+                "replacement": "default"
+            },
+            {
+                "source_labels": ["__meta_kubernetes_pod_node_name"],
+                "target_label": "node"
+            },
+        ]
+    }
+    
+    if targets is FromName:
+        targets = [name]
+
+    if targets:
+        ret["static_configs"] = [{
+            "targets": targets,
+        }]
+
+    if metrics_path:
+        ret.setdefault('relabel_configs', []).append({
+            "action": "replace",
+            "target_label": "__metrics_path__",
+            "replacement": metrics_path,
+        })
+
+    if scrape_interval:
+        ret['scrape_interval'] = scrape_interval
+
+    if params:
+        ret['params'] = params
+
+    if ping_job:
+        ret['metrics_path'] = '/probe'
+        ret['params'] = {'module': ['icmp']}
+        ret["relabel_configs"] = [
+            {
+                "source_labels": ["__address__"],
+                "target_label": "__param_target"
+            },
+            {
+                "source_labels": ["__param_target"],
+                "target_label": "instance"
+            },
+            {
+                "target_label": "__address__",
+                "replacement": "prober"
+            },
+        ]
+
+    if https:
+        ret['scheme'] = 'https'
+        ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"}
+        ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+
+    return ret
+
+
+def current_deployments():
+    deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
+    for deploy in deploys['items']:
+        name = deploy['metadata']['name']
+        yield name
+
+
+def scrape_deployments(skip_names):
+    ret = []
+    for name in current_deployments():
+        if name in skip_names:
+            continue
+        targets = [name]
+        ret.append(jobConfig(name=name, targets=targets))
+    return ret
+
+
+def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
+    outDir.mkdir(exist_ok=True, parents=True)
+    filenames_written = []
+    for job in jobConfs:
+        filename = f'job_{job["job_name"]}.yaml'
+        (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
+        filenames_written.append(filename)
+
+    (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
+        "global": {
+            "scrape_interval": "1m",
+            "scrape_timeout": "10s"
+        },
+        "scrape_config_files": sorted(filenames_written),
+    }, indent=2))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/skaffold.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,10 @@
+apiVersion: skaffold/v3
+kind: Config
+metadata:
+  name: victoriametrics
+manifests:
+  rawYaml:
+    - roles.yaml
+    - build/k8s_config/*.yaml
+deploy:
+  kubectl: {}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tasks.py	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import yaml
+from invoke import task
+from kubernetes import config
+
+import alert_rules
+from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
+
+config.load_kube_config()
+
+
+def scrapeConfig(fn):
+    return yaml.load(open(fn), yaml.FullLoader)
+
+
+@task
+def push_config(ctx):
+    # plan:
+    #   every discovered service may:
+    #      - be described here as a forever retention - ignore the discovery
+    #      - be blocked here as a no-metrics service - ignore the discovery
+    #      - be scraped as 'recent', with possible overrides of port/path
+    #   all per-node metrics shall be 'recent' (oops, not smartctl!)
+    map: dict[str, object] = {
+        'rules': alert_rules.allRules(ctx),
+    }
+    top = Path('build/scrape_config')
+    for p in top.glob('*.yaml'):
+        map[str(p.relative_to(top))] = scrapeConfig(p)
+    replaceCmap("next-victoriametrics-config", map)
+    refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
+    refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/volumes_alert.yaml	Fri May 03 11:21:08 2024 -0700
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: opt-alertmanager
+  labels:
+    type: local
+spec:
+  storageClassName: manual
+  hostPath:
+    path: "/opt/alertmanager"
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  claimRef:
+    namespace: default
+    name: opt-alertmanager
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: opt-alertmanager
+spec:
+  storageClassName: ""
+  volumeName: "opt-alertmanager"
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
\ No newline at end of file