changeset 62:8134cd480817

make next/ a complete standalone setup dir- no deps on ./
author drewp@bigasterisk.com
date Thu, 02 May 2024 20:33:29 -0700
parents fb0519859645
children 84a4c4cca4a5
files config/create_scrape_configs.py config/scrape_job.py next/alert_rules.py next/create_all.py next/create_k8s.py next/create_scrape_configs.py next/k8s_ops.py next/output.py next/roles.yaml next/scrape_job.py next/skaffold.yaml next/tasks.py tasks.py
diffstat 13 files changed, 1010 insertions(+), 448 deletions(-) [+]
line wrap: on
line diff
--- a/config/create_scrape_configs.py	Thu May 02 18:35:46 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,103 +0,0 @@
-from pathlib import Path
-
-from scrape_job import jobConfig, scrape_deployments, writeJobConfigs
-import private
-
-
-
-
-# previously this used `kubernetes_sd_configs: [{ role: node }]`
-all_hosts = [
-    'dash',
-    'ditto',
-    # 'ws-printer',
-    #todo:
-]
-
-smartctl_hosts = [
-    # ideally, all nodes with disks, but many turn off and on
-    'dash',
-    'ditto',
-]
-
-ping_hosts = [
-    # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
-    'printer',
-    # wireguard connection test
-    'prime5',
-    # after pyinfra or reboot, seems to lose wg0 address
-    'garage5',
-]
-
-deploy_doesnt_serve_metrics = [
-    'apprise',
-    'bitwarden',
-    'digi-files',
-    'digi-pose-predict',
-    'digi-tts-mimic',
-    'dovecot',
-    'front-door-display',
-    'hass',
-    'homepage',
-    'itch150',
-    'kallithea',
-    'kube-web-view',
-    'magma',
-    'megasecond',
-    'minecraft-build-world',
-    'minecraft-lake-world',
-    'minecraft-smp-world',
-    'mongodb',
-    'mqtt1',
-    'mqtt2',
-    'nodered',
-    'photoprism',
-    'plik',
-    'projects',
-    'registry',
-    'registry-ui',
-    'speakerphone',
-    'video',
-    'video-files',
-    'zigbee2mqtt',
-]
-
-forever_jobs = [
-    jobConfig(name='maildir-count',        targets=['prime:2500']),
-    jobConfig(name='mongodb',              targets=['mongodb:9216']),
-    jobConfig(name='net-traffic',          targets=['pipe:8080']),
-    jobConfig(name='ping',                 targets=ping_hosts,              scrape_interval='2m', ping_job=True),
-    jobConfig(name='power-eagle',          targets=['power-eagle:80'],      scrape_interval='8s'),  # from powerEagle/private_config.periodSec
-    jobConfig(name='powermeter-exporter',  targets=['powermeter-exporter'], scrape_interval='10s'),
-    jobConfig(name='smartctl',             targets=[f'{h}:9633' for h in smartctl_hosts]),
-    jobConfig(name='wifi',                 targets=['wifi:80']),
-    jobConfig(name='zfs-exporter',         targets=['ditto:9634']),
-    jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
-    jobConfig(name='zpool-exporter',       targets=['ditto:9986']),
-    jobConfig(name='octoprint',            targets=['octoprint'],
-              metrics_path='/plugin/prometheus_exporter/metrics',
-              params={'apikey' : [private.octoprint_apikey]},
-              ),
-]  # yapf: disable
-
-recent_jobs = [
-    jobConfig( name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
-    jobConfig( name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
-    jobConfig( name="net-routes",  targets=['pipe:9999']),
-    jobConfig( name="net-traffic", targets=['pipe:8080']),
-    jobConfig( name="dnsmasq-log", targets=['pipe:9991']),
-    jobConfig(
-        name="racc",
-        scrape_interval='30s',
-        targets=[
-            # - dash:5150
-            # - dot:5150
-            # - squib:5150
-            # - ashermac:5150
-        ],
-    ),
-]  # yapf: disable
-recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs))
-
-writeJobConfigs(Path('build/scrape_jobs'), forever_jobs, 'forever')
-writeJobConfigs(Path('build/scrape_jobs'), recent_jobs, 'recent')
--- a/config/scrape_job.py	Thu May 02 18:35:46 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,88 +0,0 @@
-import json
-from pathlib import Path
-import subprocess
-
-
-def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None):
-    """one scrape job config"""
-    ret = {
-        "job_name": name,
-        "static_configs": [{
-            "targets": targets,
-        }],
-        "relabel_configs": [
-            {
-                "target_label": "namespace",
-                "replacement": "default"
-            },
-            {
-                "source_labels": ["__meta_kubernetes_pod_node_name"],
-                "target_label": "node"
-            },
-        ]
-    }
-
-    if metrics_path:
-        ret['metrics_path'] = metrics_path
-
-    if scrape_interval:
-        ret['scrape_interval'] = scrape_interval
-
-    if params:
-        ret['params'] = params
-
-    if ping_job:
-        ret['metrics_path'] = '/probe'
-        ret['params'] = {'module': ['icmp']}
-        ret["relabel_configs"] = [
-            {
-                "source_labels": ["__address__"],
-                "target_label": "__param_target"
-            },
-            {
-                "source_labels": ["__param_target"],
-                "target_label": "instance"
-            },
-            {
-                "target_label": "__address__",
-                "replacement": "prober"
-            },
-        ]
-
-    return ret
-
-
-def current_deployments():
-    deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
-    for deploy in deploys['items']:
-        name = deploy['metadata']['name']
-        yield name
-
-
-def scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs):
-    ret = []
-    for name in current_deployments():
-        if name in deploy_doesnt_serve_metrics:
-            continue
-        if name in [j['job_name'] for j in forever_jobs]:
-            continue
-        targets = [name]
-        ret.append(jobConfig(name=name, targets=targets))
-    return ret
-
-
-def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
-    (outDir / retention).mkdir(exist_ok=True, parents=True)
-    filenames_written = []
-    for job in jobConfs:
-        filename = f'job_{job["job_name"]}.yaml'
-        (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
-        filenames_written.append(filename)
-
-    (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
-        "global": {
-            "scrape_interval": "1m",
-            "scrape_timeout": "10s"
-        },
-        "scrape_config_files": filenames_written,
-    }, indent=2))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/alert_rules.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,433 @@
+"""
+pdm run invoke push-config
+
+docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+"Whenever the alert expression results in one or more vector
+elements at a given point in time, the alert counts as active for
+these elements' label sets."
+also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics
+
+"""
+
+import json
+
+
+def pomRules():
+    return [
+        {
+            "alert": "frequent_upstream_connect_failures",
+            "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0"
+        },
+        {
+            "alert": "high_logging_pomerium",
+            "for": "3h",
+            "labels": {
+                "severity": "waste"
+            },
+            "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k',
+            "annotations": {
+                "summary": "high log output rate"
+            },
+        },
+    ]
+
+
+def k8sRules():
+    # from https://awesome-prometheus-alerts.grep.to/rules.html
+    return [
+        {
+            "alert": "metricsTargetMissing",
+            "expr": 'up{job!~"cm-acme-.*"} == 0',
+            'for': '10m',
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "metrics target missing (instance {{ $labels.instance }})",
+                "description": "A metrics target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesMemoryPressure",
+            "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesDiskPressure",
+            "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesOutOfDisk",
+            "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1',
+            "for": "2m",
+            "labels": {
+                "severity": "critical"
+            },
+            "annotations": {
+                "summary": "Kubernetes out of disk (instance {{ $labels.instance }})",
+                "description": "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesJobFailed",
+            "expr": "kube_job_status_failed > 0",
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes Job failed (instance {{ $labels.instance }})",
+                "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesPodCrashLooping",
+            "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3",
+            "for": "2m",
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})",
+                "description": "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "KubernetesClientCertificateExpiresNextWeek",
+            "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60',
+            "labels": {
+                "severity": "warning"
+            },
+            "annotations": {
+                "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})",
+                "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}",
+            },
+        },
+        {
+            "alert": "container_waiting",
+            "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)",
+            "annotations": {
+                "description": '',
+                "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}",
+            },
+            "for": "2m",
+        },
+    ]
+
+
+def allRules(ctx):
+    return {
+        "groups": [
+            {
+                "name": "k8s",
+                "interval": "1m",
+                "rules": k8sRules(),
+            },
+            {
+                "name": "pomerium_proxy",
+                "interval": "1m",
+                "rules": pomRules(),
+            },
+            {
+                "name":
+                    "Outages",
+                "interval":
+                    "1m",
+                "rules": [
+                    {
+                        "alert": "powereagleStalled",
+                        "expr": "rate(house_power_w[100m]) == 0",
+                        "for": "0m",
+                        "labels": {
+                            "severity": "losingData"
+                        },
+                        "annotations": {
+                            "summary": "power eagle data stalled",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "powereagleAbsent",
+                        "expr": "absent_over_time(house_power_w[5m])",
+                        "for": "2m",
+                        "labels": {
+                            "severity": "losingData"
+                        },
+                        "annotations": {
+                            "summary": "power eagle data missing",
+                            "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs",
+                        },
+                    },
+                    {
+                        "alert": "absent_zigbee",
+                        "expr": 'absent(container_last_seen{container="zigbee2mqtt"})',
+                    },
+                    {
+                        "alert": "net_routes_sync",
+                        "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70',
+                        "for": "10m",
+                        "labels": {
+                            "severity": "houseUsersAffected"
+                        },
+                        "annotations": {
+                            "summary": "net_routes is not getting regular updates"
+                        },
+                    },
+                ],
+            },
+            {
+                "name": "disk_errs",
+                "interval": "2d",
+                "rules": [{
+                    "alert": "zpool_device_error_increase",
+                    "labels": {
+                        "severity": "warning"
+                    },
+                    "expr": 'increase(zpool_device_error_count[3d]) > 0',
+                }, {
+                    "alert": "zpool_device_error_count",
+                    "labels": {
+                        "severity": "warning"
+                    },
+                    "expr": 'zpool_device_error_count > 0',
+                }],
+            },
+            {
+                "name": "lighting",
+                "interval": "5m",
+                "rules": [{
+                    "alert": "light_bridge_no_mqtt",
+                    "expr": 'mqtt_connected{job="light-bridge"} != 1',
+                }],
+            },
+            {
+                "name":
+                    "front_door",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "front_door_reader_esp32_no_mqtt",
+                        'expr': 'hw_connected{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_reader_svc_down",
+                        'expr': 'up{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_reader_svc_reader_no_mqtt",
+                        'expr': 'mqtt_connected{job="fingerprint"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_svc_down",
+                        'expr': 'up{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_svc_no_mqtt",
+                        'expr': 'mqtt_connected{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                    {
+                        "alert": "front_door_lock_esp32_no_mqtt",
+                        'expr': 'hw_connected{job="front-door-lock"} < 1',
+                        "annotations": {
+                            "summary": "see https://bigasterisk.com/front-door-lock/"
+                        },
+                    },
+                ],
+            },
+            {
+                "name":
+                    "net_routes",
+                "interval":
+                    "5m",
+                "rules": [
+                    {
+                        "alert": "no_house_ip_service",
+                        "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})'
+                    },
+                    {
+                        "alert": "no_net_routes_running",
+                        "expr": 'absent(python_info{job="net-routes"})'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_200",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1'
+                    },
+                    {
+                        "alert": "allowed_check_never_returned_403",
+                        'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1'
+                    },
+                    {
+                        'alert': 'net_route_input_eval_cal_loop_is_down',
+                        'expr': 'eval_cal_up!=1'
+                    },
+                    {
+                        'alert': 'net_route_input_mongo_loop_is_down',
+                        'expr': 'mongo_to_net_routes_up!=1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests',
+                        'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1'
+                    },
+                    {
+                        'alert': 'gcalendarwatch_current_events_loop_is_down',
+                        'expr': 'current_events_up != 1'
+                    },
+                ],
+            },
+            {
+                "name": "http",
+                "interval": "1h",
+                'rules': [
+                    {
+                        'alert': 'old_https_certs',
+                        'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15',
+                    },
+                    {
+                        'alert': 'high_500_response_rate',
+                        'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02',
+                    },
+                ],
+            },
+            {
+                "name": "ping",
+                "interval": "1m",
+                "rules": [{
+                    "alert": "ping_failed",
+                    "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1',
+                }]
+            },
+            {
+                "name":
+                    "alerts",
+                "rules": [
+                    {
+                        "alert": "kube_node_status_bad_condition",
+                        "for": "2h",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0',
+                    },
+                    {
+                        "alert": "housePower",
+                        "for": "1h",
+                        "labels": {
+                            "severity": "waste"
+                        },
+                        "expr": "house_power_w > 4000",
+                        "annotations": {
+                            "summary": "house power usage over 4KW"
+                        },
+                    },
+                    {
+                        "alert": "host_root_fs_space_low",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'disk_free{host!="garage",path="/"} < 20G',
+                    },
+                    {
+                        "alert": "zpool_space_low",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G',
+                    },
+                    {
+                        "alert": "disk_week_incr",
+                        "for": "20m",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000',
+                        "annotations": {
+                            "summary": "high mb/week on zfs dir"
+                        },
+                    },
+                    {
+                        "alert": "high_logging",
+                        "for": "3h",
+                        "labels": {
+                            "severity": "waste"
+                        },
+                        "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k',
+                        "annotations": {
+                            "summary": "high log output rate"
+                        },
+                    },
+                    {
+                        "alert": "stale_process",
+                        "for": "1d",
+                        "labels": {
+                            "severity": "dataRisk"
+                        },
+                        "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14",
+                        "annotations": {
+                            "summary": "process time is old"
+                        },
+                    },
+                    {
+                        "alert": "starlette",
+                        "for": "1m",
+                        "labels": {
+                            "severity": "fix"
+                        },
+                        "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}',
+                        "annotations": {
+                            "summary": "set starlette app name"
+                        },
+                    },
+                    {
+                        "alert": "ssl_certs_expiring_soon",
+                        "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10",
+                        "labels": {
+                            "severity": "warning"
+                        },
+                        "annotations": {
+                            "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}"
+                        },
+                    },
+                ],
+            },
+        ] + hostsExpectedOnline(ctx)['groups']
+    }
+
+
+def _runJson(ctx, cmd):
+    return json.loads(ctx.run(cmd, hide="stdout").stdout)
+
+
+def hostsExpectedOnline(ctx):
+    return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py")
--- a/next/create_all.py	Thu May 02 18:35:46 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,255 +0,0 @@
-from pathlib import Path
-from index_page import makeIndexHtml
-from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
-
-
-def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
-    (build / f'{agentFileName}_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": agentName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmagent",
-                            "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
-                                tzArg,
-                                f"-promscrape.config=/local/config/{scrapeMapKey}",
-                                "-promscrape.configCheckInterval=5s",
-                                "-sortLabels",
-                                f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
-                                "-remoteWrite.showURL",
-                            ],
-                            "ports": [{ "containerPort": agentPort }],
-                            "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
-    (build / f'{insertFileName}_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": insertName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vminsert",
-                            "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
-                                tzArg,
-                                f"-storageNode={storageName}",
-                            ],
-                            "ports": [{ "containerPort": insertPort }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
-    (build / f'{storageFileName}_2deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
-            "spec": {
-                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": storageName },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmstorage",
-                            "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
-                            "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
-                                tzArg,
-                                f"-retentionPeriod={retention}",
-                                f"-storageDataPath=/data/{pipelineName}",
-                            ],
-                            "ports": [
-                                { "containerPort": 8482, "name": "http" },
-                                { "containerPort": storageInsertPort, "name": "vminsert" },
-                                { "containerPort": storageSelectPort, "name": "vmselect" },
-                            ],
-                            "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
-                        }],
-                        "affinity": affinityToNode(localPvHost)
-                    }
-                }
-            }
-        })) # yapf: disable
-
-
-def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
-    name = f"{objPrefix}-vmselect"
-    (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
-            "spec": {
-                "replicas": 1,
-                "strategy": { "type": "Recreate" },
-                "selector": { "matchLabels": { "app": name } },
-                "template": {
-                    "metadata": {
-                        "labels": { "app": name },
-                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
-                    },
-                    "spec": {
-                        "serviceAccountName": "victoriametrics",
-                        "containers": [{
-                            "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f"-http.pathPrefix={webRoot}/vmselect/",
-                                tzArg,
-                            ] + [f"-storageNode={n}" for n in storageSvcs],
-                            "ports": [{ "containerPort": selectPort }]
-                        }]
-                    }
-                }
-            }
-        })) # yapf: disable
-
-def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
-    agentName = f"{objPrefix}-{pipelineName}-vmagent"
-    insertName = f"{objPrefix}-{pipelineName}-vminsert"
-    storageName = f"{objPrefix}-{pipelineName}-vmstorage"
-
-    agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
-    insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
-    storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
-
-    localPvHost = "ditto"
-    insertPort = 8480
-    agentPort = 8429
-    storageInsertPort = 8400
-    storageSelectPort = 8401
-    volName = f"{objPrefix}-data-{pipelineName}"
-    request = "50Gi"
-    pipelineWebRoot = f'{webRoot}/{pipelineName}'
-
-    createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
-    createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
-    createPv(storageFileName, volName, request)
-    createPvc(storageFileName, volName, request)
-    createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
-
-    createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
-    createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
-    createSvc(storageFileName,storageName, [
-        {"port": 80, "targetPort": "http", "name": "http"},
-        {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
-        {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
-        ]) # yapf: disable
-
-    return storageName
-
-
-def createIndex(objPrefix, webRoot, html):
-    name = f'{objPrefix}-index'
-    httpServeRoot = '/opt/html'
-
-    (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
-        "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
-        "data": {
-            "index.html": html,
-            "index.js": Path("index.js").read_text(),
-            "index.css": Path("index.css").read_text(),
-        }
-    })) # yapf: disable
-    
-    (build / f'{objPrefix}-3index_deploy.yaml').write_text(
-        toJson({
-            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
-            "spec": {
-                "replicas": 1,
-                "selector": { "matchLabels": { "app": name } },
-                "template": {
-                    "metadata": { "labels": { "app": name } },
-                    "spec": {
-                        "containers": [{
-                            "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
-                            "args": [
-                                f'--root={httpServeRoot}',
-                                '--directory-listing=true',
-                                '--experimental-metrics=true',
-                            ],
-                            "ports": [{ "containerPort": 80 }],
-                            "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
-                        }],
-                        "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
-                    }
-                }
-            }
-        })) # yapf: disable
-    createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
-
-
-def main():
-    tzArg = "-loggerTimezone=America/Los_Angeles"
-    objPrefix = "next-victoriametrics"  # prefix on all k8s object names
-    webRoot = "/m/next"
-    vmVersion = "v1.100.1"
-    webHost = 'bigasterisk.com'
-    pipelines = [
-        ('forever', '100y'),
-        ('recent', '90y'),
-    ]
-    storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
-
-    selectPort = 8481
-    createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
-    createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
-
-    ingressPaths = [
-        { "pathType": "Prefix", "path": f"{webRoot}/",          "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
-        { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
-    ]  # yapf: disable
-    for p, _ in pipelines:
-        ingressPaths.extend([
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/",   "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent",   "port": { "number": 80 } } } },
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/",  "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert",  "port": { "number": 80 } } } },
-            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
-        ]) # yapf: disable
-
-    policy = """\
-allow:
-    or: 
-        - { email: { is: "drewpca@gmail.com" }}
-        - { email: { is: "kelsimp@gmail.com" }}
-    """
-    createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
-    createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
-    # this should also emit a static html page and web server deploy that serves at webRoot and has a map of everything
-
-
-main()
-
-# in vmui, set server url to
-# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/create_k8s.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,254 @@
+from pathlib import Path
+from index_page import makeIndexHtml
+from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc
+
+
+def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix):
+    (build / f'{agentFileName}_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": agentName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }],
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmagent",
+                            "image": f"docker.io/victoriametrics/vmagent:{vmVersion}",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vmagent/",
+                                tzArg,
+                                f"-promscrape.config=/local/config/{scrapeMapKey}",
+                                "-promscrape.configCheckInterval=5s",
+                                "-sortLabels",
+                                f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write",
+                                "-remoteWrite.showURL",
+                            ],
+                            "ports": [{ "containerPort": agentPort }],
+                            "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort):
+    (build / f'{insertFileName}_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": insertName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vminsert",
+                            "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vminsert/",
+                                tzArg,
+                                f"-storageNode={storageName}",
+                            ],
+                            "ports": [{ "containerPort": insertPort }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort):
+    (build / f'{storageFileName}_2deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName },
+            "spec": {
+                "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": storageName },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }],
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmstorage",
+                            "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster",
+                            "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={pipelineWebRoot}/vmstorage/",
+                                tzArg,
+                                f"-retentionPeriod={retention}",
+                                f"-storageDataPath=/data/{pipelineName}",
+                            ],
+                            "ports": [
+                                { "containerPort": 8482, "name": "http" },
+                                { "containerPort": storageInsertPort, "name": "vminsert" },
+                                { "containerPort": storageSelectPort, "name": "vmselect" },
+                            ],
+                            "volumeMounts": [{ "name": "data", "mountPath": "/data" }]
+                        }],
+                        "affinity": affinityToNode(localPvHost)
+                    }
+                }
+            }
+        })) # yapf: disable
+
+
+def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort):
+    name = f"{objPrefix}-vmselect"
+    (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+            "spec": {
+                "replicas": 1,
+                "strategy": { "type": "Recreate" },
+                "selector": { "matchLabels": { "app": name } },
+                "template": {
+                    "metadata": {
+                        "labels": { "app": name },
+                        "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" }
+                    },
+                    "spec": {
+                        "serviceAccountName": "victoriametrics",
+                        "containers": [{
+                            "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f"-http.pathPrefix={webRoot}/vmselect/",
+                                tzArg,
+                            ] + [f"-storageNode={n}" for n in storageSvcs],
+                            "ports": [{ "containerPort": selectPort }]
+                        }]
+                    }
+                }
+            }
+        })) # yapf: disable
+
+def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention):
+    agentName = f"{objPrefix}-{pipelineName}-vmagent"
+    insertName = f"{objPrefix}-{pipelineName}-vminsert"
+    storageName = f"{objPrefix}-{pipelineName}-vmstorage"
+
+    agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent"
+    insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert"
+    storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage"
+
+    localPvHost = "ditto"
+    insertPort = 8480
+    agentPort = 8429
+    storageInsertPort = 8400
+    storageSelectPort = 8401
+    volName = f"{objPrefix}-data-{pipelineName}"
+    request = "50Gi"
+    pipelineWebRoot = f'{webRoot}/{pipelineName}'
+
+    createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix)
+    createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort)
+    createPv(storageFileName, volName, request)
+    createPvc(storageFileName, volName, request)
+    createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort)
+
+    createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}])
+    createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}])
+    createSvc(storageFileName,storageName, [
+        {"port": 80, "targetPort": "http", "name": "http"},
+        {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"},
+        {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"},
+        ]) # yapf: disable
+
+    return storageName
+
+
+def createIndex(objPrefix, webRoot, html):
+    name = f'{objPrefix}-index'
+    httpServeRoot = '/opt/html'
+
+    (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({
+        "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name },
+        "data": {
+            "index.html": html,
+            "index.js": Path("index.js").read_text(),
+            "index.css": Path("index.css").read_text(),
+        }
+    })) # yapf: disable
+
+    (build / f'{objPrefix}-3index_deploy.yaml').write_text(
+        toJson({
+            "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name },
+            "spec": {
+                "replicas": 1,
+                "selector": { "matchLabels": { "app": name } },
+                "template": {
+                    "metadata": { "labels": { "app": name } },
+                    "spec": {
+                        "containers": [{
+                            "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent",
+                            "args": [
+                                f'--root={httpServeRoot}',
+                                '--directory-listing=true',
+                                '--experimental-metrics=true',
+                            ],
+                            "ports": [{ "containerPort": 80 }],
+                            "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }]
+                        }],
+                        "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }]
+                    }
+                }
+            }
+        })) # yapf: disable
+    createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}])
+
+
+def main():
+    tzArg = "-loggerTimezone=America/Los_Angeles"
+    objPrefix = "next-victoriametrics"  # prefix on all k8s object names
+    webRoot = "/m/next"
+    vmVersion = "v1.100.1"
+    webHost = 'bigasterisk.com'
+    pipelines = [
+        ('forever', '100y'),
+        ('recent', '90y'),
+    ]
+    storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines]
+
+    selectPort = 8481
+    createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort)
+    createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}])
+
+    ingressPaths = [
+        { "pathType": "Prefix", "path": f"{webRoot}/",          "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } },
+        { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } },
+    ]  # yapf: disable
+    for p, _ in pipelines:
+        ingressPaths.extend([
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/",   "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent",   "port": { "number": 80 } } } },
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/",  "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert",  "port": { "number": 80 } } } },
+            { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } },
+        ]) # yapf: disable
+
+    policy = """\
+allow:
+    or: 
+        - { email: { is: "drewpca@gmail.com" }}
+        - { email: { is: "kelsimp@gmail.com" }}
+    """
+    createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost)
+    createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost))
+
+
+main()
+
+# in vmui, set server url to
+# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/create_scrape_configs.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,104 @@
+from pathlib import Path
+
+from scrape_job import jobConfig, scrape_deployments, writeJobConfigs
+import private
+
+
+
+
+# previously this used `kubernetes_sd_configs: [{ role: node }]`
+all_hosts = [
+    'dash',
+    'ditto',
+    # 'ws-printer',
+    #todo:
+]
+
+smartctl_hosts = [
+    # ideally, all nodes with disks, but many turn off and on
+    'dash',
+    'ditto',
+]
+
+ping_hosts = [
+    # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1
+    'printer',
+    # wireguard connection test
+    'prime5',
+    # after pyinfra or reboot, seems to lose wg0 address
+    'garage5',
+]
+
+deploy_doesnt_serve_metrics = [
+    'apprise',
+    'bitwarden',
+    'digi-files',
+    'digi-pose-predict',
+    'digi-tts-mimic',
+    'dovecot',
+    'front-door-display',
+    'hass',
+    'homepage',
+    'itch150',
+    'kallithea',
+    'kube-web-view',
+    'magma',
+    'megasecond',
+    'minecraft-build-world',
+    'minecraft-lake-world',
+    'minecraft-smp-world',
+    'mongodb',
+    'mqtt1',
+    'mqtt2',
+    'nodered',
+    'photoprism',
+    'plik',
+    'projects',
+    'registry',
+    'registry-ui',
+    'speakerphone',
+    'video',
+    'video-files',
+    'zigbee2mqtt',
+]
+
+forever_jobs = [
+    jobConfig(name='maildir-count',        targets=['prime:2500']),
+    jobConfig(name='mongodb',              targets=['mongodb:9216']),
+    jobConfig(name='net-traffic',          targets=['pipe:8080']),
+    jobConfig(name='ping',                 targets=ping_hosts,              scrape_interval='2m', ping_job=True),
+    jobConfig(name='power-eagle',          targets=['power-eagle:80'],      scrape_interval='8s'),  # from powerEagle/private_config.periodSec
+    jobConfig(name='powermeter-exporter',  targets=['powermeter-exporter'], scrape_interval='10s'),
+    jobConfig(name='smartctl',             targets=[f'{h}:9633' for h in smartctl_hosts]),
+    jobConfig(name='wifi',                 targets=['wifi:80']),
+    jobConfig(name='zfs-exporter',         targets=['ditto:9634']),
+    jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']),
+    jobConfig(name='zpool-exporter',       targets=['ditto:9986']),
+    jobConfig(name='octoprint',            targets=['octoprint'],
+              metrics_path='/plugin/prometheus_exporter/metrics',
+              params={'apikey' : [private.octoprint_apikey]},
+              ),
+]  # yapf: disable
+
+recent_jobs = [
+    jobConfig( name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
+    jobConfig( name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
+    jobConfig( name="net-routes",  targets=['pipe:9999']),
+    jobConfig( name="net-traffic", targets=['pipe:8080']),
+    jobConfig( name="dnsmasq-log", targets=['pipe:9991']),
+    jobConfig(
+        name="racc",
+        scrape_interval='30s',
+        targets=[
+            # - dash:5150
+            # - dot:5150
+            # - squib:5150
+            # - ashermac:5150
+        ],
+    ),
+]  # yapf: disable
+recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs))
+
+top = Path('build/scrape_config')
+writeJobConfigs(top, forever_jobs, 'forever')
+writeJobConfigs(top, recent_jobs, 'recent')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/k8s_ops.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,50 @@
+import json
+import time
+
+from kubernetes import client
+
+
+def refreshPodCmaps(pod_name, namespace="default"):
+    """
+    Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while
+    until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations.
+    """
+    api_instance = client.CoreV1Api()
+
+    pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace)
+    if pod.metadata.annotations is None:
+        pod.metadata.annotations = {}
+    pod.metadata.annotations["force-configmap-update"] = str(time.time())
+    api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod)
+
+
+def firstPodName(selector):
+    api_instance = client.CoreV1Api()
+    pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector)
+    return pod_list.items[0].metadata.name
+
+
+def hup(ctx, deployment, process_name):
+    ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}")
+
+
+def replaceCmap(name, dataObj):
+    api_instance = client.CoreV1Api()
+
+    data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items())
+
+    try:
+
+        existing_config_map = api_instance.read_namespaced_config_map(name, 'default')
+        existing_config_map.data.update(data)
+        api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map)
+    except client.rest.ApiException as e:
+        if e.status == 404:
+            config_map = client.V1ConfigMap()
+            config_map.metadata = client.V1ObjectMeta(name=name)
+            config_map.data = data
+            api_response = api_instance.create_namespaced_config_map('default', config_map)
+        else:
+            raise
+
+    print(f"{name} resource_version is now {api_response.metadata.resource_version}")
--- a/next/output.py	Thu May 02 18:35:46 2024 -0700
+++ b/next/output.py	Thu May 02 20:33:29 2024 -0700
@@ -1,7 +1,8 @@
 import json
 from pathlib import Path
 
-build = Path('build')
+build = Path('build/k8s_config')
+build.mkdir(parents=True, exist_ok=True)
 
 
 def toJson(d):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/roles.yaml	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,43 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: victoriametrics
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - nodes/proxy
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups:
+  - extensions
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: victoriametrics
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: victoriametrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: victoriametrics
+subjects:
+- kind: ServiceAccount
+  name: victoriametrics
+  namespace: default
+# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account
+# - kind: ServiceAccount
+#   name: default
+#   namespace: default
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/scrape_job.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,88 @@
+import json
+from pathlib import Path
+import subprocess
+
+
+def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None):
+    """one scrape job config"""
+    ret = {
+        "job_name": name,
+        "static_configs": [{
+            "targets": targets,
+        }],
+        "relabel_configs": [
+            {
+                "target_label": "namespace",
+                "replacement": "default"
+            },
+            {
+                "source_labels": ["__meta_kubernetes_pod_node_name"],
+                "target_label": "node"
+            },
+        ]
+    }
+
+    if metrics_path:
+        ret['metrics_path'] = metrics_path
+
+    if scrape_interval:
+        ret['scrape_interval'] = scrape_interval
+
+    if params:
+        ret['params'] = params
+
+    if ping_job:
+        ret['metrics_path'] = '/probe'
+        ret['params'] = {'module': ['icmp']}
+        ret["relabel_configs"] = [
+            {
+                "source_labels": ["__address__"],
+                "target_label": "__param_target"
+            },
+            {
+                "source_labels": ["__param_target"],
+                "target_label": "instance"
+            },
+            {
+                "target_label": "__address__",
+                "replacement": "prober"
+            },
+        ]
+
+    return ret
+
+
+def current_deployments():
+    deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json']))
+    for deploy in deploys['items']:
+        name = deploy['metadata']['name']
+        yield name
+
+
+def scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs):
+    ret = []
+    for name in current_deployments():
+        if name in deploy_doesnt_serve_metrics:
+            continue
+        if name in [j['job_name'] for j in forever_jobs]:
+            continue
+        targets = [name]
+        ret.append(jobConfig(name=name, targets=targets))
+    return ret
+
+
+def writeJobConfigs(outDir: Path, jobConfs: list, retention: str):
+    outDir.mkdir(exist_ok=True, parents=True)
+    filenames_written = []
+    for job in jobConfs:
+        filename = f'job_{job["job_name"]}.yaml'
+        (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True))
+        filenames_written.append(filename)
+
+    (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({
+        "global": {
+            "scrape_interval": "1m",
+            "scrape_timeout": "10s"
+        },
+        "scrape_config_files": filenames_written,
+    }, indent=2))
--- a/next/skaffold.yaml	Thu May 02 18:35:46 2024 -0700
+++ b/next/skaffold.yaml	Thu May 02 20:33:29 2024 -0700
@@ -4,6 +4,7 @@
   name: victoriametrics
 manifests:
   rawYaml:
-    - build/*.yaml
+    - roles.yaml
+    - build/k8s_config/*.yaml
 deploy:
   kubectl: {}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/next/tasks.py	Thu May 02 20:33:29 2024 -0700
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import yaml
+from invoke import task
+from kubernetes import config
+
+import alert_rules
+from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
+
+config.load_kube_config()
+
+
+def scrapeConfig(fn):
+    return yaml.load(open(fn), yaml.FullLoader)
+
+
+@task
+def push_config_2024(ctx):
+    # plan:
+    #   every discovered service may:
+    #      - be described here as a forever retention - ignore the discovery
+    #      - be blocked here as a no-metrics service - ignore the discovery
+    #      - be scraped as 'recent', with possible overrides of port/path
+    #   all per-node metrics shall be 'recent' (oops, not smartctl!)
+    map: dict[str, object] = {
+        'rules': alert_rules.allRules(ctx),
+    }
+    top = Path('build/scrape_config')
+    for p in top.glob('**/*.yaml'):
+        map[str(p.relative_to(top))] = scrapeConfig(p)
+    replaceCmap("next-victoriametrics-config", map)
+    refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))
+    refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent"))
--- a/tasks.py	Thu May 02 18:35:46 2024 -0700
+++ b/tasks.py	Thu May 02 20:33:29 2024 -0700
@@ -4,6 +4,7 @@
 from kubernetes import config
 
 import alert_rules
+
 from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap
 
 config.load_kube_config()