# HG changeset patch # User drewp@bigasterisk.com # Date 1714707209 25200 # Node ID 8134cd480817df598165b5b18212d38bda52a509 # Parent fb0519859645b08007a302bbaa4e24fee4854735 make next/ a complete standalone setup dir- no deps on ./ diff -r fb0519859645 -r 8134cd480817 config/create_scrape_configs.py --- a/config/create_scrape_configs.py Thu May 02 18:35:46 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -from pathlib import Path - -from scrape_job import jobConfig, scrape_deployments, writeJobConfigs -import private - - - - -# previously this used `kubernetes_sd_configs: [{ role: node }]` -all_hosts = [ - 'dash', - 'ditto', - # 'ws-printer', - #todo: -] - -smartctl_hosts = [ - # ideally, all nodes with disks, but many turn off and on - 'dash', - 'ditto', -] - -ping_hosts = [ - # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1 - 'printer', - # wireguard connection test - 'prime5', - # after pyinfra or reboot, seems to lose wg0 address - 'garage5', -] - -deploy_doesnt_serve_metrics = [ - 'apprise', - 'bitwarden', - 'digi-files', - 'digi-pose-predict', - 'digi-tts-mimic', - 'dovecot', - 'front-door-display', - 'hass', - 'homepage', - 'itch150', - 'kallithea', - 'kube-web-view', - 'magma', - 'megasecond', - 'minecraft-build-world', - 'minecraft-lake-world', - 'minecraft-smp-world', - 'mongodb', - 'mqtt1', - 'mqtt2', - 'nodered', - 'photoprism', - 'plik', - 'projects', - 'registry', - 'registry-ui', - 'speakerphone', - 'video', - 'video-files', - 'zigbee2mqtt', -] - -forever_jobs = [ - jobConfig(name='maildir-count', targets=['prime:2500']), - jobConfig(name='mongodb', targets=['mongodb:9216']), - jobConfig(name='net-traffic', targets=['pipe:8080']), - jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True), - jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec - jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'), - jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]), - jobConfig(name='wifi', targets=['wifi:80']), - jobConfig(name='zfs-exporter', targets=['ditto:9634']), - jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']), - jobConfig(name='zpool-exporter', targets=['ditto:9986']), - jobConfig(name='octoprint', targets=['octoprint'], - metrics_path='/plugin/prometheus_exporter/metrics', - params={'apikey' : [private.octoprint_apikey]}, - ), -] # yapf: disable - -recent_jobs = [ - jobConfig( name="telegraf", targets=[f'{h}:9273' for h in all_hosts]), - jobConfig( name="filebeat", targets=[f'{h}:5067' for h in all_hosts]), - jobConfig( name="net-routes", targets=['pipe:9999']), - jobConfig( name="net-traffic", targets=['pipe:8080']), - jobConfig( name="dnsmasq-log", targets=['pipe:9991']), - jobConfig( - name="racc", - scrape_interval='30s', - targets=[ - # - dash:5150 - # - dot:5150 - # - squib:5150 - # - ashermac:5150 - ], - ), -] # yapf: disable -recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs)) - -writeJobConfigs(Path('build/scrape_jobs'), forever_jobs, 'forever') -writeJobConfigs(Path('build/scrape_jobs'), recent_jobs, 'recent') diff -r fb0519859645 -r 8134cd480817 config/scrape_job.py --- a/config/scrape_job.py Thu May 02 18:35:46 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -import json -from pathlib import Path -import subprocess - - -def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None): - """one scrape job config""" - ret = { - "job_name": name, - "static_configs": [{ - "targets": targets, - }], - "relabel_configs": [ - { - "target_label": "namespace", - "replacement": "default" - }, - { - "source_labels": ["__meta_kubernetes_pod_node_name"], - "target_label": "node" - }, - ] - } - - if metrics_path: - ret['metrics_path'] = metrics_path - - if scrape_interval: - ret['scrape_interval'] = scrape_interval - - if params: - ret['params'] = params - - if ping_job: - ret['metrics_path'] = '/probe' - ret['params'] = {'module': ['icmp']} - ret["relabel_configs"] = [ - { - "source_labels": ["__address__"], - "target_label": "__param_target" - }, - { - "source_labels": ["__param_target"], - "target_label": "instance" - }, - { - "target_label": "__address__", - "replacement": "prober" - }, - ] - - return ret - - -def current_deployments(): - deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json'])) - for deploy in deploys['items']: - name = deploy['metadata']['name'] - yield name - - -def scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs): - ret = [] - for name in current_deployments(): - if name in deploy_doesnt_serve_metrics: - continue - if name in [j['job_name'] for j in forever_jobs]: - continue - targets = [name] - ret.append(jobConfig(name=name, targets=targets)) - return ret - - -def writeJobConfigs(outDir: Path, jobConfs: list, retention: str): - (outDir / retention).mkdir(exist_ok=True, parents=True) - filenames_written = [] - for job in jobConfs: - filename = f'job_{job["job_name"]}.yaml' - (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True)) - filenames_written.append(filename) - - (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({ - "global": { - "scrape_interval": "1m", - "scrape_timeout": "10s" - }, - "scrape_config_files": filenames_written, - }, indent=2)) diff -r fb0519859645 -r 8134cd480817 next/alert_rules.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/alert_rules.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,433 @@ +""" +pdm run invoke push-config + +docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +"Whenever the alert expression results in one or more vector +elements at a given point in time, the alert counts as active for +these elements' label sets." +also https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/#Missing-metrics + +""" + +import json + + +def pomRules(): + return [ + { + "alert": "frequent_upstream_connect_failures", + "expr": "max_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_upstream_cx_connect_fail))[1h]) > 0" + }, + { + "alert": "high_logging_pomerium", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container="pomerium"}[3h])) > 8k', + "annotations": { + "summary": "high log output rate" + }, + }, + ] + + +def k8sRules(): + # from https://awesome-prometheus-alerts.grep.to/rules.html + return [ + { + "alert": "metricsTargetMissing", + "expr": 'up{job!~"cm-acme-.*"} == 0', + 'for': '10m', + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "metrics target missing (instance {{ $labels.instance }})", + "description": "A metrics target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesMemoryPressure", + "expr": 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes memory pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesDiskPressure", + "expr": 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes disk pressure (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesOutOfDisk", + "expr": 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1', + "for": "2m", + "labels": { + "severity": "critical" + }, + "annotations": { + "summary": "Kubernetes out of disk (instance {{ $labels.instance }})", + "description": "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesJobFailed", + "expr": "kube_job_status_failed > 0", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes Job failed (instance {{ $labels.instance }})", + "description": "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesPodCrashLooping", + "expr": "increase(kube_pod_container_status_restarts_total[1m]) > 3", + "for": "2m", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes pod crash looping (instance {{ $labels.instance }})", + "description": "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}", + }, + }, + { + "alert": "KubernetesClientCertificateExpiresNextWeek", + "expr": 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60', + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "Kubernetes client certificate expires next week (instance {{ $labels.instance }})", + "description": "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}", + }, + }, + { + "alert": "container_waiting", + "expr": "sum by (namespace, pod, container)(kube_pod_container_status_waiting!=0)", + "annotations": { + "description": '', + "dashboard": "https://bigasterisk.com/k/clusters/local/namespaces/{{ $labels.namespace }}/pods/{{ $labels.pod }}", + }, + "for": "2m", + }, + ] + + +def allRules(ctx): + return { + "groups": [ + { + "name": "k8s", + "interval": "1m", + "rules": k8sRules(), + }, + { + "name": "pomerium_proxy", + "interval": "1m", + "rules": pomRules(), + }, + { + "name": + "Outages", + "interval": + "1m", + "rules": [ + { + "alert": "powereagleStalled", + "expr": "rate(house_power_w[100m]) == 0", + "for": "0m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data stalled", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "powereagleAbsent", + "expr": "absent_over_time(house_power_w[5m])", + "for": "2m", + "labels": { + "severity": "losingData" + }, + "annotations": { + "summary": "power eagle data missing", + "description": "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs", + }, + }, + { + "alert": "absent_zigbee", + "expr": 'absent(container_last_seen{container="zigbee2mqtt"})', + }, + { + "alert": "net_routes_sync", + "expr": 'rate(starlette_request_duration_seconds_count{app_name="net_routes",path="/routes"}[5m]) < 1/70', + "for": "10m", + "labels": { + "severity": "houseUsersAffected" + }, + "annotations": { + "summary": "net_routes is not getting regular updates" + }, + }, + ], + }, + { + "name": "disk_errs", + "interval": "2d", + "rules": [{ + "alert": "zpool_device_error_increase", + "labels": { + "severity": "warning" + }, + "expr": 'increase(zpool_device_error_count[3d]) > 0', + }, { + "alert": "zpool_device_error_count", + "labels": { + "severity": "warning" + }, + "expr": 'zpool_device_error_count > 0', + }], + }, + { + "name": "lighting", + "interval": "5m", + "rules": [{ + "alert": "light_bridge_no_mqtt", + "expr": 'mqtt_connected{job="light-bridge"} != 1', + }], + }, + { + "name": + "front_door", + "interval": + "5m", + "rules": [ + { + "alert": "front_door_reader_esp32_no_mqtt", + 'expr': 'hw_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_down", + 'expr': 'up{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_reader_svc_reader_no_mqtt", + 'expr': 'mqtt_connected{job="fingerprint"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_down", + 'expr': 'up{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_svc_no_mqtt", + 'expr': 'mqtt_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + { + "alert": "front_door_lock_esp32_no_mqtt", + 'expr': 'hw_connected{job="front-door-lock"} < 1', + "annotations": { + "summary": "see https://bigasterisk.com/front-door-lock/" + }, + }, + ], + }, + { + "name": + "net_routes", + "interval": + "5m", + "rules": [ + { + "alert": "no_house_ip_service", + "expr": 'absent(kube_service_spec_external_ip{service="net-route-input-allowed",external_ip="10.2.0.133"})' + }, + { + "alert": "no_net_routes_running", + "expr": 'absent(python_info{job="net-routes"})' + }, + { + "alert": "allowed_check_never_returned_200", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="200"} < 1' + }, + { + "alert": "allowed_check_never_returned_403", + 'expr': 'starlette_requests_total{app_name="net_route_input",method="GET",path="/api/allowed",status_code="403"} < 1' + }, + { + 'alert': 'net_route_input_eval_cal_loop_is_down', + 'expr': 'eval_cal_up!=1' + }, + { + 'alert': 'net_route_input_mongo_loop_is_down', + 'expr': 'mongo_to_net_routes_up!=1' + }, + { + 'alert': 'gcalendarwatch_hasnt_succeeded_on_any_currentEvents_requests', + 'expr': 'starlette_requests_total{app_name="gcalendarwatch",method="GET",path="/graph/currentEvents",status_code="200"} < 1' + }, + { + 'alert': 'gcalendarwatch_current_events_loop_is_down', + 'expr': 'current_events_up != 1' + }, + ], + }, + { + "name": "http", + "interval": "1h", + 'rules': [ + { + 'alert': 'old_https_certs', + 'expr': 'min by (source) (x509_cert_enddate - now())/86400 < 15', + }, + { + 'alert': 'high_500_response_rate', + 'expr': 'avg_over_time(rate(sum by (envoy_cluster_name) (envoy_cluster_internal_upstream_rq_xx{envoy_response_code_class="5"})[20m])) > 0.02', + }, + ], + }, + { + "name": "ping", + "interval": "1m", + "rules": [{ + "alert": "ping_failed", + "expr": 'max_over_time(probe_success{job="ping"}[1m]) < 1', + }] + }, + { + "name": + "alerts", + "rules": [ + { + "alert": "kube_node_status_bad_condition", + "for": "2h", + "labels": { + "severity": "warning" + }, + "expr": 'kube_node_status_condition{condition=~".*Pressure",status="true"} > 0', + }, + { + "alert": "housePower", + "for": "1h", + "labels": { + "severity": "waste" + }, + "expr": "house_power_w > 4000", + "annotations": { + "summary": "house power usage over 4KW" + }, + }, + { + "alert": "host_root_fs_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'disk_free{host!="garage",path="/"} < 20G', + }, + { + "alert": "zpool_space_low", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'last_over_time(zfs_pool_free_bytes{pool="stor7"}[1h]) < 100G', + }, + { + "alert": "disk_week_incr", + "for": "20m", + "labels": { + "severity": "warning" + }, + "expr": 'round(increase(disk_used{path=~"/my/.*"}[1d])/1M) > 5000', + "annotations": { + "summary": "high mb/week on zfs dir" + }, + }, + { + "alert": "high_logging", + "for": "3h", + "labels": { + "severity": "waste" + }, + "expr": 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes{container!="pomerium"}[3h])) > 4k', + "annotations": { + "summary": "high log output rate" + }, + }, + { + "alert": "stale_process", + "for": "1d", + "labels": { + "severity": "dataRisk" + }, + "expr": "round((time() - filestat_modification_time/1e9) / 86400) > 14", + "annotations": { + "summary": "process time is old" + }, + }, + { + "alert": "starlette", + "for": "1m", + "labels": { + "severity": "fix" + }, + "expr": 'starlette_request_duration_seconds_created{app_name="starlette"}', + "annotations": { + "summary": "set starlette app name" + }, + }, + { + "alert": "ssl_certs_expiring_soon", + "expr": "min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10", + "labels": { + "severity": "warning" + }, + "annotations": { + "summary": "cert expiring soon. See https://bigasterisk.com/grafana/d/z1YtDa3Gz/certs?orgId=1\nVALUE = {{ $value }}" + }, + }, + ], + }, + ] + hostsExpectedOnline(ctx)['groups'] + } + + +def _runJson(ctx, cmd): + return json.loads(ctx.run(cmd, hide="stdout").stdout) + + +def hostsExpectedOnline(ctx): + return _runJson(ctx, "cd /my/serv/lanscape; pdm run python hosts_expected_online.py") diff -r fb0519859645 -r 8134cd480817 next/create_all.py --- a/next/create_all.py Thu May 02 18:35:46 2024 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,255 +0,0 @@ -from pathlib import Path -from index_page import makeIndexHtml -from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc - - -def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix): - (build / f'{agentFileName}_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } }, - "template": { - "metadata": { - "labels": { "app": agentName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }], - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmagent", - "image": f"docker.io/victoriametrics/vmagent:{vmVersion}", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vmagent/", - tzArg, - f"-promscrape.config=/local/config/{scrapeMapKey}", - "-promscrape.configCheckInterval=5s", - "-sortLabels", - f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write", - "-remoteWrite.showURL", - ], - "ports": [{ "containerPort": agentPort }], - "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }] - }] - } - } - } - })) # yapf: disable - - -def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort): - (build / f'{insertFileName}_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } }, - "template": { - "metadata": { - "labels": { "app": insertName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vminsert", - "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vminsert/", - tzArg, - f"-storageNode={storageName}", - ], - "ports": [{ "containerPort": insertPort }] - }] - } - } - } - })) # yapf: disable - - -def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort): - (build / f'{storageFileName}_2deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName }, - "spec": { - "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } }, - "template": { - "metadata": { - "labels": { "app": storageName }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }], - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmstorage", - "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster", - "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={pipelineWebRoot}/vmstorage/", - tzArg, - f"-retentionPeriod={retention}", - f"-storageDataPath=/data/{pipelineName}", - ], - "ports": [ - { "containerPort": 8482, "name": "http" }, - { "containerPort": storageInsertPort, "name": "vminsert" }, - { "containerPort": storageSelectPort, "name": "vmselect" }, - ], - "volumeMounts": [{ "name": "data", "mountPath": "/data" }] - }], - "affinity": affinityToNode(localPvHost) - } - } - } - })) # yapf: disable - - -def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort): - name = f"{objPrefix}-vmselect" - (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, - "spec": { - "replicas": 1, - "strategy": { "type": "Recreate" }, - "selector": { "matchLabels": { "app": name } }, - "template": { - "metadata": { - "labels": { "app": name }, - "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } - }, - "spec": { - "serviceAccountName": "victoriametrics", - "containers": [{ - "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent", - "args": [ - f"-http.pathPrefix={webRoot}/vmselect/", - tzArg, - ] + [f"-storageNode={n}" for n in storageSvcs], - "ports": [{ "containerPort": selectPort }] - }] - } - } - } - })) # yapf: disable - -def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention): - agentName = f"{objPrefix}-{pipelineName}-vmagent" - insertName = f"{objPrefix}-{pipelineName}-vminsert" - storageName = f"{objPrefix}-{pipelineName}-vmstorage" - - agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent" - insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert" - storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage" - - localPvHost = "ditto" - insertPort = 8480 - agentPort = 8429 - storageInsertPort = 8400 - storageSelectPort = 8401 - volName = f"{objPrefix}-data-{pipelineName}" - request = "50Gi" - pipelineWebRoot = f'{webRoot}/{pipelineName}' - - createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix) - createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort) - createPv(storageFileName, volName, request) - createPvc(storageFileName, volName, request) - createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort) - - createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}]) - createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}]) - createSvc(storageFileName,storageName, [ - {"port": 80, "targetPort": "http", "name": "http"}, - {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"}, - {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"}, - ]) # yapf: disable - - return storageName - - -def createIndex(objPrefix, webRoot, html): - name = f'{objPrefix}-index' - httpServeRoot = '/opt/html' - - (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({ - "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name }, - "data": { - "index.html": html, - "index.js": Path("index.js").read_text(), - "index.css": Path("index.css").read_text(), - } - })) # yapf: disable - - (build / f'{objPrefix}-3index_deploy.yaml').write_text( - toJson({ - "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, - "spec": { - "replicas": 1, - "selector": { "matchLabels": { "app": name } }, - "template": { - "metadata": { "labels": { "app": name } }, - "spec": { - "containers": [{ - "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent", - "args": [ - f'--root={httpServeRoot}', - '--directory-listing=true', - '--experimental-metrics=true', - ], - "ports": [{ "containerPort": 80 }], - "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }] - }], - "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }] - } - } - } - })) # yapf: disable - createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}]) - - -def main(): - tzArg = "-loggerTimezone=America/Los_Angeles" - objPrefix = "next-victoriametrics" # prefix on all k8s object names - webRoot = "/m/next" - vmVersion = "v1.100.1" - webHost = 'bigasterisk.com' - pipelines = [ - ('forever', '100y'), - ('recent', '90y'), - ] - storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines] - - selectPort = 8481 - createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort) - createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}]) - - ingressPaths = [ - { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } }, - ] # yapf: disable - for p, _ in pipelines: - ingressPaths.extend([ - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } }, - { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } }, - ]) # yapf: disable - - policy = """\ -allow: - or: - - { email: { is: "drewpca@gmail.com" }} - - { email: { is: "kelsimp@gmail.com" }} - """ - createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost) - createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost)) - # this should also emit a static html page and web server deploy that serves at webRoot and has a map of everything - - -main() - -# in vmui, set server url to -# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus diff -r fb0519859645 -r 8134cd480817 next/create_k8s.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/create_k8s.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,254 @@ +from pathlib import Path +from index_page import makeIndexHtml +from output import affinityToNode, build, createIngress, createPv, createPvc, toJson, createSvc + + +def createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix): + (build / f'{agentFileName}_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": agentName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": agentName } }, + "template": { + "metadata": { + "labels": { "app": agentName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": f"{pipelineWebRoot}/vmagent/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "volumes": [{ "name": "config", "configMap": { "name": f"{objPrefix}-config" } }], + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmagent", + "image": f"docker.io/victoriametrics/vmagent:{vmVersion}", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vmagent/", + tzArg, + f"-promscrape.config=/local/config/{scrapeMapKey}", + "-promscrape.configCheckInterval=5s", + "-sortLabels", + f"-remoteWrite.url=http://{insertName}{pipelineWebRoot}/vminsert/insert/0/prometheus/api/v1/write", + "-remoteWrite.showURL", + ], + "ports": [{ "containerPort": agentPort }], + "volumeMounts": [{ "name": "config", "mountPath": "/local/config" }] + }] + } + } + } + })) # yapf: disable + + +def createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort): + (build / f'{insertFileName}_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": insertName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": insertName } }, + "template": { + "metadata": { + "labels": { "app": insertName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vminsert", + "image": f"docker.io/victoriametrics/vminsert:{vmVersion}-cluster", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vminsert/", + tzArg, + f"-storageNode={storageName}", + ], + "ports": [{ "containerPort": insertPort }] + }] + } + } + } + })) # yapf: disable + + +def createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort): + (build / f'{storageFileName}_2deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": storageName }, + "spec": { + "replicas": 1, "strategy": { "type": "Recreate" }, "selector": { "matchLabels": { "app": storageName } }, + "template": { + "metadata": { + "labels": { "app": storageName }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/vmstorage/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "volumes": [{ "name": "data", "persistentVolumeClaim": { "claimName": volName } }], + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmstorage", + "image": f"docker.io/victoriametrics/vmstorage:{vmVersion}-cluster", + "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={pipelineWebRoot}/vmstorage/", + tzArg, + f"-retentionPeriod={retention}", + f"-storageDataPath=/data/{pipelineName}", + ], + "ports": [ + { "containerPort": 8482, "name": "http" }, + { "containerPort": storageInsertPort, "name": "vminsert" }, + { "containerPort": storageSelectPort, "name": "vmselect" }, + ], + "volumeMounts": [{ "name": "data", "mountPath": "/data" }] + }], + "affinity": affinityToNode(localPvHost) + } + } + } + })) # yapf: disable + + +def createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort): + name = f"{objPrefix}-vmselect" + (build / f'{objPrefix}-1vmselect_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, + "spec": { + "replicas": 1, + "strategy": { "type": "Recreate" }, + "selector": { "matchLabels": { "app": name } }, + "template": { + "metadata": { + "labels": { "app": name }, + "annotations": { "prometheus.io/scrape": "true", "prometheus.io/path": "/m/metrics", "prometheus.io/port": "80" } + }, + "spec": { + "serviceAccountName": "victoriametrics", + "containers": [{ + "name": "vmselect", "image": f"docker.io/victoriametrics/vmselect:{vmVersion}-cluster", "imagePullPolicy": "IfNotPresent", + "args": [ + f"-http.pathPrefix={webRoot}/vmselect/", + tzArg, + ] + [f"-storageNode={n}" for n in storageSvcs], + "ports": [{ "containerPort": selectPort }] + }] + } + } + } + })) # yapf: disable + +def createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, pipelineName, scrapeMapKey, retention): + agentName = f"{objPrefix}-{pipelineName}-vmagent" + insertName = f"{objPrefix}-{pipelineName}-vminsert" + storageName = f"{objPrefix}-{pipelineName}-vmstorage" + + agentFileName = f"{objPrefix}-0{pipelineName}-0vmagent" + insertFileName = f"{objPrefix}-0{pipelineName}-1vminsert" + storageFileName = f"{objPrefix}-0{pipelineName}-2vmstorage" + + localPvHost = "ditto" + insertPort = 8480 + agentPort = 8429 + storageInsertPort = 8400 + storageSelectPort = 8401 + volName = f"{objPrefix}-data-{pipelineName}" + request = "50Gi" + pipelineWebRoot = f'{webRoot}/{pipelineName}' + + createAgentDeploy(tzArg, vmVersion, pipelineWebRoot, agentFileName, agentName, agentPort, scrapeMapKey, insertName, objPrefix) + createInsertDeploy(tzArg, vmVersion, pipelineWebRoot, insertName, storageName, insertFileName, insertPort) + createPv(storageFileName, volName, request) + createPvc(storageFileName, volName, request) + createStorageDeploy(tzArg, vmVersion, pipelineWebRoot, pipelineName, retention, storageName, storageFileName, localPvHost, volName, storageInsertPort, storageSelectPort) + + createSvc(agentFileName, agentName, [{"port": 80, "targetPort": agentPort}]) + createSvc(insertFileName, insertName, [{"port": 80, "targetPort": insertPort}]) + createSvc(storageFileName,storageName, [ + {"port": 80, "targetPort": "http", "name": "http"}, + {"port": storageInsertPort, "targetPort": "vminsert", "name": "vminsert"}, + {"port": storageSelectPort, "targetPort": "vmselect", "name": "vmselect"}, + ]) # yapf: disable + + return storageName + + +def createIndex(objPrefix, webRoot, html): + name = f'{objPrefix}-index' + httpServeRoot = '/opt/html' + + (build / f'{objPrefix}-3index_cmap.yaml').write_text(toJson({ + "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": name }, + "data": { + "index.html": html, + "index.js": Path("index.js").read_text(), + "index.css": Path("index.css").read_text(), + } + })) # yapf: disable + + (build / f'{objPrefix}-3index_deploy.yaml').write_text( + toJson({ + "apiVersion": "apps/v1", "kind": "Deployment", "metadata": { "name": name }, + "spec": { + "replicas": 1, + "selector": { "matchLabels": { "app": name } }, + "template": { + "metadata": { "labels": { "app": name } }, + "spec": { + "containers": [{ + "name": "webserver", "image": "docker.io/joseluisq/static-web-server", "imagePullPolicy": "IfNotPresent", + "args": [ + f'--root={httpServeRoot}', + '--directory-listing=true', + '--experimental-metrics=true', + ], + "ports": [{ "containerPort": 80 }], + "volumeMounts": [{ "name": "html", "mountPath": f"{httpServeRoot}{webRoot}" }] + }], + "volumes": [{ "name": "html", "configMap": { "name": name, "defaultMode": 444 } }] + } + } + } + })) # yapf: disable + createSvc(f'{objPrefix}-3index', f'{objPrefix}-index', [{'port': 80, 'targetPort': 80}]) + + +def main(): + tzArg = "-loggerTimezone=America/Los_Angeles" + objPrefix = "next-victoriametrics" # prefix on all k8s object names + webRoot = "/m/next" + vmVersion = "v1.100.1" + webHost = 'bigasterisk.com' + pipelines = [ + ('forever', '100y'), + ('recent', '90y'), + ] + storageSvcs = [createIngestPipeline(tzArg, vmVersion, webRoot, objPrefix, p, f'scrape_{p}.yaml', ret) for p, ret in pipelines] + + selectPort = 8481 + createVmselectDeploy(tzArg, vmVersion, webRoot, objPrefix, storageSvcs, selectPort) + createSvc(f'{objPrefix}-1vmselect', f"{objPrefix}-vmselect", [{"port": 80, "targetPort": selectPort}]) + + ingressPaths = [ + { "pathType": "Prefix", "path": f"{webRoot}/", "backend": { "service": { "name": f"{objPrefix}-index", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/vmselect/", "backend": { "service": { "name": f"{objPrefix}-vmselect", "port": { "number": 80 } } } }, + ] # yapf: disable + for p, _ in pipelines: + ingressPaths.extend([ + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmagent/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmagent", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vminsert/", "backend": { "service": { "name": f"{objPrefix}-{p}-vminsert", "port": { "number": 80 } } } }, + { "pathType": "Prefix", "path": f"{webRoot}/{p}/vmstorage/", "backend": { "service": { "name": f"{objPrefix}-{p}-vmstorage", "port": { "number": 80 } } } }, + ]) # yapf: disable + + policy = """\ +allow: + or: + - { email: { is: "drewpca@gmail.com" }} + - { email: { is: "kelsimp@gmail.com" }} + """ + createIngress(f'{objPrefix}-2ingress.yaml', objPrefix, policy, ingressPaths, webHost) + createIndex(objPrefix, webRoot, makeIndexHtml(objPrefix, webRoot, webHost)) + + +main() + +# in vmui, set server url to +# https://bigasterisk.com{webRoot}/vmselect/select/0/prometheus diff -r fb0519859645 -r 8134cd480817 next/create_scrape_configs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/create_scrape_configs.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,104 @@ +from pathlib import Path + +from scrape_job import jobConfig, scrape_deployments, writeJobConfigs +import private + + + + +# previously this used `kubernetes_sd_configs: [{ role: node }]` +all_hosts = [ + 'dash', + 'ditto', + # 'ws-printer', + #todo: +] + +smartctl_hosts = [ + # ideally, all nodes with disks, but many turn off and on + 'dash', + 'ditto', +] + +ping_hosts = [ + # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://printer/general/status.html?pageid=1 + 'printer', + # wireguard connection test + 'prime5', + # after pyinfra or reboot, seems to lose wg0 address + 'garage5', +] + +deploy_doesnt_serve_metrics = [ + 'apprise', + 'bitwarden', + 'digi-files', + 'digi-pose-predict', + 'digi-tts-mimic', + 'dovecot', + 'front-door-display', + 'hass', + 'homepage', + 'itch150', + 'kallithea', + 'kube-web-view', + 'magma', + 'megasecond', + 'minecraft-build-world', + 'minecraft-lake-world', + 'minecraft-smp-world', + 'mongodb', + 'mqtt1', + 'mqtt2', + 'nodered', + 'photoprism', + 'plik', + 'projects', + 'registry', + 'registry-ui', + 'speakerphone', + 'video', + 'video-files', + 'zigbee2mqtt', +] + +forever_jobs = [ + jobConfig(name='maildir-count', targets=['prime:2500']), + jobConfig(name='mongodb', targets=['mongodb:9216']), + jobConfig(name='net-traffic', targets=['pipe:8080']), + jobConfig(name='ping', targets=ping_hosts, scrape_interval='2m', ping_job=True), + jobConfig(name='power-eagle', targets=['power-eagle:80'], scrape_interval='8s'), # from powerEagle/private_config.periodSec + jobConfig(name='powermeter-exporter', targets=['powermeter-exporter'], scrape_interval='10s'), + jobConfig(name='smartctl', targets=[f'{h}:9633' for h in smartctl_hosts]), + jobConfig(name='wifi', targets=['wifi:80']), + jobConfig(name='zfs-exporter', targets=['ditto:9634']), + jobConfig(name='zigbee2mqtt-exporter', targets=['zigbee2mqtt-exporter:80']), + jobConfig(name='zpool-exporter', targets=['ditto:9986']), + jobConfig(name='octoprint', targets=['octoprint'], + metrics_path='/plugin/prometheus_exporter/metrics', + params={'apikey' : [private.octoprint_apikey]}, + ), +] # yapf: disable + +recent_jobs = [ + jobConfig( name="telegraf", targets=[f'{h}:9273' for h in all_hosts]), + jobConfig( name="filebeat", targets=[f'{h}:5067' for h in all_hosts]), + jobConfig( name="net-routes", targets=['pipe:9999']), + jobConfig( name="net-traffic", targets=['pipe:8080']), + jobConfig( name="dnsmasq-log", targets=['pipe:9991']), + jobConfig( + name="racc", + scrape_interval='30s', + targets=[ + # - dash:5150 + # - dot:5150 + # - squib:5150 + # - ashermac:5150 + ], + ), +] # yapf: disable +recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs)) + +top = Path('build/scrape_config') +writeJobConfigs(top, forever_jobs, 'forever') +writeJobConfigs(top, recent_jobs, 'recent') diff -r fb0519859645 -r 8134cd480817 next/k8s_ops.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/k8s_ops.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,50 @@ +import json +import time + +from kubernetes import client + + +def refreshPodCmaps(pod_name, namespace="default"): + """ + Per https://ahmet.im/blog/kubernetes-secret-volumes-delay/ there could be a while + until k8s updates the CM volume that a pod sees. Workaround is to edit the pod annotations. + """ + api_instance = client.CoreV1Api() + + pod = api_instance.read_namespaced_pod(name=pod_name, namespace=namespace) + if pod.metadata.annotations is None: + pod.metadata.annotations = {} + pod.metadata.annotations["force-configmap-update"] = str(time.time()) + api_instance.replace_namespaced_pod(name=pod_name, namespace=namespace, body=pod) + + +def firstPodName(selector): + api_instance = client.CoreV1Api() + pod_list = api_instance.list_namespaced_pod(namespace="default", label_selector=selector) + return pod_list.items[0].metadata.name + + +def hup(ctx, deployment, process_name): + ctx.run(f"kubectl exec {deployment} -- pkill -HUP {process_name}") + + +def replaceCmap(name, dataObj): + api_instance = client.CoreV1Api() + + data = dict((fn, json.dumps(obj)) for fn, obj in dataObj.items()) + + try: + + existing_config_map = api_instance.read_namespaced_config_map(name, 'default') + existing_config_map.data.update(data) + api_response = api_instance.replace_namespaced_config_map(name, "default", existing_config_map) + except client.rest.ApiException as e: + if e.status == 404: + config_map = client.V1ConfigMap() + config_map.metadata = client.V1ObjectMeta(name=name) + config_map.data = data + api_response = api_instance.create_namespaced_config_map('default', config_map) + else: + raise + + print(f"{name} resource_version is now {api_response.metadata.resource_version}") diff -r fb0519859645 -r 8134cd480817 next/output.py --- a/next/output.py Thu May 02 18:35:46 2024 -0700 +++ b/next/output.py Thu May 02 20:33:29 2024 -0700 @@ -1,7 +1,8 @@ import json from pathlib import Path -build = Path('build') +build = Path('build/k8s_config') +build.mkdir(parents=True, exist_ok=True) def toJson(d): diff -r fb0519859645 -r 8134cd480817 next/roles.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/roles.yaml Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,43 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: victoriametrics +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: victoriametrics +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: victoriametrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: victoriametrics +subjects: +- kind: ServiceAccount + name: victoriametrics + namespace: default +# # couldn't get prometheus to read the per-node telegraf metrics as 'prometheus' account +# - kind: ServiceAccount +# name: default +# namespace: default \ No newline at end of file diff -r fb0519859645 -r 8134cd480817 next/scrape_job.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/scrape_job.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,88 @@ +import json +from pathlib import Path +import subprocess + + +def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None): + """one scrape job config""" + ret = { + "job_name": name, + "static_configs": [{ + "targets": targets, + }], + "relabel_configs": [ + { + "target_label": "namespace", + "replacement": "default" + }, + { + "source_labels": ["__meta_kubernetes_pod_node_name"], + "target_label": "node" + }, + ] + } + + if metrics_path: + ret['metrics_path'] = metrics_path + + if scrape_interval: + ret['scrape_interval'] = scrape_interval + + if params: + ret['params'] = params + + if ping_job: + ret['metrics_path'] = '/probe' + ret['params'] = {'module': ['icmp']} + ret["relabel_configs"] = [ + { + "source_labels": ["__address__"], + "target_label": "__param_target" + }, + { + "source_labels": ["__param_target"], + "target_label": "instance" + }, + { + "target_label": "__address__", + "replacement": "prober" + }, + ] + + return ret + + +def current_deployments(): + deploys = json.loads(subprocess.check_output(['kubectl', 'get', 'deploy', '-o=json'])) + for deploy in deploys['items']: + name = deploy['metadata']['name'] + yield name + + +def scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs): + ret = [] + for name in current_deployments(): + if name in deploy_doesnt_serve_metrics: + continue + if name in [j['job_name'] for j in forever_jobs]: + continue + targets = [name] + ret.append(jobConfig(name=name, targets=targets)) + return ret + + +def writeJobConfigs(outDir: Path, jobConfs: list, retention: str): + outDir.mkdir(exist_ok=True, parents=True) + filenames_written = [] + for job in jobConfs: + filename = f'job_{job["job_name"]}.yaml' + (outDir / filename).write_text(json.dumps([job], indent=2, sort_keys=True)) + filenames_written.append(filename) + + (outDir / f'scrape_{retention}.yaml').write_text(json.dumps({ + "global": { + "scrape_interval": "1m", + "scrape_timeout": "10s" + }, + "scrape_config_files": filenames_written, + }, indent=2)) diff -r fb0519859645 -r 8134cd480817 next/skaffold.yaml --- a/next/skaffold.yaml Thu May 02 18:35:46 2024 -0700 +++ b/next/skaffold.yaml Thu May 02 20:33:29 2024 -0700 @@ -4,6 +4,7 @@ name: victoriametrics manifests: rawYaml: - - build/*.yaml + - roles.yaml + - build/k8s_config/*.yaml deploy: kubectl: {} diff -r fb0519859645 -r 8134cd480817 next/tasks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next/tasks.py Thu May 02 20:33:29 2024 -0700 @@ -0,0 +1,33 @@ +from pathlib import Path + +import yaml +from invoke import task +from kubernetes import config + +import alert_rules +from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap + +config.load_kube_config() + + +def scrapeConfig(fn): + return yaml.load(open(fn), yaml.FullLoader) + + +@task +def push_config_2024(ctx): + # plan: + # every discovered service may: + # - be described here as a forever retention - ignore the discovery + # - be blocked here as a no-metrics service - ignore the discovery + # - be scraped as 'recent', with possible overrides of port/path + # all per-node metrics shall be 'recent' (oops, not smartctl!) + map: dict[str, object] = { + 'rules': alert_rules.allRules(ctx), + } + top = Path('build/scrape_config') + for p in top.glob('**/*.yaml'): + map[str(p.relative_to(top))] = scrapeConfig(p) + replaceCmap("next-victoriametrics-config", map) + refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent")) + refreshPodCmaps(firstPodName("app=next-victoriametrics-recent-vmagent")) diff -r fb0519859645 -r 8134cd480817 tasks.py --- a/tasks.py Thu May 02 18:35:46 2024 -0700 +++ b/tasks.py Thu May 02 20:33:29 2024 -0700 @@ -4,6 +4,7 @@ from kubernetes import config import alert_rules + from k8s_ops import firstPodName, refreshPodCmaps, replaceCmap config.load_kube_config()