changeset 64:def1aa2bfa3f

more targets polish. reorg code into next/
author drewp@bigasterisk.com
date Thu, 02 May 2024 23:06:31 -0700
parents 84a4c4cca4a5
children fada8d64c4d3
files .hgignore config/scrape_main.yaml config/scrape_recent.yaml next/create_scrape_configs.py next/scrape_job.py next/tasks.py
diffstat 6 files changed, 133 insertions(+), 222 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Thu May 02 23:05:14 2024 -0700
+++ b/.hgignore	Thu May 02 23:06:31 2024 -0700
@@ -3,5 +3,7 @@
 .pdm-python
 .vscode
 .venv
-^export
-^build
+^migration/export
+^next/build
+^next/private.py
+
--- a/config/scrape_main.yaml	Thu May 02 23:05:14 2024 -0700
+++ b/config/scrape_main.yaml	Thu May 02 23:06:31 2024 -0700
@@ -1,55 +1,5 @@
-# see https://relabeler.promlabs.com/
-
-global:
-  scrape_interval: 1m
-  scrape_timeout: 10s
-
-# scrape_config_files:
-#   - build/scrape_ssl.yaml
-# These can even be urls: https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md#loading-scrape-configs-from-multiple-files
-
-scrape_configs:
   # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
 
-  - job_name: "kubernetes-apiservers"
-    scheme: https
-    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
-    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-    kubernetes_sd_configs: [{ role: endpoints }]
-
-    relabel_configs:
-      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
-        action: keep
-        regex: default;kubernetes;https
-
-  - job_name: "kubernetes-nodes"
-    scheme: https
-    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
-    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-    kubernetes_sd_configs: [{ role: node }]
-
-    relabel_configs:
-      - action: labeldrop
-        regex: "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
-      - action: labelmap
-        regex: __meta_kubernetes_node_label_(.+)
-
-  # see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
-  # for metric definitions
-  - job_name: "kubernetes-cadvisor"
-    scheme: https
-    metrics_path: /metrics/cadvisor
-    tls_config: { ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt }
-    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-    kubernetes_sd_configs: [{ role: node }]
-
-    relabel_configs:
-      - action: labeldrop
-        regex: "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
-
   - job_name: "k8services"
     kubernetes_sd_configs: [{ role: endpoints }]
     relabel_configs:
@@ -74,13 +24,6 @@
       - source_labels: [__meta_kubernetes_pod_container_port_number]
         target_label: __port_number
 
-      # period tweaks
-      - if: '{job="power-eagle"}'
-        action: replace
-        target_label: __scrape_interval__
-        # from powerEagle/private_config.periodSec
-        replacement: 8s
-
       # path tweaks
       - if: '{job="victoriametrics",__port_number="8428"}'
         action: replace
@@ -130,91 +73,3 @@
       # Something doesn't work with the scrape, and I don't see why I should care:
       - {if: '{job="metrics-server"                        }', action: drop}
 
-
-  - job_name: "telegraf"
-    scheme: http
-    kubernetes_sd_configs: [{ role: node }]
-    relabel_configs:
-      - source_labels: [__address__]
-        regex: "(.*):(\\d+)"
-        target_label: __address__
-        replacement: "${1}:9273"
-        action: replace
-
-  - job_name: "smartctl"
-    scrape_interval: 1h
-    scheme: http
-    kubernetes_sd_configs: [{ role: node }]
-    relabel_configs:
-      - source_labels: [__address__]
-        regex: "(.*):(\\d+)"
-        target_label: __address__
-        replacement: "${1}:9633"
-        action: replace
-
-  - job_name: "filebeat"
-    scrape_interval: 1m
-    scheme: http
-    kubernetes_sd_configs: [{ role: node }]
-    relabel_configs:
-      - source_labels: [__address__]
-        regex: "(.*):(\\d+)"
-        target_label: __address__
-        replacement: "${1}:5067"
-        action: replace
-
-  - job_name: "net-routes"
-    static_configs:
-      - targets:
-          - pipe:9999
-
-  - job_name: "net-traffic"
-    static_configs:
-      - targets:
-          - pipe:8080
-
-  - job_name: "dnsmasq-log"
-    static_configs:
-      - targets:
-          - pipe:9991
-
-  - job_name: "maildir-count"
-    static_configs:
-      - targets:
-          - prime:2500
-
-  - job_name: "zfs"
-    scrape_interval: 1h
-    static_configs:
-      - targets:
-          # running in in k8s, but as daemonset so it's not in SD above
-          - ditto:9634
-          - ditto:9986
-
-  - job_name: "ping"
-    scrape_interval: 2m
-    metrics_path: /probe
-    params:
-      module: [icmp]
-    static_configs:
-      - targets:
-          # printer, since it falls out of ntop with no traffic at all. Or, we could poll ink status at http://10.2.0.37/general/status.html?pageid=1
-          - printer
-          # wireguard connection test
-          - prime5
-          # after pyinfra or reboot, seems to lose wg0 address
-          - garage5
-    relabel_configs:
-      - {source_labels: [__address__],    target_label: __param_target}
-      - {source_labels: [__param_target], target_label: instance}
-      - target_label: __address__
-        replacement: prober
-
-  - job_name: "racc"
-    scrape_interval: 30s
-    static_configs:
-      - targets: []
-        # - dash:5150
-        # - dot:5150
-        # - plus:5150
-        # - Kelsis-iMac:5150
--- a/config/scrape_recent.yaml	Thu May 02 23:05:14 2024 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-# see https://relabeler.promlabs.com/
-
-global:
-  scrape_interval: 1m
-  scrape_timeout: 10s
-
-
-scrape_configs:
-  # some based on https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
-
-  - job_name: "telegraf"
-    scheme: http
-    kubernetes_sd_configs: [{ role: node }]
-    relabel_configs:
-      - source_labels: [__address__]
-        regex: "(.*):(\\d+)"
-        target_label: __address__
-        replacement: "${1}:9273"
-        action: replace
--- a/next/create_scrape_configs.py	Thu May 02 23:05:14 2024 -0700
+++ b/next/create_scrape_configs.py	Thu May 02 23:06:31 2024 -0700
@@ -1,11 +1,8 @@
 from pathlib import Path
 
-from scrape_job import jobConfig, scrape_deployments, writeJobConfigs
+from scrape_job import jobConfig, scrape_deployments, writeJobConfigs, FromName
 import private
 
-
-
-
 # previously this used `kubernetes_sd_configs: [{ role: node }]`
 all_hosts = [
     'dash',
@@ -29,38 +26,6 @@
     'garage5',
 ]
 
-deploy_doesnt_serve_metrics = [
-    'apprise',
-    'bitwarden',
-    'digi-files',
-    'digi-pose-predict',
-    'digi-tts-mimic',
-    'dovecot',
-    'front-door-display',
-    'hass',
-    'homepage',
-    'itch150',
-    'kallithea',
-    'kube-web-view',
-    'magma',
-    'megasecond',
-    'minecraft-build-world',
-    'minecraft-lake-world',
-    'minecraft-smp-world',
-    'mongodb',
-    'mqtt1',
-    'mqtt2',
-    'nodered',
-    'photoprism',
-    'plik',
-    'projects',
-    'registry',
-    'registry-ui',
-    'speakerphone',
-    'video',
-    'video-files',
-    'zigbee2mqtt',
-]
 
 forever_jobs = [
     jobConfig(name='maildir-count',        targets=['prime:2500']),
@@ -81,11 +46,23 @@
 ]  # yapf: disable
 
 recent_jobs = [
-    jobConfig( name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
-    jobConfig( name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
-    jobConfig( name="net-routes",  targets=['pipe:9999']),
-    jobConfig( name="net-traffic", targets=['pipe:8080']),
-    jobConfig( name="dnsmasq-log", targets=['pipe:9991']),
+    jobConfig(name="dnsmasq-log", targets=['pipe:9991']),
+    jobConfig(name="filebeat",    targets=[f'{h}:5067' for h in all_hosts]),
+    jobConfig(name="net-routes",  targets=['pipe:9999']),
+    jobConfig(name="net-traffic", targets=['pipe:8080']),
+    jobConfig(name="pomerium",    targets=['pomerium-metrics.pomerium:9090']),
+    jobConfig(name="telegraf",    targets=[f'{h}:9273' for h in all_hosts]),
+    jobConfig(name="victorialogs",targets=['victorialogs'], metrics_path='/logs/metrics'),
+
+    jobConfig(name="next-victoriametrics-forever-vmagent",   metrics_path='/m/next/forever/vmagent/metrics',  targets=FromName),
+    jobConfig(name="next-victoriametrics-forever-vminsert",  metrics_path='/m/next/forever/vminsert/metrics', targets=FromName),
+    jobConfig(name="next-victoriametrics-forever-vmstorage", metrics_path='/m/next/forever/vmstorage/metrics',targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vmagent",    metrics_path='/m/next/recent/vmagent/metrics',   targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vminsert",   metrics_path='/m/next/recent/vminsert/metrics',  targets=FromName),
+    jobConfig(name="next-victoriametrics-recent-vmstorage",  metrics_path='/m/next/recent/vmstorage/metrics', targets=FromName),
+    jobConfig(name="next-victoriametrics-vmselect",          metrics_path='/m/next/vmselect/metrics',         targets=FromName),
+    jobConfig(name="next-victoriametrics-index",                                                              targets=FromName),
+
     jobConfig(
         name="racc",
         scrape_interval='30s',
@@ -97,8 +74,90 @@
         ],
     ),
 ]  # yapf: disable
-recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs))
+
+
+deploy_doesnt_serve_metrics = [
+    'apprise',
+    'bitwarden',
+    'digi-files',
+    'digi-pose-predict',
+    'digi-tts-mimic',
+    'digi-web',
+    'dovecot',
+    'ectoscope',
+    'front-door-display',
+    'hass',
+    'homepage',
+    'itch150',
+    'jsregistry',
+    'kallithea',
+    'kube-web-view',
+    'magma',
+    'megasecond',
+    'minecraft-build-world',
+    'minecraft-lake-world',
+    'minecraft-smp-world',
+    'mongodb',
+    'mqtt1',
+    'mqtt2',
+    'nodered',
+    'photoprism',
+    'plik',
+    'projects',
+    'registry-ui',
+    'registry',
+    'speakerphone',
+    'victorialogs-ui',
+    'video-files',
+    'video',
+    'zigbee2mqtt',
+    'zwave2mqtt',
+]
+
+existing_jobs = [j['job_name'] for j in forever_jobs + recent_jobs]
+recent_jobs.extend(scrape_deployments(deploy_doesnt_serve_metrics + existing_jobs))
 
-top = Path('build/scrape_config')
-writeJobConfigs(top, forever_jobs, 'forever')
-writeJobConfigs(top, recent_jobs, 'recent')
+recent_jobs.append(jobConfig(name='kubernetes-apiservers', https=True, targets=[]) | {
+    'kubernetes_sd_configs': [{
+        'role': 'endpoints'
+    }],
+    'relabel_configs': [{
+        'source_labels': ['__meta_kubernetes_namespace', '__meta_kubernetes_service_name', '__meta_kubernetes_endpoint_port_name'],
+        'action': 'keep',
+        'regex': 'default;kubernetes;https'
+    }],
+})
+
+recent_jobs.append(
+    jobConfig(name="kubernetes-nodes", https=True, targets=[]) | {
+        "kubernetes_sd_configs": [{
+            "role": "node"
+        }],
+        "relabel_configs": [{
+            "action": "labeldrop",
+            "regex": "__meta_kubernetes_node_label_(feature_node|nvidia_com_|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+        }, {
+            "action": "labelmap",
+            "regex": "__meta_kubernetes_node_label_(.+)"
+        }, {
+            "action": "labeldrop",
+            "regex": "kubernetes_io_hostname"
+        }],
+    })
+
+# see https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
+# for metric definitions
+
+recent_jobs.append(jobConfig(name="kubernetes-cadvisor", https=True, metrics_path="/metrics/cadvisor", targets=[]) | {
+    "kubernetes_sd_configs": [{
+        "role": "node"
+    }],
+    "relabel_configs": [{
+        "action": "labeldrop",
+        "regex": "(feature_node|nvidia_com_gpu|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|beta_kubernetes_io_os|node_kubernetes_io_instance_type|kubernetes_io_os).*"
+    }],
+})
+
+outDir = Path('build/scrape_config')
+writeJobConfigs(outDir, forever_jobs, 'forever')
+writeJobConfigs(outDir, recent_jobs, 'recent')
--- a/next/scrape_job.py	Thu May 02 23:05:14 2024 -0700
+++ b/next/scrape_job.py	Thu May 02 23:06:31 2024 -0700
@@ -2,14 +2,13 @@
 from pathlib import Path
 import subprocess
 
+class FromName:
+    pass
 
-def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None):
+def jobConfig(name, targets, scrape_interval=None, ping_job=False, metrics_path=None, params=None, https=False):
     """one scrape job config"""
     ret = {
         "job_name": name,
-        "static_configs": [{
-            "targets": targets,
-        }],
         "relabel_configs": [
             {
                 "target_label": "namespace",
@@ -21,9 +20,21 @@
             },
         ]
     }
+    
+    if targets is FromName:
+        targets = [name]
+
+    if targets:
+        ret["static_configs"] = [{
+            "targets": targets,
+        }]
 
     if metrics_path:
-        ret['metrics_path'] = metrics_path
+        ret.setdefault('relabel_configs', []).append({
+            "action": "replace",
+            "target_label": "__metrics_path__",
+            "replacement": metrics_path,
+        })
 
     if scrape_interval:
         ret['scrape_interval'] = scrape_interval
@@ -49,6 +60,11 @@
             },
         ]
 
+    if https:
+        ret['scheme'] = 'https'
+        ret["tls_config"] = {"ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"}
+        ret["bearer_token_file"] = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+
     return ret
 
 
@@ -59,12 +75,10 @@
         yield name
 
 
-def scrape_deployments(deploy_doesnt_serve_metrics, forever_jobs):
+def scrape_deployments(skip_names):
     ret = []
     for name in current_deployments():
-        if name in deploy_doesnt_serve_metrics:
-            continue
-        if name in [j['job_name'] for j in forever_jobs]:
+        if name in skip_names:
             continue
         targets = [name]
         ret.append(jobConfig(name=name, targets=targets))
@@ -84,5 +98,5 @@
             "scrape_interval": "1m",
             "scrape_timeout": "10s"
         },
-        "scrape_config_files": filenames_written,
+        "scrape_config_files": sorted(filenames_written),
     }, indent=2))
--- a/next/tasks.py	Thu May 02 23:05:14 2024 -0700
+++ b/next/tasks.py	Thu May 02 23:06:31 2024 -0700
@@ -26,7 +26,7 @@
         'rules': alert_rules.allRules(ctx),
     }
     top = Path('build/scrape_config')
-    for p in top.glob('**/*.yaml'):
+    for p in top.glob('*.yaml'):
         map[str(p.relative_to(top))] = scrapeConfig(p)
     replaceCmap("next-victoriametrics-config", map)
     refreshPodCmaps(firstPodName("app=next-victoriametrics-forever-vmagent"))