changeset 12:b6720e379d5b

config updates
author drewp@bigasterisk.com
date Tue, 14 Mar 2023 20:04:06 -0700
parents 2eab3e6b89f2
children 89a351ec7abf
files config/scrape_main.yaml rules/rules_main.yaml tasks.py
diffstat 3 files changed, 111 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/config/scrape_main.yaml	Mon Jan 16 01:05:31 2023 -0800
+++ b/config/scrape_main.yaml	Tue Mar 14 20:04:06 2023 -0700
@@ -97,6 +97,8 @@
       - {if: '{job="mosquitto-ext",     __port_number="1883"}', action: drop}
       - {if: '{job="filesync-syncthing",__port_number="8384"}', action: drop}
       - {if: '{job="jsregistry",        __port_number="4873"}', action: drop}
+      - {if: '{job="photoprism",        __port_number="2342"}', action: drop}
+      - {if: '{job="net-route-input",   __port_number="80"}', action: drop}
   
   # # seems like this would match more stuff, but all I get is coredns
   # - job_name: 'old_coredns'
@@ -157,3 +159,11 @@
       - {source_labels: [__param_target], target_label: instance}
       - target_label: __address__
         replacement: prober
+  - job_name: "racc"
+    scrape_interval: 30s
+    static_configs:
+      - targets:
+         - dash:5150
+         - dot:5150
+         - plus:5150
+         - Kelsis-iMac:5150
--- a/rules/rules_main.yaml	Mon Jan 16 01:05:31 2023 -0800
+++ b/rules/rules_main.yaml	Tue Mar 14 20:04:06 2023 -0700
@@ -71,13 +71,6 @@
           summary: "power eagle data missing"
           description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs"
 
-      # - alert: wifi_scrape_errors
-      #   expr: rate(poll_errors_total{job="wifi"}[2m]) > .1
-      #   labels:
-      #     severity: houseUsersAffected
-      #   annotations:
-      #     summary: "errors getting wifi users list"
-
       # - alert: absent_mitmproxy
       #   expr: absent(process_resident_memory_bytes{job="mitmproxy"})
       #   labels:
@@ -99,14 +92,15 @@
       
   - name: alerts
     rules:
-      - {alert: housePower,   for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
-      - {alert: disk1,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk2,        for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
-      - {alert: disk3,        for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500',           annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
-      - {alert: oom,          for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
-      - {alert: high_logging, for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}}
-      - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
-      - {alert: starlette,    for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                       annotations: {summary: "set starlette app name {{ $labels }}"}}
+      - {alert: housePower,     for: 24h, labels: {severity: waste},   expr: "house_power_w > 4000",                                                                annotations: {summary: "house power usage over 3KW {{ $labels }}"}}
+      - {alert: disk1,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G',                                                 annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk2,          for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G',                                                  annotations: {summary: "low disk_free {{ $labels }}"}}
+      - {alert: disk3,          for: 20m, labels: {severity: warning}, expr: '1 > 2',                                                                               annotations: {summary: "unused"}}
+      - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000',          annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}}
+        # - {alert: oom,        for: 1m,  labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100',                                         annotations: {summary: "host about to run OOM {{ $labels }}"}}
+      - {alert: high_logging,   for: 20m, labels: {severity: waste},   expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k',     annotations: {summary: "high log output rate {{ $labels }}"}}
+      - {alert: stale_process,  for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14',                       annotations: {summary: "process time is old {{ $labels }}"}}
+      - {alert: starlette,      for: 1m,  labels: {severity: fix},     expr: 'starlette_request_duration_seconds_created{app_name="starlette"}',                    annotations: {summary: "set starlette app name {{ $labels }}"}}
       - alert: ssl_certs_expiring_soon
         expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10
         labels:
--- a/tasks.py	Mon Jan 16 01:05:31 2023 -0800
+++ b/tasks.py	Tue Mar 14 20:04:06 2023 -0700
@@ -8,34 +8,113 @@
 
 from invoke import task
 
+sys.path.append('/usr/lib/python3/dist-packages/')
+import yaml
 
-def updateConfigmapCmd(path):
-    return f'kubectl create configmap victoriametrics-config --from-file {path}=config/{path} -o yaml --dry-run=client | kubectl apply -f -'
+_tfs = []
+
+
+def saveTmp(text):
+    tf = tempfile.NamedTemporaryFile(mode='wt')
+    _tfs.append(tf)
+    tf.write(text)
+    tf.flush()
+    return Path(tf.name)
+
+
+def writeConfigmap(ctx, files: Dict[str, Path]):
+    arg = ','.join(f'{k}={v}' for k, v in files.items())
+    ctx.run(
+        f'kubectl create configmap victoriametrics-config --from-file {arg} -o yaml --dry-run=client | kubectl apply -f -'
+    )
 
 
 def reload(ctx, svc):
     host = ctx.run(f'khost {svc}').stdout
-    path = {'victoriametrics': '/m', 'vmalert': '/vmalert'}[svc]
-    print(' -> status',
-          urllib.request.urlopen(f'http://{host}{path}/-/reload').status)
+    path = {'victoriametrics': '/m/', 'vmalert': '/'}[svc]
+    reload_url = f'http://{host}{path}-/reload'
+    print(f'reload with POST {reload_url}')
+    for workaround in [1]:
+        print(' -> status',
+              urllib.request.urlopen(reload_url, data=b'unused').status)
+        time.sleep(0)
+
+
+def httpsCertProber():
+    domains = []
+    for line in open(
+            '/my/doc/ssl/letsencrypt/run.py'):  # moved to cert-manager
+        if line.startswith('update_certs('):
+            domains.append(line.split("'")[1])
+    relabel = {
+        'relabel_configs': [{
+            'source_labels': ['__address__'],
+            'target_label': '__param_target'
+        }, {
+            'source_labels': ['__param_target'],
+            'target_label': 'instance'
+        }, {
+            'target_label': '__address__',
+            'replacement': 'prober'
+        }]
+    }
+    return yaml.dump(  # Note that an included file must skip the scrape_configs toplevel key and just include the list.
+        [{
+            'job_name': 'prober',
+            'scrape_interval': '24h',
+            'metrics_path': '/probe',
+            'params': {
+                'module': ['https']
+            },
+            'static_configs': [{
+                'targets': domains
+            }],
+        } | relabel])
+
+
 def hostsExpectedOnline(ctx):
     return ctx.run(
         'cd /my/serv/lanscape; pdm run python hosts_expected_online.py').stdout
 
 
+def expectedK8sNodes(ctx):
+    getNode = json.loads(ctx.run("kubectl get node -o json").stdout)
+    hosts = [item['metadata']['name'] for item in getNode['items']]
+    return yaml.dump({
+        'groups': [{
+            'name':
+            'k8s_expected_nodes',
+            'rules': [{
+                'alert':
+                'kube_node_log_size_report_' + h,
+                'expr':
+                'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})'
+                % h,
+                'for':
+                '1h',
+                'annotations': {
+                    'summary': f"no recent k8s log size report from host {h}"
+                }
+            } for h in hosts]
+        }]
+    })
 
 
 @task
 def sync_config(ctx):
-    ctx.run(updateConfigmapCmd("scrape_config.yaml"))
-    ctx.run(updateConfigmapCmd("scrape_ssl.yaml"))
-    reload(ctx, 'victoriametrics')
+    config = Path('config')
+    for workaround in [1]:
+        writeConfigmap(
+            ctx, {
+#                'scrape_ssl.yaml': saveTmp(httpsCertProber()),
+                'rules_expected_nodes.yaml': saveTmp(expectedK8sNodes(ctx)),
+                'rules_expected_hosts.yaml': saveTmp(hostsExpectedOnline(ctx)),
+            })
+        reload(ctx, 'victoriametrics')
 
-    ctx.run(updateConfigmapCmd("rules_expected_hosts.yaml"))
-    ctx.run(updateConfigmapCmd("rules_expected_nodes.yaml"))
-    ctx.run(updateConfigmapCmd("rules_k8s.yaml"))
-    ctx.run(updateConfigmapCmd("rules_main.yaml"))
-    reload(ctx, 'vmalert')
+        # this reload doesn't get the new config- not sure if it's vmalert bug or k8s cm propogation problem
+        # reload(ctx, 'vmalert')
+        ctx.run('kubectl rollout restart deploy/vmalert')
 
 @task
 def build_config(ctx):