Mercurial > code > home > repos > victoriametrics
changeset 12:b6720e379d5b
config updates
author | drewp@bigasterisk.com |
---|---|
date | Tue, 14 Mar 2023 20:04:06 -0700 |
parents | 2eab3e6b89f2 |
children | 89a351ec7abf |
files | config/scrape_main.yaml rules/rules_main.yaml tasks.py |
diffstat | 3 files changed, 111 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/config/scrape_main.yaml Mon Jan 16 01:05:31 2023 -0800 +++ b/config/scrape_main.yaml Tue Mar 14 20:04:06 2023 -0700 @@ -97,6 +97,8 @@ - {if: '{job="mosquitto-ext", __port_number="1883"}', action: drop} - {if: '{job="filesync-syncthing",__port_number="8384"}', action: drop} - {if: '{job="jsregistry", __port_number="4873"}', action: drop} + - {if: '{job="photoprism", __port_number="2342"}', action: drop} + - {if: '{job="net-route-input", __port_number="80"}', action: drop} # # seems like this would match more stuff, but all I get is coredns # - job_name: 'old_coredns' @@ -157,3 +159,11 @@ - {source_labels: [__param_target], target_label: instance} - target_label: __address__ replacement: prober + - job_name: "racc" + scrape_interval: 30s + static_configs: + - targets: + - dash:5150 + - dot:5150 + - plus:5150 + - Kelsis-iMac:5150
--- a/rules/rules_main.yaml Mon Jan 16 01:05:31 2023 -0800 +++ b/rules/rules_main.yaml Tue Mar 14 20:04:06 2023 -0700 @@ -71,13 +71,6 @@ summary: "power eagle data missing" description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - # - alert: wifi_scrape_errors - # expr: rate(poll_errors_total{job="wifi"}[2m]) > .1 - # labels: - # severity: houseUsersAffected - # annotations: - # summary: "errors getting wifi users list" - # - alert: absent_mitmproxy # expr: absent(process_resident_memory_bytes{job="mitmproxy"}) # labels: @@ -99,14 +92,15 @@ - name: alerts rules: - - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} - - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk3, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} - - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} - - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}} - - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} - - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} + - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} + - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk3, for: 20m, labels: {severity: warning}, expr: '1 > 2', annotations: {summary: "unused"}} + - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} + # - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} + - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k', annotations: {summary: "high log output rate {{ $labels }}"}} + - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} + - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} - alert: ssl_certs_expiring_soon expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 labels:
--- a/tasks.py Mon Jan 16 01:05:31 2023 -0800 +++ b/tasks.py Tue Mar 14 20:04:06 2023 -0700 @@ -8,34 +8,113 @@ from invoke import task +sys.path.append('/usr/lib/python3/dist-packages/') +import yaml -def updateConfigmapCmd(path): - return f'kubectl create configmap victoriametrics-config --from-file {path}=config/{path} -o yaml --dry-run=client | kubectl apply -f -' +_tfs = [] + + +def saveTmp(text): + tf = tempfile.NamedTemporaryFile(mode='wt') + _tfs.append(tf) + tf.write(text) + tf.flush() + return Path(tf.name) + + +def writeConfigmap(ctx, files: Dict[str, Path]): + arg = ','.join(f'{k}={v}' for k, v in files.items()) + ctx.run( + f'kubectl create configmap victoriametrics-config --from-file {arg} -o yaml --dry-run=client | kubectl apply -f -' + ) def reload(ctx, svc): host = ctx.run(f'khost {svc}').stdout - path = {'victoriametrics': '/m', 'vmalert': '/vmalert'}[svc] - print(' -> status', - urllib.request.urlopen(f'http://{host}{path}/-/reload').status) + path = {'victoriametrics': '/m/', 'vmalert': '/'}[svc] + reload_url = f'http://{host}{path}-/reload' + print(f'reload with POST {reload_url}') + for workaround in [1]: + print(' -> status', + urllib.request.urlopen(reload_url, data=b'unused').status) + time.sleep(0) + + +def httpsCertProber(): + domains = [] + for line in open( + '/my/doc/ssl/letsencrypt/run.py'): # moved to cert-manager + if line.startswith('update_certs('): + domains.append(line.split("'")[1]) + relabel = { + 'relabel_configs': [{ + 'source_labels': ['__address__'], + 'target_label': '__param_target' + }, { + 'source_labels': ['__param_target'], + 'target_label': 'instance' + }, { + 'target_label': '__address__', + 'replacement': 'prober' + }] + } + return yaml.dump( # Note that an included file must skip the scrape_configs toplevel key and just include the list. + [{ + 'job_name': 'prober', + 'scrape_interval': '24h', + 'metrics_path': '/probe', + 'params': { + 'module': ['https'] + }, + 'static_configs': [{ + 'targets': domains + }], + } | relabel]) + + def hostsExpectedOnline(ctx): return ctx.run( 'cd /my/serv/lanscape; pdm run python hosts_expected_online.py').stdout +def expectedK8sNodes(ctx): + getNode = json.loads(ctx.run("kubectl get node -o json").stdout) + hosts = [item['metadata']['name'] for item in getNode['items']] + return yaml.dump({ + 'groups': [{ + 'name': + 'k8s_expected_nodes', + 'rules': [{ + 'alert': + 'kube_node_log_size_report_' + h, + 'expr': + 'absent(kubelet_container_log_filesystem_used_bytes{instance="%s"})' + % h, + 'for': + '1h', + 'annotations': { + 'summary': f"no recent k8s log size report from host {h}" + } + } for h in hosts] + }] + }) @task def sync_config(ctx): - ctx.run(updateConfigmapCmd("scrape_config.yaml")) - ctx.run(updateConfigmapCmd("scrape_ssl.yaml")) - reload(ctx, 'victoriametrics') + config = Path('config') + for workaround in [1]: + writeConfigmap( + ctx, { +# 'scrape_ssl.yaml': saveTmp(httpsCertProber()), + 'rules_expected_nodes.yaml': saveTmp(expectedK8sNodes(ctx)), + 'rules_expected_hosts.yaml': saveTmp(hostsExpectedOnline(ctx)), + }) + reload(ctx, 'victoriametrics') - ctx.run(updateConfigmapCmd("rules_expected_hosts.yaml")) - ctx.run(updateConfigmapCmd("rules_expected_nodes.yaml")) - ctx.run(updateConfigmapCmd("rules_k8s.yaml")) - ctx.run(updateConfigmapCmd("rules_main.yaml")) - reload(ctx, 'vmalert') + # this reload doesn't get the new config- not sure if it's vmalert bug or k8s cm propogation problem + # reload(ctx, 'vmalert') + ctx.run('kubectl rollout restart deploy/vmalert') @task def build_config(ctx):