Mercurial > code > home > repos > victoriametrics
diff rules/rules_main.yaml @ 12:b6720e379d5b
config updates
author | drewp@bigasterisk.com |
---|---|
date | Tue, 14 Mar 2023 20:04:06 -0700 |
parents | 17db5e8e7a2f |
children | 89a351ec7abf |
line wrap: on
line diff
--- a/rules/rules_main.yaml Mon Jan 16 01:05:31 2023 -0800 +++ b/rules/rules_main.yaml Tue Mar 14 20:04:06 2023 -0700 @@ -71,13 +71,6 @@ summary: "power eagle data missing" description: "logs at https://bigasterisk.com/k/clusters/local/namespaces/default/deployments/power-eagle/logs" - # - alert: wifi_scrape_errors - # expr: rate(poll_errors_total{job="wifi"}[2m]) > .1 - # labels: - # severity: houseUsersAffected - # annotations: - # summary: "errors getting wifi users list" - # - alert: absent_mitmproxy # expr: absent(process_resident_memory_bytes{job="mitmproxy"}) # labels: @@ -99,14 +92,15 @@ - name: alerts rules: - - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} - - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} - - {alert: disk3, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1w]) / 1M) > 500', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} - - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} - - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[30m])) > 30000', annotations: {summary: "high log output rate {{ $labels }}"}} - - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} - - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} + - {alert: housePower, for: 24h, labels: {severity: waste}, expr: "house_power_w > 4000", annotations: {summary: "house power usage over 3KW {{ $labels }}"}} + - {alert: disk1, for: 20m, labels: {severity: warning}, expr: 'disk_free{path=~"/(d[1-9])?"} < 20G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk2, for: 20m, labels: {severity: warning}, expr: 'disk_free{path="/stor6/my"} < 100G', annotations: {summary: "low disk_free {{ $labels }}"}} + - {alert: disk3, for: 20m, labels: {severity: warning}, expr: '1 > 2', annotations: {summary: "unused"}} + - {alert: disk_week_incr, for: 20m, labels: {severity: warning}, expr: 'round(increase(disk_used{fstype="zfs",path=~"^/stor6.*"}[1d]) / 1M) > 5000', annotations: {summary: "high mb/week on zfs dir {{ $labels }}"}} + # - {alert: oom, for: 1m, labels: {severity: warning}, expr: 'predict_linear(mem_free[5m], 5m) / 1M < 100', annotations: {summary: "host about to run OOM {{ $labels }}"}} + - {alert: high_logging, for: 20m, labels: {severity: waste}, expr: 'sum by (container) (rate(kubelet_container_log_filesystem_used_bytes[3h])) > 4k', annotations: {summary: "high log output rate {{ $labels }}"}} + - {alert: stale_process, for: 1d, labels: {severity: dataRisk}, expr: 'round((time() - filestat_modification_time/1e9) / 86400) > 14', annotations: {summary: "process time is old {{ $labels }}"}} + - {alert: starlette, for: 1m, labels: {severity: fix}, expr: 'starlette_request_duration_seconds_created{app_name="starlette"}', annotations: {summary: "set starlette app name {{ $labels }}"}} - alert: ssl_certs_expiring_soon expr: min((min_over_time(probe_ssl_earliest_cert_expiry[1d])-time())/86400) < 10 labels: