# Alerting rules, evaluated by vmalert against VictoriaMetrics on control. # Everything is declared here in git. vmalert remote-writes alert state back to # VM, so firing alerts surface as the `ALERTS{alertstate="firing"}` series and # can be viewed in Grafana. No notifier is wired yet: notifier.blackhole makes # that explicit (vmalert evaluates rules but sends nowhere). To deliver alerts # later, drop blackhole and set settings."notifier.url" to an Alertmanager. { ... }: let vmUrl = "http://127.0.0.1:8428"; in { services.vmalert.instances.cnx = { enable = true; settings = { "datasource.url" = vmUrl; "remoteWrite.url" = vmUrl; # persists ALERTS / ALERTS_FOR_STATE back to VM "notifier.blackhole" = true; "httpListenAddr" = "127.0.0.1:8880"; # vmalert UI/API, loopback only (like VM) }; rules.groups = [ { name = "dns"; rules = [ { alert = "DNSSecondaryOutOfSync"; expr = "max by (zone) (knot_zone_serial) - min by (zone) (knot_zone_serial) > 0"; for = "15m"; labels.severity = "warning"; annotations.summary = "Zone {{ $labels.zone }} SOA serial differs between nameservers"; annotations.description = "The secondary is out of sync with the primary for {{ $labels.zone }}. `knotc zone-retransfer {{ $labels.zone }}` on ns2 forces a fresh pull."; } { alert = "ZoneExpiryLow"; expr = "knot_zone_status_expiration < 3600"; for = "5m"; labels.severity = "critical"; annotations.summary = "Zone {{ $labels.zone }} on {{ $labels.instance }} is within 1h of expiry"; annotations.description = "Transfers to the secondary appear to be failing; the zone stops being served when the SOA expire timer hits zero."; } ]; } { name = "host"; rules = [ { alert = "ScrapeTargetDown"; expr = "up == 0"; for = "5m"; labels.severity = "critical"; annotations.summary = "{{ $labels.job }} exporter on {{ $labels.instance }} is down"; annotations.description = "VictoriaMetrics cannot scrape this target; its metrics are missing."; } { alert = "RootFilesystemFull"; expr = ''100 * (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) > 90''; for = "15m"; labels.severity = "warning"; annotations.summary = "Root filesystem on {{ $labels.instance }} is over 90% full"; } ]; } { name = "backup"; rules = [ { alert = "BackupJobFailed"; expr = ''node_systemd_unit_state{name=~"borgbackup-job-.+\\.service",state="failed"} == 1''; for = "5m"; labels.severity = "warning"; annotations.summary = "Backup job {{ $labels.name }} on {{ $labels.instance }} failed"; annotations.description = "The borgbackup run did not complete. Check `systemctl status {{ $labels.name }}` and `journalctl -u {{ $labels.name }}` on the client; `borgbackup-create` re-runs it."; } { alert = "BackupStale"; expr = ''time() - node_systemd_timer_last_trigger_seconds{name=~"borgbackup-job-.+\\.timer"} > 93600''; for = "30m"; labels.severity = "warning"; annotations.summary = "No successful backup on {{ $labels.instance }} for over 26h"; annotations.description = "The daily backup timer {{ $labels.name }} has not fired within its expected window; the most recent archive is stale. A value far above 26h (or no data) means backups have stopped entirely."; } ]; } { # Outside-in DNS probes (blackbox on control). The `for` rides out a # single dropped UDP packet; only a sustained failure fires. name = "dns_probe"; rules = [ { alert = "DNSResolutionProbeFailed"; expr = ''probe_success{query="SOA"} == 0''; for = "5m"; labels.severity = "critical"; annotations.summary = "{{ $labels.zone }} is not resolving from {{ $labels.instance }}"; annotations.description = "The blackbox SOA probe to this public nameserver address is failing; from the outside the zone looks unavailable there, which the Knot stats would not show."; } { alert = "DNSSECProbeFailed"; expr = ''probe_success{query="DNSKEY"} == 0''; for = "5m"; labels.severity = "critical"; annotations.summary = "{{ $labels.zone }} DNSKEY missing from {{ $labels.instance }}"; annotations.description = "The DNSKEY probe to this public nameserver address is failing: the zone's signing keys are not being served, so validating resolvers will treat answers as bogus."; } ]; } ]; }; }