Add vmalert alerting rules for DNS and host health

vmalert on control evaluates rules (declared in git) against VictoriaMetrics and
remote-writes alert state back, so firing alerts show as the ALERTS series in
Grafana. Covers SOA divergence between ns1/ns2, secondary zone expiry, scrape
target down, and root disk full. No notifier yet (notifier.blackhole). Also adds
TODO.md roadmap.
This commit is contained in:
Berwn
2026-06-17 14:49:32 +07:00
parent a7d4c0e567
commit 4c7c74836d
3 changed files with 105 additions and 0 deletions
+64
View File
@@ -0,0 +1,64 @@
# Alerting rules, evaluated by vmalert against VictoriaMetrics on control.
# Everything is declared here in git. vmalert remote-writes alert state back to
# VM, so firing alerts surface as the `ALERTS{alertstate="firing"}` series and
# can be viewed in Grafana. No notifier is wired yet: notifier.blackhole makes
# that explicit (vmalert evaluates rules but sends nowhere). To deliver alerts
# later, drop blackhole and set settings."notifier.url" to an Alertmanager.
{ ... }:
let
vmUrl = "http://127.0.0.1:8428";
in
{
services.vmalert.instances.cnx = {
enable = true;
settings = {
"datasource.url" = vmUrl;
"remoteWrite.url" = vmUrl; # persists ALERTS / ALERTS_FOR_STATE back to VM
"notifier.blackhole" = true;
"httpListenAddr" = "127.0.0.1:8880"; # vmalert UI/API, loopback only (like VM)
};
rules.groups = [
{
name = "dns";
rules = [
{
alert = "DNSSecondaryOutOfSync";
expr = "max by (zone) (knot_zone_serial) - min by (zone) (knot_zone_serial) > 0";
for = "15m";
labels.severity = "warning";
annotations.summary = "Zone {{ $labels.zone }} SOA serial differs between nameservers";
annotations.description = "The secondary is out of sync with the primary for {{ $labels.zone }}. `knotc zone-retransfer {{ $labels.zone }}` on ns2 forces a fresh pull.";
}
{
alert = "ZoneExpiryLow";
expr = "knot_zone_status_expiration < 3600";
for = "5m";
labels.severity = "critical";
annotations.summary = "Zone {{ $labels.zone }} on {{ $labels.instance }} is within 1h of expiry";
annotations.description = "Transfers to the secondary appear to be failing; the zone stops being served when the SOA expire timer hits zero.";
}
];
}
{
name = "host";
rules = [
{
alert = "ScrapeTargetDown";
expr = "up == 0";
for = "5m";
labels.severity = "critical";
annotations.summary = "{{ $labels.job }} exporter on {{ $labels.instance }} is down";
annotations.description = "VictoriaMetrics cannot scrape this target; its metrics are missing.";
}
{
alert = "RootFilesystemFull";
expr = ''100 * (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) > 90'';
for = "15m";
labels.severity = "warning";
annotations.summary = "Root filesystem on {{ $labels.instance }} is over 90% full";
}
];
}
];
};
}