Add vmalert alerting rules for DNS and host health
vmalert on control evaluates rules (declared in git) against VictoriaMetrics and remote-writes alert state back, so firing alerts show as the ALERTS series in Grafana. Covers SOA divergence between ns1/ns2, secondary zone expiry, scrape target down, and root disk full. No notifier yet (notifier.blackhole). Also adds TODO.md roadmap.
This commit is contained in:
@@ -0,0 +1,40 @@
|
|||||||
|
# Infra roadmap
|
||||||
|
|
||||||
|
Prioritized backlog for the cnx-network clan. See `docs/` for how the current
|
||||||
|
pieces work.
|
||||||
|
|
||||||
|
## 1. Alerting (done — pending deploy)
|
||||||
|
|
||||||
|
Rules evaluated by vmalert against VictoriaMetrics on control, declared in
|
||||||
|
`modules/monitoring/alerts.nix`:
|
||||||
|
|
||||||
|
- [x] SOA serial divergence between ns1 and ns2 (secondary out of sync)
|
||||||
|
- [x] Zone-expiry countdown on the secondary approaching zero (transfers failing)
|
||||||
|
- [x] Any scrape target down (`up == 0`)
|
||||||
|
- [x] Root filesystem nearly full
|
||||||
|
|
||||||
|
Delivery stays minimal for now (`notifier.blackhole`): vmalert remote-writes
|
||||||
|
alert state back to VM, so firing alerts show up as the `ALERTS` series in
|
||||||
|
Grafana. Wiring a real notifier (Matrix) is a later step — drop `blackhole` and
|
||||||
|
set `settings."notifier.url"` to an Alertmanager.
|
||||||
|
|
||||||
|
## 2. Backups of critical state
|
||||||
|
|
||||||
|
- [ ] DNSSEC key material on ns1 (KSK/ZSK in Knot's KASP store) — losing it forces
|
||||||
|
an emergency DS rollover at the registrar
|
||||||
|
- [ ] VictoriaMetrics TSDB on control (optional, retention is 180d)
|
||||||
|
|
||||||
|
## 3. Blackbox DNS probing
|
||||||
|
|
||||||
|
- [ ] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries
|
||||||
|
against ns1/ns2 — catches outside-in resolution failures the Knot stats miss
|
||||||
|
|
||||||
|
## 4. Third secondary off Hetzner (resilience)
|
||||||
|
|
||||||
|
- [ ] A secondary nameserver on a different provider/network so a single-provider
|
||||||
|
outage doesn't take all authoritative DNS down (architectural — new machine)
|
||||||
|
|
||||||
|
## 5. Centralized logs
|
||||||
|
|
||||||
|
- [ ] VictoriaLogs on control to grep journald across all three hosts, pairing
|
||||||
|
with the existing VictoriaMetrics setup
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
../../modules/static-ipv6.nix
|
../../modules/static-ipv6.nix
|
||||||
../../modules/monitoring/exporters.nix
|
../../modules/monitoring/exporters.nix
|
||||||
../../modules/monitoring/server.nix
|
../../modules/monitoring/server.nix
|
||||||
|
../../modules/monitoring/alerts.nix
|
||||||
../../modules/docs.nix
|
../../modules/docs.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,64 @@
|
|||||||
|
# Alerting rules, evaluated by vmalert against VictoriaMetrics on control.
|
||||||
|
# Everything is declared here in git. vmalert remote-writes alert state back to
|
||||||
|
# VM, so firing alerts surface as the `ALERTS{alertstate="firing"}` series and
|
||||||
|
# can be viewed in Grafana. No notifier is wired yet: notifier.blackhole makes
|
||||||
|
# that explicit (vmalert evaluates rules but sends nowhere). To deliver alerts
|
||||||
|
# later, drop blackhole and set settings."notifier.url" to an Alertmanager.
|
||||||
|
{ ... }:
|
||||||
|
let
|
||||||
|
vmUrl = "http://127.0.0.1:8428";
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.vmalert.instances.cnx = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
"datasource.url" = vmUrl;
|
||||||
|
"remoteWrite.url" = vmUrl; # persists ALERTS / ALERTS_FOR_STATE back to VM
|
||||||
|
"notifier.blackhole" = true;
|
||||||
|
"httpListenAddr" = "127.0.0.1:8880"; # vmalert UI/API, loopback only (like VM)
|
||||||
|
};
|
||||||
|
rules.groups = [
|
||||||
|
{
|
||||||
|
name = "dns";
|
||||||
|
rules = [
|
||||||
|
{
|
||||||
|
alert = "DNSSecondaryOutOfSync";
|
||||||
|
expr = "max by (zone) (knot_zone_serial) - min by (zone) (knot_zone_serial) > 0";
|
||||||
|
for = "15m";
|
||||||
|
labels.severity = "warning";
|
||||||
|
annotations.summary = "Zone {{ $labels.zone }} SOA serial differs between nameservers";
|
||||||
|
annotations.description = "The secondary is out of sync with the primary for {{ $labels.zone }}. `knotc zone-retransfer {{ $labels.zone }}` on ns2 forces a fresh pull.";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
alert = "ZoneExpiryLow";
|
||||||
|
expr = "knot_zone_status_expiration < 3600";
|
||||||
|
for = "5m";
|
||||||
|
labels.severity = "critical";
|
||||||
|
annotations.summary = "Zone {{ $labels.zone }} on {{ $labels.instance }} is within 1h of expiry";
|
||||||
|
annotations.description = "Transfers to the secondary appear to be failing; the zone stops being served when the SOA expire timer hits zero.";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
name = "host";
|
||||||
|
rules = [
|
||||||
|
{
|
||||||
|
alert = "ScrapeTargetDown";
|
||||||
|
expr = "up == 0";
|
||||||
|
for = "5m";
|
||||||
|
labels.severity = "critical";
|
||||||
|
annotations.summary = "{{ $labels.job }} exporter on {{ $labels.instance }} is down";
|
||||||
|
annotations.description = "VictoriaMetrics cannot scrape this target; its metrics are missing.";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
alert = "RootFilesystemFull";
|
||||||
|
expr = ''100 * (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) > 90'';
|
||||||
|
for = "15m";
|
||||||
|
labels.severity = "warning";
|
||||||
|
annotations.summary = "Root filesystem on {{ $labels.instance }} is over 90% full";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user