diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..df62644 --- /dev/null +++ b/TODO.md @@ -0,0 +1,40 @@ +# Infra roadmap + +Prioritized backlog for the cnx-network clan. See `docs/` for how the current +pieces work. + +## 1. Alerting (done — pending deploy) + +Rules evaluated by vmalert against VictoriaMetrics on control, declared in +`modules/monitoring/alerts.nix`: + +- [x] SOA serial divergence between ns1 and ns2 (secondary out of sync) +- [x] Zone-expiry countdown on the secondary approaching zero (transfers failing) +- [x] Any scrape target down (`up == 0`) +- [x] Root filesystem nearly full + +Delivery stays minimal for now (`notifier.blackhole`): vmalert remote-writes +alert state back to VM, so firing alerts show up as the `ALERTS` series in +Grafana. Wiring a real notifier (Matrix) is a later step — drop `blackhole` and +set `settings."notifier.url"` to an Alertmanager. + +## 2. Backups of critical state + +- [ ] DNSSEC key material on ns1 (KSK/ZSK in Knot's KASP store) — losing it forces + an emergency DS rollover at the registrar +- [ ] VictoriaMetrics TSDB on control (optional, retention is 180d) + +## 3. Blackbox DNS probing + +- [ ] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries + against ns1/ns2 — catches outside-in resolution failures the Knot stats miss + +## 4. Third secondary off Hetzner (resilience) + +- [ ] A secondary nameserver on a different provider/network so a single-provider + outage doesn't take all authoritative DNS down (architectural — new machine) + +## 5. Centralized logs + +- [ ] VictoriaLogs on control to grep journald across all three hosts, pairing + with the existing VictoriaMetrics setup diff --git a/machines/control/configuration.nix b/machines/control/configuration.nix index 6775bd4..5dddbf0 100644 --- a/machines/control/configuration.nix +++ b/machines/control/configuration.nix @@ -4,6 +4,7 @@ ../../modules/static-ipv6.nix ../../modules/monitoring/exporters.nix ../../modules/monitoring/server.nix + ../../modules/monitoring/alerts.nix ../../modules/docs.nix ]; diff --git a/modules/monitoring/alerts.nix b/modules/monitoring/alerts.nix new file mode 100644 index 0000000..f71e4b6 --- /dev/null +++ b/modules/monitoring/alerts.nix @@ -0,0 +1,64 @@ +# Alerting rules, evaluated by vmalert against VictoriaMetrics on control. +# Everything is declared here in git. vmalert remote-writes alert state back to +# VM, so firing alerts surface as the `ALERTS{alertstate="firing"}` series and +# can be viewed in Grafana. No notifier is wired yet: notifier.blackhole makes +# that explicit (vmalert evaluates rules but sends nowhere). To deliver alerts +# later, drop blackhole and set settings."notifier.url" to an Alertmanager. +{ ... }: +let + vmUrl = "http://127.0.0.1:8428"; +in +{ + services.vmalert.instances.cnx = { + enable = true; + settings = { + "datasource.url" = vmUrl; + "remoteWrite.url" = vmUrl; # persists ALERTS / ALERTS_FOR_STATE back to VM + "notifier.blackhole" = true; + "httpListenAddr" = "127.0.0.1:8880"; # vmalert UI/API, loopback only (like VM) + }; + rules.groups = [ + { + name = "dns"; + rules = [ + { + alert = "DNSSecondaryOutOfSync"; + expr = "max by (zone) (knot_zone_serial) - min by (zone) (knot_zone_serial) > 0"; + for = "15m"; + labels.severity = "warning"; + annotations.summary = "Zone {{ $labels.zone }} SOA serial differs between nameservers"; + annotations.description = "The secondary is out of sync with the primary for {{ $labels.zone }}. `knotc zone-retransfer {{ $labels.zone }}` on ns2 forces a fresh pull."; + } + { + alert = "ZoneExpiryLow"; + expr = "knot_zone_status_expiration < 3600"; + for = "5m"; + labels.severity = "critical"; + annotations.summary = "Zone {{ $labels.zone }} on {{ $labels.instance }} is within 1h of expiry"; + annotations.description = "Transfers to the secondary appear to be failing; the zone stops being served when the SOA expire timer hits zero."; + } + ]; + } + { + name = "host"; + rules = [ + { + alert = "ScrapeTargetDown"; + expr = "up == 0"; + for = "5m"; + labels.severity = "critical"; + annotations.summary = "{{ $labels.job }} exporter on {{ $labels.instance }} is down"; + annotations.description = "VictoriaMetrics cannot scrape this target; its metrics are missing."; + } + { + alert = "RootFilesystemFull"; + expr = ''100 * (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) > 90''; + for = "15m"; + labels.severity = "warning"; + annotations.summary = "Root filesystem on {{ $labels.instance }} is over 90% full"; + } + ]; + } + ]; + }; +}