diff --git a/TODO.md b/TODO.md index efbc235..a5bbfff 100644 --- a/TODO.md +++ b/TODO.md @@ -37,10 +37,19 @@ deploy ns1 and control. regenerable over time and control is the backup server, so this needs a second client→server pair (e.g. control→ns2) rather than the same topology -## 3. Blackbox DNS probing +## 3. Blackbox DNS probing (done — pending deploy) -- [ ] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries +`blackbox_exporter` on control (loopback `:9115`), probing each nameserver's +public v4+v6 address for every zone: an SOA query (zone served?) and a DNSKEY +query (still signed?). Blackbox has no DO-bit option, so signing is checked by +asking for DNSKEY directly and asserting the RRset is present. Probe defs live +in `modules/monitoring/blackbox-probes.nix`, shared by the exporter +(`blackbox.nix`) and the VM scrape jobs (`server.nix`). Verified live against +ns1/ns2: SOA + DNSKEY succeed on both servers over v4 and v6. + +- [x] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries against ns1/ns2 — catches outside-in resolution failures the Knot stats miss +- [ ] still to pair (next): a `probe_success == 0` alert and a Grafana panel ## 4. Third secondary off Hetzner (resilience) diff --git a/docs/src/monitoring.md b/docs/src/monitoring.md index 4ded43d..e93c296 100644 --- a/docs/src/monitoring.md +++ b/docs/src/monitoring.md @@ -9,6 +9,11 @@ Metrics and dashboards live on `control`, reachable only over the ZeroTier mesh. (`modules/monitoring/exporters.nix`). - **knot-exporter** (`:9433`) on `ns1`/`ns2` only — reads Knot's control socket, fed by the `mod-stats` module (query/response counters per zone). +- **blackbox_exporter** (`127.0.0.1:9115`) on `control` only — outside-in DNS + probes. For every zone it queries each nameserver's **public** address (v4 and + v6) for SOA (is the zone served?) and DNSKEY (is it still signed?). This is the + resolver's-eye view that the Knot stats can't see. Probe definitions are shared + between the exporter and the scrape jobs in `modules/monitoring/blackbox-probes.nix`. ## Storage & scraping diff --git a/machines/control/configuration.nix b/machines/control/configuration.nix index 5dddbf0..b1915a7 100644 --- a/machines/control/configuration.nix +++ b/machines/control/configuration.nix @@ -4,6 +4,7 @@ ../../modules/static-ipv6.nix ../../modules/monitoring/exporters.nix ../../modules/monitoring/server.nix + ../../modules/monitoring/blackbox.nix ../../modules/monitoring/alerts.nix ../../modules/docs.nix ]; diff --git a/modules/monitoring/blackbox-probes.nix b/modules/monitoring/blackbox-probes.nix new file mode 100644 index 0000000..e07ad56 --- /dev/null +++ b/modules/monitoring/blackbox-probes.nix @@ -0,0 +1,108 @@ +# Blackbox DNS probe definitions, shared between the exporter module +# (modules/monitoring/blackbox.nix, which renders these into the blackbox +# config) and the scraper (modules/monitoring/server.nix, which turns them into +# VictoriaMetrics scrape jobs). Kept in one place so the module list and the +# scrape jobs can never drift apart. +# +# These query the nameservers' PUBLIC addresses, i.e. the path a real internet +# resolver takes, not the mesh — the whole point is to catch outside-in +# resolution failures the Knot stats can't see. For each zone we run two probes +# per endpoint: an SOA query (is the zone being served at all?) and a DNSKEY +# query (is it still DNSSEC-signed?). Blackbox has no DO-bit option, so we ask +# for DNSKEY directly — an authoritative signed zone returns it without EDNS0, +# and its absence means signing has broken. +{ lib }: +let + domains = import ../dns/domains.nix; + + blackboxAddr = "127.0.0.1:9115"; + + # Public endpoints of the authoritative nameservers. The v4 addresses also + # appear in the `internet` instance in clan.nix; the v6 ones in each ns + # machine's cnx.staticIPv6. IPv6 literals are bracketed for host:port. + endpoints = [ + { + instance = "ns1 v4"; + target = "46.224.170.206:53"; + } + { + instance = "ns1 v6"; + target = "[2a01:4f8:c014:b5c5::1]:53"; + } + { + instance = "ns2 v4"; + target = "157.180.70.82:53"; + } + { + instance = "ns2 v6"; + target = "[2a01:4f9:c014:6d87::1]:53"; + } + ]; + + queries = [ + { + name = "soa"; + type = "SOA"; + } + { + name = "dnskey"; + type = "DNSKEY"; + } + ]; + + sanitize = lib.replaceStrings [ "." ] [ "_" ]; + moduleName = zone: q: "dns_${q.name}_${sanitize zone}"; + + modules = lib.listToAttrs ( + lib.concatMap ( + zone: + map ( + q: + lib.nameValuePair (moduleName zone q) { + prober = "dns"; + timeout = "5s"; + dns = { + query_name = "${zone}."; + query_type = q.type; + valid_rcodes = [ "NOERROR" ]; + # Fail unless at least one answer RR of the queried type is present: + # a NOERROR with an empty answer (or a missing DNSKEY) still fails. + validate_answer_rrs.fail_if_not_matches_regexp = [ "\\s${q.type}\\s" ]; + }; + } + ) queries + ) domains + ); + + scrapeConfigs = lib.concatMap ( + zone: + map (q: { + job_name = "blackbox_${moduleName zone q}"; + metrics_path = "/probe"; + params.module = [ (moduleName zone q) ]; + static_configs = map (e: { + targets = [ e.target ]; + labels = { + instance = e.instance; + zone = zone; + query = q.type; + }; + }) endpoints; + # Hand the real DNS server to blackbox as ?target=, then point the scrape + # at the exporter itself. + relabel_configs = [ + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + target_label = "__address__"; + replacement = blackboxAddr; + } + ]; + }) queries + ) domains; +in +{ + inherit modules scrapeConfigs blackboxAddr; +} diff --git a/modules/monitoring/blackbox.nix b/modules/monitoring/blackbox.nix new file mode 100644 index 0000000..2ee3e81 --- /dev/null +++ b/modules/monitoring/blackbox.nix @@ -0,0 +1,24 @@ +# Blackbox exporter on control: outside-in DNS probes against the public +# nameserver addresses (see blackbox-probes.nix for what and why). Bound to +# loopback — only VictoriaMetrics on the same host scrapes its /probe endpoint, +# and the scrape jobs that drive it live in server.nix. The probes leave control +# over the public internet to reach ns1/ns2, which is the path we want to test. +{ + lib, + pkgs, + ... +}: +let + probes = import ./blackbox-probes.nix { inherit lib; }; +in +{ + services.prometheus.exporters.blackbox = { + enable = true; + listenAddress = "127.0.0.1"; + port = 9115; + # JSON is valid YAML; enableConfigCheck runs the exporter's own --config.check + # against this file at build time, so a malformed prober is caught here. + configFile = pkgs.writeText "blackbox.yml" (builtins.toJSON { inherit (probes) modules; }); + enableConfigCheck = true; + }; +} diff --git a/modules/monitoring/server.nix b/modules/monitoring/server.nix index b829d33..c49d759 100644 --- a/modules/monitoring/server.nix +++ b/modules/monitoring/server.nix @@ -10,6 +10,7 @@ }: let mesh = import ../mesh-hosts.nix { inherit config lib; }; + probes = import ./blackbox-probes.nix { inherit lib; }; vmPort = 8428; grafanaPort = 3000; controlV6 = mesh.hosts.control; @@ -52,7 +53,10 @@ in (target "ns2" (v6 mesh.hosts.ns2) 9433) ]; } - ]; + ] + # Outside-in DNS probes via the blackbox exporter (blackbox.nix). The job + # list is generated from the same probe definitions the exporter uses. + ++ probes.scrapeConfigs; }; };