From c7b0f206c8c12cc9745d21653707e2c17c9551d2 Mon Sep 17 00:00:00 2001 From: Berwn Date: Wed, 17 Jun 2026 15:42:13 +0700 Subject: [PATCH] Alert on and chart blackbox DNS probe failures DNSResolutionProbeFailed and DNSSECProbeFailed fire when an SOA or DNSKEY probe to a public nameserver address stays down for 5m. The CNX DNS dashboard gains a "DNS probes (outside-in)" row: per-zone/server status table, probe success, and probe latency. --- TODO.md | 3 +- modules/monitoring/alerts.nix | 23 ++++++ modules/monitoring/dashboards/dns.json | 104 ++++++++++++++++++++++++- 3 files changed, 128 insertions(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index a5bbfff..52b6117 100644 --- a/TODO.md +++ b/TODO.md @@ -49,7 +49,8 @@ ns1/ns2: SOA + DNSKEY succeed on both servers over v4 and v6. - [x] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries against ns1/ns2 — catches outside-in resolution failures the Knot stats miss -- [ ] still to pair (next): a `probe_success == 0` alert and a Grafana panel +- [x] paired with alerts (`DNSResolutionProbeFailed` / `DNSSECProbeFailed` in + `alerts.nix`) and a "DNS probes (outside-in)" row on the CNX DNS dashboard ## 4. Third secondary off Hetzner (resilience) diff --git a/modules/monitoring/alerts.nix b/modules/monitoring/alerts.nix index 311873f..cc419a7 100644 --- a/modules/monitoring/alerts.nix +++ b/modules/monitoring/alerts.nix @@ -80,6 +80,29 @@ in } ]; } + { + # Outside-in DNS probes (blackbox on control). The `for` rides out a + # single dropped UDP packet; only a sustained failure fires. + name = "dns_probe"; + rules = [ + { + alert = "DNSResolutionProbeFailed"; + expr = ''probe_success{query="SOA"} == 0''; + for = "5m"; + labels.severity = "critical"; + annotations.summary = "{{ $labels.zone }} is not resolving from {{ $labels.instance }}"; + annotations.description = "The blackbox SOA probe to this public nameserver address is failing; from the outside the zone looks unavailable there, which the Knot stats would not show."; + } + { + alert = "DNSSECProbeFailed"; + expr = ''probe_success{query="DNSKEY"} == 0''; + for = "5m"; + labels.severity = "critical"; + annotations.summary = "{{ $labels.zone }} DNSKEY missing from {{ $labels.instance }}"; + annotations.description = "The DNSKEY probe to this public nameserver address is failing: the zone's signing keys are not being served, so validating resolvers will treat answers as bogus."; + } + ]; + } ]; }; } diff --git a/modules/monitoring/dashboards/dns.json b/modules/monitoring/dashboards/dns.json index eda2d9c..e58595d 100644 --- a/modules/monitoring/dashboards/dns.json +++ b/modules/monitoring/dashboards/dns.json @@ -4,7 +4,7 @@ "tags": ["dns", "knot", "cnx"], "timezone": "browser", "schemaVersion": 39, - "version": 2, + "version": 3, "refresh": "30s", "time": { "from": "now-6h", "to": "now" }, "templating": { "list": [] }, @@ -207,6 +207,108 @@ "legendFormat": "{{instance}}" } ] + }, + { + "type": "row", + "title": "DNS probes (outside-in)", + "id": 20, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 } + }, + { + "type": "table", + "title": "Probe status (per zone / server)", + "description": "blackbox_exporter on control queries each nameserver's public address (v4 + v6) for every zone: an SOA query (zone served) and a DNSKEY query (still signed). UP = the resolver's-eye view is healthy.", + "id": 21, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 }, + "options": { "showHeader": true }, + "fieldConfig": { + "defaults": { + "custom": { "align": "auto", "cellOptions": { "type": "color-background" } }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "index": 0 }, + "1": { "text": "UP", "index": 1 } + } + } + ] + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "zone" }, + "properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }] + }, + { + "matcher": { "id": "byName", "options": "query" }, + "properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }] + }, + { + "matcher": { "id": "byName", "options": "instance" }, + "properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }] + } + ] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "probe_success", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true }, + "renameByName": { "Value": "status" } + } + } + ] + }, + { + "type": "timeseries", + "title": "Probe success (1 = ok)", + "description": "0 means the probe failed: the zone is not being served or not signed from that public address. Sustained failures fire DNSResolutionProbeFailed / DNSSECProbeFailed.", + "id": 22, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 }, + "fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "probe_success", + "legendFormat": "{{zone}} {{query}} @ {{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "DNS probe latency", + "description": "Total round-trip time of each blackbox DNS probe. A climbing trend points at a slow or overloaded nameserver before it starts failing outright.", + "id": 23, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 51 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "probe_duration_seconds", + "legendFormat": "{{zone}} {{query}} @ {{instance}}" + } + ] } ] }