Alert on and chart blackbox DNS probe failures

DNSResolutionProbeFailed and DNSSECProbeFailed fire when an SOA or
DNSKEY probe to a public nameserver address stays down for 5m. The CNX
DNS dashboard gains a "DNS probes (outside-in)" row: per-zone/server
status table, probe success, and probe latency.
This commit is contained in:
Berwn
2026-06-17 15:42:13 +07:00
parent 54f607d063
commit c7b0f206c8
3 changed files with 128 additions and 2 deletions
+2 -1
View File
@@ -49,7 +49,8 @@ ns1/ns2: SOA + DNSKEY succeed on both servers over v4 and v6.
- [x] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries - [x] `blackbox_exporter` on control doing real DNS + DNSSEC-validation queries
against ns1/ns2 — catches outside-in resolution failures the Knot stats miss against ns1/ns2 — catches outside-in resolution failures the Knot stats miss
- [ ] still to pair (next): a `probe_success == 0` alert and a Grafana panel - [x] paired with alerts (`DNSResolutionProbeFailed` / `DNSSECProbeFailed` in
`alerts.nix`) and a "DNS probes (outside-in)" row on the CNX DNS dashboard
## 4. Third secondary off Hetzner (resilience) ## 4. Third secondary off Hetzner (resilience)
+23
View File
@@ -80,6 +80,29 @@ in
} }
]; ];
} }
{
# Outside-in DNS probes (blackbox on control). The `for` rides out a
# single dropped UDP packet; only a sustained failure fires.
name = "dns_probe";
rules = [
{
alert = "DNSResolutionProbeFailed";
expr = ''probe_success{query="SOA"} == 0'';
for = "5m";
labels.severity = "critical";
annotations.summary = "{{ $labels.zone }} is not resolving from {{ $labels.instance }}";
annotations.description = "The blackbox SOA probe to this public nameserver address is failing; from the outside the zone looks unavailable there, which the Knot stats would not show.";
}
{
alert = "DNSSECProbeFailed";
expr = ''probe_success{query="DNSKEY"} == 0'';
for = "5m";
labels.severity = "critical";
annotations.summary = "{{ $labels.zone }} DNSKEY missing from {{ $labels.instance }}";
annotations.description = "The DNSKEY probe to this public nameserver address is failing: the zone's signing keys are not being served, so validating resolvers will treat answers as bogus.";
}
];
}
]; ];
}; };
} }
+103 -1
View File
@@ -4,7 +4,7 @@
"tags": ["dns", "knot", "cnx"], "tags": ["dns", "knot", "cnx"],
"timezone": "browser", "timezone": "browser",
"schemaVersion": 39, "schemaVersion": 39,
"version": 2, "version": 3,
"refresh": "30s", "refresh": "30s",
"time": { "from": "now-6h", "to": "now" }, "time": { "from": "now-6h", "to": "now" },
"templating": { "list": [] }, "templating": { "list": [] },
@@ -207,6 +207,108 @@
"legendFormat": "{{instance}}" "legendFormat": "{{instance}}"
} }
] ]
},
{
"type": "row",
"title": "DNS probes (outside-in)",
"id": 20,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 }
},
{
"type": "table",
"title": "Probe status (per zone / server)",
"description": "blackbox_exporter on control queries each nameserver's public address (v4 + v6) for every zone: an SOA query (zone served) and a DNSKEY query (still signed). UP = the resolver's-eye view is healthy.",
"id": 21,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 },
"options": { "showHeader": true },
"fieldConfig": {
"defaults": {
"custom": { "align": "auto", "cellOptions": { "type": "color-background" } },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "DOWN", "index": 0 },
"1": { "text": "UP", "index": 1 }
}
}
]
},
"overrides": [
{
"matcher": { "id": "byName", "options": "zone" },
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
},
{
"matcher": { "id": "byName", "options": "query" },
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
},
{
"matcher": { "id": "byName", "options": "instance" },
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
}
]
},
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "probe_success",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true, "job": true },
"renameByName": { "Value": "status" }
}
}
]
},
{
"type": "timeseries",
"title": "Probe success (1 = ok)",
"description": "0 means the probe failed: the zone is not being served or not signed from that public address. Sustained failures fire DNSResolutionProbeFailed / DNSSECProbeFailed.",
"id": 22,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 },
"fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "probe_success",
"legendFormat": "{{zone}} {{query}} @ {{instance}}"
}
]
},
{
"type": "timeseries",
"title": "DNS probe latency",
"description": "Total round-trip time of each blackbox DNS probe. A climbing trend points at a slow or overloaded nameserver before it starts failing outright.",
"id": 23,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 51 },
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "probe_duration_seconds",
"legendFormat": "{{zone}} {{query}} @ {{instance}}"
}
]
} }
] ]
} }