c7b0f206c8
DNSResolutionProbeFailed and DNSSECProbeFailed fire when an SOA or DNSKEY probe to a public nameserver address stays down for 5m. The CNX DNS dashboard gains a "DNS probes (outside-in)" row: per-zone/server status table, probe success, and probe latency.
315 lines
10 KiB
JSON
315 lines
10 KiB
JSON
{
|
|
"uid": "cnx-dns",
|
|
"title": "CNX DNS",
|
|
"tags": ["dns", "knot", "cnx"],
|
|
"timezone": "browser",
|
|
"schemaVersion": 39,
|
|
"version": 3,
|
|
"refresh": "30s",
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"templating": { "list": [] },
|
|
"annotations": { "list": [] },
|
|
"panels": [
|
|
{
|
|
"type": "row",
|
|
"title": "Alerts",
|
|
"id": 11,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
|
},
|
|
{
|
|
"type": "table",
|
|
"title": "Active alerts",
|
|
"description": "Firing vmalert alerts (the ALERTS series vmalert writes back to VictoriaMetrics). An empty table means all clear.",
|
|
"id": 12,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 7, "w": 24, "x": 0, "y": 1 },
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": {
|
|
"defaults": { "custom": { "align": "auto" } },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "ALERTS{alertstate=\"firing\"}",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": {
|
|
"Time": true,
|
|
"Value": true,
|
|
"__name__": true,
|
|
"alertstate": true
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "DNS / Zones",
|
|
"id": 1,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }
|
|
},
|
|
{
|
|
"type": "table",
|
|
"title": "Zone SOA serial (per nameserver)",
|
|
"description": "ns1 and ns2 should report the same serial per zone. A divergence here is the secondary-out-of-sync condition.",
|
|
"id": 2,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": {
|
|
"defaults": { "custom": { "align": "auto" } },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "knot_zone_serial",
|
|
"format": "table",
|
|
"instant": true,
|
|
"legendFormat": "{{zone}} @ {{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Seconds until zone expiry",
|
|
"description": "On secondaries this counts down between successful transfers; a steady decline toward zero means transfers are failing.",
|
|
"id": 3,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "knot_zone_status_expiration",
|
|
"legendFormat": "{{zone}} @ {{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Query rate by nameserver",
|
|
"id": 4,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
|
|
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "sum by (instance) (rate(knot_stats_request_protocol_total[5m]))",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Response codes",
|
|
"id": 5,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
|
|
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "sum by (type) (rate(knot_stats_response_code_total[5m]))",
|
|
"legendFormat": "{{type}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "Hosts",
|
|
"id": 6,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "CPU busy %",
|
|
"id": 7,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Memory used %",
|
|
"id": 8,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Root filesystem used %",
|
|
"id": 9,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "100 * (1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Load average (1m)",
|
|
"id": 10,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 },
|
|
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "node_load1",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "DNS probes (outside-in)",
|
|
"id": 20,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 }
|
|
},
|
|
{
|
|
"type": "table",
|
|
"title": "Probe status (per zone / server)",
|
|
"description": "blackbox_exporter on control queries each nameserver's public address (v4 + v6) for every zone: an SOA query (zone served) and a DNSKEY query (still signed). UP = the resolver's-eye view is healthy.",
|
|
"id": 21,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 },
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "align": "auto", "cellOptions": { "type": "color-background" } },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
},
|
|
"mappings": [
|
|
{
|
|
"type": "value",
|
|
"options": {
|
|
"0": { "text": "DOWN", "index": 0 },
|
|
"1": { "text": "UP", "index": 1 }
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "zone" },
|
|
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "query" },
|
|
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
|
|
},
|
|
{
|
|
"matcher": { "id": "byName", "options": "instance" },
|
|
"properties": [{ "id": "custom.cellOptions", "value": { "type": "auto" } }]
|
|
}
|
|
]
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "probe_success",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": { "Time": true, "__name__": true, "job": true },
|
|
"renameByName": { "Value": "status" }
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Probe success (1 = ok)",
|
|
"description": "0 means the probe failed: the zone is not being served or not signed from that public address. Sustained failures fire DNSResolutionProbeFailed / DNSSECProbeFailed.",
|
|
"id": 22,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 },
|
|
"fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "probe_success",
|
|
"legendFormat": "{{zone}} {{query}} @ {{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "DNS probe latency",
|
|
"description": "Total round-trip time of each blackbox DNS probe. A climbing trend points at a slow or overloaded nameserver before it starts failing outright.",
|
|
"id": 23,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 51 },
|
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "probe_duration_seconds",
|
|
"legendFormat": "{{zone}} {{query}} @ {{instance}}"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|