Add VictoriaMetrics + Grafana DNS monitoring over the mesh

control runs VictoriaMetrics (loopback) and Grafana; every machine exports
node metrics and the nameservers export Knot stats (mod-stats + knot-exporter).
Scraping and the Grafana UI ride the ZeroTier mesh only, scoped by nftables to
the mesh /88; the public side stays closed by the Hetzner cloud firewall. The
provisioned DNS dashboard includes a per-zone SOA serial table to catch
primary/secondary drift. ZeroTier ULAs are centralised in mesh-hosts.nix.
This commit is contained in:
Berwn
2026-06-17 10:17:27 +07:00
parent 63446173bc
commit 33ac7e106b
8 changed files with 402 additions and 2 deletions
+2
View File
@@ -2,6 +2,8 @@
imports = [
../../modules/hetzner-firewall.nix
../../modules/static-ipv6.nix
../../modules/monitoring/exporters.nix
../../modules/monitoring/server.nix
];
clan.core.sops.defaultGroups = [ "admins" ];
+1
View File
@@ -6,6 +6,7 @@ in
imports = [
../../modules/dns/authoritative.nix
../../modules/static-ipv6.nix
../../modules/monitoring/exporters.nix
];
clan.core.sops.defaultGroups = [ "admins" ];
+1
View File
@@ -6,6 +6,7 @@ in
imports = [
../../modules/dns/authoritative.nix
../../modules/static-ipv6.nix
../../modules/monitoring/exporters.nix
];
clan.core.sops.defaultGroups = [ "admins" ];
+3 -2
View File
@@ -1,8 +1,9 @@
{ config, pkgs, ... }:
let
# ZeroTier addresses — zone transfers run over the mesh, not the public net.
ns1zt = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
ns2zt = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
mesh = import ../mesh-hosts.nix;
ns1zt = mesh.hosts.ns1;
ns2zt = mesh.hosts.ns2;
in
{
# Shared TSIG key, generated once and copied to every machine that imports
+14
View File
@@ -0,0 +1,14 @@
# ZeroTier (clan mesh) addresses — the private IPv6 overlay every machine shares.
# DNS zone transfers and metrics scraping ride this mesh, never the public net.
rec {
hosts = {
control = "fd06:1bad:ece2:92ad:ba99:9306:1bad:ece2";
ns1 = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
ns2 = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
};
# RFC 4193 /88 prefix of this ZeroTier network (fd + 8-byte network id + the
# 0x9993 marker). Covers every mesh peer — servers and admin laptops alike —
# and is used to scope mesh-only firewall rules.
subnet = "fd06:1bad:ece2:92ad:ba99:9300::/88";
}
+171
View File
@@ -0,0 +1,171 @@
{
"uid": "cnx-dns",
"title": "CNX DNS",
"tags": ["dns", "knot", "cnx"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"templating": { "list": [] },
"annotations": { "list": [] },
"panels": [
{
"type": "row",
"title": "DNS / Zones",
"id": 1,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
},
{
"type": "table",
"title": "Zone SOA serial (per nameserver)",
"description": "ns1 and ns2 should report the same serial per zone. A divergence here is the secondary-out-of-sync condition.",
"id": 2,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
"options": { "showHeader": true },
"fieldConfig": {
"defaults": { "custom": { "align": "auto" } },
"overrides": []
},
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "knot_zone_serial",
"format": "table",
"instant": true,
"legendFormat": "{{zone}} @ {{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Seconds until zone expiry",
"description": "On secondaries this counts down between successful transfers; a steady decline toward zero means transfers are failing.",
"id": 3,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "knot_zone_status_expiration",
"legendFormat": "{{zone}} @ {{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Query rate by nameserver",
"id": 4,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "sum by (instance) (rate(knot_stats_request_protocol_total[5m]))",
"legendFormat": "{{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Response codes",
"id": 5,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "sum by (type) (rate(knot_stats_response_code_total[5m]))",
"legendFormat": "{{type}}"
}
]
},
{
"type": "row",
"title": "Hosts",
"id": 6,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }
},
{
"type": "timeseries",
"title": "CPU busy %",
"id": 7,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
"fieldConfig": {
"defaults": { "unit": "percent", "min": 0, "max": 100 },
"overrides": []
},
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Memory used %",
"id": 8,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
"fieldConfig": {
"defaults": { "unit": "percent", "min": 0, "max": 100 },
"overrides": []
},
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
"legendFormat": "{{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Root filesystem used %",
"id": 9,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
"fieldConfig": {
"defaults": { "unit": "percent", "min": 0, "max": 100 },
"overrides": []
},
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "100 * (1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})",
"legendFormat": "{{instance}}"
}
]
},
{
"type": "timeseries",
"title": "Load average (1m)",
"id": 10,
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"targets": [
{
"refId": "A",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"expr": "node_load1",
"legendFormat": "{{instance}}"
}
]
}
]
}
+93
View File
@@ -0,0 +1,93 @@
# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS
# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh
# (see the firewall rule at the bottom); the public side is already closed by the
# Hetzner cloud firewall.
{
config,
lib,
pkgs,
...
}:
let
mesh = import ../mesh-hosts.nix;
knotEnabled = config.services.knot.enable;
# node_exporter on every host; knot-exporter only where Knot runs.
ports = [ 9100 ] ++ lib.optional knotEnabled 9433;
in
{
# extraInputRules (the mesh-scoped accept below) needs the nftables firewall
# backend. allowedTCP/UDPPorts used elsewhere (53, 9993) translate unchanged.
networking.nftables.enable = true;
# Host metrics: CPU, memory, disk, network, systemd unit state.
services.prometheus.exporters.node = {
enable = true;
# Listen on all interfaces (incl. the v6 mesh). We deliberately do NOT bind
# to the ZeroTier ULA: the node module renders --web.listen-address without
# IPv6 brackets, and binding a single ULA would also race ZeroTier bring-up
# at boot. Reachability is constrained by the firewall rule instead.
listenAddress = "";
port = 9100;
enabledCollectors = [ "systemd" ];
};
# Knot DNS metrics. The exporter reads Knot's control socket, so it runs as the
# knot user; mod-stats (below) populates the query/response counters it exports.
systemd.services.knot-exporter = lib.mkIf knotEnabled {
description = "Prometheus exporter for Knot DNS";
after = [ "knot.service" ];
wants = [ "knot.service" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = lib.concatStringsSep " " [
"${pkgs.prometheus-knot-exporter}/bin/knot-exporter"
"--web-listen-addr ::"
"--web-listen-port 9433"
"--knot-library-path ${pkgs.knot-dns.out}/lib/libknot.so"
"--knot-socket-path /run/knot/knot.sock"
];
User = "knot";
Group = "knot";
Restart = "on-failure";
RestartSec = 5;
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = true;
PrivateTmp = true;
ProtectKernelTunables = true;
ProtectControlGroups = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
"AF_UNIX"
];
};
};
# mod-stats: per-query/zone counters exposed over the control socket and read
# by knot-exporter. Loaded as a global module on the default template so it
# applies to every zone. (Merges with the zone/acl/policy settings elsewhere.)
services.knot.settings = lib.mkIf knotEnabled {
"mod-stats" = [
{
id = "default";
"request-protocol" = true;
"server-operation" = true;
"response-code" = true;
"query-type" = true;
"reply-nodata" = true;
}
];
template = [
{
id = "default";
global-module = [ "mod-stats/default" ];
}
];
};
# Scrape ports reachable only from the ZeroTier mesh.
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept
'';
}
+117
View File
@@ -0,0 +1,117 @@
# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper)
# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host,
# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall
# rule at the bottom; the Hetzner cloud firewall keeps it off the public net.
{
config,
lib,
pkgs,
...
}:
let
mesh = import ../mesh-hosts.nix;
vmPort = 8428;
grafanaPort = 3000;
controlV6 = mesh.hosts.control;
# A single scrape target with a friendly instance label. IPv6 mesh addresses
# must be bracketed for Prometheus-style targets.
target = name: addr: port: {
targets = [ "${addr}:${toString port}" ];
labels.instance = name;
};
v6 = addr: "[${addr}]";
adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path;
in
{
services.victoriametrics = {
enable = true;
listenAddress = "127.0.0.1:${toString vmPort}";
retentionPeriod = "180d";
prometheusConfig = {
global.scrape_interval = "30s";
scrape_configs = [
{
job_name = "node";
static_configs = [
# control scrapes its own node_exporter over loopback so host metrics
# survive even if the mesh is down; ns1/ns2 are scraped over the mesh.
(target "control" "127.0.0.1" 9100)
(target "ns1" (v6 mesh.hosts.ns1) 9100)
(target "ns2" (v6 mesh.hosts.ns2) 9100)
];
}
{
job_name = "knot";
static_configs = [
(target "ns1" (v6 mesh.hosts.ns1) 9433)
(target "ns2" (v6 mesh.hosts.ns2) 9433)
];
}
];
};
};
# Admin password generated once and stored as a clan secret. Retrieve with:
# clan vars get control grafana-admin/password
clan.core.vars.generators.grafana-admin = {
files."password" = {
secret = true;
owner = "grafana";
group = "grafana";
};
runtimeInputs = [ pkgs.openssl ];
script = ''
openssl rand -base64 24 | tr -d "\n" > "$out"/password
'';
};
services.grafana = {
enable = true;
settings = {
server = {
http_addr = "::";
http_port = grafanaPort;
root_url = "http://${v6 controlV6}:${toString grafanaPort}/";
};
security = {
admin_user = "admin";
admin_password = "$__file{${adminPasswordFile}}";
};
"auth.anonymous".enabled = false;
users.allow_sign_up = false;
};
provision = {
enable = true;
datasources.settings = {
apiVersion = 1;
datasources = [
{
name = "VictoriaMetrics";
type = "prometheus";
uid = "victoriametrics";
access = "proxy";
url = "http://127.0.0.1:${toString vmPort}";
isDefault = true;
}
];
};
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "cnx";
options.path = ./dashboards;
options.foldersFromFilesStructure = false;
}
];
};
};
};
# Grafana reachable only from the ZeroTier mesh (admin laptops + servers).
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept
'';
}