Add VictoriaMetrics + Grafana DNS monitoring over the mesh
control runs VictoriaMetrics (loopback) and Grafana; every machine exports node metrics and the nameservers export Knot stats (mod-stats + knot-exporter). Scraping and the Grafana UI ride the ZeroTier mesh only, scoped by nftables to the mesh /88; the public side stays closed by the Hetzner cloud firewall. The provisioned DNS dashboard includes a per-zone SOA serial table to catch primary/secondary drift. ZeroTier ULAs are centralised in mesh-hosts.nix.
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
imports = [
|
||||
../../modules/hetzner-firewall.nix
|
||||
../../modules/static-ipv6.nix
|
||||
../../modules/monitoring/exporters.nix
|
||||
../../modules/monitoring/server.nix
|
||||
];
|
||||
|
||||
clan.core.sops.defaultGroups = [ "admins" ];
|
||||
|
||||
@@ -6,6 +6,7 @@ in
|
||||
imports = [
|
||||
../../modules/dns/authoritative.nix
|
||||
../../modules/static-ipv6.nix
|
||||
../../modules/monitoring/exporters.nix
|
||||
];
|
||||
|
||||
clan.core.sops.defaultGroups = [ "admins" ];
|
||||
|
||||
@@ -6,6 +6,7 @@ in
|
||||
imports = [
|
||||
../../modules/dns/authoritative.nix
|
||||
../../modules/static-ipv6.nix
|
||||
../../modules/monitoring/exporters.nix
|
||||
];
|
||||
|
||||
clan.core.sops.defaultGroups = [ "admins" ];
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
{ config, pkgs, ... }:
|
||||
let
|
||||
# ZeroTier addresses — zone transfers run over the mesh, not the public net.
|
||||
ns1zt = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
|
||||
ns2zt = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
|
||||
mesh = import ../mesh-hosts.nix;
|
||||
ns1zt = mesh.hosts.ns1;
|
||||
ns2zt = mesh.hosts.ns2;
|
||||
in
|
||||
{
|
||||
# Shared TSIG key, generated once and copied to every machine that imports
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# ZeroTier (clan mesh) addresses — the private IPv6 overlay every machine shares.
|
||||
# DNS zone transfers and metrics scraping ride this mesh, never the public net.
|
||||
rec {
|
||||
hosts = {
|
||||
control = "fd06:1bad:ece2:92ad:ba99:9306:1bad:ece2";
|
||||
ns1 = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
|
||||
ns2 = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
|
||||
};
|
||||
|
||||
# RFC 4193 /88 prefix of this ZeroTier network (fd + 8-byte network id + the
|
||||
# 0x9993 marker). Covers every mesh peer — servers and admin laptops alike —
|
||||
# and is used to scope mesh-only firewall rules.
|
||||
subnet = "fd06:1bad:ece2:92ad:ba99:9300::/88";
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
{
|
||||
"uid": "cnx-dns",
|
||||
"title": "CNX DNS",
|
||||
"tags": ["dns", "knot", "cnx"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"templating": { "list": [] },
|
||||
"annotations": { "list": [] },
|
||||
"panels": [
|
||||
{
|
||||
"type": "row",
|
||||
"title": "DNS / Zones",
|
||||
"id": 1,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"title": "Zone SOA serial (per nameserver)",
|
||||
"description": "ns1 and ns2 should report the same serial per zone. A divergence here is the secondary-out-of-sync condition.",
|
||||
"id": 2,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
||||
"options": { "showHeader": true },
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "auto" } },
|
||||
"overrides": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "knot_zone_serial",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{zone}} @ {{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Seconds until zone expiry",
|
||||
"description": "On secondaries this counts down between successful transfers; a steady decline toward zero means transfers are failing.",
|
||||
"id": 3,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "knot_zone_status_expiration",
|
||||
"legendFormat": "{{zone}} @ {{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Query rate by nameserver",
|
||||
"id": 4,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
||||
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "sum by (instance) (rate(knot_stats_request_protocol_total[5m]))",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Response codes",
|
||||
"id": 5,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
||||
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "sum by (type) (rate(knot_stats_response_code_total[5m]))",
|
||||
"legendFormat": "{{type}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"title": "Hosts",
|
||||
"id": 6,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "CPU busy %",
|
||||
"id": 7,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||
"overrides": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Memory used %",
|
||||
"id": 8,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||
"overrides": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem used %",
|
||||
"id": 9,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||
"overrides": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "100 * (1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Load average (1m)",
|
||||
"id": 10,
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
|
||||
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||
"expr": "node_load1",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS
|
||||
# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh
|
||||
# (see the firewall rule at the bottom); the public side is already closed by the
|
||||
# Hetzner cloud firewall.
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
mesh = import ../mesh-hosts.nix;
|
||||
knotEnabled = config.services.knot.enable;
|
||||
# node_exporter on every host; knot-exporter only where Knot runs.
|
||||
ports = [ 9100 ] ++ lib.optional knotEnabled 9433;
|
||||
in
|
||||
{
|
||||
# extraInputRules (the mesh-scoped accept below) needs the nftables firewall
|
||||
# backend. allowedTCP/UDPPorts used elsewhere (53, 9993) translate unchanged.
|
||||
networking.nftables.enable = true;
|
||||
|
||||
# Host metrics: CPU, memory, disk, network, systemd unit state.
|
||||
services.prometheus.exporters.node = {
|
||||
enable = true;
|
||||
# Listen on all interfaces (incl. the v6 mesh). We deliberately do NOT bind
|
||||
# to the ZeroTier ULA: the node module renders --web.listen-address without
|
||||
# IPv6 brackets, and binding a single ULA would also race ZeroTier bring-up
|
||||
# at boot. Reachability is constrained by the firewall rule instead.
|
||||
listenAddress = "";
|
||||
port = 9100;
|
||||
enabledCollectors = [ "systemd" ];
|
||||
};
|
||||
|
||||
# Knot DNS metrics. The exporter reads Knot's control socket, so it runs as the
|
||||
# knot user; mod-stats (below) populates the query/response counters it exports.
|
||||
systemd.services.knot-exporter = lib.mkIf knotEnabled {
|
||||
description = "Prometheus exporter for Knot DNS";
|
||||
after = [ "knot.service" ];
|
||||
wants = [ "knot.service" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
serviceConfig = {
|
||||
ExecStart = lib.concatStringsSep " " [
|
||||
"${pkgs.prometheus-knot-exporter}/bin/knot-exporter"
|
||||
"--web-listen-addr ::"
|
||||
"--web-listen-port 9433"
|
||||
"--knot-library-path ${pkgs.knot-dns.out}/lib/libknot.so"
|
||||
"--knot-socket-path /run/knot/knot.sock"
|
||||
];
|
||||
User = "knot";
|
||||
Group = "knot";
|
||||
Restart = "on-failure";
|
||||
RestartSec = 5;
|
||||
NoNewPrivileges = true;
|
||||
ProtectSystem = "strict";
|
||||
ProtectHome = true;
|
||||
PrivateTmp = true;
|
||||
ProtectKernelTunables = true;
|
||||
ProtectControlGroups = true;
|
||||
RestrictAddressFamilies = [
|
||||
"AF_INET"
|
||||
"AF_INET6"
|
||||
"AF_UNIX"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
# mod-stats: per-query/zone counters exposed over the control socket and read
|
||||
# by knot-exporter. Loaded as a global module on the default template so it
|
||||
# applies to every zone. (Merges with the zone/acl/policy settings elsewhere.)
|
||||
services.knot.settings = lib.mkIf knotEnabled {
|
||||
"mod-stats" = [
|
||||
{
|
||||
id = "default";
|
||||
"request-protocol" = true;
|
||||
"server-operation" = true;
|
||||
"response-code" = true;
|
||||
"query-type" = true;
|
||||
"reply-nodata" = true;
|
||||
}
|
||||
];
|
||||
template = [
|
||||
{
|
||||
id = "default";
|
||||
global-module = [ "mod-stats/default" ];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
# Scrape ports reachable only from the ZeroTier mesh.
|
||||
networking.firewall.extraInputRules = ''
|
||||
ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept
|
||||
'';
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper)
|
||||
# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host,
|
||||
# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall
|
||||
# rule at the bottom; the Hetzner cloud firewall keeps it off the public net.
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
mesh = import ../mesh-hosts.nix;
|
||||
vmPort = 8428;
|
||||
grafanaPort = 3000;
|
||||
controlV6 = mesh.hosts.control;
|
||||
|
||||
# A single scrape target with a friendly instance label. IPv6 mesh addresses
|
||||
# must be bracketed for Prometheus-style targets.
|
||||
target = name: addr: port: {
|
||||
targets = [ "${addr}:${toString port}" ];
|
||||
labels.instance = name;
|
||||
};
|
||||
v6 = addr: "[${addr}]";
|
||||
|
||||
adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path;
|
||||
in
|
||||
{
|
||||
services.victoriametrics = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1:${toString vmPort}";
|
||||
retentionPeriod = "180d";
|
||||
prometheusConfig = {
|
||||
global.scrape_interval = "30s";
|
||||
scrape_configs = [
|
||||
{
|
||||
job_name = "node";
|
||||
static_configs = [
|
||||
# control scrapes its own node_exporter over loopback so host metrics
|
||||
# survive even if the mesh is down; ns1/ns2 are scraped over the mesh.
|
||||
(target "control" "127.0.0.1" 9100)
|
||||
(target "ns1" (v6 mesh.hosts.ns1) 9100)
|
||||
(target "ns2" (v6 mesh.hosts.ns2) 9100)
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "knot";
|
||||
static_configs = [
|
||||
(target "ns1" (v6 mesh.hosts.ns1) 9433)
|
||||
(target "ns2" (v6 mesh.hosts.ns2) 9433)
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
# Admin password generated once and stored as a clan secret. Retrieve with:
|
||||
# clan vars get control grafana-admin/password
|
||||
clan.core.vars.generators.grafana-admin = {
|
||||
files."password" = {
|
||||
secret = true;
|
||||
owner = "grafana";
|
||||
group = "grafana";
|
||||
};
|
||||
runtimeInputs = [ pkgs.openssl ];
|
||||
script = ''
|
||||
openssl rand -base64 24 | tr -d "\n" > "$out"/password
|
||||
'';
|
||||
};
|
||||
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
settings = {
|
||||
server = {
|
||||
http_addr = "::";
|
||||
http_port = grafanaPort;
|
||||
root_url = "http://${v6 controlV6}:${toString grafanaPort}/";
|
||||
};
|
||||
security = {
|
||||
admin_user = "admin";
|
||||
admin_password = "$__file{${adminPasswordFile}}";
|
||||
};
|
||||
"auth.anonymous".enabled = false;
|
||||
users.allow_sign_up = false;
|
||||
};
|
||||
provision = {
|
||||
enable = true;
|
||||
datasources.settings = {
|
||||
apiVersion = 1;
|
||||
datasources = [
|
||||
{
|
||||
name = "VictoriaMetrics";
|
||||
type = "prometheus";
|
||||
uid = "victoriametrics";
|
||||
access = "proxy";
|
||||
url = "http://127.0.0.1:${toString vmPort}";
|
||||
isDefault = true;
|
||||
}
|
||||
];
|
||||
};
|
||||
dashboards.settings = {
|
||||
apiVersion = 1;
|
||||
providers = [
|
||||
{
|
||||
name = "cnx";
|
||||
options.path = ./dashboards;
|
||||
options.foldersFromFilesStructure = false;
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
# Grafana reachable only from the ZeroTier mesh (admin laptops + servers).
|
||||
networking.firewall.extraInputRules = ''
|
||||
ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept
|
||||
'';
|
||||
}
|
||||
Reference in New Issue
Block a user