Add VictoriaMetrics + Grafana DNS monitoring over the mesh
control runs VictoriaMetrics (loopback) and Grafana; every machine exports node metrics and the nameservers export Knot stats (mod-stats + knot-exporter). Scraping and the Grafana UI ride the ZeroTier mesh only, scoped by nftables to the mesh /88; the public side stays closed by the Hetzner cloud firewall. The provisioned DNS dashboard includes a per-zone SOA serial table to catch primary/secondary drift. ZeroTier ULAs are centralised in mesh-hosts.nix.
This commit is contained in:
@@ -2,6 +2,8 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../../modules/hetzner-firewall.nix
|
../../modules/hetzner-firewall.nix
|
||||||
../../modules/static-ipv6.nix
|
../../modules/static-ipv6.nix
|
||||||
|
../../modules/monitoring/exporters.nix
|
||||||
|
../../modules/monitoring/server.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
clan.core.sops.defaultGroups = [ "admins" ];
|
clan.core.sops.defaultGroups = [ "admins" ];
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ in
|
|||||||
imports = [
|
imports = [
|
||||||
../../modules/dns/authoritative.nix
|
../../modules/dns/authoritative.nix
|
||||||
../../modules/static-ipv6.nix
|
../../modules/static-ipv6.nix
|
||||||
|
../../modules/monitoring/exporters.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
clan.core.sops.defaultGroups = [ "admins" ];
|
clan.core.sops.defaultGroups = [ "admins" ];
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ in
|
|||||||
imports = [
|
imports = [
|
||||||
../../modules/dns/authoritative.nix
|
../../modules/dns/authoritative.nix
|
||||||
../../modules/static-ipv6.nix
|
../../modules/static-ipv6.nix
|
||||||
|
../../modules/monitoring/exporters.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
clan.core.sops.defaultGroups = [ "admins" ];
|
clan.core.sops.defaultGroups = [ "admins" ];
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
{ config, pkgs, ... }:
|
{ config, pkgs, ... }:
|
||||||
let
|
let
|
||||||
# ZeroTier addresses — zone transfers run over the mesh, not the public net.
|
# ZeroTier addresses — zone transfers run over the mesh, not the public net.
|
||||||
ns1zt = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
|
mesh = import ../mesh-hosts.nix;
|
||||||
ns2zt = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
|
ns1zt = mesh.hosts.ns1;
|
||||||
|
ns2zt = mesh.hosts.ns2;
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
# Shared TSIG key, generated once and copied to every machine that imports
|
# Shared TSIG key, generated once and copied to every machine that imports
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
# ZeroTier (clan mesh) addresses — the private IPv6 overlay every machine shares.
|
||||||
|
# DNS zone transfers and metrics scraping ride this mesh, never the public net.
|
||||||
|
rec {
|
||||||
|
hosts = {
|
||||||
|
control = "fd06:1bad:ece2:92ad:ba99:9306:1bad:ece2";
|
||||||
|
ns1 = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974";
|
||||||
|
ns2 = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e";
|
||||||
|
};
|
||||||
|
|
||||||
|
# RFC 4193 /88 prefix of this ZeroTier network (fd + 8-byte network id + the
|
||||||
|
# 0x9993 marker). Covers every mesh peer — servers and admin laptops alike —
|
||||||
|
# and is used to scope mesh-only firewall rules.
|
||||||
|
subnet = "fd06:1bad:ece2:92ad:ba99:9300::/88";
|
||||||
|
}
|
||||||
@@ -0,0 +1,171 @@
|
|||||||
|
{
|
||||||
|
"uid": "cnx-dns",
|
||||||
|
"title": "CNX DNS",
|
||||||
|
"tags": ["dns", "knot", "cnx"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-6h", "to": "now" },
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"title": "DNS / Zones",
|
||||||
|
"id": 1,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "table",
|
||||||
|
"title": "Zone SOA serial (per nameserver)",
|
||||||
|
"description": "ns1 and ns2 should report the same serial per zone. A divergence here is the secondary-out-of-sync condition.",
|
||||||
|
"id": 2,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
||||||
|
"options": { "showHeader": true },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "auto" } },
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "knot_zone_serial",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": "{{zone}} @ {{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Seconds until zone expiry",
|
||||||
|
"description": "On secondaries this counts down between successful transfers; a steady decline toward zero means transfers are failing.",
|
||||||
|
"id": 3,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "knot_zone_status_expiration",
|
||||||
|
"legendFormat": "{{zone}} @ {{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Query rate by nameserver",
|
||||||
|
"id": 4,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "sum by (instance) (rate(knot_stats_request_protocol_total[5m]))",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Response codes",
|
||||||
|
"id": 5,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "sum by (type) (rate(knot_stats_response_code_total[5m]))",
|
||||||
|
"legendFormat": "{{type}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"title": "Hosts",
|
||||||
|
"id": 6,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU busy %",
|
||||||
|
"id": 7,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory used %",
|
||||||
|
"id": 8,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Root filesystem used %",
|
||||||
|
"id": 9,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "percent", "min": 0, "max": 100 },
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "100 * (1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Load average (1m)",
|
||||||
|
"id": 10,
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
|
||||||
|
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"expr": "node_load1",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS
|
||||||
|
# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh
|
||||||
|
# (see the firewall rule at the bottom); the public side is already closed by the
|
||||||
|
# Hetzner cloud firewall.
|
||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
mesh = import ../mesh-hosts.nix;
|
||||||
|
knotEnabled = config.services.knot.enable;
|
||||||
|
# node_exporter on every host; knot-exporter only where Knot runs.
|
||||||
|
ports = [ 9100 ] ++ lib.optional knotEnabled 9433;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# extraInputRules (the mesh-scoped accept below) needs the nftables firewall
|
||||||
|
# backend. allowedTCP/UDPPorts used elsewhere (53, 9993) translate unchanged.
|
||||||
|
networking.nftables.enable = true;
|
||||||
|
|
||||||
|
# Host metrics: CPU, memory, disk, network, systemd unit state.
|
||||||
|
services.prometheus.exporters.node = {
|
||||||
|
enable = true;
|
||||||
|
# Listen on all interfaces (incl. the v6 mesh). We deliberately do NOT bind
|
||||||
|
# to the ZeroTier ULA: the node module renders --web.listen-address without
|
||||||
|
# IPv6 brackets, and binding a single ULA would also race ZeroTier bring-up
|
||||||
|
# at boot. Reachability is constrained by the firewall rule instead.
|
||||||
|
listenAddress = "";
|
||||||
|
port = 9100;
|
||||||
|
enabledCollectors = [ "systemd" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Knot DNS metrics. The exporter reads Knot's control socket, so it runs as the
|
||||||
|
# knot user; mod-stats (below) populates the query/response counters it exports.
|
||||||
|
systemd.services.knot-exporter = lib.mkIf knotEnabled {
|
||||||
|
description = "Prometheus exporter for Knot DNS";
|
||||||
|
after = [ "knot.service" ];
|
||||||
|
wants = [ "knot.service" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
ExecStart = lib.concatStringsSep " " [
|
||||||
|
"${pkgs.prometheus-knot-exporter}/bin/knot-exporter"
|
||||||
|
"--web-listen-addr ::"
|
||||||
|
"--web-listen-port 9433"
|
||||||
|
"--knot-library-path ${pkgs.knot-dns.out}/lib/libknot.so"
|
||||||
|
"--knot-socket-path /run/knot/knot.sock"
|
||||||
|
];
|
||||||
|
User = "knot";
|
||||||
|
Group = "knot";
|
||||||
|
Restart = "on-failure";
|
||||||
|
RestartSec = 5;
|
||||||
|
NoNewPrivileges = true;
|
||||||
|
ProtectSystem = "strict";
|
||||||
|
ProtectHome = true;
|
||||||
|
PrivateTmp = true;
|
||||||
|
ProtectKernelTunables = true;
|
||||||
|
ProtectControlGroups = true;
|
||||||
|
RestrictAddressFamilies = [
|
||||||
|
"AF_INET"
|
||||||
|
"AF_INET6"
|
||||||
|
"AF_UNIX"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# mod-stats: per-query/zone counters exposed over the control socket and read
|
||||||
|
# by knot-exporter. Loaded as a global module on the default template so it
|
||||||
|
# applies to every zone. (Merges with the zone/acl/policy settings elsewhere.)
|
||||||
|
services.knot.settings = lib.mkIf knotEnabled {
|
||||||
|
"mod-stats" = [
|
||||||
|
{
|
||||||
|
id = "default";
|
||||||
|
"request-protocol" = true;
|
||||||
|
"server-operation" = true;
|
||||||
|
"response-code" = true;
|
||||||
|
"query-type" = true;
|
||||||
|
"reply-nodata" = true;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
template = [
|
||||||
|
{
|
||||||
|
id = "default";
|
||||||
|
global-module = [ "mod-stats/default" ];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Scrape ports reachable only from the ZeroTier mesh.
|
||||||
|
networking.firewall.extraInputRules = ''
|
||||||
|
ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -0,0 +1,117 @@
|
|||||||
|
# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper)
|
||||||
|
# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host,
|
||||||
|
# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall
|
||||||
|
# rule at the bottom; the Hetzner cloud firewall keeps it off the public net.
|
||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
mesh = import ../mesh-hosts.nix;
|
||||||
|
vmPort = 8428;
|
||||||
|
grafanaPort = 3000;
|
||||||
|
controlV6 = mesh.hosts.control;
|
||||||
|
|
||||||
|
# A single scrape target with a friendly instance label. IPv6 mesh addresses
|
||||||
|
# must be bracketed for Prometheus-style targets.
|
||||||
|
target = name: addr: port: {
|
||||||
|
targets = [ "${addr}:${toString port}" ];
|
||||||
|
labels.instance = name;
|
||||||
|
};
|
||||||
|
v6 = addr: "[${addr}]";
|
||||||
|
|
||||||
|
adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.victoriametrics = {
|
||||||
|
enable = true;
|
||||||
|
listenAddress = "127.0.0.1:${toString vmPort}";
|
||||||
|
retentionPeriod = "180d";
|
||||||
|
prometheusConfig = {
|
||||||
|
global.scrape_interval = "30s";
|
||||||
|
scrape_configs = [
|
||||||
|
{
|
||||||
|
job_name = "node";
|
||||||
|
static_configs = [
|
||||||
|
# control scrapes its own node_exporter over loopback so host metrics
|
||||||
|
# survive even if the mesh is down; ns1/ns2 are scraped over the mesh.
|
||||||
|
(target "control" "127.0.0.1" 9100)
|
||||||
|
(target "ns1" (v6 mesh.hosts.ns1) 9100)
|
||||||
|
(target "ns2" (v6 mesh.hosts.ns2) 9100)
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "knot";
|
||||||
|
static_configs = [
|
||||||
|
(target "ns1" (v6 mesh.hosts.ns1) 9433)
|
||||||
|
(target "ns2" (v6 mesh.hosts.ns2) 9433)
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Admin password generated once and stored as a clan secret. Retrieve with:
|
||||||
|
# clan vars get control grafana-admin/password
|
||||||
|
clan.core.vars.generators.grafana-admin = {
|
||||||
|
files."password" = {
|
||||||
|
secret = true;
|
||||||
|
owner = "grafana";
|
||||||
|
group = "grafana";
|
||||||
|
};
|
||||||
|
runtimeInputs = [ pkgs.openssl ];
|
||||||
|
script = ''
|
||||||
|
openssl rand -base64 24 | tr -d "\n" > "$out"/password
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
services.grafana = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
server = {
|
||||||
|
http_addr = "::";
|
||||||
|
http_port = grafanaPort;
|
||||||
|
root_url = "http://${v6 controlV6}:${toString grafanaPort}/";
|
||||||
|
};
|
||||||
|
security = {
|
||||||
|
admin_user = "admin";
|
||||||
|
admin_password = "$__file{${adminPasswordFile}}";
|
||||||
|
};
|
||||||
|
"auth.anonymous".enabled = false;
|
||||||
|
users.allow_sign_up = false;
|
||||||
|
};
|
||||||
|
provision = {
|
||||||
|
enable = true;
|
||||||
|
datasources.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
datasources = [
|
||||||
|
{
|
||||||
|
name = "VictoriaMetrics";
|
||||||
|
type = "prometheus";
|
||||||
|
uid = "victoriametrics";
|
||||||
|
access = "proxy";
|
||||||
|
url = "http://127.0.0.1:${toString vmPort}";
|
||||||
|
isDefault = true;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
dashboards.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
providers = [
|
||||||
|
{
|
||||||
|
name = "cnx";
|
||||||
|
options.path = ./dashboards;
|
||||||
|
options.foldersFromFilesStructure = false;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Grafana reachable only from the ZeroTier mesh (admin laptops + servers).
|
||||||
|
networking.firewall.extraInputRules = ''
|
||||||
|
ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept
|
||||||
|
'';
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user