diff --git a/machines/control/configuration.nix b/machines/control/configuration.nix index 6f72146..805dbc8 100644 --- a/machines/control/configuration.nix +++ b/machines/control/configuration.nix @@ -2,6 +2,8 @@ imports = [ ../../modules/hetzner-firewall.nix ../../modules/static-ipv6.nix + ../../modules/monitoring/exporters.nix + ../../modules/monitoring/server.nix ]; clan.core.sops.defaultGroups = [ "admins" ]; diff --git a/machines/ns1/configuration.nix b/machines/ns1/configuration.nix index 82b2a5d..6de65c0 100644 --- a/machines/ns1/configuration.nix +++ b/machines/ns1/configuration.nix @@ -6,6 +6,7 @@ in imports = [ ../../modules/dns/authoritative.nix ../../modules/static-ipv6.nix + ../../modules/monitoring/exporters.nix ]; clan.core.sops.defaultGroups = [ "admins" ]; diff --git a/machines/ns2/configuration.nix b/machines/ns2/configuration.nix index 0277a31..cd4dc59 100644 --- a/machines/ns2/configuration.nix +++ b/machines/ns2/configuration.nix @@ -6,6 +6,7 @@ in imports = [ ../../modules/dns/authoritative.nix ../../modules/static-ipv6.nix + ../../modules/monitoring/exporters.nix ]; clan.core.sops.defaultGroups = [ "admins" ]; diff --git a/modules/dns/authoritative.nix b/modules/dns/authoritative.nix index 2c95f9f..f6d9d40 100644 --- a/modules/dns/authoritative.nix +++ b/modules/dns/authoritative.nix @@ -1,8 +1,9 @@ { config, pkgs, ... }: let # ZeroTier addresses — zone transfers run over the mesh, not the public net. - ns1zt = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974"; - ns2zt = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e"; + mesh = import ../mesh-hosts.nix; + ns1zt = mesh.hosts.ns1; + ns2zt = mesh.hosts.ns2; in { # Shared TSIG key, generated once and copied to every machine that imports diff --git a/modules/mesh-hosts.nix b/modules/mesh-hosts.nix new file mode 100644 index 0000000..8696a38 --- /dev/null +++ b/modules/mesh-hosts.nix @@ -0,0 +1,14 @@ +# ZeroTier (clan mesh) addresses — the private IPv6 overlay every machine shares. +# DNS zone transfers and metrics scraping ride this mesh, never the public net. +rec { + hosts = { + control = "fd06:1bad:ece2:92ad:ba99:9306:1bad:ece2"; + ns1 = "fd06:1bad:ece2:92ad:ba99:939d:766d:8974"; + ns2 = "fd06:1bad:ece2:92ad:ba99:9323:61be:a09e"; + }; + + # RFC 4193 /88 prefix of this ZeroTier network (fd + 8-byte network id + the + # 0x9993 marker). Covers every mesh peer — servers and admin laptops alike — + # and is used to scope mesh-only firewall rules. + subnet = "fd06:1bad:ece2:92ad:ba99:9300::/88"; +} diff --git a/modules/monitoring/dashboards/dns.json b/modules/monitoring/dashboards/dns.json new file mode 100644 index 0000000..d67e951 --- /dev/null +++ b/modules/monitoring/dashboards/dns.json @@ -0,0 +1,171 @@ +{ + "uid": "cnx-dns", + "title": "CNX DNS", + "tags": ["dns", "knot", "cnx"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "type": "row", + "title": "DNS / Zones", + "id": 1, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "table", + "title": "Zone SOA serial (per nameserver)", + "description": "ns1 and ns2 should report the same serial per zone. A divergence here is the secondary-out-of-sync condition.", + "id": 2, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "options": { "showHeader": true }, + "fieldConfig": { + "defaults": { "custom": { "align": "auto" } }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "knot_zone_serial", + "format": "table", + "instant": true, + "legendFormat": "{{zone}} @ {{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Seconds until zone expiry", + "description": "On secondaries this counts down between successful transfers; a steady decline toward zero means transfers are failing.", + "id": 3, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "knot_zone_status_expiration", + "legendFormat": "{{zone}} @ {{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Query rate by nameserver", + "id": 4, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "sum by (instance) (rate(knot_stats_request_protocol_total[5m]))", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Response codes", + "id": 5, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "fieldConfig": { "defaults": { "unit": "qps" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "sum by (type) (rate(knot_stats_response_code_total[5m]))", + "legendFormat": "{{type}}" + } + ] + }, + { + "type": "row", + "title": "Hosts", + "id": 6, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 } + }, + { + "type": "timeseries", + "title": "CPU busy %", + "id": 7, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Memory used %", + "id": 8, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Root filesystem used %", + "id": 9, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "100 * (1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Load average (1m)", + "id": 10, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "node_load1", + "legendFormat": "{{instance}}" + } + ] + } + ] +} diff --git a/modules/monitoring/exporters.nix b/modules/monitoring/exporters.nix new file mode 100644 index 0000000..a8a4869 --- /dev/null +++ b/modules/monitoring/exporters.nix @@ -0,0 +1,93 @@ +# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS +# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh +# (see the firewall rule at the bottom); the public side is already closed by the +# Hetzner cloud firewall. +{ + config, + lib, + pkgs, + ... +}: +let + mesh = import ../mesh-hosts.nix; + knotEnabled = config.services.knot.enable; + # node_exporter on every host; knot-exporter only where Knot runs. + ports = [ 9100 ] ++ lib.optional knotEnabled 9433; +in +{ + # extraInputRules (the mesh-scoped accept below) needs the nftables firewall + # backend. allowedTCP/UDPPorts used elsewhere (53, 9993) translate unchanged. + networking.nftables.enable = true; + + # Host metrics: CPU, memory, disk, network, systemd unit state. + services.prometheus.exporters.node = { + enable = true; + # Listen on all interfaces (incl. the v6 mesh). We deliberately do NOT bind + # to the ZeroTier ULA: the node module renders --web.listen-address without + # IPv6 brackets, and binding a single ULA would also race ZeroTier bring-up + # at boot. Reachability is constrained by the firewall rule instead. + listenAddress = ""; + port = 9100; + enabledCollectors = [ "systemd" ]; + }; + + # Knot DNS metrics. The exporter reads Knot's control socket, so it runs as the + # knot user; mod-stats (below) populates the query/response counters it exports. + systemd.services.knot-exporter = lib.mkIf knotEnabled { + description = "Prometheus exporter for Knot DNS"; + after = [ "knot.service" ]; + wants = [ "knot.service" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + ExecStart = lib.concatStringsSep " " [ + "${pkgs.prometheus-knot-exporter}/bin/knot-exporter" + "--web-listen-addr ::" + "--web-listen-port 9433" + "--knot-library-path ${pkgs.knot-dns.out}/lib/libknot.so" + "--knot-socket-path /run/knot/knot.sock" + ]; + User = "knot"; + Group = "knot"; + Restart = "on-failure"; + RestartSec = 5; + NoNewPrivileges = true; + ProtectSystem = "strict"; + ProtectHome = true; + PrivateTmp = true; + ProtectKernelTunables = true; + ProtectControlGroups = true; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + "AF_UNIX" + ]; + }; + }; + + # mod-stats: per-query/zone counters exposed over the control socket and read + # by knot-exporter. Loaded as a global module on the default template so it + # applies to every zone. (Merges with the zone/acl/policy settings elsewhere.) + services.knot.settings = lib.mkIf knotEnabled { + "mod-stats" = [ + { + id = "default"; + "request-protocol" = true; + "server-operation" = true; + "response-code" = true; + "query-type" = true; + "reply-nodata" = true; + } + ]; + template = [ + { + id = "default"; + global-module = [ "mod-stats/default" ]; + } + ]; + }; + + # Scrape ports reachable only from the ZeroTier mesh. + networking.firewall.extraInputRules = '' + ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept + ''; +} diff --git a/modules/monitoring/server.nix b/modules/monitoring/server.nix new file mode 100644 index 0000000..37a2afa --- /dev/null +++ b/modules/monitoring/server.nix @@ -0,0 +1,117 @@ +# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper) +# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host, +# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall +# rule at the bottom; the Hetzner cloud firewall keeps it off the public net. +{ + config, + lib, + pkgs, + ... +}: +let + mesh = import ../mesh-hosts.nix; + vmPort = 8428; + grafanaPort = 3000; + controlV6 = mesh.hosts.control; + + # A single scrape target with a friendly instance label. IPv6 mesh addresses + # must be bracketed for Prometheus-style targets. + target = name: addr: port: { + targets = [ "${addr}:${toString port}" ]; + labels.instance = name; + }; + v6 = addr: "[${addr}]"; + + adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path; +in +{ + services.victoriametrics = { + enable = true; + listenAddress = "127.0.0.1:${toString vmPort}"; + retentionPeriod = "180d"; + prometheusConfig = { + global.scrape_interval = "30s"; + scrape_configs = [ + { + job_name = "node"; + static_configs = [ + # control scrapes its own node_exporter over loopback so host metrics + # survive even if the mesh is down; ns1/ns2 are scraped over the mesh. + (target "control" "127.0.0.1" 9100) + (target "ns1" (v6 mesh.hosts.ns1) 9100) + (target "ns2" (v6 mesh.hosts.ns2) 9100) + ]; + } + { + job_name = "knot"; + static_configs = [ + (target "ns1" (v6 mesh.hosts.ns1) 9433) + (target "ns2" (v6 mesh.hosts.ns2) 9433) + ]; + } + ]; + }; + }; + + # Admin password generated once and stored as a clan secret. Retrieve with: + # clan vars get control grafana-admin/password + clan.core.vars.generators.grafana-admin = { + files."password" = { + secret = true; + owner = "grafana"; + group = "grafana"; + }; + runtimeInputs = [ pkgs.openssl ]; + script = '' + openssl rand -base64 24 | tr -d "\n" > "$out"/password + ''; + }; + + services.grafana = { + enable = true; + settings = { + server = { + http_addr = "::"; + http_port = grafanaPort; + root_url = "http://${v6 controlV6}:${toString grafanaPort}/"; + }; + security = { + admin_user = "admin"; + admin_password = "$__file{${adminPasswordFile}}"; + }; + "auth.anonymous".enabled = false; + users.allow_sign_up = false; + }; + provision = { + enable = true; + datasources.settings = { + apiVersion = 1; + datasources = [ + { + name = "VictoriaMetrics"; + type = "prometheus"; + uid = "victoriametrics"; + access = "proxy"; + url = "http://127.0.0.1:${toString vmPort}"; + isDefault = true; + } + ]; + }; + dashboards.settings = { + apiVersion = 1; + providers = [ + { + name = "cnx"; + options.path = ./dashboards; + options.foldersFromFilesStructure = false; + } + ]; + }; + }; + }; + + # Grafana reachable only from the ZeroTier mesh (admin laptops + servers). + networking.firewall.extraInputRules = '' + ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept + ''; +}