Files
cnx-network-clan/modules/monitoring/exporters.nix
T
Berwn 0eb883061b Keep systemd-journal-upload retrying instead of failing a deploy
The uploader exits when VictoriaLogs is unreachable. Upstream already sets
Restart=always/RestartSec=3sec, but the default start-rate limit lets the unit
give up permanently and trip switch-to-configuration when the sink is briefly
down. Disable the limit (startLimitIntervalSec=0) so logging stays best-effort
and never wedges a host or a deploy.
2026-06-17 17:09:30 +07:00

118 lines
4.1 KiB
Nix

# Per-host observability agents, imported by every machine. Host metrics
# everywhere; Knot DNS metrics on the nameservers; journald shipped to
# VictoriaLogs on control. Everything is reachable only over the ZeroTier mesh
# (see the firewall rule at the bottom); the public side is already closed by the
# Hetzner cloud firewall.
{
config,
lib,
pkgs,
...
}:
let
mesh = import ../mesh-hosts.nix { inherit config lib; };
knotEnabled = config.services.knot.enable;
# node_exporter on every host; knot-exporter only where Knot runs.
ports = [ 9100 ] ++ lib.optional knotEnabled 9433;
in
{
# extraInputRules (the mesh-scoped accept below) needs the nftables firewall
# backend. allowedTCP/UDPPorts used elsewhere (53, 9993) translate unchanged.
networking.nftables.enable = true;
# Host metrics: CPU, memory, disk, network, systemd unit state.
services.prometheus.exporters.node = {
enable = true;
# Listen on all interfaces (incl. the v6 mesh). We deliberately do NOT bind
# to the ZeroTier ULA: the node module renders --web.listen-address without
# IPv6 brackets, and binding a single ULA would also race ZeroTier bring-up
# at boot. Reachability is constrained by the firewall rule instead.
listenAddress = "";
port = 9100;
enabledCollectors = [ "systemd" ];
};
# Knot DNS metrics. The exporter reads Knot's control socket, so it runs as the
# knot user; mod-stats (below) populates the query/response counters it exports.
systemd.services.knot-exporter = lib.mkIf knotEnabled {
description = "Prometheus exporter for Knot DNS";
after = [ "knot.service" ];
wants = [ "knot.service" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = lib.concatStringsSep " " [
"${pkgs.prometheus-knot-exporter}/bin/knot-exporter"
"--web-listen-addr ::"
"--web-listen-port 9433"
"--knot-library-path ${pkgs.knot-dns.out}/lib/libknot.so"
"--knot-socket-path /run/knot/knot.sock"
];
User = "knot";
Group = "knot";
Restart = "on-failure";
RestartSec = 5;
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = true;
PrivateTmp = true;
ProtectKernelTunables = true;
ProtectControlGroups = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
"AF_UNIX"
];
};
};
# mod-stats: per-query/zone counters exposed over the control socket and read
# by knot-exporter. Loaded as a global module on the default template so it
# applies to every zone. (Merges with the zone/acl/policy settings elsewhere.)
services.knot.settings = lib.mkIf knotEnabled {
"mod-stats" = [
{
id = "default";
"request-protocol" = true;
"server-operation" = true;
"response-code" = true;
"query-type" = true;
"reply-nodata" = true;
}
];
template = [
{
id = "default";
global-module = [ "mod-stats/default" ];
}
];
};
# Ship journald to VictoriaLogs on control (services.victorialogs in
# server.nix). control uploads to loopback so its own logs survive a mesh
# outage; ns1/ns2 push over the mesh to control's ZeroTier address.
services.journald.upload = {
enable = true;
settings.Upload.URL =
let
dest =
if config.networking.hostName == "control" then
"127.0.0.1:9428"
else
"[${mesh.hosts.control}]:9428";
in
"http://${dest}/insert/journald";
};
# systemd-journal-upload exits if the sink is unreachable. The upstream module
# already sets Restart=always/RestartSec=3sec, but the default start-rate limit
# (5 tries / 10s) still lets the unit give up permanently and fail a deploy when
# VictoriaLogs is briefly down. Logging is best-effort: disable the limit so it
# retries forever instead of wedging the host (or switch-to-configuration).
systemd.services.systemd-journal-upload.startLimitIntervalSec = 0;
# Scrape ports reachable only from the ZeroTier mesh.
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept
'';
}