diff --git a/TODO.md b/TODO.md index 52b6117..a7309c2 100644 --- a/TODO.md +++ b/TODO.md @@ -57,7 +57,13 @@ ns1/ns2: SOA + DNSKEY succeed on both servers over v4 and v6. - [ ] A secondary nameserver on a different provider/network so a single-provider outage doesn't take all authoritative DNS down (architectural — new machine) -## 5. Centralized logs +## 5. Centralized logs (done — pending deploy) -- [ ] VictoriaLogs on control to grep journald across all three hosts, pairing +VictoriaLogs on control (`:9428`, 30d retention, mesh-scoped) in +`modules/monitoring/server.nix`, plus a VictoriaLogs Grafana datasource. All +three hosts ship journald with systemd's own `services.journald.upload` to the +`/insert/journald` endpoint (`modules/monitoring/exporters.nix`) — no extra +agent. control uploads over loopback; ns1/ns2 over the mesh. + +- [x] VictoriaLogs on control to grep journald across all three hosts, pairing with the existing VictoriaMetrics setup diff --git a/docs/src/monitoring.md b/docs/src/monitoring.md index e93c296..675974a 100644 --- a/docs/src/monitoring.md +++ b/docs/src/monitoring.md @@ -42,6 +42,25 @@ Dashboards are provisioned from `modules/monitoring/dashboards/` (any JSON file there is picked up): - **CNX DNS** (`dns.json`) — firing alerts, per-nameserver SOA serials, zone - expiry countdowns, query/response rates, and host CPU/memory/disk/load. + expiry countdowns, query/response rates, host CPU/memory/disk/load, and the + outside-in DNS probes. - **CNX Backups** (`backups.json`) — borgbackup job health, time since the last run, and per-job state. See [Backups](./backups.md). + +## Logs + +**VictoriaLogs** on `control` (`:9428`), 30-day retention +(`modules/monitoring/server.nix`). All three hosts ship journald to it via +systemd's own `services.journald.upload` → the `/insert/journald` endpoint +(`modules/monitoring/exporters.nix`); no extra agent. `control` uploads over +loopback so its logs survive a mesh outage, `ns1`/`ns2` push over the mesh, and +9428 is firewall-scoped to the mesh like everything else. + +Query logs from Grafana via the provisioned **VictoriaLogs** datasource (Explore +view, LogsQL), or directly in the built-in UI at `http://[control]:9428/select/vmui`. +Logs are tagged with `_HOSTNAME` and `_SYSTEMD_UNIT`, so to follow one service +across hosts: + +``` +_SYSTEMD_UNIT:"knot.service" +``` diff --git a/modules/monitoring/exporters.nix b/modules/monitoring/exporters.nix index 2877923..40d564c 100644 --- a/modules/monitoring/exporters.nix +++ b/modules/monitoring/exporters.nix @@ -1,5 +1,6 @@ -# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS -# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh +# Per-host observability agents, imported by every machine. Host metrics +# everywhere; Knot DNS metrics on the nameservers; journald shipped to +# VictoriaLogs on control. Everything is reachable only over the ZeroTier mesh # (see the firewall rule at the bottom); the public side is already closed by the # Hetzner cloud firewall. { @@ -86,6 +87,22 @@ in ]; }; + # Ship journald to VictoriaLogs on control (services.victorialogs in + # server.nix). control uploads to loopback so its own logs survive a mesh + # outage; ns1/ns2 push over the mesh to control's ZeroTier address. + services.journald.upload = { + enable = true; + settings.Upload.URL = + let + dest = + if config.networking.hostName == "control" then + "127.0.0.1:9428" + else + "[${mesh.hosts.control}]:9428"; + in + "http://${dest}/insert/journald"; + }; + # Scrape ports reachable only from the ZeroTier mesh. networking.firewall.extraInputRules = '' ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept diff --git a/modules/monitoring/server.nix b/modules/monitoring/server.nix index c49d759..3eae4e0 100644 --- a/modules/monitoring/server.nix +++ b/modules/monitoring/server.nix @@ -12,6 +12,7 @@ let mesh = import ../mesh-hosts.nix { inherit config lib; }; probes = import ./blackbox-probes.nix { inherit lib; }; vmPort = 8428; + logsPort = 9428; grafanaPort = 3000; controlV6 = mesh.hosts.control; @@ -60,6 +61,17 @@ in }; }; + # Centralized logs: VictoriaLogs ingests journald from all three hosts, each + # of which runs systemd-journal-upload against /insert/journald (exporters.nix). + # Binds all interfaces because ns1/ns2 push over the mesh; the firewall rule at + # the bottom scopes 9428 to the mesh subnet and the Hetzner firewall closes the + # public side. Retention is set via extraOptions (no dedicated NixOS option). + services.victorialogs = { + enable = true; + listenAddress = ":${toString logsPort}"; + extraOptions = [ "-retentionPeriod=30d" ]; + }; + # Admin password generated once and stored as a clan secret. Retrieve with: # clan vars get control grafana-admin/password clan.core.vars.generators.grafana-admin = { @@ -76,6 +88,9 @@ in services.grafana = { enable = true; + # VictoriaLogs datasource plugin so journald is greppable from Grafana, + # alongside the metrics datasource. + declarativePlugins = [ pkgs.grafanaPlugins.victoriametrics-logs-datasource ]; settings = { server = { http_addr = "::"; @@ -102,6 +117,13 @@ in url = "http://127.0.0.1:${toString vmPort}"; isDefault = true; } + { + name = "VictoriaLogs"; + type = "victoriametrics-logs-datasource"; + uid = "victorialogs"; + access = "proxy"; + url = "http://127.0.0.1:${toString logsPort}"; + } ]; }; dashboards.settings = { @@ -117,8 +139,9 @@ in }; }; - # Grafana reachable only from the ZeroTier mesh (admin laptops + servers). + # Grafana (admin laptops + servers) and VictoriaLogs ingestion (ns1/ns2 push + # journald over the mesh) reachable only from the ZeroTier mesh. networking.firewall.extraInputRules = '' - ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept + ip6 saddr ${mesh.subnet} tcp dport { ${toString grafanaPort}, ${toString logsPort} } accept ''; }