Add VictoriaLogs for centralized journald across all hosts

control runs VictoriaLogs (:9428, 30d, mesh-scoped) with a matching
Grafana datasource. Each host ships journald via systemd's own
journald.upload to the /insert/journald endpoint -- no extra agent.
control uploads over loopback so its logs survive a mesh outage; ns1
and ns2 push over the mesh.
This commit is contained in:
Berwn
2026-06-17 16:53:52 +07:00
parent c7b0f206c8
commit d4a171640b
4 changed files with 72 additions and 7 deletions
+8 -2
View File
@@ -57,7 +57,13 @@ ns1/ns2: SOA + DNSKEY succeed on both servers over v4 and v6.
- [ ] A secondary nameserver on a different provider/network so a single-provider
outage doesn't take all authoritative DNS down (architectural — new machine)
## 5. Centralized logs
## 5. Centralized logs (done — pending deploy)
- [ ] VictoriaLogs on control to grep journald across all three hosts, pairing
VictoriaLogs on control (`:9428`, 30d retention, mesh-scoped) in
`modules/monitoring/server.nix`, plus a VictoriaLogs Grafana datasource. All
three hosts ship journald with systemd's own `services.journald.upload` to the
`/insert/journald` endpoint (`modules/monitoring/exporters.nix`) — no extra
agent. control uploads over loopback; ns1/ns2 over the mesh.
- [x] VictoriaLogs on control to grep journald across all three hosts, pairing
with the existing VictoriaMetrics setup
+20 -1
View File
@@ -42,6 +42,25 @@ Dashboards are provisioned from `modules/monitoring/dashboards/` (any JSON file
there is picked up):
- **CNX DNS** (`dns.json`) — firing alerts, per-nameserver SOA serials, zone
expiry countdowns, query/response rates, and host CPU/memory/disk/load.
expiry countdowns, query/response rates, host CPU/memory/disk/load, and the
outside-in DNS probes.
- **CNX Backups** (`backups.json`) — borgbackup job health, time since the last
run, and per-job state. See [Backups](./backups.md).
## Logs
**VictoriaLogs** on `control` (`:9428`), 30-day retention
(`modules/monitoring/server.nix`). All three hosts ship journald to it via
systemd's own `services.journald.upload` → the `/insert/journald` endpoint
(`modules/monitoring/exporters.nix`); no extra agent. `control` uploads over
loopback so its logs survive a mesh outage, `ns1`/`ns2` push over the mesh, and
9428 is firewall-scoped to the mesh like everything else.
Query logs from Grafana via the provisioned **VictoriaLogs** datasource (Explore
view, LogsQL), or directly in the built-in UI at `http://[control]:9428/select/vmui`.
Logs are tagged with `_HOSTNAME` and `_SYSTEMD_UNIT`, so to follow one service
across hosts:
```
_SYSTEMD_UNIT:"knot.service"
```
+19 -2
View File
@@ -1,5 +1,6 @@
# Metric exporters, imported by every machine. Host metrics everywhere; Knot DNS
# metrics on the nameservers. Everything is reachable only over the ZeroTier mesh
# Per-host observability agents, imported by every machine. Host metrics
# everywhere; Knot DNS metrics on the nameservers; journald shipped to
# VictoriaLogs on control. Everything is reachable only over the ZeroTier mesh
# (see the firewall rule at the bottom); the public side is already closed by the
# Hetzner cloud firewall.
{
@@ -86,6 +87,22 @@ in
];
};
# Ship journald to VictoriaLogs on control (services.victorialogs in
# server.nix). control uploads to loopback so its own logs survive a mesh
# outage; ns1/ns2 push over the mesh to control's ZeroTier address.
services.journald.upload = {
enable = true;
settings.Upload.URL =
let
dest =
if config.networking.hostName == "control" then
"127.0.0.1:9428"
else
"[${mesh.hosts.control}]:9428";
in
"http://${dest}/insert/journald";
};
# Scrape ports reachable only from the ZeroTier mesh.
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport { ${lib.concatMapStringsSep ", " toString ports} } accept
+25 -2
View File
@@ -12,6 +12,7 @@ let
mesh = import ../mesh-hosts.nix { inherit config lib; };
probes = import ./blackbox-probes.nix { inherit lib; };
vmPort = 8428;
logsPort = 9428;
grafanaPort = 3000;
controlV6 = mesh.hosts.control;
@@ -60,6 +61,17 @@ in
};
};
# Centralized logs: VictoriaLogs ingests journald from all three hosts, each
# of which runs systemd-journal-upload against /insert/journald (exporters.nix).
# Binds all interfaces because ns1/ns2 push over the mesh; the firewall rule at
# the bottom scopes 9428 to the mesh subnet and the Hetzner firewall closes the
# public side. Retention is set via extraOptions (no dedicated NixOS option).
services.victorialogs = {
enable = true;
listenAddress = ":${toString logsPort}";
extraOptions = [ "-retentionPeriod=30d" ];
};
# Admin password generated once and stored as a clan secret. Retrieve with:
# clan vars get control grafana-admin/password
clan.core.vars.generators.grafana-admin = {
@@ -76,6 +88,9 @@ in
services.grafana = {
enable = true;
# VictoriaLogs datasource plugin so journald is greppable from Grafana,
# alongside the metrics datasource.
declarativePlugins = [ pkgs.grafanaPlugins.victoriametrics-logs-datasource ];
settings = {
server = {
http_addr = "::";
@@ -102,6 +117,13 @@ in
url = "http://127.0.0.1:${toString vmPort}";
isDefault = true;
}
{
name = "VictoriaLogs";
type = "victoriametrics-logs-datasource";
uid = "victorialogs";
access = "proxy";
url = "http://127.0.0.1:${toString logsPort}";
}
];
};
dashboards.settings = {
@@ -117,8 +139,9 @@ in
};
};
# Grafana reachable only from the ZeroTier mesh (admin laptops + servers).
# Grafana (admin laptops + servers) and VictoriaLogs ingestion (ns1/ns2 push
# journald over the mesh) reachable only from the ZeroTier mesh.
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport ${toString grafanaPort} accept
ip6 saddr ${mesh.subnet} tcp dport { ${toString grafanaPort}, ${toString logsPort} } accept
'';
}