Files
cnx-network-clan/modules/monitoring/server.nix
T
Berwn 6e4178df04 Onboard mx1 mail host and factor out per-host public IPs
- Register mx1 in the inventory and as a direct-SSH `internet` host; give it
  a static public IPv6 (2a01:4ff:2f0:1963::1).
- Point the cnx.email MX (plus SPF/DMARC) at mx1 and add its A record.
- Bring mx1 into monitoring: import exporters, add it to the mesh map and the
  node scrape job so its host metrics and journald reach control.
- Add a clan-mx1 Hetzner firewall: inbound SMTP + ZeroTier + ICMP, no public
  SSH (admin rides the mesh like the other hosts). 587/465/993 held for now.
- Extract per-host public IPv4/IPv6 into modules/hosts.nix, consumed by
  clan.nix's internet hosts and each machine's cnx.staticIPv6, so each address
  is declared once instead of being duplicated across configs.
- docs: add mx1 to the machines table.
2026-06-18 11:53:14 +07:00

156 lines
5.2 KiB
Nix

# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper)
# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host,
# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall
# rule at the bottom; the Hetzner cloud firewall keeps it off the public net.
{
config,
lib,
pkgs,
...
}:
let
mesh = import ../mesh-hosts.nix { inherit config lib; };
probes = import ./blackbox-probes.nix { inherit lib; };
vmPort = 8428;
logsPort = 9428;
grafanaPort = 3000;
controlV6 = mesh.hosts.control;
# A single scrape target with a friendly instance label. IPv6 mesh addresses
# must be bracketed for Prometheus-style targets.
target = name: addr: port: {
targets = [ "${addr}:${toString port}" ];
labels.instance = name;
};
v6 = addr: "[${addr}]";
adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path;
in
{
services.victoriametrics = {
enable = true;
listenAddress = "127.0.0.1:${toString vmPort}";
retentionPeriod = "180d";
# The scraper dials IPv4-only by default; our ns1/ns2 targets are mesh ULAs,
# so without this VM drops them with "no suitable address found (try -enableTCP6)".
extraOptions = [ "-enableTCP6" ];
prometheusConfig = {
global.scrape_interval = "30s";
scrape_configs = [
{
job_name = "node";
static_configs = [
# control scrapes its own node_exporter over loopback so host metrics
# survive even if the mesh is down; ns1/ns2 are scraped over the mesh.
(target "control" "127.0.0.1" 9100)
(target "ns1" (v6 mesh.hosts.ns1) 9100)
(target "ns2" (v6 mesh.hosts.ns2) 9100)
(target "mx1" (v6 mesh.hosts.mx1) 9100)
];
}
{
job_name = "knot";
static_configs = [
(target "ns1" (v6 mesh.hosts.ns1) 9433)
(target "ns2" (v6 mesh.hosts.ns2) 9433)
];
}
]
# Outside-in DNS probes via the blackbox exporter (blackbox.nix). The job
# list is generated from the same probe definitions the exporter uses.
++ probes.scrapeConfigs;
};
};
# Centralized logs: VictoriaLogs ingests journald from all three hosts, each
# of which runs systemd-journal-upload against /insert/journald (exporters.nix).
# Binds all interfaces because ns1/ns2 push over the mesh; the firewall rule at
# the bottom scopes 9428 to the mesh subnet and the Hetzner firewall closes the
# public side. Retention is set via extraOptions (no dedicated NixOS option).
services.victorialogs = {
enable = true;
listenAddress = ":${toString logsPort}";
# -enableTCP6: like the scraper above, VictoriaLogs is IPv4-only by default
# for *listening* too — ":9428" binds 0.0.0.0 only, so ns1/ns2 pushing over
# the IPv6 mesh get "connection refused". This makes it bind [::] (dual-stack)
# so the mesh can reach it. Retention has no dedicated NixOS option.
extraOptions = [
"-retentionPeriod=30d"
"-enableTCP6"
];
};
# Admin password generated once and stored as a clan secret. Retrieve with:
# clan vars get control grafana-admin/password
clan.core.vars.generators.grafana-admin = {
files."password" = {
secret = true;
owner = "grafana";
group = "grafana";
};
runtimeInputs = [ pkgs.openssl ];
script = ''
openssl rand -base64 24 | tr -d "\n" > "$out"/password
'';
};
services.grafana = {
enable = true;
# VictoriaLogs datasource plugin so journald is greppable from Grafana,
# alongside the metrics datasource.
declarativePlugins = [ pkgs.grafanaPlugins.victoriametrics-logs-datasource ];
settings = {
server = {
http_addr = "::";
http_port = grafanaPort;
root_url = "http://${v6 controlV6}:${toString grafanaPort}/";
};
security = {
admin_user = "admin";
admin_password = "$__file{${adminPasswordFile}}";
};
"auth.anonymous".enabled = false;
users.allow_sign_up = false;
};
provision = {
enable = true;
datasources.settings = {
apiVersion = 1;
datasources = [
{
name = "VictoriaMetrics";
type = "prometheus";
uid = "victoriametrics";
access = "proxy";
url = "http://127.0.0.1:${toString vmPort}";
isDefault = true;
}
{
name = "VictoriaLogs";
type = "victoriametrics-logs-datasource";
uid = "victorialogs";
access = "proxy";
url = "http://127.0.0.1:${toString logsPort}";
}
];
};
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "cnx";
options.path = ./dashboards;
options.foldersFromFilesStructure = false;
}
];
};
};
};
# Grafana (admin laptops + servers) and VictoriaLogs ingestion (ns1/ns2 push
# journald over the mesh) reachable only from the ZeroTier mesh.
networking.firewall.extraInputRules = ''
ip6 saddr ${mesh.subnet} tcp dport { ${toString grafanaPort}, ${toString logsPort} } accept
'';
}