diff --git a/TODO.md b/TODO.md index df62644..efbc235 100644 --- a/TODO.md +++ b/TODO.md @@ -18,11 +18,24 @@ alert state back to VM, so firing alerts show up as the `ALERTS` series in Grafana. Wiring a real notifier (Matrix) is a later step — drop `blackhole` and set `settings."notifier.url"` to an Alertmanager. -## 2. Backups of critical state +## 2. Backups of critical state (DNSSEC done — pending vars + deploy) -- [ ] DNSSEC key material on ns1 (KSK/ZSK in Knot's KASP store) — losing it forces +clan `borgbackup` instance in `clan.nix`: control is the server (repos under +`/var/lib/borgbackup/`), ns1 the client. ns1 declares +`clan.core.state.knot.folders = [ "/var/lib/knot" ]`, so the Knot KASP keystore +is backed up nightly (01:00) over the mesh with repokey encryption — control +never holds plaintext. ns1 maps the `control` machine name to its mesh IP via +`networking.hosts` so the `borg@control` repo resolves. + +Before deploy: `clan vars generate ns1` (YubiKey) to mint the borgbackup ssh +keypair + repokey; control won't evaluate until ns1's public key exists. Then +deploy ns1 and control. + +- [x] DNSSEC key material on ns1 (KSK/ZSK in Knot's KASP store) — losing it forces an emergency DS rollover at the registrar -- [ ] VictoriaMetrics TSDB on control (optional, retention is 180d) +- [ ] VictoriaMetrics TSDB on control (optional, retention is 180d) — deferred; + regenerable over time and control is the backup server, so this needs a + second client→server pair (e.g. control→ns2) rather than the same topology ## 3. Blackbox DNS probing diff --git a/clan.nix b/clan.nix index 6e132e0..cdad86d 100644 --- a/clan.nix +++ b/clan.nix @@ -64,6 +64,16 @@ in emergency-access = { roles.default.tags.nixos = { }; }; + + # Encrypted, deduplicating backups. control hosts the repos; ns1 is the + # only client, backing up its declared clan.core.state (the Knot DNSSEC + # keystore) over the mesh. Repo lives at /var/lib/borgbackup/ns1 on control. + # Cross-host so an ns1 loss is recoverable; repokey encryption means control + # never holds plaintext. Run `clan vars generate ns1` (YubiKey) before deploy. + borgbackup = { + roles.server.machines.control = { }; + roles.client.machines.ns1 = { }; + }; }; machines = { diff --git a/machines/ns1/configuration.nix b/machines/ns1/configuration.nix index 6de65c0..0f0a238 100644 --- a/machines/ns1/configuration.nix +++ b/machines/ns1/configuration.nix @@ -1,6 +1,7 @@ -{ config, pkgs, ... }: +{ config, lib, pkgs, ... }: let domains = import ../../modules/dns/domains.nix; + mesh = import ../../modules/mesh-hosts.nix { inherit config lib; }; in { imports = [ @@ -11,6 +12,16 @@ in clan.core.sops.defaultGroups = [ "admins" ]; + # Knot's state dir holds the non-regenerable DNSSEC key material (KSK/ZSK + # private keys in the KASP keystore). Declaring it as clan state makes the + # borgbackup client back it up; losing it forces an emergency DS rollover at + # the registrar. mode 0700 owned by knot, but borg runs as root so it reads it. + clan.core.state.knot.folders = [ "/var/lib/knot" ]; + + # The borgbackup repo is addressed as `borg@control`; mesh peers have no name + # resolution, so map the control machine name to its ZeroTier mesh address. + networking.hosts.${mesh.hosts.control} = [ "control" ]; + # Public IPv6 (matches the ns1 AAAA glue); SLAAC doesn't bring it up here. cnx.staticIPv6 = { enable = true;