From a7d4c0e56734430d998c03708ade2e62436dcc51 Mon Sep 17 00:00:00 2001 From: Berwn Date: Wed, 17 Jun 2026 14:26:21 +0700 Subject: [PATCH] Add mdBook infra runbook served by Caddy on control Docs live in docs/ (DNS, ZeroTier mesh, monitoring), built at Nix-build time and served as static files over the ZeroTier mesh on control:8080. Commit-to-edit: change the markdown and redeploy to publish. --- docs/book.toml | 12 ++++++ docs/src/SUMMARY.md | 6 +++ docs/src/dns.md | 59 ++++++++++++++++++++++++++++++ docs/src/mesh.md | 39 ++++++++++++++++++++ docs/src/monitoring.md | 38 +++++++++++++++++++ docs/src/overview.md | 26 +++++++++++++ machines/control/configuration.nix | 1 + modules/docs.nix | 40 ++++++++++++++++++++ 8 files changed, 221 insertions(+) create mode 100644 docs/book.toml create mode 100644 docs/src/SUMMARY.md create mode 100644 docs/src/dns.md create mode 100644 docs/src/mesh.md create mode 100644 docs/src/monitoring.md create mode 100644 docs/src/overview.md create mode 100644 modules/docs.nix diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 0000000..6eecc75 --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,12 @@ +[book] +title = "CNX Infra Runbook" +description = "Operational docs for the cnx-network clan: DNS, ZeroTier mesh, monitoring." +authors = ["B4L"] +src = "src" +language = "en" + +[output.html] +default-theme = "navy" +preferred-dark-theme = "navy" +git-repository-url = "https://git.b4l.co.th/B4L/cnx-network-clan" +edit-url-template = "https://git.b4l.co.th/B4L/cnx-network-clan/_edit/main/docs/{path}" diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md new file mode 100644 index 0000000..88abf0c --- /dev/null +++ b/docs/src/SUMMARY.md @@ -0,0 +1,6 @@ +# Summary + +- [Overview](./overview.md) +- [ZeroTier mesh](./mesh.md) +- [DNS](./dns.md) +- [Monitoring](./monitoring.md) diff --git a/docs/src/dns.md b/docs/src/dns.md new file mode 100644 index 0000000..38186a1 --- /dev/null +++ b/docs/src/dns.md @@ -0,0 +1,59 @@ +# DNS + +Authoritative DNS for three zones, served by Knot: + +- `cnx.network` +- `buildfor.life` +- `cnx.email` + +Add a zone in `modules/dns/domains.nix` **and** drop a matching `.zone` +file in `modules/dns/zones/`. + +## Primary / secondary + +- **`ns1` = primary (master).** Loads each zone from its file, signs it, and + notifies `ns2`. Config in `machines/ns1/configuration.nix`. +- **`ns2` = secondary (slave).** Pulls every zone from `ns1` (AXFR/IXFR) and + accepts its NOTIFY. Config in `machines/ns2/configuration.nix`. + +Zone transfers run **over the ZeroTier mesh**, authenticated with a shared TSIG +key (`dns-tsig`, a clan var copied to both machines). + +## Serial handling + +`ns1` uses `zonefile-load = difference-no-serial` with `serial-policy = unixtime`: +edit records without touching the SOA serial — Knot diffs the file, assigns a +strictly-monotonic unixtime serial, signs, and transfers. `journal-content = all` +holds the live signed zone (required by `difference-no-serial`). + +## DNSSEC + +Automatic signing on `ns1` only, policy `cnx`: ECDSA P-256/SHA-256. The ZSK +auto-rolls; the KSK is kept stable, so the DS at the registrar only changes on a +manual KSK rollover. + +> **Pending (manual):** submit DS records for `buildfor.life` and `cnx.email` +> once they're at a DNSSEC-capable registrar. + +## ACME DNS-01 + +A dedicated TSIG key (`acme_ddns`), scoped by `acl_acme` to `TXT` updates at or +under `_acme-challenge.` on `ns1` only. Knot signs the record and transfers +it to `ns2`, which never needs this key. Retrieve the client config with: + +``` +clan vars get ns1 dns-acme-tsig/acme.conf +``` + +## Runbook: stale secondary + +If `ns2` serves stale records while SOA serials match (e.g. after a manual zone +edit that didn't bump the serial as expected), force a fresh transfer on `ns2`: + +``` +knotc zone-retransfer +``` + +Watch the **CNX DNS** Grafana dashboard: the per-nameserver SOA serial table +should agree across `ns1`/`ns2`, and "seconds until zone expiry" on the secondary +should reset on each successful transfer rather than counting toward zero. diff --git a/docs/src/mesh.md b/docs/src/mesh.md new file mode 100644 index 0000000..17ecaf0 --- /dev/null +++ b/docs/src/mesh.md @@ -0,0 +1,39 @@ +# ZeroTier mesh + +A private IPv6 overlay that every machine (and admin laptops) shares. DNS zone +transfers and metrics scraping ride this mesh, never the public net. + +- **Controller:** `control` (the `zerotier` instance in `clan.nix`). +- **Peers:** every machine (`roles.peer.tags.all`). +- **Prefix:** `fd06:1bad:ece2:92ad:ba99:9300::/88` (RFC 4193: `fd` + network id + `0x9993`). + +## The mesh map + +`modules/mesh-hosts.nix` does **not** hardcode addresses. It reads each machine's +IP from the public clan vars that clan-core's zerotier generator already writes +(`vars/per-machine//zerotier/zerotier-ip/value`) and derives the `/88` subnet +from `control`'s `zerotier-network-id`. Regenerate or re-key a node and the map +follows automatically. + +Consumers: `modules/dns/authoritative.nix` (transfer ACLs), `modules/monitoring/*` +(scrape targets and firewall scoping). + +## Admitting external members + +Inventory machines are auto-accepted. External devices (admin laptops) are listed +in `clan.nix` under the controller's `allowedIps`. Because this clan-core pins the +`allowedIps` interface (admit by network IPv6), we keep a **node-id** list and a +`ztMemberIp` helper derives each device's IP on this network: + +```nix +roles.controller.settings.allowedIps = map ztMemberIp [ + "8802c8d7e0" # alex-nixos + "2bd36db8cc" # kurogeek-thinkpad +]; +``` + +A device's 10-char node id comes from `zerotier-cli info` on that device. After +editing, deploy `control`; the controller admits the new member on its next run. + +> A newer clan-core exposes `allowedIds` (admit by node id directly), but adopting +> it means a zerotier vars-schema migration, so we stay on the IP-derivation path. diff --git a/docs/src/monitoring.md b/docs/src/monitoring.md new file mode 100644 index 0000000..fe9bfcb --- /dev/null +++ b/docs/src/monitoring.md @@ -0,0 +1,38 @@ +# Monitoring + +Metrics and dashboards live on `control`, reachable only over the ZeroTier mesh. + +## Collection + +- **node_exporter** (`:9100`) on every machine — CPU, memory, disk, systemd units. + Binds all interfaces; the scrape ports are firewall-scoped to the mesh subnet + (`modules/monitoring/exporters.nix`). +- **knot-exporter** (`:9433`) on `ns1`/`ns2` only — reads Knot's control socket, + fed by the `mod-stats` module (query/response counters per zone). + +## Storage & scraping + +**VictoriaMetrics** on `control`, bound to `127.0.0.1:8428`, 180-day retention +(`modules/monitoring/server.nix`). It scrapes `control` over loopback and `ns1`/ +`ns2` over the mesh. + +> The scraper dials IPv4-only by default, so mesh (IPv6) targets need +> `extraOptions = [ "-enableTCP6" ]`. Without it, ns1/ns2 are dropped with +> "no suitable address found". Check live target health on `control`: +> +> ``` +> curl -s http://127.0.0.1:8428/api/v1/targets | jq '.data.activeTargets[] | {i:.labels.instance, h:.health, e:.lastError}' +> ``` + +## Dashboards + +**Grafana** on `control` (`:3000`), mesh-only, anonymous access disabled. The +admin password is a clan var: + +``` +clan vars get control grafana-admin/password +``` + +The provisioned **CNX DNS** dashboard (`modules/monitoring/dashboards/dns.json`) +shows per-nameserver SOA serials, zone expiry countdowns, query/response rates, +and host CPU/memory/disk/load. diff --git a/docs/src/overview.md b/docs/src/overview.md new file mode 100644 index 0000000..ebe1479 --- /dev/null +++ b/docs/src/overview.md @@ -0,0 +1,26 @@ +# Overview + +This is the operational runbook for the **cnx-network** clan. Everything here is +managed declaratively from the [clan repo](https://git.b4l.co.th/B4L/cnx-network-clan); +this book is built from `docs/` and served on `control` over the ZeroTier mesh. + +## Machines + +| Machine | Role | Public IPv4 | Public IPv6 | +| --------- | -------------------------------------- | ---------------- | --------------------------- | +| `control` | ZeroTier controller, monitoring, docs | `77.42.68.181` | `2a01:4f9:c013:e6d0::1` | +| `ns1` | Knot DNS **primary** (master) | `46.224.170.206` | `2a01:4f8:c014:b5c5::1` | +| `ns2` | Knot DNS **secondary** (slave) | `157.180.70.82` | `2a01:4f9:c014:6d87::1` | + +## Access + +- Admin SSH and all internal services ride the **ZeroTier mesh**, not the public + net. Public SSH (22) is intentionally closed at the Hetzner cloud firewall. +- clan reaches machines by their public IPs first (the `internet` instance), with + the mesh and Tor as automatic fallbacks. + +## Editing these docs + +Commit-to-edit: change the markdown under `docs/src/`, commit, and redeploy +`control`. There is no in-browser editor by design — the docs are versioned and +reviewed alongside the config that they describe. diff --git a/machines/control/configuration.nix b/machines/control/configuration.nix index 805dbc8..6775bd4 100644 --- a/machines/control/configuration.nix +++ b/machines/control/configuration.nix @@ -4,6 +4,7 @@ ../../modules/static-ipv6.nix ../../modules/monitoring/exporters.nix ../../modules/monitoring/server.nix + ../../modules/docs.nix ]; clan.core.sops.defaultGroups = [ "admins" ]; diff --git a/modules/docs.nix b/modules/docs.nix new file mode 100644 index 0000000..c9ab38d --- /dev/null +++ b/modules/docs.nix @@ -0,0 +1,40 @@ +# Infra runbook (mdBook), built at Nix-build time from ./docs and served by Caddy. +# Reachable only over the ZeroTier mesh (firewall rule below); the public side is +# already closed by the Hetzner cloud firewall. Imported by control only. +{ + config, + lib, + pkgs, + ... +}: +let + mesh = import ./mesh-hosts.nix { inherit config lib; }; + port = 8080; + + site = pkgs.stdenvNoCC.mkDerivation { + name = "cnx-infra-docs"; + src = ../docs; + nativeBuildInputs = [ pkgs.mdbook ]; + # mdbook writes a state dir under $HOME; the build sandbox has none. + buildPhase = '' + export HOME=$TMPDIR + mdbook build -d $out + ''; + dontInstall = true; + }; +in +{ + # ":port" makes Caddy serve plain HTTP (no automatic TLS) on all interfaces; + # the mesh-scoped firewall rule below is what constrains reachability. + services.caddy = { + enable = true; + virtualHosts.":${toString port}".extraConfig = '' + root * ${site} + file_server + ''; + }; + + networking.firewall.extraInputRules = '' + ip6 saddr ${mesh.subnet} tcp dport ${toString port} accept + ''; +}