From 1ea5bda23f498c2915215eabbcbe3a8f0b1dff0b Mon Sep 17 00:00:00 2001 From: Berwn Date: Wed, 17 Jun 2026 15:13:47 +0700 Subject: [PATCH] Add CNX Backups dashboard and document the backup setup Grafana dashboard (auto-provisioned from the dashboards dir) tracks borgbackup job health, time since last run, and per-job systemd state from the node_exporter systemd collector on the client. New docs page covers the ns1 -> control topology, secrets flow, and restore commands. --- docs/src/SUMMARY.md | 1 + docs/src/backups.md | 61 +++++++ docs/src/monitoring.md | 10 +- modules/monitoring/dashboards/backups.json | 199 +++++++++++++++++++++ 4 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 docs/src/backups.md create mode 100644 modules/monitoring/dashboards/backups.json diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 88abf0c..45be790 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -4,3 +4,4 @@ - [ZeroTier mesh](./mesh.md) - [DNS](./dns.md) - [Monitoring](./monitoring.md) +- [Backups](./backups.md) diff --git a/docs/src/backups.md b/docs/src/backups.md new file mode 100644 index 0000000..ff6379b --- /dev/null +++ b/docs/src/backups.md @@ -0,0 +1,61 @@ +# Backups + +Encrypted, deduplicating backups via clan's `borgbackup` service, declared in +`clan.nix`. The only critical, non-regenerable state is the **Knot DNSSEC +keystore** on `ns1` (the KSK/ZSK private keys under `/var/lib/knot`); losing it +forces an emergency DS rollover at the registrar. + +## Topology + +- **control** is the borgbackup **server** — it hosts the repos under + `/var/lib/borgbackup/` (so `ns1`'s repo is `/var/lib/borgbackup/ns1`). +- **ns1** is the **client**. It backs up everything it declares as clan state + (`clan.core.state.knot.folders = [ "/var/lib/knot" ]`) once a day at 01:00, + over the ZeroTier mesh. + +The backup is cross-host so that losing `ns1` is recoverable, and stays +self-contained (no third-party storage). Encryption is `repokey` with a +generated passphrase, so `control` only ever stores ciphertext. + +Mesh peers have no name resolution, so `ns1` maps the `control` machine name to +its ZeroTier address via `networking.hosts`; that is how the `borg@control` repo +URL resolves. + +## Secrets + +The borgbackup ssh keypair and repokey passphrase are clan vars, generated once +(needs the YubiKey). `control` will not evaluate until `ns1`'s public key +exists, so generate before the first deploy: + +``` +clan vars generate ns1 +clan machines update ns1 +clan machines update control +``` + +## Operating + +Backups are driven by systemd on `ns1` (`borgbackup-job-control.timer`). + +``` +# trigger a backup now (on ns1) +borgbackup-create + +# list archives (on ns1) +borgbackup-list + +# restore selected folders from an archive (on ns1) +NAME='' FOLDERS=/var/lib/knot borgbackup-restore +``` + +Retention is pruned automatically: all archives from the last day, then 7 daily +and 4 weekly. + +## Monitoring + +The **CNX Backups** Grafana dashboard +(`modules/monitoring/dashboards/backups.json`) tracks job health, time since the +last successful run, and per-job state — all from the node_exporter systemd +collector on the client. There is no dedicated borg metrics exporter; the unit +state and the timer's last-trigger timestamp are enough to catch a backup that +stops running or fails. diff --git a/docs/src/monitoring.md b/docs/src/monitoring.md index fe9bfcb..4ded43d 100644 --- a/docs/src/monitoring.md +++ b/docs/src/monitoring.md @@ -33,6 +33,10 @@ admin password is a clan var: clan vars get control grafana-admin/password ``` -The provisioned **CNX DNS** dashboard (`modules/monitoring/dashboards/dns.json`) -shows per-nameserver SOA serials, zone expiry countdowns, query/response rates, -and host CPU/memory/disk/load. +Dashboards are provisioned from `modules/monitoring/dashboards/` (any JSON file +there is picked up): + +- **CNX DNS** (`dns.json`) — firing alerts, per-nameserver SOA serials, zone + expiry countdowns, query/response rates, and host CPU/memory/disk/load. +- **CNX Backups** (`backups.json`) — borgbackup job health, time since the last + run, and per-job state. See [Backups](./backups.md). diff --git a/modules/monitoring/dashboards/backups.json b/modules/monitoring/dashboards/backups.json new file mode 100644 index 0000000..3968c0d --- /dev/null +++ b/modules/monitoring/dashboards/backups.json @@ -0,0 +1,199 @@ +{ + "uid": "cnx-backups", + "title": "CNX Backups", + "tags": ["backup", "borg", "cnx"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { "from": "now-7d", "to": "now" }, + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "type": "row", + "title": "Backups", + "id": 1, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "Backup health", + "description": "1 if any borgbackup job is in the failed state, 0 otherwise. A successful run leaves the oneshot unit inactive (still OK); only a real failure shows FAILED. Derived from the node_exporter systemd collector on the backup client (ns1).", + "id": 2, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 5, "w": 8, "x": 0, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "noValue": "no data", + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "OK", "color": "green", "index": 0 }, + "1": { "text": "FAILED", "color": "red", "index": 1 } + } + } + ] + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto", + "orientation": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "max(node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"})", + "instant": true + } + ] + }, + { + "type": "stat", + "title": "Last backup run", + "description": "When the most recent backup timer last fired (the daily borgbackup job). 'No data' before the first run.", + "id": 3, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 5, "w": 8, "x": 8, "y": 1 }, + "fieldConfig": { + "defaults": { + "unit": "dateTimeFromNow", + "color": { "mode": "fixed", "fixedColor": "text" }, + "noValue": "never" + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "none", + "graphMode": "none", + "textMode": "auto", + "orientation": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}) * 1000", + "instant": true + } + ] + }, + { + "type": "stat", + "title": "Time since last backup", + "description": "Age of the most recent backup. Backups run daily, so anything past ~25h means a run was missed. Red over 25h.", + "id": 4, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 5, "w": 8, "x": 16, "y": 1 }, + "fieldConfig": { + "defaults": { + "unit": "s", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 90000 } + ] + }, + "noValue": "never" + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto", + "orientation": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "time() - max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"})", + "instant": true + } + ] + }, + { + "type": "table", + "title": "Backup jobs (current state)", + "description": "Every borgbackup job and the systemd unit state it is currently in, per client. 'inactive' is the normal resting state of a oneshot job between runs.", + "id": 5, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "options": { "showHeader": true }, + "fieldConfig": { + "defaults": { "custom": { "align": "auto" } }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\"} == 1", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "job": true, + "type": true + } + } + } + ] + }, + { + "type": "timeseries", + "title": "Failed state over time", + "description": "1 while a backup job is in the failed state. A spike here is a backup that did not complete and was not retried before the next scrape.", + "id": 6, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"}", + "legendFormat": "{{instance}} {{name}}" + } + ] + }, + { + "type": "timeseries", + "title": "Time since last backup (history)", + "description": "Age of the latest backup over time. The sawtooth should reset to near zero once a day; a steady climb without a reset means backups stopped running.", + "id": 7, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "time() - node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}", + "legendFormat": "{{instance}} {{name}}" + } + ] + } + ] +}