From 1ea5bda23f498c2915215eabbcbe3a8f0b1dff0b Mon Sep 17 00:00:00 2001
From: Berwn <alexander@grabowski.email>
Date: Wed, 17 Jun 2026 15:13:47 +0700
Subject: [PATCH] Add CNX Backups dashboard and document the backup setup

Grafana dashboard (auto-provisioned from the dashboards dir) tracks
borgbackup job health, time since last run, and per-job systemd state
from the node_exporter systemd collector on the client. New docs page
covers the ns1 -> control topology, secrets flow, and restore commands.
---
 docs/src/SUMMARY.md                        |   1 +
 docs/src/backups.md                        |  61 +++++++
 docs/src/monitoring.md                     |  10 +-
 modules/monitoring/dashboards/backups.json | 199 +++++++++++++++++++++
 4 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 docs/src/backups.md
 create mode 100644 modules/monitoring/dashboards/backups.json
diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
index 88abf0c..45be790 100644
--- a/docs/src/SUMMARY.md
+++ b/docs/src/SUMMARY.md
@@ -4,3 +4,4 @@
 - [ZeroTier mesh](./mesh.md)
 - [DNS](./dns.md)
 - [Monitoring](./monitoring.md)
+- [Backups](./backups.md)
diff --git a/docs/src/backups.md b/docs/src/backups.md
new file mode 100644
index 0000000..ff6379b
--- /dev/null
+++ b/docs/src/backups.md
@@ -0,0 +1,61 @@
+# Backups
+
+Encrypted, deduplicating backups via clan's `borgbackup` service, declared in
+`clan.nix`. The only critical, non-regenerable state is the **Knot DNSSEC
+keystore** on `ns1` (the KSK/ZSK private keys under `/var/lib/knot`); losing it
+forces an emergency DS rollover at the registrar.
+
+## Topology
+
+- **control** is the borgbackup **server** — it hosts the repos under
+  `/var/lib/borgbackup/<client>` (so `ns1`'s repo is `/var/lib/borgbackup/ns1`).
+- **ns1** is the **client**. It backs up everything it declares as clan state
+  (`clan.core.state.knot.folders = [ "/var/lib/knot" ]`) once a day at 01:00,
+  over the ZeroTier mesh.
+
+The backup is cross-host so that losing `ns1` is recoverable, and stays
+self-contained (no third-party storage). Encryption is `repokey` with a
+generated passphrase, so `control` only ever stores ciphertext.
+
+Mesh peers have no name resolution, so `ns1` maps the `control` machine name to
+its ZeroTier address via `networking.hosts`; that is how the `borg@control` repo
+URL resolves.
+
+## Secrets
+
+The borgbackup ssh keypair and repokey passphrase are clan vars, generated once
+(needs the YubiKey). `control` will not evaluate until `ns1`'s public key
+exists, so generate before the first deploy:
+
+```
+clan vars generate ns1
+clan machines update ns1
+clan machines update control
+```
+
+## Operating
+
+Backups are driven by systemd on `ns1` (`borgbackup-job-control.timer`).
+
+```
+# trigger a backup now (on ns1)
+borgbackup-create
+
+# list archives (on ns1)
+borgbackup-list
+
+# restore selected folders from an archive (on ns1)
+NAME='<archive-name>' FOLDERS=/var/lib/knot borgbackup-restore
+```
+
+Retention is pruned automatically: all archives from the last day, then 7 daily
+and 4 weekly.
+
+## Monitoring
+
+The **CNX Backups** Grafana dashboard
+(`modules/monitoring/dashboards/backups.json`) tracks job health, time since the
+last successful run, and per-job state — all from the node_exporter systemd
+collector on the client. There is no dedicated borg metrics exporter; the unit
+state and the timer's last-trigger timestamp are enough to catch a backup that
+stops running or fails.
diff --git a/docs/src/monitoring.md b/docs/src/monitoring.md
index fe9bfcb..4ded43d 100644
--- a/docs/src/monitoring.md
+++ b/docs/src/monitoring.md
@@ -33,6 +33,10 @@ admin password is a clan var:
 clan vars get control grafana-admin/password
 ```
 
-The provisioned **CNX DNS** dashboard (`modules/monitoring/dashboards/dns.json`)
-shows per-nameserver SOA serials, zone expiry countdowns, query/response rates,
-and host CPU/memory/disk/load.
+Dashboards are provisioned from `modules/monitoring/dashboards/` (any JSON file
+there is picked up):
+
+- **CNX DNS** (`dns.json`) — firing alerts, per-nameserver SOA serials, zone
+  expiry countdowns, query/response rates, and host CPU/memory/disk/load.
+- **CNX Backups** (`backups.json`) — borgbackup job health, time since the last
+  run, and per-job state. See [Backups](./backups.md).
diff --git a/modules/monitoring/dashboards/backups.json b/modules/monitoring/dashboards/backups.json
new file mode 100644
index 0000000..3968c0d
--- /dev/null
+++ b/modules/monitoring/dashboards/backups.json
@@ -0,0 +1,199 @@
+{
+  "uid": "cnx-backups",
+  "title": "CNX Backups",
+  "tags": ["backup", "borg", "cnx"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "version": 1,
+  "refresh": "1m",
+  "time": { "from": "now-7d", "to": "now" },
+  "templating": { "list": [] },
+  "annotations": { "list": [] },
+  "panels": [
+    {
+      "type": "row",
+      "title": "Backups",
+      "id": 1,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
+    },
+    {
+      "type": "stat",
+      "title": "Backup health",
+      "description": "1 if any borgbackup job is in the failed state, 0 otherwise. A successful run leaves the oneshot unit inactive (still OK); only a real failure shows FAILED. Derived from the node_exporter systemd collector on the backup client (ns1).",
+      "id": 2,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 5, "w": 8, "x": 0, "y": 1 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
+          "noValue": "no data",
+          "mappings": [
+            {
+              "type": "value",
+              "options": {
+                "0": { "text": "OK", "color": "green", "index": 0 },
+                "1": { "text": "FAILED", "color": "red", "index": 1 }
+              }
+            }
+          ]
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "colorMode": "background",
+        "graphMode": "none",
+        "textMode": "auto",
+        "orientation": "auto"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "max(node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"})",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Last backup run",
+      "description": "When the most recent backup timer last fired (the daily borgbackup job). 'No data' before the first run.",
+      "id": 3,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 5, "w": 8, "x": 8, "y": 1 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "dateTimeFromNow",
+          "color": { "mode": "fixed", "fixedColor": "text" },
+          "noValue": "never"
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "colorMode": "none",
+        "graphMode": "none",
+        "textMode": "auto",
+        "orientation": "auto"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}) * 1000",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Time since last backup",
+      "description": "Age of the most recent backup. Backups run daily, so anything past ~25h means a run was missed. Red over 25h.",
+      "id": 4,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 5, "w": 8, "x": 16, "y": 1 },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 90000 }
+            ]
+          },
+          "noValue": "never"
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "colorMode": "background",
+        "graphMode": "none",
+        "textMode": "auto",
+        "orientation": "auto"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "time() - max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"})",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "table",
+      "title": "Backup jobs (current state)",
+      "description": "Every borgbackup job and the systemd unit state it is currently in, per client. 'inactive' is the normal resting state of a oneshot job between runs.",
+      "id": 5,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+      "options": { "showHeader": true },
+      "fieldConfig": {
+        "defaults": { "custom": { "align": "auto" } },
+        "overrides": []
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\"} == 1",
+          "format": "table",
+          "instant": true
+        }
+      ],
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true,
+              "Value": true,
+              "__name__": true,
+              "job": true,
+              "type": true
+            }
+          }
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Failed state over time",
+      "description": "1 while a backup job is in the failed state. A spike here is a backup that did not complete and was not retried before the next scrape.",
+      "id": 6,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+      "fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"}",
+          "legendFormat": "{{instance}} {{name}}"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Time since last backup (history)",
+      "description": "Age of the latest backup over time. The sawtooth should reset to near zero once a day; a steady climb without a reset means backups stopped running.",
+      "id": 7,
+      "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 },
+      "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": { "type": "prometheus", "uid": "victoriametrics" },
+          "expr": "time() - node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}",
+          "legendFormat": "{{instance}} {{name}}"
+        }
+      ]
+    }
+  ]
+}