1ea5bda23f
Grafana dashboard (auto-provisioned from the dashboards dir) tracks borgbackup job health, time since last run, and per-job systemd state from the node_exporter systemd collector on the client. New docs page covers the ns1 -> control topology, secrets flow, and restore commands.
200 lines
6.8 KiB
JSON
200 lines
6.8 KiB
JSON
{
|
|
"uid": "cnx-backups",
|
|
"title": "CNX Backups",
|
|
"tags": ["backup", "borg", "cnx"],
|
|
"timezone": "browser",
|
|
"schemaVersion": 39,
|
|
"version": 1,
|
|
"refresh": "1m",
|
|
"time": { "from": "now-7d", "to": "now" },
|
|
"templating": { "list": [] },
|
|
"annotations": { "list": [] },
|
|
"panels": [
|
|
{
|
|
"type": "row",
|
|
"title": "Backups",
|
|
"id": 1,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Backup health",
|
|
"description": "1 if any borgbackup job is in the failed state, 0 otherwise. A successful run leaves the oneshot unit inactive (still OK); only a real failure shows FAILED. Derived from the node_exporter systemd collector on the backup client (ns1).",
|
|
"id": 2,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 5, "w": 8, "x": 0, "y": 1 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "thresholds" },
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
|
"noValue": "no data",
|
|
"mappings": [
|
|
{
|
|
"type": "value",
|
|
"options": {
|
|
"0": { "text": "OK", "color": "green", "index": 0 },
|
|
"1": { "text": "FAILED", "color": "red", "index": 1 }
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"textMode": "auto",
|
|
"orientation": "auto"
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "max(node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"})",
|
|
"instant": true
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Last backup run",
|
|
"description": "When the most recent backup timer last fired (the daily borgbackup job). 'No data' before the first run.",
|
|
"id": 3,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 5, "w": 8, "x": 8, "y": 1 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "dateTimeFromNow",
|
|
"color": { "mode": "fixed", "fixedColor": "text" },
|
|
"noValue": "never"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "none",
|
|
"graphMode": "none",
|
|
"textMode": "auto",
|
|
"orientation": "auto"
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}) * 1000",
|
|
"instant": true
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Time since last backup",
|
|
"description": "Age of the most recent backup. Backups run daily, so anything past ~25h means a run was missed. Red over 25h.",
|
|
"id": 4,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 5, "w": 8, "x": 16, "y": 1 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"color": { "mode": "thresholds" },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "red", "value": 90000 }
|
|
]
|
|
},
|
|
"noValue": "never"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"textMode": "auto",
|
|
"orientation": "auto"
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "time() - max(node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"})",
|
|
"instant": true
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "table",
|
|
"title": "Backup jobs (current state)",
|
|
"description": "Every borgbackup job and the systemd unit state it is currently in, per client. 'inactive' is the normal resting state of a oneshot job between runs.",
|
|
"id": 5,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": {
|
|
"defaults": { "custom": { "align": "auto" } },
|
|
"overrides": []
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\"} == 1",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": {
|
|
"Time": true,
|
|
"Value": true,
|
|
"__name__": true,
|
|
"job": true,
|
|
"type": true
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Failed state over time",
|
|
"description": "1 while a backup job is in the failed state. A spike here is a backup that did not complete and was not retried before the next scrape.",
|
|
"id": 6,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
|
"fieldConfig": { "defaults": { "unit": "short", "min": 0, "max": 1 }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "node_systemd_unit_state{name=~\"borgbackup-job-.+\\\\.service\",state=\"failed\"}",
|
|
"legendFormat": "{{instance}} {{name}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Time since last backup (history)",
|
|
"description": "Age of the latest backup over time. The sawtooth should reset to near zero once a day; a steady climb without a reset means backups stopped running.",
|
|
"id": 7,
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 },
|
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
|
"expr": "time() - node_systemd_timer_last_trigger_seconds{name=~\"borgbackup-job-.+\\\\.timer\"}",
|
|
"legendFormat": "{{instance}} {{name}}"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|