Add vmalert rules for failed and stale backups

BackupJobFailed fires when a borgbackup job enters the systemd failed
state; BackupStale fires when the daily timer has not run in over 26h
(or has never run). Both read the node_exporter systemd collector on
the backup client, matching the CNX Backups dashboard.
This commit is contained in:
Berwn
2026-06-17 15:17:12 +07:00
parent 1ea5bda23f
commit 0544bf95e5
+21
View File
@@ -59,6 +59,27 @@ in
} }
]; ];
} }
{
name = "backup";
rules = [
{
alert = "BackupJobFailed";
expr = ''node_systemd_unit_state{name=~"borgbackup-job-.+\\.service",state="failed"} == 1'';
for = "5m";
labels.severity = "warning";
annotations.summary = "Backup job {{ $labels.name }} on {{ $labels.instance }} failed";
annotations.description = "The borgbackup run did not complete. Check `systemctl status {{ $labels.name }}` and `journalctl -u {{ $labels.name }}` on the client; `borgbackup-create` re-runs it.";
}
{
alert = "BackupStale";
expr = ''time() - node_systemd_timer_last_trigger_seconds{name=~"borgbackup-job-.+\\.timer"} > 93600'';
for = "30m";
labels.severity = "warning";
annotations.summary = "No successful backup on {{ $labels.instance }} for over 26h";
annotations.description = "The daily backup timer {{ $labels.name }} has not fired within its expected window; the most recent archive is stale. A value far above 26h (or no data) means backups have stopped entirely.";
}
];
}
]; ];
}; };
} }