Add vmalert rules for failed and stale backups
BackupJobFailed fires when a borgbackup job enters the systemd failed state; BackupStale fires when the daily timer has not run in over 26h (or has never run). Both read the node_exporter systemd collector on the backup client, matching the CNX Backups dashboard.
This commit is contained in:
@@ -59,6 +59,27 @@ in
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "backup";
|
||||
rules = [
|
||||
{
|
||||
alert = "BackupJobFailed";
|
||||
expr = ''node_systemd_unit_state{name=~"borgbackup-job-.+\\.service",state="failed"} == 1'';
|
||||
for = "5m";
|
||||
labels.severity = "warning";
|
||||
annotations.summary = "Backup job {{ $labels.name }} on {{ $labels.instance }} failed";
|
||||
annotations.description = "The borgbackup run did not complete. Check `systemctl status {{ $labels.name }}` and `journalctl -u {{ $labels.name }}` on the client; `borgbackup-create` re-runs it.";
|
||||
}
|
||||
{
|
||||
alert = "BackupStale";
|
||||
expr = ''time() - node_systemd_timer_last_trigger_seconds{name=~"borgbackup-job-.+\\.timer"} > 93600'';
|
||||
for = "30m";
|
||||
labels.severity = "warning";
|
||||
annotations.summary = "No successful backup on {{ $labels.instance }} for over 26h";
|
||||
annotations.description = "The daily backup timer {{ $labels.name }} has not fired within its expected window; the most recent archive is stale. A value far above 26h (or no data) means backups have stopped entirely.";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user