diff --git a/modules/monitoring/alerts.nix b/modules/monitoring/alerts.nix index f71e4b6..311873f 100644 --- a/modules/monitoring/alerts.nix +++ b/modules/monitoring/alerts.nix @@ -59,6 +59,27 @@ in } ]; } + { + name = "backup"; + rules = [ + { + alert = "BackupJobFailed"; + expr = ''node_systemd_unit_state{name=~"borgbackup-job-.+\\.service",state="failed"} == 1''; + for = "5m"; + labels.severity = "warning"; + annotations.summary = "Backup job {{ $labels.name }} on {{ $labels.instance }} failed"; + annotations.description = "The borgbackup run did not complete. Check `systemctl status {{ $labels.name }}` and `journalctl -u {{ $labels.name }}` on the client; `borgbackup-create` re-runs it."; + } + { + alert = "BackupStale"; + expr = ''time() - node_systemd_timer_last_trigger_seconds{name=~"borgbackup-job-.+\\.timer"} > 93600''; + for = "30m"; + labels.severity = "warning"; + annotations.summary = "No successful backup on {{ $labels.instance }} for over 26h"; + annotations.description = "The daily backup timer {{ $labels.name }} has not fired within its expected window; the most recent archive is stale. A value far above 26h (or no data) means backups have stopped entirely."; + } + ]; + } ]; }; }