0544bf95e5
BackupJobFailed fires when a borgbackup job enters the systemd failed state; BackupStale fires when the daily timer has not run in over 26h (or has never run). Both read the node_exporter systemd collector on the backup client, matching the CNX Backups dashboard.
86 lines
3.8 KiB
Nix
86 lines
3.8 KiB
Nix
# Alerting rules, evaluated by vmalert against VictoriaMetrics on control.
|
|
# Everything is declared here in git. vmalert remote-writes alert state back to
|
|
# VM, so firing alerts surface as the `ALERTS{alertstate="firing"}` series and
|
|
# can be viewed in Grafana. No notifier is wired yet: notifier.blackhole makes
|
|
# that explicit (vmalert evaluates rules but sends nowhere). To deliver alerts
|
|
# later, drop blackhole and set settings."notifier.url" to an Alertmanager.
|
|
{ ... }:
|
|
let
|
|
vmUrl = "http://127.0.0.1:8428";
|
|
in
|
|
{
|
|
services.vmalert.instances.cnx = {
|
|
enable = true;
|
|
settings = {
|
|
"datasource.url" = vmUrl;
|
|
"remoteWrite.url" = vmUrl; # persists ALERTS / ALERTS_FOR_STATE back to VM
|
|
"notifier.blackhole" = true;
|
|
"httpListenAddr" = "127.0.0.1:8880"; # vmalert UI/API, loopback only (like VM)
|
|
};
|
|
rules.groups = [
|
|
{
|
|
name = "dns";
|
|
rules = [
|
|
{
|
|
alert = "DNSSecondaryOutOfSync";
|
|
expr = "max by (zone) (knot_zone_serial) - min by (zone) (knot_zone_serial) > 0";
|
|
for = "15m";
|
|
labels.severity = "warning";
|
|
annotations.summary = "Zone {{ $labels.zone }} SOA serial differs between nameservers";
|
|
annotations.description = "The secondary is out of sync with the primary for {{ $labels.zone }}. `knotc zone-retransfer {{ $labels.zone }}` on ns2 forces a fresh pull.";
|
|
}
|
|
{
|
|
alert = "ZoneExpiryLow";
|
|
expr = "knot_zone_status_expiration < 3600";
|
|
for = "5m";
|
|
labels.severity = "critical";
|
|
annotations.summary = "Zone {{ $labels.zone }} on {{ $labels.instance }} is within 1h of expiry";
|
|
annotations.description = "Transfers to the secondary appear to be failing; the zone stops being served when the SOA expire timer hits zero.";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "host";
|
|
rules = [
|
|
{
|
|
alert = "ScrapeTargetDown";
|
|
expr = "up == 0";
|
|
for = "5m";
|
|
labels.severity = "critical";
|
|
annotations.summary = "{{ $labels.job }} exporter on {{ $labels.instance }} is down";
|
|
annotations.description = "VictoriaMetrics cannot scrape this target; its metrics are missing.";
|
|
}
|
|
{
|
|
alert = "RootFilesystemFull";
|
|
expr = ''100 * (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) > 90'';
|
|
for = "15m";
|
|
labels.severity = "warning";
|
|
annotations.summary = "Root filesystem on {{ $labels.instance }} is over 90% full";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "backup";
|
|
rules = [
|
|
{
|
|
alert = "BackupJobFailed";
|
|
expr = ''node_systemd_unit_state{name=~"borgbackup-job-.+\\.service",state="failed"} == 1'';
|
|
for = "5m";
|
|
labels.severity = "warning";
|
|
annotations.summary = "Backup job {{ $labels.name }} on {{ $labels.instance }} failed";
|
|
annotations.description = "The borgbackup run did not complete. Check `systemctl status {{ $labels.name }}` and `journalctl -u {{ $labels.name }}` on the client; `borgbackup-create` re-runs it.";
|
|
}
|
|
{
|
|
alert = "BackupStale";
|
|
expr = ''time() - node_systemd_timer_last_trigger_seconds{name=~"borgbackup-job-.+\\.timer"} > 93600'';
|
|
for = "30m";
|
|
labels.severity = "warning";
|
|
annotations.summary = "No successful backup on {{ $labels.instance }} for over 26h";
|
|
annotations.description = "The daily backup timer {{ $labels.name }} has not fired within its expected window; the most recent archive is stale. A value far above 26h (or no data) means backups have stopped entirely.";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
}
|