diff --git a/inventories/default.nix b/inventories/default.nix index d871bbb..9bbcfc6 100644 --- a/inventories/default.nix +++ b/inventories/default.nix @@ -360,23 +360,6 @@ }; services.prometheus = { - # rules = [ - # (builtins.toJSON { - # groups = [ - # { - # name = "default"; - # rules = [ - # { - # alert = "NodeDown"; - # expr = "up == 0"; - # for = "1m"; - # annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes. {{ $labels.instance }}"; - # } - # ]; - # } - # ]; - # }) - # ]; alertmanager = { enable = true; diff --git a/modules/clan/prometheus/default.nix b/modules/clan/prometheus/default.nix index a63abbe..e54baae 100644 --- a/modules/clan/prometheus/default.nix +++ b/modules/clan/prometheus/default.nix @@ -103,11 +103,24 @@ expr = "smartctl_device_error_log_count > 0"; for = "5m"; labels = { - severity = "medium"; + severity = "critical"; }; annotations.summary = '' Errors occur on {{ $labels.job }} - Disk {{ $labels.device }} + Disk {{ $labels.device }} {{ $value }} + {{ $labels.instance }} + ''; + } + { + alert = "ZFSPoolsHealth"; + expr = "zfs_pool_health > 0"; + for = "5m"; + labels = { + severity = "critical"; + }; + annotations.summary = '' + Unhealthy Pool at {{ $labels.job }} + Pool {{ $labels.pool }} value {{ $value }} {{ $labels.instance }} ''; }