diff --git a/inventories/default.nix b/inventories/default.nix index b2028c2..bb426fa 100644 --- a/inventories/default.nix +++ b/inventories/default.nix @@ -360,23 +360,23 @@ }; services.prometheus = { - rules = [ - (builtins.toJSON { - groups = [ - { - name = "default"; - rules = [ - { - alert = "NodeDown"; - expr = "up == 0"; - for = "1m"; - annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes. {{ $labels.instance }}"; - } - ]; - } - ]; - }) - ]; + # rules = [ + # (builtins.toJSON { + # groups = [ + # { + # name = "default"; + # rules = [ + # { + # alert = "NodeDown"; + # expr = "up == 0"; + # for = "1m"; + # annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes. {{ $labels.instance }}"; + # } + # ]; + # } + # ]; + # }) + # ]; alertmanager = { enable = true; diff --git a/modules/clan/prometheus/default.nix b/modules/clan/prometheus/default.nix index 62fae66..a63abbe 100644 --- a/modules/clan/prometheus/default.nix +++ b/modules/clan/prometheus/default.nix @@ -18,6 +18,11 @@ default = "1m"; description = "How often to scrape targets. Default is 1 minutes"; }; + extraRules = lib.mkOption { + type = with lib.types; listOf attrs; + default = [ ]; + description = "Additional rules for Prometheus"; + }; }; }; @@ -78,6 +83,41 @@ ) machineVal.settings.exporters; }) roles.nodes.machines; + rules = [ + (builtins.toJSON { + groups = [ + { + name = "default"; + rules = [ + { + alert = "NodesDown"; + expr = "up == 0"; + for = "1m"; + labels = { + severity = "critical"; + }; + annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes. {{ $labels.instance }}"; + } + { + alert = "SmartCtlErrors"; + expr = "smartctl_device_error_log_count > 0"; + for = "5m"; + labels = { + severity = "medium"; + }; + annotations.summary = '' + Errors occur on {{ $labels.job }} + Disk {{ $labels.device }} + {{ $labels.instance }} + ''; + } + ] + ++ settings.extraRules; + } + ]; + }) + ]; + }; };