{ clanLib, ... }: { _class = "clan.service"; manifest.name = "prometheus"; manifest.description = "The Prometheus monitoring system and time series database."; manifest.readme = builtins.readFile ./README.md; manifest.categories = [ "System" ]; roles.server = { description = "Prometheus server that scraps all data from nodes"; interface = { lib, ... }: { options = { scrape_interval = lib.mkOption { type = with lib.types; nullOr str; default = "1m"; description = "How often to scrape targets. Default is 1 minutes"; }; extraRules = lib.mkOption { type = with lib.types; listOf attrs; default = [ ]; description = "Additional rules for Prometheus"; }; }; }; perInstance = { settings, roles, ... }: { nixosModule = { config, lib, ... }: let getYggdrasilIP = machineName: if config.clan.core.vars.generators.yggdrasil.files.address ? value then clanLib.getPublicValue { flake = config.clan.core.settings.directory; machine = machineName; generator = "yggdrasil"; file = "address"; default = null; } else throw "clanService/yggdrasil is required"; in { networking.firewall.allowedTCPPorts = [ 9090 ]; services.prometheus = { enable = true; globalConfig = { scrape_interval = settings.scrape_interval; }; alertmanagers = [ { scheme = "http"; path_prefix = "/"; static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: { tls_config.insecure_skip_verify = true; job_name = "${machineName}"; static_configs = lib.mapAttrsToList ( exporterName: exporterVal: let targetPort = if exporterVal ? port then exporterVal.port else config.services.prometheus.exporters."${exporterName}".port; targetHost = getYggdrasilIP machineName; in { targets = [ "[${targetHost}]:${lib.toString targetPort}" ]; } ) machineVal.settings.exporters; }) roles.nodes.machines; rules = [ (builtins.toJSON { groups = [ { name = "default"; rules = [ { alert = "NodesDown"; expr = "count by (job) (up == 0) > 0"; for = "1m"; labels = { severity = "critical"; }; annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes."; } { alert = "SmartCtlErrors"; expr = "smartctl_device_error_log_count > 0"; for = "5m"; labels = { severity = "critical"; }; annotations.summary = '' Errors occur on {{ $labels.job }} Disk {{ $labels.device }} {{ $value }} ''; } { alert = "ZFSPoolsHealth"; expr = "zfs_pool_health > 0"; for = "5m"; labels = { severity = "critical"; }; annotations.summary = '' Unhealthy Pool at {{ $labels.job }} Pool {{ $labels.pool }} value {{ $value }} ''; } ] ++ settings.extraRules; } ]; }) ]; }; }; }; }; roles.nodes = { description = "A node will expose metrics for server to harvest"; interface = { lib, ... }: { options = { exporters = lib.mkOption { type = lib.types.attrsOf (lib.types.submodule { }); default = { }; description = "Mirror of services.prometheus.exporters"; }; }; }; perInstance = { settings, ... }: let enabledExporters = builtins.mapAttrs ( name: value: value // { enable = true; openFirewall = true; } ) settings.exporters; in { nixosModule = { ... }: { services.prometheus.exporters = enabledExporters; }; }; }; }