{ clanLib, ... }: { _class = "clan.service"; manifest.name = "prometheus"; manifest.description = "The Prometheus monitoring system and time series database."; manifest.readme = builtins.readFile ./README.md; manifest.categories = [ "System" ]; roles.server = { description = "Prometheus server that scraps all data from nodes"; interface = { lib, options, ... }: { options = { scrape_interval = lib.mkOption { type = with lib.types; nullOr str; default = "1m"; description = "How often to scrape targets. Default is 1 minutes"; }; extra_rules = lib.mkOption { type = with lib.types; listOf attrs; default = [ ]; description = "Additional rules for Prometheus"; }; default_receiver = lib.mkOption { type = with lib.types; attrs; default = { name = "default"; }; description = "Definition of a default receiver, default is doing nothing"; }; matrix-alertmanager = { enable = lib.mkOption { type = with lib.types; bool; default = false; description = "Whether to enable `services.matrix-alertmanager`"; }; homeserverUrl = lib.mkOption { type = with lib.types; str; default = "https://matrix-client.matrix.org"; description = "URL of the Matrix homeserver to use"; }; matrixUser = lib.mkOption { type = with lib.types; str; description = "Matrix user for the bot"; }; matrixRooms = lib.mkOption { type = lib.types.listOf ( lib.types.submodule { options = { receivers = lib.mkOption { type = lib.types.listOf lib.types.str; description = "List of receivers for this room"; }; roomId = lib.mkOption { type = lib.types.str; description = "Matrix room ID"; apply = x: assert lib.assertMsg (lib.hasPrefix "!" x) "Matrix room ID must start with a '!'. Got: ${x}"; x; }; }; } ); description = '' Combination of Alertmanager receiver(s) and rooms for the bot to join. Each Alertmanager receiver can be mapped to post to a matrix room. Note, you must use a room ID and not a room alias/name. Room IDs start with a "!". ''; example = [ { receivers = [ "receiver1" "receiver2" ]; roomId = "!roomid@example.com"; } { receivers = [ "receiver3" ]; roomId = "!differentroomid@example.com"; } ]; }; }; }; }; perInstance = { settings, roles, ... }: { nixosModule = { config, lib, ... }: let getYggdrasilIP = machineName: if config.clan.core.vars.generators.yggdrasil.files.address ? value then clanLib.getPublicValue { flake = config.clan.core.settings.directory; machine = machineName; generator = "yggdrasil"; file = "address"; default = null; } else throw "clanService/yggdrasil is required"; in { networking.firewall.allowedTCPPorts = [ 9090 ]; services.prometheus = { enable = true; globalConfig = { scrape_interval = settings.scrape_interval; }; alertmanagers = [ { scheme = "http"; path_prefix = "/"; static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; alertmanager = { enable = true; configuration = { global = { resolve_timeout = "5m"; }; route = { receiver = "default"; }; receivers = [ { name = "default"; } ]; }; }; scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: { tls_config.insecure_skip_verify = true; job_name = "${machineName}"; static_configs = lib.mapAttrsToList ( exporterName: exporterVal: let targetPort = if exporterVal ? port then exporterVal.port else config.services.prometheus.exporters."${exporterName}".port; targetHost = getYggdrasilIP machineName; in { targets = [ "[${targetHost}]:${lib.toString targetPort}" ]; } ) machineVal.settings.exporters; }) roles.nodes.machines; rules = [ (builtins.toJSON { groups = [ { name = "default"; rules = [ { alert = "NodesDown"; expr = "count by (job) (up == 0) > 0"; for = "1m"; labels = { severity = "critical"; }; annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes."; } { alert = "SmartCtlErrors"; expr = "smartctl_device_error_log_count > 0"; for = "5m"; labels = { severity = "critical"; }; annotations.summary = '' Errors occur on {{ $labels.job }} Disk {{ $labels.device }} {{ $value }} ''; } { alert = "ZFSPoolsHealth"; expr = "zfs_pool_health > 0"; for = "5m"; labels = { severity = "critical"; }; annotations.summary = '' Unhealthy Pool at {{ $labels.job }} Pool {{ $labels.pool }} value {{ $value }} ''; } ] ++ settings.extra_rules; } ]; }) ]; }; } // lib.optionalAttrs settings.matrix-alertmanager.enable ( { pkgs, ... }: { clan.core.vars.generators.prometheus = { files.matrix-alertmanager-token.secret = true; files.matrix-alertmanager-secret.secret = true; files.matrix-alertmanager-urlfile = { secret = true; owner = "alertmanager"; group = "alertmanager"; }; script = '' echo "" > $out/matrix-alertmanager-token openssl rand -hex 32 > "$out"/matrix-alertmanager-secret echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile ''; runtimeInputs = [ pkgs.openssl ]; }; services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable { enable = true; tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path; secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path; homeserverUrl = settings.matrix-alertmanager.homeserverUrl; matrixUser = settings.matrix-alertmanager.matrixUser; matrixRooms = settings.matrix-alertmanager.matrixRooms; }; } ); }; }; roles.nodes = { description = "A node will expose metrics for server to harvest"; interface = { lib, ... }: { options = { exporters = lib.mkOption { type = lib.types.attrsOf (lib.types.submodule { }); default = { }; description = "Mirror of services.prometheus.exporters"; }; }; }; perInstance = { settings, ... }: let enabledExporters = builtins.mapAttrs ( name: value: value // { enable = true; openFirewall = true; } ) settings.exporters; in { nixosModule = { ... }: { services.prometheus.exporters = enabledExporters; }; }; }; }