Files
infra/modules/clan/prometheus/default.nix
T
kurogeek 693a97ec26 mob next [ci-skip] [ci skip] [skip ci]
lastFile:modules/clan/prometheus/default.nix
2026-06-19 21:54:51 +07:00

205 lines
6.1 KiB
Nix

{ clanLib, ... }:
{
_class = "clan.service";
manifest.name = "prometheus";
manifest.description = "The Prometheus monitoring system and time series database.";
manifest.readme = builtins.readFile ./README.md;
manifest.categories = [ "System" ];
roles.server = {
description = "Prometheus server that scraps all data from nodes";
interface =
{ lib, ... }:
{
options = {
scrape_interval = lib.mkOption {
type = with lib.types; nullOr str;
default = "1m";
description = "How often to scrape targets. Default is 1 minutes";
};
extra_rules = lib.mkOption {
type = with lib.types; listOf attrs;
default = [ ];
description = "Additional rules for Prometheus";
};
default_receiver = lib.mkOption {
type = with lib.types; attrs;
default = {
name = "default";
};
description = "Definition of a default receiver, default is doing nothing";
};
};
};
perInstance =
{
settings,
roles,
...
}:
{
nixosModule =
{
config,
lib,
...
}:
let
getYggdrasilIP =
machineName:
if config.clan.core.vars.generators.yggdrasil.files.address ? value then
clanLib.getPublicValue {
flake = config.clan.core.settings.directory;
machine = machineName;
generator = "yggdrasil";
file = "address";
default = null;
}
else
throw "clanService/yggdrasil is required";
in
{
networking.firewall.allowedTCPPorts = [
9090
];
services.prometheus = {
enable = true;
globalConfig = {
scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
];
alertmanager = {
enable = true;
configuration = {
global = {
resolve_timeout = "5m";
};
route = {
receiver = "default";
};
receivers = [
{ name = "default"; }
];
};
};
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
tls_config.insecure_skip_verify = true;
job_name = "${machineName}";
static_configs = lib.mapAttrsToList (
exporterName: exporterVal:
let
targetPort =
if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{
name = "default";
rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extraRules;
}
];
})
];
};
};
};
};
roles.nodes = {
description = "A node will expose metrics for server to harvest";
interface =
{ lib, ... }:
{
options = {
exporters = lib.mkOption {
type = lib.types.attrsOf (lib.types.submodule { });
default = { };
description = "Mirror of services.prometheus.exporters";
};
};
};
perInstance =
{ settings, ... }:
let
enabledExporters = builtins.mapAttrs (
name: value:
value
// {
enable = true;
openFirewall = true;
}
) settings.exporters;
in
{
nixosModule =
{ ... }:
{
services.prometheus.exporters = enabledExporters;
};
};
};
}