Files
infra/modules/clan/prometheus/default.nix
T
kurogeek 4b5f2aa68b mob next [ci-skip] [ci skip] [skip ci]
lastFile:modules/clan/prometheus/default.nix
2026-06-22 14:10:03 +07:00

308 lines
10 KiB
Nix

{ clanLib, ... }:
{
_class = "clan.service";
manifest.name = "prometheus";
manifest.description = "The Prometheus monitoring system and time series database.";
manifest.readme = builtins.readFile ./README.md;
manifest.categories = [ "System" ];
roles.server = {
description = "Prometheus server that scraps all data from nodes";
interface =
{ lib, ... }:
{
options = {
scrape_interval = lib.mkOption {
type = with lib.types; nullOr str;
default = "1m";
description = "How often to scrape targets. Default is 1 minutes";
};
extra_rules = lib.mkOption {
type = with lib.types; listOf attrs;
default = [ ];
description = "Additional rules for Prometheus";
};
default_receiver = lib.mkOption {
type = with lib.types; attrs;
default = {
name = "default";
};
description = "Definition of a default receiver, default is doing nothing";
};
matrix-alertmanager = {
enable = lib.mkOption {
type = with lib.types; bool;
default = false;
description = "Whether to enable `services.matrix-alertmanager`";
};
homeserverUrl = lib.mkOption {
type = with lib.types; str;
default = "https://matrix-client.matrix.org";
description = "URL of the Matrix homeserver to use";
};
matrixUser = lib.mkOption {
type = with lib.types; str;
description = "Matrix user for the bot";
};
matrixRooms = lib.mkOption {
type = lib.types.listOf (
lib.types.submodule {
options = {
receivers = lib.mkOption {
type = lib.types.listOf lib.types.str;
description = "List of receivers for this room";
};
roomId = lib.mkOption {
type = lib.types.str;
description = "Matrix room ID";
apply =
x:
assert lib.assertMsg (lib.hasPrefix "!" x) "Matrix room ID must start with a '!'. Got: ${x}";
x;
};
};
}
);
description = ''
Combination of Alertmanager receiver(s) and rooms for the bot to join.
Each Alertmanager receiver can be mapped to post to a matrix room.
Note, you must use a room ID and not a room alias/name. Room IDs start
with a "!".
'';
example = [
{
receivers = [
"receiver1"
"receiver2"
];
roomId = "!roomid@example.com";
}
{
receivers = [ "receiver3" ];
roomId = "!differentroomid@example.com";
}
];
};
};
};
};
perInstance =
{
settings,
roles,
...
}:
{
nixosModule =
{
config,
lib,
...
}:
let
getYggdrasilIP =
machineName:
if config.clan.core.vars.generators.yggdrasil.files.address ? value then
clanLib.getPublicValue {
flake = config.clan.core.settings.directory;
machine = machineName;
generator = "yggdrasil";
file = "address";
default = null;
}
else
throw "clanService/yggdrasil is required";
matrixRoomReceivers = lib.unique (
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
);
in
{
networking.firewall.allowedTCPPorts = [
9090
];
services.prometheus = {
enable = true;
globalConfig = {
scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
];
alertmanager = {
enable = true;
configuration = {
global = {
resolve_timeout = "5m";
};
route = {
receiver = "default";
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
};
receivers = [
{ name = "default"; }
]
++ map (mReceiver: {
name = mReceiver;
webhook_config = [
{
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
send_resolved = true;
}
];
}) matrixRoomReceivers;
};
};
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
tls_config.insecure_skip_verify = true;
job_name = "${machineName}";
static_configs = lib.mapAttrsToList (
exporterName: exporterVal:
let
targetPort =
if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{
name = "default";
rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extra_rules;
}
];
})
];
};
}
// lib.optionalAttrs settings.matrix-alertmanager.enable (
{ pkgs, ... }:
{
clan.core.vars.generators.prometheus = {
files.matrix-alertmanager-token.secret = true;
files.matrix-alertmanager-secret.secret = true;
files.matrix-alertmanager-urlfile = {
secret = true;
owner = "alertmanager";
group = "alertmanager";
};
script = ''
echo "" > $out/matrix-alertmanager-token
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
'';
runtimeInputs = [
pkgs.openssl
];
};
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
enable = true;
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
matrixUser = settings.matrix-alertmanager.matrixUser;
matrixRooms = settings.matrix-alertmanager.matrixRooms;
};
}
);
};
};
roles.nodes = {
description = "A node will expose metrics for server to harvest";
interface =
{ lib, ... }:
{
options = {
exporters = lib.mkOption {
type = lib.types.attrsOf (lib.types.submodule { });
default = { };
description = "Mirror of services.prometheus.exporters";
};
};
};
perInstance =
{ settings, ... }:
let
enabledExporters = builtins.mapAttrs (
name: value:
value
// {
enable = true;
openFirewall = true;
}
) settings.exporters;
in
{
nixosModule =
{ ... }:
{
services.prometheus.exporters = enabledExporters;
};
};
};
}