mob next [ci-skip] [ci skip] [skip ci]

lastFile:modules/clan/prometheus/default.nix
This commit is contained in:
2026-06-22 16:49:41 +07:00
parent f2cdea1aec
commit 106591d374
2 changed files with 223 additions and 206 deletions
+89 -75
View File
@@ -320,81 +320,95 @@
name = "prometheus";
input = "self";
};
roles.server.machines."rigel".settings = { };
roles.server.extraModules = [
(
{ config, pkgs, ... }:
{
# clan.core.vars.generators.prometheus = {
# files.matrix-alertmanager-token.secret = true;
# files.matrix-alertmanager-secret.secret = true;
# files.matrix-alertmanager-urlfile = {
# secret = true;
# owner = "alertmanager";
# group = "alertmanager";
# };
# script = ''
# echo "" > $out/matrix-alertmanager-token
# openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
#
# echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
# '';
# runtimeInputs = [
# pkgs.openssl
# ];
# };
#
# services.matrix-alertmanager = {
# enable = true;
# tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
# secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
# homeserverUrl = "https://matrix-client.matrix.org";
# matrixUser = "@kuroiris:matrix.org";
# matrixRooms = [
# {
# receivers = [
# "matrix"
# ];
# roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
# }
# ];
# };
# services.prometheus = {
#
# alertmanager = {
# enable = true;
# configuration = {
# global = {
# resolve_timeout = "5m";
# };
# route = {
# receiver = "default";
# routes = [
# {
# receiver = "matrix";
# }
# ];
# };
# receivers = [
# { name = "default"; }
# {
# name = "matrix";
# webhook_configs = [
# {
# url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
# send_resolved = true;
# }
# ];
# }
# ];
# };
# };
#
# };
}
)
];
roles.server.machines."rigel".settings = {
matrix-alertmanager = {
enable = true;
homeserverUrl = "https://matrix-client.matrix.org";
matrixUser = "@kuroiris:matrix.org";
matrixRooms = [
{
receivers = [
"matrix"
];
roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
}
];
};
};
# roles.server.extraModules = [
# (
# { config, pkgs, ... }:
# {
# # clan.core.vars.generators.prometheus = {
# # files.matrix-alertmanager-token.secret = true;
# # files.matrix-alertmanager-secret.secret = true;
# # files.matrix-alertmanager-urlfile = {
# # secret = true;
# # owner = "alertmanager";
# # group = "alertmanager";
# # };
# # script = ''
# # echo "" > $out/matrix-alertmanager-token
# # openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
# #
# # echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
# # '';
# # runtimeInputs = [
# # pkgs.openssl
# # ];
# # };
# #
# # services.matrix-alertmanager = {
# # enable = true;
# # tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
# # secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
# # homeserverUrl = "https://matrix-client.matrix.org";
# # matrixUser = "@kuroiris:matrix.org";
# # matrixRooms = [
# # {
# # receivers = [
# # "matrix"
# # ];
# # roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
# # }
# # ];
# # };
#
# # services.prometheus = {
# #
# # alertmanager = {
# # enable = true;
# # configuration = {
# # global = {
# # resolve_timeout = "5m";
# # };
# # route = {
# # receiver = "default";
# # routes = [
# # {
# # receiver = "matrix";
# # }
# # ];
# # };
# # receivers = [
# # { name = "default"; }
# # {
# # name = "matrix";
# # webhook_configs = [
# # {
# # url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
# # send_resolved = true;
# # }
# # ];
# # }
# # ];
# # };
# # };
# #
# # };
# }
# )
# ];
roles.nodes.machines = {
vega.settings = {
+134 -131
View File
@@ -122,147 +122,150 @@
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
);
in
{
networking.firewall.allowedTCPPorts = [
9090
];
services.prometheus = {
enable = true;
globalConfig = {
scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
lib.mkMerge [
{
networking.firewall.allowedTCPPorts = [
9090
];
alertmanager = {
services.prometheus = {
enable = true;
configuration = {
global = {
resolve_timeout = "5m";
globalConfig = {
scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
];
alertmanager = {
enable = true;
configuration = {
global = {
resolve_timeout = "5m";
};
route = {
receiver = "default";
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
};
receivers = [
{ name = "default"; }
]
++ map (mReceiver: {
name = mReceiver;
webhook_configs = [
{
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
send_resolved = true;
}
];
}) matrixRoomReceivers;
};
route = {
receiver = "default";
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
};
receivers = [
{ name = "default"; }
]
++ map (mReceiver: {
name = mReceiver;
webhook_config = [
};
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
tls_config.insecure_skip_verify = true;
job_name = "${machineName}";
static_configs = lib.mapAttrsToList (
exporterName: exporterVal:
let
targetPort =
if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
send_resolved = true;
name = "default";
rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extra_rules;
}
];
}) matrixRoomReceivers;
})
];
};
}
(lib.optionalAttrs settings.matrix-alertmanager.enable {
clan.core.vars.generators.prometheus = {
files.matrix-alertmanager-token.secret = true;
files.matrix-alertmanager-secret.secret = true;
files.matrix-alertmanager-urlfile = {
secret = true;
owner = "alertmanager";
group = "alertmanager";
};
script = ''
echo "" > $out/matrix-alertmanager-token
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
'';
runtimeInputs = [
pkgs.openssl
];
};
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
tls_config.insecure_skip_verify = true;
job_name = "${machineName}";
static_configs = lib.mapAttrsToList (
exporterName: exporterVal:
let
targetPort =
if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{
name = "default";
rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extra_rules;
}
];
})
];
};
}
// lib.optionalAttrs settings.matrix-alertmanager.enable {
clan.core.vars.generators.prometheus = {
files.matrix-alertmanager-token.secret = true;
files.matrix-alertmanager-secret.secret = true;
files.matrix-alertmanager-urlfile = {
secret = true;
owner = "alertmanager";
group = "alertmanager";
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
enable = true;
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
matrixUser = settings.matrix-alertmanager.matrixUser;
matrixRooms = settings.matrix-alertmanager.matrixRooms;
};
script = ''
echo "" > $out/matrix-alertmanager-token
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
'';
runtimeInputs = [
pkgs.openssl
];
};
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
enable = true;
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
matrixUser = settings.matrix-alertmanager.matrixUser;
matrixRooms = settings.matrix-alertmanager.matrixRooms;
};
};
})
];
};
};