mob next [ci-skip] [ci skip] [skip ci]

lastFile:modules/clan/prometheus/default.nix
This commit is contained in:
2026-06-22 16:49:41 +07:00
parent f2cdea1aec
commit 106591d374
2 changed files with 223 additions and 206 deletions
+89 -75
View File
@@ -320,81 +320,95 @@
name = "prometheus"; name = "prometheus";
input = "self"; input = "self";
}; };
roles.server.machines."rigel".settings = { }; roles.server.machines."rigel".settings = {
roles.server.extraModules = [ matrix-alertmanager = {
( enable = true;
{ config, pkgs, ... }: homeserverUrl = "https://matrix-client.matrix.org";
{ matrixUser = "@kuroiris:matrix.org";
# clan.core.vars.generators.prometheus = { matrixRooms = [
# files.matrix-alertmanager-token.secret = true; {
# files.matrix-alertmanager-secret.secret = true; receivers = [
# files.matrix-alertmanager-urlfile = { "matrix"
# secret = true; ];
# owner = "alertmanager"; roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
# group = "alertmanager"; }
# }; ];
# script = '' };
# echo "" > $out/matrix-alertmanager-token };
# openssl rand -hex 32 > "$out"/matrix-alertmanager-secret # roles.server.extraModules = [
# # (
# echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile # { config, pkgs, ... }:
# ''; # {
# runtimeInputs = [ # # clan.core.vars.generators.prometheus = {
# pkgs.openssl # # files.matrix-alertmanager-token.secret = true;
# ]; # # files.matrix-alertmanager-secret.secret = true;
# }; # # files.matrix-alertmanager-urlfile = {
# # # secret = true;
# services.matrix-alertmanager = { # # owner = "alertmanager";
# enable = true; # # group = "alertmanager";
# tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path; # # };
# secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path; # # script = ''
# homeserverUrl = "https://matrix-client.matrix.org"; # # echo "" > $out/matrix-alertmanager-token
# matrixUser = "@kuroiris:matrix.org"; # # openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
# matrixRooms = [ # #
# { # # echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
# receivers = [ # # '';
# "matrix" # # runtimeInputs = [
# ]; # # pkgs.openssl
# roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org"; # # ];
# } # # };
# ]; # #
# }; # # services.matrix-alertmanager = {
# # enable = true;
# services.prometheus = { # # tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
# # # secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
# alertmanager = { # # homeserverUrl = "https://matrix-client.matrix.org";
# enable = true; # # matrixUser = "@kuroiris:matrix.org";
# configuration = { # # matrixRooms = [
# global = { # # {
# resolve_timeout = "5m"; # # receivers = [
# }; # # "matrix"
# route = { # # ];
# receiver = "default"; # # roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
# routes = [ # # }
# { # # ];
# receiver = "matrix"; # # };
# } #
# ]; # # services.prometheus = {
# }; # #
# receivers = [ # # alertmanager = {
# { name = "default"; } # # enable = true;
# { # # configuration = {
# name = "matrix"; # # global = {
# webhook_configs = [ # # resolve_timeout = "5m";
# { # # };
# url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path; # # route = {
# send_resolved = true; # # receiver = "default";
# } # # routes = [
# ]; # # {
# } # # receiver = "matrix";
# ]; # # }
# }; # # ];
# }; # # };
# # # receivers = [
# }; # # { name = "default"; }
} # # {
) # # name = "matrix";
]; # # webhook_configs = [
# # {
# # url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
# # send_resolved = true;
# # }
# # ];
# # }
# # ];
# # };
# # };
# #
# # };
# }
# )
# ];
roles.nodes.machines = { roles.nodes.machines = {
vega.settings = { vega.settings = {
+134 -131
View File
@@ -122,147 +122,150 @@
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
); );
in in
{ lib.mkMerge [
networking.firewall.allowedTCPPorts = [ {
9090 networking.firewall.allowedTCPPorts = [
]; 9090
services.prometheus = {
enable = true;
globalConfig = {
scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
]; ];
services.prometheus = {
alertmanager = {
enable = true; enable = true;
configuration = {
global = { globalConfig = {
resolve_timeout = "5m"; scrape_interval = settings.scrape_interval;
};
alertmanagers = [
{
scheme = "http";
path_prefix = "/";
static_configs = [ { targets = [ "localhost:9093" ]; } ];
}
];
alertmanager = {
enable = true;
configuration = {
global = {
resolve_timeout = "5m";
};
route = {
receiver = "default";
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
};
receivers = [
{ name = "default"; }
]
++ map (mReceiver: {
name = mReceiver;
webhook_configs = [
{
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
send_resolved = true;
}
];
}) matrixRoomReceivers;
}; };
route = { };
receiver = "default";
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers; scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
}; tls_config.insecure_skip_verify = true;
receivers = [ job_name = "${machineName}";
{ name = "default"; } static_configs = lib.mapAttrsToList (
] exporterName: exporterVal:
++ map (mReceiver: { let
name = mReceiver; targetPort =
webhook_config = [ if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{ {
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path; name = "default";
send_resolved = true; rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extra_rules;
} }
]; ];
}) matrixRoomReceivers; })
];
};
}
(lib.optionalAttrs settings.matrix-alertmanager.enable {
clan.core.vars.generators.prometheus = {
files.matrix-alertmanager-token.secret = true;
files.matrix-alertmanager-secret.secret = true;
files.matrix-alertmanager-urlfile = {
secret = true;
owner = "alertmanager";
group = "alertmanager";
}; };
script = ''
echo "" > $out/matrix-alertmanager-token
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
'';
runtimeInputs = [
pkgs.openssl
];
}; };
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: { services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
tls_config.insecure_skip_verify = true; enable = true;
job_name = "${machineName}"; tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
static_configs = lib.mapAttrsToList ( secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
exporterName: exporterVal: homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
let matrixUser = settings.matrix-alertmanager.matrixUser;
targetPort = matrixRooms = settings.matrix-alertmanager.matrixRooms;
if exporterVal ? port then
exporterVal.port
else
config.services.prometheus.exporters."${exporterName}".port;
targetHost = getYggdrasilIP machineName;
in
{
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
}
) machineVal.settings.exporters;
}) roles.nodes.machines;
rules = [
(builtins.toJSON {
groups = [
{
name = "default";
rules = [
{
alert = "NodesDown";
expr = "count by (job) (up == 0) > 0";
for = "1m";
labels = {
severity = "critical";
};
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
}
{
alert = "SmartCtlErrors";
expr = "smartctl_device_error_log_count > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Errors occur on {{ $labels.job }}
Disk {{ $labels.device }} {{ $value }}
'';
}
{
alert = "ZFSPoolsHealth";
expr = "zfs_pool_health > 0";
for = "5m";
labels = {
severity = "critical";
};
annotations.summary = ''
Unhealthy Pool at {{ $labels.job }}
Pool {{ $labels.pool }} value {{ $value }}
'';
}
]
++ settings.extra_rules;
}
];
})
];
};
}
// lib.optionalAttrs settings.matrix-alertmanager.enable {
clan.core.vars.generators.prometheus = {
files.matrix-alertmanager-token.secret = true;
files.matrix-alertmanager-secret.secret = true;
files.matrix-alertmanager-urlfile = {
secret = true;
owner = "alertmanager";
group = "alertmanager";
}; };
script = '' })
echo "" > $out/matrix-alertmanager-token ];
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
'';
runtimeInputs = [
pkgs.openssl
];
};
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
enable = true;
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
matrixUser = settings.matrix-alertmanager.matrixUser;
matrixRooms = settings.matrix-alertmanager.matrixRooms;
};
};
}; };
}; };