mob next [ci-skip] [ci skip] [skip ci]
lastFile:modules/clan/prometheus/default.nix
This commit is contained in:
+134
-131
@@ -122,147 +122,150 @@
|
||||
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
|
||||
);
|
||||
in
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9090
|
||||
];
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = settings.scrape_interval;
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}
|
||||
lib.mkMerge [
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9090
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
resolve_timeout = "5m";
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = settings.scrape_interval;
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
resolve_timeout = "5m";
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
|
||||
};
|
||||
receivers = [
|
||||
{ name = "default"; }
|
||||
]
|
||||
++ map (mReceiver: {
|
||||
name = mReceiver;
|
||||
webhook_configs = [
|
||||
{
|
||||
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}) matrixRoomReceivers;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
|
||||
};
|
||||
receivers = [
|
||||
{ name = "default"; }
|
||||
]
|
||||
++ map (mReceiver: {
|
||||
name = mReceiver;
|
||||
webhook_config = [
|
||||
};
|
||||
|
||||
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
||||
tls_config.insecure_skip_verify = true;
|
||||
job_name = "${machineName}";
|
||||
static_configs = lib.mapAttrsToList (
|
||||
exporterName: exporterVal:
|
||||
let
|
||||
targetPort =
|
||||
if exporterVal ? port then
|
||||
exporterVal.port
|
||||
else
|
||||
config.services.prometheus.exporters."${exporterName}".port;
|
||||
targetHost = getYggdrasilIP machineName;
|
||||
in
|
||||
{
|
||||
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
||||
}
|
||||
) machineVal.settings.exporters;
|
||||
}) roles.nodes.machines;
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
send_resolved = true;
|
||||
name = "default";
|
||||
rules = [
|
||||
{
|
||||
alert = "NodesDown";
|
||||
expr = "count by (job) (up == 0) > 0";
|
||||
for = "1m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
alert = "SmartCtlErrors";
|
||||
expr = "smartctl_device_error_log_count > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Errors occur on {{ $labels.job }}
|
||||
Disk {{ $labels.device }} {{ $value }}
|
||||
'';
|
||||
}
|
||||
{
|
||||
alert = "ZFSPoolsHealth";
|
||||
expr = "zfs_pool_health > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Unhealthy Pool at {{ $labels.job }}
|
||||
Pool {{ $labels.pool }} value {{ $value }}
|
||||
'';
|
||||
}
|
||||
]
|
||||
++ settings.extra_rules;
|
||||
}
|
||||
];
|
||||
}) matrixRoomReceivers;
|
||||
})
|
||||
];
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
(lib.optionalAttrs settings.matrix-alertmanager.enable {
|
||||
|
||||
clan.core.vars.generators.prometheus = {
|
||||
files.matrix-alertmanager-token.secret = true;
|
||||
files.matrix-alertmanager-secret.secret = true;
|
||||
files.matrix-alertmanager-urlfile = {
|
||||
secret = true;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
};
|
||||
script = ''
|
||||
echo "" > $out/matrix-alertmanager-token
|
||||
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
|
||||
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
'';
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
};
|
||||
|
||||
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
||||
tls_config.insecure_skip_verify = true;
|
||||
job_name = "${machineName}";
|
||||
static_configs = lib.mapAttrsToList (
|
||||
exporterName: exporterVal:
|
||||
let
|
||||
targetPort =
|
||||
if exporterVal ? port then
|
||||
exporterVal.port
|
||||
else
|
||||
config.services.prometheus.exporters."${exporterName}".port;
|
||||
targetHost = getYggdrasilIP machineName;
|
||||
in
|
||||
{
|
||||
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
||||
}
|
||||
) machineVal.settings.exporters;
|
||||
}) roles.nodes.machines;
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "default";
|
||||
rules = [
|
||||
{
|
||||
alert = "NodesDown";
|
||||
expr = "count by (job) (up == 0) > 0";
|
||||
for = "1m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
alert = "SmartCtlErrors";
|
||||
expr = "smartctl_device_error_log_count > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Errors occur on {{ $labels.job }}
|
||||
Disk {{ $labels.device }} {{ $value }}
|
||||
'';
|
||||
}
|
||||
{
|
||||
alert = "ZFSPoolsHealth";
|
||||
expr = "zfs_pool_health > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Unhealthy Pool at {{ $labels.job }}
|
||||
Pool {{ $labels.pool }} value {{ $value }}
|
||||
'';
|
||||
}
|
||||
]
|
||||
++ settings.extra_rules;
|
||||
}
|
||||
];
|
||||
})
|
||||
];
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
// lib.optionalAttrs settings.matrix-alertmanager.enable {
|
||||
clan.core.vars.generators.prometheus = {
|
||||
files.matrix-alertmanager-token.secret = true;
|
||||
files.matrix-alertmanager-secret.secret = true;
|
||||
files.matrix-alertmanager-urlfile = {
|
||||
secret = true;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
|
||||
enable = true;
|
||||
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
|
||||
matrixUser = settings.matrix-alertmanager.matrixUser;
|
||||
matrixRooms = settings.matrix-alertmanager.matrixRooms;
|
||||
};
|
||||
script = ''
|
||||
echo "" > $out/matrix-alertmanager-token
|
||||
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
|
||||
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
'';
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
};
|
||||
|
||||
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
|
||||
enable = true;
|
||||
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
|
||||
matrixUser = settings.matrix-alertmanager.matrixUser;
|
||||
matrixRooms = settings.matrix-alertmanager.matrixRooms;
|
||||
};
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user