mob next [ci-skip] [ci skip] [skip ci]
lastFile:modules/clan/prometheus/default.nix
This commit is contained in:
+89
-75
@@ -320,81 +320,95 @@
|
||||
name = "prometheus";
|
||||
input = "self";
|
||||
};
|
||||
roles.server.machines."rigel".settings = { };
|
||||
roles.server.extraModules = [
|
||||
(
|
||||
{ config, pkgs, ... }:
|
||||
{
|
||||
# clan.core.vars.generators.prometheus = {
|
||||
# files.matrix-alertmanager-token.secret = true;
|
||||
# files.matrix-alertmanager-secret.secret = true;
|
||||
# files.matrix-alertmanager-urlfile = {
|
||||
# secret = true;
|
||||
# owner = "alertmanager";
|
||||
# group = "alertmanager";
|
||||
# };
|
||||
# script = ''
|
||||
# echo "" > $out/matrix-alertmanager-token
|
||||
# openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
#
|
||||
# echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
# '';
|
||||
# runtimeInputs = [
|
||||
# pkgs.openssl
|
||||
# ];
|
||||
# };
|
||||
#
|
||||
# services.matrix-alertmanager = {
|
||||
# enable = true;
|
||||
# tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
# secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
# homeserverUrl = "https://matrix-client.matrix.org";
|
||||
# matrixUser = "@kuroiris:matrix.org";
|
||||
# matrixRooms = [
|
||||
# {
|
||||
# receivers = [
|
||||
# "matrix"
|
||||
# ];
|
||||
# roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
|
||||
# services.prometheus = {
|
||||
#
|
||||
# alertmanager = {
|
||||
# enable = true;
|
||||
# configuration = {
|
||||
# global = {
|
||||
# resolve_timeout = "5m";
|
||||
# };
|
||||
# route = {
|
||||
# receiver = "default";
|
||||
# routes = [
|
||||
# {
|
||||
# receiver = "matrix";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# receivers = [
|
||||
# { name = "default"; }
|
||||
# {
|
||||
# name = "matrix";
|
||||
# webhook_configs = [
|
||||
# {
|
||||
# url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
# send_resolved = true;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# };
|
||||
#
|
||||
# };
|
||||
}
|
||||
)
|
||||
];
|
||||
roles.server.machines."rigel".settings = {
|
||||
matrix-alertmanager = {
|
||||
enable = true;
|
||||
homeserverUrl = "https://matrix-client.matrix.org";
|
||||
matrixUser = "@kuroiris:matrix.org";
|
||||
matrixRooms = [
|
||||
{
|
||||
receivers = [
|
||||
"matrix"
|
||||
];
|
||||
roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
# roles.server.extraModules = [
|
||||
# (
|
||||
# { config, pkgs, ... }:
|
||||
# {
|
||||
# # clan.core.vars.generators.prometheus = {
|
||||
# # files.matrix-alertmanager-token.secret = true;
|
||||
# # files.matrix-alertmanager-secret.secret = true;
|
||||
# # files.matrix-alertmanager-urlfile = {
|
||||
# # secret = true;
|
||||
# # owner = "alertmanager";
|
||||
# # group = "alertmanager";
|
||||
# # };
|
||||
# # script = ''
|
||||
# # echo "" > $out/matrix-alertmanager-token
|
||||
# # openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
# #
|
||||
# # echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
# # '';
|
||||
# # runtimeInputs = [
|
||||
# # pkgs.openssl
|
||||
# # ];
|
||||
# # };
|
||||
# #
|
||||
# # services.matrix-alertmanager = {
|
||||
# # enable = true;
|
||||
# # tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
# # secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
# # homeserverUrl = "https://matrix-client.matrix.org";
|
||||
# # matrixUser = "@kuroiris:matrix.org";
|
||||
# # matrixRooms = [
|
||||
# # {
|
||||
# # receivers = [
|
||||
# # "matrix"
|
||||
# # ];
|
||||
# # roomId = "!rqIrWqPvsXqMgYpcNZ:matrix.org";
|
||||
# # }
|
||||
# # ];
|
||||
# # };
|
||||
#
|
||||
# # services.prometheus = {
|
||||
# #
|
||||
# # alertmanager = {
|
||||
# # enable = true;
|
||||
# # configuration = {
|
||||
# # global = {
|
||||
# # resolve_timeout = "5m";
|
||||
# # };
|
||||
# # route = {
|
||||
# # receiver = "default";
|
||||
# # routes = [
|
||||
# # {
|
||||
# # receiver = "matrix";
|
||||
# # }
|
||||
# # ];
|
||||
# # };
|
||||
# # receivers = [
|
||||
# # { name = "default"; }
|
||||
# # {
|
||||
# # name = "matrix";
|
||||
# # webhook_configs = [
|
||||
# # {
|
||||
# # url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
# # send_resolved = true;
|
||||
# # }
|
||||
# # ];
|
||||
# # }
|
||||
# # ];
|
||||
# # };
|
||||
# # };
|
||||
# #
|
||||
# # };
|
||||
# }
|
||||
# )
|
||||
# ];
|
||||
|
||||
roles.nodes.machines = {
|
||||
vega.settings = {
|
||||
|
||||
+134
-131
@@ -122,147 +122,150 @@
|
||||
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
|
||||
);
|
||||
in
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9090
|
||||
];
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = settings.scrape_interval;
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}
|
||||
lib.mkMerge [
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9090
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
resolve_timeout = "5m";
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = settings.scrape_interval;
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
resolve_timeout = "5m";
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
|
||||
};
|
||||
receivers = [
|
||||
{ name = "default"; }
|
||||
]
|
||||
++ map (mReceiver: {
|
||||
name = mReceiver;
|
||||
webhook_configs = [
|
||||
{
|
||||
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}) matrixRoomReceivers;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
|
||||
};
|
||||
receivers = [
|
||||
{ name = "default"; }
|
||||
]
|
||||
++ map (mReceiver: {
|
||||
name = mReceiver;
|
||||
webhook_config = [
|
||||
};
|
||||
|
||||
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
||||
tls_config.insecure_skip_verify = true;
|
||||
job_name = "${machineName}";
|
||||
static_configs = lib.mapAttrsToList (
|
||||
exporterName: exporterVal:
|
||||
let
|
||||
targetPort =
|
||||
if exporterVal ? port then
|
||||
exporterVal.port
|
||||
else
|
||||
config.services.prometheus.exporters."${exporterName}".port;
|
||||
targetHost = getYggdrasilIP machineName;
|
||||
in
|
||||
{
|
||||
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
||||
}
|
||||
) machineVal.settings.exporters;
|
||||
}) roles.nodes.machines;
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
send_resolved = true;
|
||||
name = "default";
|
||||
rules = [
|
||||
{
|
||||
alert = "NodesDown";
|
||||
expr = "count by (job) (up == 0) > 0";
|
||||
for = "1m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
alert = "SmartCtlErrors";
|
||||
expr = "smartctl_device_error_log_count > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Errors occur on {{ $labels.job }}
|
||||
Disk {{ $labels.device }} {{ $value }}
|
||||
'';
|
||||
}
|
||||
{
|
||||
alert = "ZFSPoolsHealth";
|
||||
expr = "zfs_pool_health > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Unhealthy Pool at {{ $labels.job }}
|
||||
Pool {{ $labels.pool }} value {{ $value }}
|
||||
'';
|
||||
}
|
||||
]
|
||||
++ settings.extra_rules;
|
||||
}
|
||||
];
|
||||
}) matrixRoomReceivers;
|
||||
})
|
||||
];
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
(lib.optionalAttrs settings.matrix-alertmanager.enable {
|
||||
|
||||
clan.core.vars.generators.prometheus = {
|
||||
files.matrix-alertmanager-token.secret = true;
|
||||
files.matrix-alertmanager-secret.secret = true;
|
||||
files.matrix-alertmanager-urlfile = {
|
||||
secret = true;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
};
|
||||
script = ''
|
||||
echo "" > $out/matrix-alertmanager-token
|
||||
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
|
||||
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
'';
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
};
|
||||
|
||||
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
||||
tls_config.insecure_skip_verify = true;
|
||||
job_name = "${machineName}";
|
||||
static_configs = lib.mapAttrsToList (
|
||||
exporterName: exporterVal:
|
||||
let
|
||||
targetPort =
|
||||
if exporterVal ? port then
|
||||
exporterVal.port
|
||||
else
|
||||
config.services.prometheus.exporters."${exporterName}".port;
|
||||
targetHost = getYggdrasilIP machineName;
|
||||
in
|
||||
{
|
||||
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
||||
}
|
||||
) machineVal.settings.exporters;
|
||||
}) roles.nodes.machines;
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "default";
|
||||
rules = [
|
||||
{
|
||||
alert = "NodesDown";
|
||||
expr = "count by (job) (up == 0) > 0";
|
||||
for = "1m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
alert = "SmartCtlErrors";
|
||||
expr = "smartctl_device_error_log_count > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Errors occur on {{ $labels.job }}
|
||||
Disk {{ $labels.device }} {{ $value }}
|
||||
'';
|
||||
}
|
||||
{
|
||||
alert = "ZFSPoolsHealth";
|
||||
expr = "zfs_pool_health > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Unhealthy Pool at {{ $labels.job }}
|
||||
Pool {{ $labels.pool }} value {{ $value }}
|
||||
'';
|
||||
}
|
||||
]
|
||||
++ settings.extra_rules;
|
||||
}
|
||||
];
|
||||
})
|
||||
];
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
// lib.optionalAttrs settings.matrix-alertmanager.enable {
|
||||
clan.core.vars.generators.prometheus = {
|
||||
files.matrix-alertmanager-token.secret = true;
|
||||
files.matrix-alertmanager-secret.secret = true;
|
||||
files.matrix-alertmanager-urlfile = {
|
||||
secret = true;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
|
||||
enable = true;
|
||||
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
|
||||
matrixUser = settings.matrix-alertmanager.matrixUser;
|
||||
matrixRooms = settings.matrix-alertmanager.matrixRooms;
|
||||
};
|
||||
script = ''
|
||||
echo "" > $out/matrix-alertmanager-token
|
||||
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
|
||||
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
'';
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
};
|
||||
|
||||
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
|
||||
enable = true;
|
||||
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
|
||||
matrixUser = settings.matrix-alertmanager.matrixUser;
|
||||
matrixRooms = settings.matrix-alertmanager.matrixRooms;
|
||||
};
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user