clanService/prometheus: init monitoring system
This commit is contained in:
@@ -0,0 +1,308 @@
|
||||
{ clanLib, ... }:
|
||||
{
|
||||
_class = "clan.service";
|
||||
manifest.name = "prometheus";
|
||||
manifest.description = "The Prometheus monitoring system and time series database.";
|
||||
manifest.readme = builtins.readFile ./README.md;
|
||||
manifest.categories = [ "System" ];
|
||||
|
||||
roles.server = {
|
||||
description = "Prometheus server that scraps all data from nodes";
|
||||
|
||||
interface =
|
||||
{ lib, ... }:
|
||||
{
|
||||
options = {
|
||||
scrape_interval = lib.mkOption {
|
||||
type = with lib.types; nullOr str;
|
||||
default = "1m";
|
||||
description = "How often to scrape targets. Default is 1 minutes";
|
||||
};
|
||||
extra_rules = lib.mkOption {
|
||||
type = with lib.types; listOf attrs;
|
||||
default = [ ];
|
||||
description = "Additional rules for Prometheus";
|
||||
};
|
||||
default_receiver = lib.mkOption {
|
||||
type = with lib.types; attrs;
|
||||
default = {
|
||||
name = "default";
|
||||
};
|
||||
description = "Definition of a default receiver, default is doing nothing";
|
||||
};
|
||||
matrix-alertmanager = {
|
||||
enable = lib.mkOption {
|
||||
type = with lib.types; bool;
|
||||
default = false;
|
||||
description = "Whether to enable `services.matrix-alertmanager`";
|
||||
};
|
||||
homeserverUrl = lib.mkOption {
|
||||
type = with lib.types; str;
|
||||
default = "https://matrix-client.matrix.org";
|
||||
description = "URL of the Matrix homeserver to use";
|
||||
};
|
||||
matrixUser = lib.mkOption {
|
||||
type = with lib.types; str;
|
||||
description = "Matrix user for the bot";
|
||||
};
|
||||
matrixRooms = lib.mkOption {
|
||||
type = lib.types.listOf (
|
||||
lib.types.submodule {
|
||||
options = {
|
||||
receivers = lib.mkOption {
|
||||
type = lib.types.listOf lib.types.str;
|
||||
description = "List of receivers for this room";
|
||||
};
|
||||
roomId = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
description = "Matrix room ID";
|
||||
apply =
|
||||
x:
|
||||
assert lib.assertMsg (lib.hasPrefix "!" x) "Matrix room ID must start with a '!'. Got: ${x}";
|
||||
x;
|
||||
};
|
||||
};
|
||||
}
|
||||
);
|
||||
description = ''
|
||||
Combination of Alertmanager receiver(s) and rooms for the bot to join.
|
||||
Each Alertmanager receiver can be mapped to post to a matrix room.
|
||||
|
||||
Note, you must use a room ID and not a room alias/name. Room IDs start
|
||||
with a "!".
|
||||
'';
|
||||
example = [
|
||||
{
|
||||
receivers = [
|
||||
"receiver1"
|
||||
"receiver2"
|
||||
];
|
||||
roomId = "!roomid@example.com";
|
||||
}
|
||||
{
|
||||
receivers = [ "receiver3" ];
|
||||
roomId = "!differentroomid@example.com";
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
perInstance =
|
||||
{
|
||||
settings,
|
||||
roles,
|
||||
...
|
||||
}:
|
||||
{
|
||||
nixosModule =
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
getYggdrasilIP =
|
||||
machineName:
|
||||
if config.clan.core.vars.generators.yggdrasil.files.address ? value then
|
||||
clanLib.getPublicValue {
|
||||
flake = config.clan.core.settings.directory;
|
||||
machine = machineName;
|
||||
generator = "yggdrasil";
|
||||
file = "address";
|
||||
default = null;
|
||||
}
|
||||
else
|
||||
throw "clanService/yggdrasil is required";
|
||||
|
||||
matrixRoomReceivers = lib.unique (
|
||||
lib.concatMap (entry: entry.receivers) settings.matrix-alertmanager.matrixRooms
|
||||
);
|
||||
in
|
||||
lib.mkMerge [
|
||||
{
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
9090
|
||||
];
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = settings.scrape_interval;
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
resolve_timeout = "5m";
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = map (mReceiver: { receiver = mReceiver; }) matrixRoomReceivers;
|
||||
};
|
||||
receivers = [
|
||||
{ name = "default"; }
|
||||
]
|
||||
++ map (mReceiver: {
|
||||
name = mReceiver;
|
||||
webhook_configs = [
|
||||
{
|
||||
url_file = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-urlfile.path;
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}) matrixRoomReceivers;
|
||||
};
|
||||
};
|
||||
|
||||
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
||||
tls_config.insecure_skip_verify = true;
|
||||
job_name = "${machineName}";
|
||||
static_configs = lib.mapAttrsToList (
|
||||
exporterName: exporterVal:
|
||||
let
|
||||
targetPort =
|
||||
if exporterVal ? port then
|
||||
exporterVal.port
|
||||
else
|
||||
config.services.prometheus.exporters."${exporterName}".port;
|
||||
targetHost = getYggdrasilIP machineName;
|
||||
in
|
||||
{
|
||||
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
||||
}
|
||||
) machineVal.settings.exporters;
|
||||
}) roles.nodes.machines;
|
||||
|
||||
rules = [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "default";
|
||||
rules = [
|
||||
{
|
||||
alert = "NodesDown";
|
||||
expr = "count by (job) (up == 0) > 0";
|
||||
for = "1m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = "Node **{{ $labels.job }}** has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
alert = "SmartCtlErrors";
|
||||
expr = "smartctl_device_error_log_count > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Errors occur on **{{ $labels.job }}**
|
||||
Disk {{ $labels.device }} {{ $value }}
|
||||
'';
|
||||
}
|
||||
{
|
||||
alert = "ZFSPoolsHealth";
|
||||
expr = "zfs_pool_health > 0";
|
||||
for = "5m";
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
annotations.summary = ''
|
||||
Unhealthy Pool at **{{ $labels.job }}**
|
||||
Pool {{ $labels.pool }} value {{ $value }}
|
||||
'';
|
||||
}
|
||||
]
|
||||
++ settings.extra_rules;
|
||||
}
|
||||
];
|
||||
})
|
||||
];
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
(lib.optionalAttrs settings.matrix-alertmanager.enable {
|
||||
|
||||
clan.core.vars.generators.prometheus = {
|
||||
files.matrix-alertmanager-token.secret = true;
|
||||
files.matrix-alertmanager-secret.secret = true;
|
||||
files.matrix-alertmanager-urlfile = {
|
||||
secret = true;
|
||||
owner = "alertmanager";
|
||||
group = "alertmanager";
|
||||
};
|
||||
script = ''
|
||||
echo "" > $out/matrix-alertmanager-token
|
||||
openssl rand -hex 32 > "$out"/matrix-alertmanager-secret
|
||||
|
||||
echo "http://localhost:3000/alerts?secret=$(cat $out/matrix-alertmanager-secret)" > $out/matrix-alertmanager-urlfile
|
||||
'';
|
||||
runtimeInputs = [
|
||||
pkgs.openssl
|
||||
];
|
||||
};
|
||||
|
||||
services.matrix-alertmanager = lib.mkIf settings.matrix-alertmanager.enable {
|
||||
enable = true;
|
||||
tokenFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-token.path;
|
||||
secretFile = config.clan.core.vars.generators.prometheus.files.matrix-alertmanager-secret.path;
|
||||
homeserverUrl = settings.matrix-alertmanager.homeserverUrl;
|
||||
matrixUser = settings.matrix-alertmanager.matrixUser;
|
||||
matrixRooms = settings.matrix-alertmanager.matrixRooms;
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
roles.nodes = {
|
||||
description = "A node will expose metrics for server to harvest";
|
||||
|
||||
interface =
|
||||
{ lib, ... }:
|
||||
{
|
||||
options = {
|
||||
exporters = lib.mkOption {
|
||||
type = lib.types.attrsOf (lib.types.submodule { });
|
||||
default = { };
|
||||
description = "Mirror of services.prometheus.exporters";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
perInstance =
|
||||
{ settings, ... }:
|
||||
let
|
||||
enabledExporters = builtins.mapAttrs (
|
||||
name: value:
|
||||
value
|
||||
// {
|
||||
enable = true;
|
||||
openFirewall = true;
|
||||
}
|
||||
) settings.exporters;
|
||||
in
|
||||
{
|
||||
nixosModule =
|
||||
{ ... }:
|
||||
{
|
||||
services.prometheus.exporters = enabledExporters;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user