27a3c29335
lastFile:modules/clan/prometheus/default.nix
175 lines
5.2 KiB
Nix
175 lines
5.2 KiB
Nix
{ clanLib, ... }:
|
|
{
|
|
_class = "clan.service";
|
|
manifest.name = "prometheus";
|
|
manifest.description = "The Prometheus monitoring system and time series database.";
|
|
manifest.readme = builtins.readFile ./README.md;
|
|
manifest.categories = [ "System" ];
|
|
|
|
roles.server = {
|
|
description = "Prometheus server that scraps all data from nodes";
|
|
|
|
interface =
|
|
{ lib, ... }:
|
|
{
|
|
options = {
|
|
scrape_interval = lib.mkOption {
|
|
type = with lib.types; nullOr str;
|
|
default = "1m";
|
|
description = "How often to scrape targets. Default is 1 minutes";
|
|
};
|
|
extraRules = lib.mkOption {
|
|
type = with lib.types; listOf attrs;
|
|
default = [ ];
|
|
description = "Additional rules for Prometheus";
|
|
};
|
|
};
|
|
};
|
|
|
|
perInstance =
|
|
{
|
|
settings,
|
|
roles,
|
|
...
|
|
}:
|
|
{
|
|
nixosModule =
|
|
{
|
|
config,
|
|
lib,
|
|
...
|
|
}:
|
|
let
|
|
getYggdrasilIP =
|
|
machineName:
|
|
if config.clan.core.vars.generators.yggdrasil.files.address ? value then
|
|
clanLib.getPublicValue {
|
|
flake = config.clan.core.settings.directory;
|
|
machine = machineName;
|
|
generator = "yggdrasil";
|
|
file = "address";
|
|
default = null;
|
|
}
|
|
else
|
|
throw "clanService/yggdrasil is required";
|
|
in
|
|
{
|
|
networking.firewall.allowedTCPPorts = [
|
|
9090
|
|
];
|
|
services.prometheus = {
|
|
enable = true;
|
|
|
|
globalConfig = {
|
|
scrape_interval = settings.scrape_interval;
|
|
};
|
|
|
|
scrapeConfigs = lib.mapAttrsToList (machineName: machineVal: {
|
|
tls_config.insecure_skip_verify = true;
|
|
job_name = "${machineName}";
|
|
static_configs = lib.mapAttrsToList (
|
|
exporterName: exporterVal:
|
|
let
|
|
targetPort =
|
|
if exporterVal ? port then
|
|
exporterVal.port
|
|
else
|
|
config.services.prometheus.exporters."${exporterName}".port;
|
|
targetHost = getYggdrasilIP machineName;
|
|
in
|
|
{
|
|
targets = [ "[${targetHost}]:${lib.toString targetPort}" ];
|
|
}
|
|
) machineVal.settings.exporters;
|
|
}) roles.nodes.machines;
|
|
|
|
rules = [
|
|
(builtins.toJSON {
|
|
groups = [
|
|
{
|
|
name = "default";
|
|
rules = [
|
|
{
|
|
alert = "NodesDown";
|
|
expr = "up == 0";
|
|
for = "1m";
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
annotations.summary = "Node {{ $labels.job }} has been down for more than 1 minutes.";
|
|
}
|
|
{
|
|
alert = "SmartCtlErrors";
|
|
expr = "smartctl_device_error_log_count > 0";
|
|
for = "5m";
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
annotations.summary = ''
|
|
Errors occur on {{ $labels.job }}
|
|
Disk {{ $labels.device }} {{ $value }}
|
|
'';
|
|
}
|
|
{
|
|
alert = "ZFSPoolsHealth";
|
|
expr = "zfs_pool_health > 0";
|
|
for = "5m";
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
annotations.summary = ''
|
|
Unhealthy Pool at {{ $labels.job }}
|
|
Pool {{ $labels.pool }} value {{ $value }}
|
|
'';
|
|
}
|
|
]
|
|
++ settings.extraRules;
|
|
}
|
|
];
|
|
})
|
|
];
|
|
|
|
};
|
|
|
|
};
|
|
};
|
|
};
|
|
|
|
roles.nodes = {
|
|
description = "A node will expose metrics for server to harvest";
|
|
|
|
interface =
|
|
{ lib, ... }:
|
|
{
|
|
options = {
|
|
exporters = lib.mkOption {
|
|
type = lib.types.attrsOf (lib.types.submodule { });
|
|
default = { };
|
|
description = "Mirror of services.prometheus.exporters";
|
|
};
|
|
};
|
|
};
|
|
|
|
perInstance =
|
|
{ settings, ... }:
|
|
let
|
|
enabledExporters = builtins.mapAttrs (
|
|
name: value:
|
|
value
|
|
// {
|
|
enable = true;
|
|
openFirewall = true;
|
|
}
|
|
) settings.exporters;
|
|
in
|
|
{
|
|
nixosModule =
|
|
{ ... }:
|
|
{
|
|
services.prometheus.exporters = enabledExporters;
|
|
};
|
|
};
|
|
};
|
|
|
|
}
|