cnx-network-clan/modules/monitoring/server.nix

# Monitoring server, imported by control only: VictoriaMetrics (TSDB + scraper)
# and Grafana. VictoriaMetrics binds loopback (only Grafana, on the same host,
# reads it). Grafana is reachable over the ZeroTier mesh, scoped by the firewall
# rule at the bottom; the Hetzner cloud firewall keeps it off the public net.
{
  config,
  lib,
  pkgs,
  ...
}:
let
  mesh = import ../mesh-hosts.nix { inherit config lib; };
  probes = import ./blackbox-probes.nix { inherit lib; };
  vmPort = 8428;
  logsPort = 9428;
  grafanaPort = 3000;
  controlV6 = mesh.hosts.control;

  # A single scrape target with a friendly instance label. IPv6 mesh addresses
  # must be bracketed for Prometheus-style targets.
  target = name: addr: port: {
    targets = [ "${addr}:${toString port}" ];
    labels.instance = name;
  };
  v6 = addr: "[${addr}]";

  adminPasswordFile = config.clan.core.vars.generators.grafana-admin.files."password".path;
in
{
  services.victoriametrics = {
    enable = true;
    listenAddress = "127.0.0.1:${toString vmPort}";
    retentionPeriod = "180d";
    # The scraper dials IPv4-only by default; our ns1/ns2 targets are mesh ULAs,
    # so without this VM drops them with "no suitable address found (try -enableTCP6)".
    extraOptions = [ "-enableTCP6" ];
    prometheusConfig = {
      global.scrape_interval = "30s";
      scrape_configs = [
        {
          job_name = "node";
          static_configs = [
            # control scrapes its own node_exporter over loopback so host metrics
            # survive even if the mesh is down; ns1/ns2 are scraped over the mesh.
            (target "control" "127.0.0.1" 9100)
            (target "ns1" (v6 mesh.hosts.ns1) 9100)
            (target "ns2" (v6 mesh.hosts.ns2) 9100)
            (target "mx1" (v6 mesh.hosts.mx1) 9100)
          ];
        }
        {
          job_name = "knot";
          static_configs = [
            (target "ns1" (v6 mesh.hosts.ns1) 9433)
            (target "ns2" (v6 mesh.hosts.ns2) 9433)
          ];
        }
      ]
      # Outside-in DNS probes via the blackbox exporter (blackbox.nix). The job
      # list is generated from the same probe definitions the exporter uses.
      ++ probes.scrapeConfigs;
    };
  };

  # Centralized logs: VictoriaLogs ingests journald from all three hosts, each
  # of which runs systemd-journal-upload against /insert/journald (exporters.nix).
  # Binds all interfaces because ns1/ns2 push over the mesh; the firewall rule at
  # the bottom scopes 9428 to the mesh subnet and the Hetzner firewall closes the
  # public side. Retention is set via extraOptions (no dedicated NixOS option).
  services.victorialogs = {
    enable = true;
    listenAddress = ":${toString logsPort}";
    # -enableTCP6: like the scraper above, VictoriaLogs is IPv4-only by default
    # for *listening* too — ":9428" binds 0.0.0.0 only, so ns1/ns2 pushing over
    # the IPv6 mesh get "connection refused". This makes it bind [::] (dual-stack)
    # so the mesh can reach it. Retention has no dedicated NixOS option.
    extraOptions = [
      "-retentionPeriod=30d"
      "-enableTCP6"
    ];
  };

  # Admin password generated once and stored as a clan secret. Retrieve with:
  #   clan vars get control grafana-admin/password
  clan.core.vars.generators.grafana-admin = {
    files."password" = {
      secret = true;
      owner = "grafana";
      group = "grafana";
    };
    runtimeInputs = [ pkgs.openssl ];
    script = ''
      openssl rand -base64 24 | tr -d "\n" > "$out"/password
    '';
  };

  services.grafana = {
    enable = true;
    # VictoriaLogs datasource plugin so journald is greppable from Grafana,
    # alongside the metrics datasource.
    declarativePlugins = [ pkgs.grafanaPlugins.victoriametrics-logs-datasource ];
    settings = {
      server = {
        http_addr = "::";
        http_port = grafanaPort;
        root_url = "http://${v6 controlV6}:${toString grafanaPort}/";
      };
      security = {
        admin_user = "admin";
        admin_password = "$__file{${adminPasswordFile}}";
      };
      "auth.anonymous".enabled = false;
      users.allow_sign_up = false;
    };
    provision = {
      enable = true;
      datasources.settings = {
        apiVersion = 1;
        datasources = [
          {
            name = "VictoriaMetrics";
            type = "prometheus";
            uid = "victoriametrics";
            access = "proxy";
            url = "http://127.0.0.1:${toString vmPort}";
            isDefault = true;
          }
          {
            name = "VictoriaLogs";
            type = "victoriametrics-logs-datasource";
            uid = "victorialogs";
            access = "proxy";
            url = "http://127.0.0.1:${toString logsPort}";
          }
        ];
      };
      dashboards.settings = {
        apiVersion = 1;
        providers = [
          {
            name = "cnx";
            options.path = ./dashboards;
            options.foldersFromFilesStructure = false;
          }
        ];
      };
    };
  };

  # Grafana (admin laptops + servers) and VictoriaLogs ingestion (ns1/ns2 push
  # journald over the mesh) reachable only from the ZeroTier mesh.
  networking.firewall.extraInputRules = ''
    ip6 saddr ${mesh.subnet} tcp dport { ${toString grafanaPort}, ${toString logsPort} } accept
  '';
}