diff --git a/modules/monitoring/dashboards/uptime.json b/modules/monitoring/dashboards/uptime.json new file mode 100644 index 0000000..2340658 --- /dev/null +++ b/modules/monitoring/dashboards/uptime.json @@ -0,0 +1,194 @@ +{ + "uid": "cnx-uptime", + "title": "CNX Uptime", + "tags": ["uptime", "availability", "cnx"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { "from": "now-24h", "to": "now" }, + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "type": "row", + "title": "Uptime", + "id": 1, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "Host status", + "description": "Whether VictoriaMetrics is currently able to scrape each host's node_exporter. UP means the host (and its mesh path) is reachable; DOWN means the scrape failed. One tile per machine.", + "id": 2, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "noValue": "no data", + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red", "index": 0 }, + "1": { "text": "UP", "color": "green", "index": 1 } + } + } + ] + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "background", + "graphMode": "none", + "textMode": "value_and_name", + "orientation": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "up{job=\"node\"}", + "legendFormat": "{{instance}}", + "instant": true + } + ] + }, + { + "type": "stat", + "title": "Current uptime", + "description": "Time since each host last booted (now - node_boot_time_seconds). A value that drops back to near zero means the host rebooted.", + "id": 3, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 }, + "fieldConfig": { + "defaults": { + "unit": "dtdurations", + "color": { "mode": "fixed", "fixedColor": "text" }, + "noValue": "no data" + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "none", + "graphMode": "none", + "textMode": "value_and_name", + "orientation": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "time() - node_boot_time_seconds{job=\"node\"}", + "legendFormat": "{{instance}}", + "instant": true + } + ] + }, + { + "type": "bargauge", + "title": "Availability over window", + "description": "Fraction of successful scrapes over the selected time range, per host (avg of up over $__range). 100% means every scrape in the window succeeded; dips reveal flapping or outages. Red below 99%.", + "id": 4, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 99 }, + { "color": "green", "value": 99.9 } + ] + }, + "noValue": "no data" + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "showUnfilled": true, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "avg_over_time(up{job=\"node\"}[$__range]) * 100", + "legendFormat": "{{instance}}", + "instant": true + } + ] + }, + { + "type": "timeseries", + "title": "Uptime over time", + "description": "Host uptime across the window. The line should climb steadily; a reset to zero marks a reboot.", + "id": 5, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, + "fieldConfig": { + "defaults": { "unit": "s", "custom": { "fillOpacity": 0 } }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "time() - node_boot_time_seconds{job=\"node\"}", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Up/down history", + "description": "1 while a host's node_exporter was scrapeable, 0 while it was not. Gaps to zero are outages or lost mesh connectivity.", + "id": 6, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 15 }, + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0, + "max": 1, + "custom": { "fillOpacity": 20, "lineInterpolation": "stepAfter" } + }, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "expr": "up{job=\"node\"}", + "legendFormat": "{{instance}}" + } + ] + } + ] +}