diff --git a/setup/pi-monitoring/ARCHITECTURE.md b/setup/pi-monitoring/ARCHITECTURE.md new file mode 100644 index 0000000..e742a15 --- /dev/null +++ b/setup/pi-monitoring/ARCHITECTURE.md @@ -0,0 +1,33 @@ +# Pi Monitoring Architecture + +## Goal + +Use Raspberry Pi as lightweight control/observability plane for Docker hosts in this project. + +## Topology + +```text + +------------------------------+ + | Raspberry Pi (ops node) | + | Portainer + Grafana | + | Prometheus + Uptime Kuma | + +---------------+--------------+ + | + Portainer Agent (9001) + +----------------+----------------+ + | | + +---------v---------+ +---------v---------+ + | Unraid Docker host| | Fedora Docker host| + | Gitea primary | | Gitea backup | + +-------------------+ +-------------------+ +``` + +## Data paths + +- Pi local metrics: `node-exporter` + `cadvisor` -> Prometheus -> Grafana +- Remote host control: Portainer -> remote `portainer_agent` +- Availability checks: Uptime Kuma -> HTTP/TCP checks for Gitea and other services + +## Persistence + +All service data is stored under `OPS_ROOT` (default `/srv/ops`) on SSD. diff --git a/setup/pi-monitoring/backup_stack.sh b/setup/pi-monitoring/backup_stack.sh new file mode 100755 index 0000000..a621e90 --- /dev/null +++ b/setup/pi-monitoring/backup_stack.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=./lib.sh +source "$SCRIPT_DIR/lib.sh" + +ENV_FILE="$SCRIPT_DIR/stack.env" +RETENTION_DAYS=14 +AUTO_YES=false + +usage() { + cat </dev/null 2>&1; then + log_info "Installing Docker Engine..." + curl -fsSL https://get.docker.com | sh +else + log_success "Docker already installed" +fi + +log_info "Enabling Docker service..." +sudo systemctl enable --now docker + +log_info "Configuring Docker daemon defaults..." +sudo mkdir -p /etc/docker +sudo tee /etc/docker/daemon.json >/dev/null <<'JSON' +{ + "log-driver": "json-file", + "log-opts": { + "max-size": "10m", + "max-file": "3" + }, + "live-restore": true +} +JSON +sudo systemctl restart docker + +if id -nG "$USER" | grep -qw docker; then + log_success "User '$USER' is already in docker group" +else + log_info "Adding '$USER' to docker group..." + sudo usermod -aG docker "$USER" + log_warn "Log out and back in once for docker group membership to apply." +fi + +if [[ "$ENABLE_UFW" == "true" ]]; then + log_info "Configuring UFW firewall rules..." + sudo ufw allow "${SSH_PORT}/tcp" + sudo ufw allow 9443/tcp # Portainer HTTPS + sudo ufw allow 8000/tcp # Portainer Edge + sudo ufw allow 3000/tcp # Grafana + sudo ufw allow 3001/tcp # Uptime Kuma + sudo ufw allow 9090/tcp # Prometheus + sudo ufw --force enable +fi + +log_success "Bootstrap complete" +log_info "Recommended next steps:" +log_info "1) Re-login to apply docker group membership" +log_info "2) Run setup/pi-monitoring/mount_ssd.sh" +log_info "3) Copy stack.env.example to stack.env and run deploy_stack.sh" diff --git a/setup/pi-monitoring/deploy_stack.sh b/setup/pi-monitoring/deploy_stack.sh new file mode 100755 index 0000000..683d626 --- /dev/null +++ b/setup/pi-monitoring/deploy_stack.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=./lib.sh +source "$SCRIPT_DIR/lib.sh" + +ENV_FILE="$SCRIPT_DIR/stack.env" +AUTO_YES=false + +usage() { + cat </dev/null 2>&1; then + log_error "docker compose plugin not found" + exit 1 +fi + +if ! confirm_action "Deploy monitoring stack using $ENV_FILE?" "$AUTO_YES"; then + log_info "Cancelled" + exit 0 +fi + +ensure_ops_dirs "$OPS_ROOT" +prepare_permissions "$OPS_ROOT" + +if [[ ! -f "$OPS_ROOT/prometheus/targets/external.yml" ]]; then + cp "$SCRIPT_DIR/prometheus/targets/external.yml.example" "$OPS_ROOT/prometheus/targets/external.yml" + log_info "Created $OPS_ROOT/prometheus/targets/external.yml from example" +fi + +compose_file_path="$(compose_file)" + +log_info "Pulling container images..." +docker compose \ + --project-name "$COMPOSE_PROJECT_NAME" \ + --env-file "$ENV_FILE" \ + -f "$compose_file_path" \ + pull + +log_info "Starting stack..." +docker compose \ + --project-name "$COMPOSE_PROJECT_NAME" \ + --env-file "$ENV_FILE" \ + -f "$compose_file_path" \ + up -d + +services=(portainer grafana prometheus uptime-kuma node-exporter cadvisor) +for svc in "${services[@]}"; do + cid="$(docker compose \ + --project-name "$COMPOSE_PROJECT_NAME" \ + --env-file "$ENV_FILE" \ + -f "$compose_file_path" \ + ps -q "$svc")" + + if [[ -z "$cid" ]]; then + log_error "Service did not start: $svc" + exit 1 + fi + + if ! wait_for_container_running "$cid" 60; then + log_error "Service failed to reach running state: $svc" + docker logs "$cid" --tail 80 || true + exit 1 + fi +done + +log_success "Monitoring stack is up" +log_info "Portainer: https://:${PORTAINER_HTTPS_PORT}" +log_info "Grafana: http://:${GRAFANA_PORT}" +log_info "Prometheus:http://:${PROMETHEUS_PORT}" +log_info "Uptime Kuma:http://:${UPTIME_KUMA_PORT}" diff --git a/setup/pi-monitoring/docker-compose.yml b/setup/pi-monitoring/docker-compose.yml new file mode 100644 index 0000000..d9ce783 --- /dev/null +++ b/setup/pi-monitoring/docker-compose.yml @@ -0,0 +1,96 @@ +name: ${COMPOSE_PROJECT_NAME:-pi-monitoring} + +services: + portainer: + image: ${PORTAINER_IMAGE:-portainer/portainer-ce:latest} + restart: unless-stopped + command: -H unix:///var/run/docker.sock + ports: + - "${BIND_IP:-0.0.0.0}:${PORTAINER_HTTPS_PORT:-9443}:9443" + - "${BIND_IP:-0.0.0.0}:${PORTAINER_EDGE_PORT:-8000}:8000" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${OPS_ROOT:-/srv/ops}/portainer/data:/data + networks: + - monitoring + + grafana: + image: ${GRAFANA_IMAGE:-grafana/grafana-oss:latest} + restart: unless-stopped + environment: + - TZ=${TZ:-America/New_York} + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_INSTALL_PLUGINS=${GRAFANA_PLUGINS:-} + ports: + - "${BIND_IP:-0.0.0.0}:${GRAFANA_PORT:-3000}:3000" + volumes: + - ${OPS_ROOT:-/srv/ops}/grafana/data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + - prometheus + networks: + - monitoring + + prometheus: + image: ${PROMETHEUS_IMAGE:-prom/prometheus:latest} + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + ports: + - "${BIND_IP:-0.0.0.0}:${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ${OPS_ROOT:-/srv/ops}/prometheus/targets:/etc/prometheus/targets:ro + - ${OPS_ROOT:-/srv/ops}/prometheus/data:/prometheus + networks: + - monitoring + + node-exporter: + image: ${NODE_EXPORTER_IMAGE:-prom/node-exporter:latest} + restart: unless-stopped + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/host/root' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)' + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/host/root:ro + networks: + - monitoring + + cadvisor: + image: ${CADVISOR_IMAGE:-gcr.io/cadvisor/cadvisor:latest} + restart: unless-stopped + privileged: true + devices: + - /dev/kmsg:/dev/kmsg + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + networks: + - monitoring + + uptime-kuma: + image: ${UPTIME_KUMA_IMAGE:-louislam/uptime-kuma:1} + restart: unless-stopped + ports: + - "${BIND_IP:-0.0.0.0}:${UPTIME_KUMA_PORT:-3001}:3001" + volumes: + - ${OPS_ROOT:-/srv/ops}/uptime-kuma/data:/app/data + networks: + - monitoring + +networks: + monitoring: + driver: bridge diff --git a/setup/pi-monitoring/grafana/provisioning/datasources/prometheus.yml b/setup/pi-monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..bb009bb --- /dev/null +++ b/setup/pi-monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/setup/pi-monitoring/install_portainer_agent_remote.sh b/setup/pi-monitoring/install_portainer_agent_remote.sh new file mode 100755 index 0000000..46af743 --- /dev/null +++ b/setup/pi-monitoring/install_portainer_agent_remote.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=./lib.sh +source "$SCRIPT_DIR/lib.sh" + +HOST="" +USER_NAME="" +SSH_PORT="22" +AGENT_IMAGE="portainer/agent:latest" +AUTO_YES=false + +usage() { + cat </dev/null 2>&1 || true + +docker run -d \ + --name portainer_agent \ + --restart=unless-stopped \ + -p 9001:9001 \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /var/lib/docker/volumes:/var/lib/docker/volumes \ + -v /:/host \ + '"$AGENT_IMAGE"' +' + +log_success "Portainer Agent running on $HOST:9001" +log_info "Add endpoint in Portainer: tcp://$HOST:9001" diff --git a/setup/pi-monitoring/lib.sh b/setup/pi-monitoring/lib.sh new file mode 100755 index 0000000..f81b2a8 --- /dev/null +++ b/setup/pi-monitoring/lib.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -t 2 ]]; then + _C_RESET='\033[0m' + _C_RED='\033[0;31m' + _C_GREEN='\033[0;32m' + _C_YELLOW='\033[0;33m' + _C_BLUE='\033[0;34m' +else + _C_RESET='' _C_RED='' _C_GREEN='' _C_YELLOW='' _C_BLUE='' +fi + +log_info() { + printf '%b[INFO]%b %s\n' "$_C_BLUE" "$_C_RESET" "$*" >&2 +} + +log_warn() { + printf '%b[WARN]%b %s\n' "$_C_YELLOW" "$_C_RESET" "$*" >&2 +} + +log_error() { + printf '%b[ERROR]%b %s\n' "$_C_RED" "$_C_RESET" "$*" >&2 +} + +log_success() { + printf '%b[OK]%b %s\n' "$_C_GREEN" "$_C_RESET" "$*" >&2 +} + +require_cmd() { + local cmd + for cmd in "$@"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + log_error "Required command not found: $cmd" + return 1 + fi + done +} + +confirm_action() { + local prompt="${1:-Continue?}" + local auto_yes="${2:-false}" + + if [[ "$auto_yes" == "true" ]]; then + return 0 + fi + + printf '%s [y/N] ' "$prompt" + read -r reply + [[ "$reply" =~ ^[Yy]$ ]] +} + +stack_script_dir() { + cd "$(dirname "${BASH_SOURCE[1]:-${BASH_SOURCE[0]}}")" && pwd +} + +load_stack_env() { + local env_file="$1" + + if [[ ! -f "$env_file" ]]; then + log_error "Missing env file: $env_file" + log_info "Copy stack.env.example to stack.env and update secrets first." + return 1 + fi + + local line key value + while IFS= read -r line || [[ -n "$line" ]]; do + [[ -z "$line" || "$line" == \#* ]] && continue + [[ "$line" == *=* ]] || continue + + key="${line%%=*}" + value="${line#*=}" + value="${value%%# *}" + if [[ "$value" =~ ^\"(.*)\"$ ]] || [[ "$value" =~ ^\'(.*)\'$ ]]; then + value="${BASH_REMATCH[1]}" + fi + value="${value%"${value##*[![:space:]]}"}" + + export "$key=$value" + done < "$env_file" + + : "${OPS_ROOT:=/srv/ops}" + : "${COMPOSE_PROJECT_NAME:=pi-monitoring}" +} + +compose_file() { + local dir + dir="$(stack_script_dir)" + printf '%s/docker-compose.yml' "$dir" +} + +ensure_ops_dirs() { + local root="$1" + sudo mkdir -p \ + "$root/portainer/data" \ + "$root/grafana/data" \ + "$root/prometheus/data" \ + "$root/prometheus/targets" \ + "$root/uptime-kuma/data" \ + "$root/backups" +} + +prepare_permissions() { + local root="$1" + + # Keep operational directories writable by the current admin user. + sudo chown -R "$USER:$USER" \ + "$root/portainer/data" \ + "$root/uptime-kuma/data" \ + "$root/prometheus/targets" \ + "$root/backups" + + # Grafana UID/GID in official image is usually 472 + sudo chown -R 472:472 "$root/grafana/data" + # Prometheus runs as nobody (65534) in official image + sudo chown -R 65534:65534 "$root/prometheus/data" +} + +wait_for_container_running() { + local container_id="$1" + local timeout_sec="$2" + local elapsed=0 + + while (( elapsed < timeout_sec )); do + local state + state="$(docker inspect -f '{{.State.Status}}' "$container_id" 2>/dev/null || true)" + if [[ "$state" == "running" ]]; then + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + + return 1 +} diff --git a/setup/pi-monitoring/mount_ssd.sh b/setup/pi-monitoring/mount_ssd.sh new file mode 100755 index 0000000..270d1fc --- /dev/null +++ b/setup/pi-monitoring/mount_ssd.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=./lib.sh +source "$SCRIPT_DIR/lib.sh" + +DEVICE="" +MOUNT_POINT="/srv/ops" +FS_TYPE="ext4" +AUTO_YES=false +FORCE_FORMAT=false + +usage() { + cat </dev/null || true)" +if [[ -n "$existing_fs" && "$FORCE_FORMAT" != "true" ]]; then + log_info "Existing filesystem detected on $TARGET_DEVICE: $existing_fs" + log_info "Skipping format. Use --force-format to recreate filesystem." +else + if [[ -n "$existing_fs" && "$FORCE_FORMAT" == "true" ]]; then + prompt="Reformat $TARGET_DEVICE (current: $existing_fs)? This destroys existing data. Continue?" + else + prompt="Create new $FS_TYPE filesystem on $TARGET_DEVICE?" + fi + + if ! confirm_action "$prompt" "$AUTO_YES"; then + log_info "Cancelled" + exit 0 + fi + + log_info "Formatting $TARGET_DEVICE as $FS_TYPE..." + sudo mkfs -t "$FS_TYPE" "$TARGET_DEVICE" +fi + +uuid="$(sudo blkid -s UUID -o value "$TARGET_DEVICE" 2>/dev/null || true)" +if [[ -z "$uuid" ]]; then + log_error "Could not read UUID from $TARGET_DEVICE" + exit 1 +fi + +sudo mkdir -p "$MOUNT_POINT" + +fstab_line="UUID=${uuid} ${MOUNT_POINT} ${FS_TYPE} defaults,noatime,nofail 0 2" +if grep -q "UUID=${uuid}" /etc/fstab; then + log_info "Existing fstab entry found for UUID=${uuid}; leaving as-is" +else + log_info "Adding mount to /etc/fstab" + printf '%s\n' "$fstab_line" | sudo tee -a /etc/fstab >/dev/null +fi + +log_info "Mounting $MOUNT_POINT..." +if mountpoint -q "$MOUNT_POINT"; then + log_info "$MOUNT_POINT is already mounted" +else + sudo mount "$MOUNT_POINT" +fi + +log_info "Creating monitoring data directories..." +sudo mkdir -p \ + "$MOUNT_POINT/portainer/data" \ + "$MOUNT_POINT/grafana/data" \ + "$MOUNT_POINT/prometheus/data" \ + "$MOUNT_POINT/prometheus/targets" \ + "$MOUNT_POINT/uptime-kuma/data" \ + "$MOUNT_POINT/backups" + +log_success "SSD mount ready at $MOUNT_POINT" diff --git a/setup/pi-monitoring/prometheus/prometheus.yml b/setup/pi-monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..7334bda --- /dev/null +++ b/setup/pi-monitoring/prometheus/prometheus.yml @@ -0,0 +1,24 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["prometheus:9090"] + + - job_name: pi-node + static_configs: + - targets: ["node-exporter:9100"] + + - job_name: pi-cadvisor + static_configs: + - targets: ["cadvisor:8080"] + + # Add Fedora/Unraid (or any remote host) targets in: + # /srv/ops/prometheus/targets/external.yml + # A template is generated from external.yml.example by deploy_stack.sh. + - job_name: external-node-exporters + file_sd_configs: + - files: + - /etc/prometheus/targets/*.yml diff --git a/setup/pi-monitoring/prometheus/targets/external.yml.example b/setup/pi-monitoring/prometheus/targets/external.yml.example new file mode 100644 index 0000000..4e7f0c3 --- /dev/null +++ b/setup/pi-monitoring/prometheus/targets/external.yml.example @@ -0,0 +1,15 @@ +# Example scrape targets for external hosts. +# Copy to /srv/ops/prometheus/targets/external.yml and edit real IPs. + +- labels: + job: unraid-node + targets: + - 192.168.1.82:9100 + +- labels: + job: fedora-node + targets: + - 192.168.1.90:9100 + +# If you expose cAdvisor on remote hosts, add additional targets here. +# Keep this file in YAML list format. diff --git a/setup/pi-monitoring/restore_stack.sh b/setup/pi-monitoring/restore_stack.sh new file mode 100755 index 0000000..924f4b7 --- /dev/null +++ b/setup/pi-monitoring/restore_stack.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=./lib.sh +source "$SCRIPT_DIR/lib.sh" + +ARCHIVE="" +AUTO_YES=false + +usage() { + cat <