feat: add Pi monitoring stack with deployment scripts and architecture documentation

This commit is contained in:
S
2026-03-02 21:12:24 -05:00
parent ca4f4924b6
commit 780748083f
16 changed files with 1106 additions and 0 deletions

View File

@@ -0,0 +1,33 @@
# Pi Monitoring Architecture
## Goal
Use Raspberry Pi as lightweight control/observability plane for Docker hosts in this project.
## Topology
```text
+------------------------------+
| Raspberry Pi (ops node) |
| Portainer + Grafana |
| Prometheus + Uptime Kuma |
+---------------+--------------+
|
Portainer Agent (9001)
+----------------+----------------+
| |
+---------v---------+ +---------v---------+
| Unraid Docker host| | Fedora Docker host|
| Gitea primary | | Gitea backup |
+-------------------+ +-------------------+
```
## Data paths
- Pi local metrics: `node-exporter` + `cadvisor` -> Prometheus -> Grafana
- Remote host control: Portainer -> remote `portainer_agent`
- Availability checks: Uptime Kuma -> HTTP/TCP checks for Gitea and other services
## Persistence
All service data is stored under `OPS_ROOT` (default `/srv/ops`) on SSD.

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ENV_FILE="$SCRIPT_DIR/stack.env"
RETENTION_DAYS=14
AUTO_YES=false
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Create a compressed backup archive for the Pi monitoring stack state.
Options:
--env-file=PATH Env file path (default: setup/pi-monitoring/stack.env)
--retention-days=N Delete backups older than N days (default: 14)
--yes, -y Skip confirmation prompt
--help, -h Show help
USAGE
}
for arg in "$@"; do
case "$arg" in
--env-file=*) ENV_FILE="${arg#*=}" ;;
--retention-days=*) RETENTION_DAYS="${arg#*=}" ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd tar find date
load_stack_env "$ENV_FILE"
backup_dir="$OPS_ROOT/backups"
mkdir -p "$backup_dir"
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
archive_path="$backup_dir/pi-monitoring-${timestamp}.tar.gz"
if ! confirm_action "Create backup archive at $archive_path?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
paths=(
"$OPS_ROOT/portainer/data"
"$OPS_ROOT/grafana/data"
"$OPS_ROOT/prometheus/data"
"$OPS_ROOT/prometheus/targets"
"$OPS_ROOT/uptime-kuma/data"
"$SCRIPT_DIR/docker-compose.yml"
"$SCRIPT_DIR/prometheus/prometheus.yml"
"$ENV_FILE"
)
include=()
for p in "${paths[@]}"; do
if [[ -e "$p" ]]; then
include+=("${p#/}")
else
log_warn "Skipping missing path: $p"
fi
done
if [[ ${#include[@]} -eq 0 ]]; then
log_error "No backup sources found"
exit 1
fi
log_info "Creating backup archive..."
(
cd /
tar -czf "$archive_path" "${include[@]}"
)
log_success "Backup created: $archive_path"
if [[ "$RETENTION_DAYS" =~ ^[0-9]+$ ]]; then
log_info "Pruning backups older than ${RETENTION_DAYS} days..."
find "$backup_dir" -type f -name 'pi-monitoring-*.tar.gz' -mtime "+${RETENTION_DAYS}" -delete
else
log_warn "Invalid retention value '${RETENTION_DAYS}'; skipping pruning"
fi

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
TIMEZONE="America/New_York"
SSH_PORT="22"
AUTO_YES=false
ENABLE_UFW=true
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Prepare a brand-new Raspberry Pi OS host for monitoring stack workloads.
Options:
--timezone=ZONE Set system timezone (default: America/New_York)
--ssh-port=PORT SSH port allowed by firewall (default: 22)
--skip-firewall Skip UFW configuration
--yes, -y Non-interactive; skip confirmation prompts
--help, -h Show help
Example:
$(basename "$0") --timezone=America/New_York --yes
USAGE
}
for arg in "$@"; do
case "$arg" in
--timezone=*) TIMEZONE="${arg#*=}" ;;
--ssh-port=*) SSH_PORT="${arg#*=}" ;;
--skip-firewall) ENABLE_UFW=false ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd sudo apt systemctl timedatectl curl
if ! confirm_action "This will install/update OS packages and Docker on this Pi. Continue?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
arch="$(uname -m)"
if [[ "$arch" != "aarch64" && "$arch" != "arm64" ]]; then
log_warn "Detected architecture '$arch' (expected ARM64 for Raspberry Pi 4)."
fi
log_info "Updating OS packages..."
sudo apt update
sudo apt full-upgrade -y
log_info "Installing base packages..."
sudo apt install -y \
ca-certificates \
curl \
fail2ban \
git \
htop \
jq \
ufw \
unattended-upgrades
log_info "Setting timezone to $TIMEZONE..."
sudo timedatectl set-timezone "$TIMEZONE"
log_info "Enabling security services..."
sudo systemctl enable --now fail2ban
sudo systemctl enable --now unattended-upgrades || true
if ! command -v docker >/dev/null 2>&1; then
log_info "Installing Docker Engine..."
curl -fsSL https://get.docker.com | sh
else
log_success "Docker already installed"
fi
log_info "Enabling Docker service..."
sudo systemctl enable --now docker
log_info "Configuring Docker daemon defaults..."
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json >/dev/null <<'JSON'
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"live-restore": true
}
JSON
sudo systemctl restart docker
if id -nG "$USER" | grep -qw docker; then
log_success "User '$USER' is already in docker group"
else
log_info "Adding '$USER' to docker group..."
sudo usermod -aG docker "$USER"
log_warn "Log out and back in once for docker group membership to apply."
fi
if [[ "$ENABLE_UFW" == "true" ]]; then
log_info "Configuring UFW firewall rules..."
sudo ufw allow "${SSH_PORT}/tcp"
sudo ufw allow 9443/tcp # Portainer HTTPS
sudo ufw allow 8000/tcp # Portainer Edge
sudo ufw allow 3000/tcp # Grafana
sudo ufw allow 3001/tcp # Uptime Kuma
sudo ufw allow 9090/tcp # Prometheus
sudo ufw --force enable
fi
log_success "Bootstrap complete"
log_info "Recommended next steps:"
log_info "1) Re-login to apply docker group membership"
log_info "2) Run setup/pi-monitoring/mount_ssd.sh"
log_info "3) Copy stack.env.example to stack.env and run deploy_stack.sh"

View File

@@ -0,0 +1,94 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ENV_FILE="$SCRIPT_DIR/stack.env"
AUTO_YES=false
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Deploy Portainer + Grafana + Prometheus + Uptime Kuma monitoring stack.
Options:
--env-file=PATH Env file path (default: setup/pi-monitoring/stack.env)
--yes, -y Skip confirmation prompt
--help, -h Show help
USAGE
}
for arg in "$@"; do
case "$arg" in
--env-file=*) ENV_FILE="${arg#*=}" ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd docker sudo
load_stack_env "$ENV_FILE"
if ! docker compose version >/dev/null 2>&1; then
log_error "docker compose plugin not found"
exit 1
fi
if ! confirm_action "Deploy monitoring stack using $ENV_FILE?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
ensure_ops_dirs "$OPS_ROOT"
prepare_permissions "$OPS_ROOT"
if [[ ! -f "$OPS_ROOT/prometheus/targets/external.yml" ]]; then
cp "$SCRIPT_DIR/prometheus/targets/external.yml.example" "$OPS_ROOT/prometheus/targets/external.yml"
log_info "Created $OPS_ROOT/prometheus/targets/external.yml from example"
fi
compose_file_path="$(compose_file)"
log_info "Pulling container images..."
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
pull
log_info "Starting stack..."
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
up -d
services=(portainer grafana prometheus uptime-kuma node-exporter cadvisor)
for svc in "${services[@]}"; do
cid="$(docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
ps -q "$svc")"
if [[ -z "$cid" ]]; then
log_error "Service did not start: $svc"
exit 1
fi
if ! wait_for_container_running "$cid" 60; then
log_error "Service failed to reach running state: $svc"
docker logs "$cid" --tail 80 || true
exit 1
fi
done
log_success "Monitoring stack is up"
log_info "Portainer: https://<pi-ip>:${PORTAINER_HTTPS_PORT}"
log_info "Grafana: http://<pi-ip>:${GRAFANA_PORT}"
log_info "Prometheus:http://<pi-ip>:${PROMETHEUS_PORT}"
log_info "Uptime Kuma:http://<pi-ip>:${UPTIME_KUMA_PORT}"

View File

@@ -0,0 +1,96 @@
name: ${COMPOSE_PROJECT_NAME:-pi-monitoring}
services:
portainer:
image: ${PORTAINER_IMAGE:-portainer/portainer-ce:latest}
restart: unless-stopped
command: -H unix:///var/run/docker.sock
ports:
- "${BIND_IP:-0.0.0.0}:${PORTAINER_HTTPS_PORT:-9443}:9443"
- "${BIND_IP:-0.0.0.0}:${PORTAINER_EDGE_PORT:-8000}:8000"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${OPS_ROOT:-/srv/ops}/portainer/data:/data
networks:
- monitoring
grafana:
image: ${GRAFANA_IMAGE:-grafana/grafana-oss:latest}
restart: unless-stopped
environment:
- TZ=${TZ:-America/New_York}
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_AUTH_ANONYMOUS_ENABLED=false
- GF_INSTALL_PLUGINS=${GRAFANA_PLUGINS:-}
ports:
- "${BIND_IP:-0.0.0.0}:${GRAFANA_PORT:-3000}:3000"
volumes:
- ${OPS_ROOT:-/srv/ops}/grafana/data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
depends_on:
- prometheus
networks:
- monitoring
prometheus:
image: ${PROMETHEUS_IMAGE:-prom/prometheus:latest}
restart: unless-stopped
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "${BIND_IP:-0.0.0.0}:${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ${OPS_ROOT:-/srv/ops}/prometheus/targets:/etc/prometheus/targets:ro
- ${OPS_ROOT:-/srv/ops}/prometheus/data:/prometheus
networks:
- monitoring
node-exporter:
image: ${NODE_EXPORTER_IMAGE:-prom/node-exporter:latest}
restart: unless-stopped
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/host/root'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/host/root:ro
networks:
- monitoring
cadvisor:
image: ${CADVISOR_IMAGE:-gcr.io/cadvisor/cadvisor:latest}
restart: unless-stopped
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- monitoring
uptime-kuma:
image: ${UPTIME_KUMA_IMAGE:-louislam/uptime-kuma:1}
restart: unless-stopped
ports:
- "${BIND_IP:-0.0.0.0}:${UPTIME_KUMA_PORT:-3001}:3001"
volumes:
- ${OPS_ROOT:-/srv/ops}/uptime-kuma/data:/app/data
networks:
- monitoring
networks:
monitoring:
driver: bridge

View File

@@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
HOST=""
USER_NAME=""
SSH_PORT="22"
AGENT_IMAGE="portainer/agent:latest"
AUTO_YES=false
usage() {
cat <<USAGE
Usage: $(basename "$0") --host=IP --user=NAME [options]
Install or refresh Portainer Agent on a remote Docker host (Fedora/Unraid).
Options:
--host=IP Remote host IP or hostname
--user=NAME SSH username
--port=PORT SSH port (default: 22)
--agent-image=I Portainer agent image (default: portainer/agent:latest)
--yes, -y Skip confirmation prompt
--help, -h Show help
Example:
$(basename "$0") --host=192.168.1.90 --user=admin --port=22
USAGE
}
for arg in "$@"; do
case "$arg" in
--host=*) HOST="${arg#*=}" ;;
--user=*) USER_NAME="${arg#*=}" ;;
--port=*) SSH_PORT="${arg#*=}" ;;
--agent-image=*) AGENT_IMAGE="${arg#*=}" ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd ssh
if [[ -z "$HOST" || -z "$USER_NAME" ]]; then
log_error "--host and --user are required"
usage
exit 1
fi
if ! confirm_action "Install/update Portainer Agent on ${USER_NAME}@${HOST}?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
if [[ -z "$AGENT_IMAGE" ]]; then
log_error "Agent image cannot be empty"
exit 1
fi
ssh -p "$SSH_PORT" "$USER_NAME@$HOST" '
set -euo pipefail
docker rm -f portainer_agent >/dev/null 2>&1 || true
docker run -d \
--name portainer_agent \
--restart=unless-stopped \
-p 9001:9001 \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /var/lib/docker/volumes:/var/lib/docker/volumes \
-v /:/host \
'"$AGENT_IMAGE"'
'
log_success "Portainer Agent running on $HOST:9001"
log_info "Add endpoint in Portainer: tcp://$HOST:9001"

135
setup/pi-monitoring/lib.sh Executable file
View File

@@ -0,0 +1,135 @@
#!/usr/bin/env bash
set -euo pipefail
if [[ -t 2 ]]; then
_C_RESET='\033[0m'
_C_RED='\033[0;31m'
_C_GREEN='\033[0;32m'
_C_YELLOW='\033[0;33m'
_C_BLUE='\033[0;34m'
else
_C_RESET='' _C_RED='' _C_GREEN='' _C_YELLOW='' _C_BLUE=''
fi
log_info() {
printf '%b[INFO]%b %s\n' "$_C_BLUE" "$_C_RESET" "$*" >&2
}
log_warn() {
printf '%b[WARN]%b %s\n' "$_C_YELLOW" "$_C_RESET" "$*" >&2
}
log_error() {
printf '%b[ERROR]%b %s\n' "$_C_RED" "$_C_RESET" "$*" >&2
}
log_success() {
printf '%b[OK]%b %s\n' "$_C_GREEN" "$_C_RESET" "$*" >&2
}
require_cmd() {
local cmd
for cmd in "$@"; do
if ! command -v "$cmd" >/dev/null 2>&1; then
log_error "Required command not found: $cmd"
return 1
fi
done
}
confirm_action() {
local prompt="${1:-Continue?}"
local auto_yes="${2:-false}"
if [[ "$auto_yes" == "true" ]]; then
return 0
fi
printf '%s [y/N] ' "$prompt"
read -r reply
[[ "$reply" =~ ^[Yy]$ ]]
}
stack_script_dir() {
cd "$(dirname "${BASH_SOURCE[1]:-${BASH_SOURCE[0]}}")" && pwd
}
load_stack_env() {
local env_file="$1"
if [[ ! -f "$env_file" ]]; then
log_error "Missing env file: $env_file"
log_info "Copy stack.env.example to stack.env and update secrets first."
return 1
fi
local line key value
while IFS= read -r line || [[ -n "$line" ]]; do
[[ -z "$line" || "$line" == \#* ]] && continue
[[ "$line" == *=* ]] || continue
key="${line%%=*}"
value="${line#*=}"
value="${value%%# *}"
if [[ "$value" =~ ^\"(.*)\"$ ]] || [[ "$value" =~ ^\'(.*)\'$ ]]; then
value="${BASH_REMATCH[1]}"
fi
value="${value%"${value##*[![:space:]]}"}"
export "$key=$value"
done < "$env_file"
: "${OPS_ROOT:=/srv/ops}"
: "${COMPOSE_PROJECT_NAME:=pi-monitoring}"
}
compose_file() {
local dir
dir="$(stack_script_dir)"
printf '%s/docker-compose.yml' "$dir"
}
ensure_ops_dirs() {
local root="$1"
sudo mkdir -p \
"$root/portainer/data" \
"$root/grafana/data" \
"$root/prometheus/data" \
"$root/prometheus/targets" \
"$root/uptime-kuma/data" \
"$root/backups"
}
prepare_permissions() {
local root="$1"
# Keep operational directories writable by the current admin user.
sudo chown -R "$USER:$USER" \
"$root/portainer/data" \
"$root/uptime-kuma/data" \
"$root/prometheus/targets" \
"$root/backups"
# Grafana UID/GID in official image is usually 472
sudo chown -R 472:472 "$root/grafana/data"
# Prometheus runs as nobody (65534) in official image
sudo chown -R 65534:65534 "$root/prometheus/data"
}
wait_for_container_running() {
local container_id="$1"
local timeout_sec="$2"
local elapsed=0
while (( elapsed < timeout_sec )); do
local state
state="$(docker inspect -f '{{.State.Status}}' "$container_id" 2>/dev/null || true)"
if [[ "$state" == "running" ]]; then
return 0
fi
sleep 2
elapsed=$((elapsed + 2))
done
return 1
}

124
setup/pi-monitoring/mount_ssd.sh Executable file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
DEVICE=""
MOUNT_POINT="/srv/ops"
FS_TYPE="ext4"
AUTO_YES=false
FORCE_FORMAT=false
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Prepare and mount an SSD for persistent monitoring data.
Options:
--device=/dev/sda1 Block device or partition to mount
--mount-point=PATH Mount path (default: /srv/ops)
--fs=ext4 Filesystem to create if blank (default: ext4)
--force-format Reformat even if filesystem exists (destructive)
--yes, -y Skip confirmation prompts
--help, -h Show help
Examples:
$(basename "$0") --device=/dev/sda1
$(basename "$0") --device=/dev/sda --force-format --yes
USAGE
}
for arg in "$@"; do
case "$arg" in
--device=*) DEVICE="${arg#*=}" ;;
--mount-point=*) MOUNT_POINT="${arg#*=}" ;;
--fs=*) FS_TYPE="${arg#*=}" ;;
--force-format) FORCE_FORMAT=true ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd sudo lsblk blkid findmnt mount mountpoint grep awk sed
if [[ -z "$DEVICE" ]]; then
root_parent="/dev/$(lsblk -no PKNAME "$(findmnt -n -o SOURCE /)")"
DEVICE="$(lsblk -dpno NAME,TYPE | awk '$2 == "disk" {print $1}' | grep -v "^${root_parent}$" | head -n1 || true)"
if [[ -z "$DEVICE" ]]; then
log_error "Could not auto-detect a non-root disk. Specify --device=/dev/sdX1"
exit 1
fi
log_info "Auto-detected device: $DEVICE"
fi
if [[ ! -b "$DEVICE" ]]; then
log_error "Device does not exist or is not a block device: $DEVICE"
exit 1
fi
TARGET_DEVICE="$DEVICE"
if [[ "$(lsblk -no TYPE "$DEVICE")" == "disk" ]]; then
first_part="$(lsblk -ln -o NAME "$DEVICE" | sed -n '2p' || true)"
if [[ -n "$first_part" ]]; then
TARGET_DEVICE="/dev/${first_part}"
log_info "Using first partition on disk: $TARGET_DEVICE"
fi
fi
existing_fs="$(sudo blkid -s TYPE -o value "$TARGET_DEVICE" 2>/dev/null || true)"
if [[ -n "$existing_fs" && "$FORCE_FORMAT" != "true" ]]; then
log_info "Existing filesystem detected on $TARGET_DEVICE: $existing_fs"
log_info "Skipping format. Use --force-format to recreate filesystem."
else
if [[ -n "$existing_fs" && "$FORCE_FORMAT" == "true" ]]; then
prompt="Reformat $TARGET_DEVICE (current: $existing_fs)? This destroys existing data. Continue?"
else
prompt="Create new $FS_TYPE filesystem on $TARGET_DEVICE?"
fi
if ! confirm_action "$prompt" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
log_info "Formatting $TARGET_DEVICE as $FS_TYPE..."
sudo mkfs -t "$FS_TYPE" "$TARGET_DEVICE"
fi
uuid="$(sudo blkid -s UUID -o value "$TARGET_DEVICE" 2>/dev/null || true)"
if [[ -z "$uuid" ]]; then
log_error "Could not read UUID from $TARGET_DEVICE"
exit 1
fi
sudo mkdir -p "$MOUNT_POINT"
fstab_line="UUID=${uuid} ${MOUNT_POINT} ${FS_TYPE} defaults,noatime,nofail 0 2"
if grep -q "UUID=${uuid}" /etc/fstab; then
log_info "Existing fstab entry found for UUID=${uuid}; leaving as-is"
else
log_info "Adding mount to /etc/fstab"
printf '%s\n' "$fstab_line" | sudo tee -a /etc/fstab >/dev/null
fi
log_info "Mounting $MOUNT_POINT..."
if mountpoint -q "$MOUNT_POINT"; then
log_info "$MOUNT_POINT is already mounted"
else
sudo mount "$MOUNT_POINT"
fi
log_info "Creating monitoring data directories..."
sudo mkdir -p \
"$MOUNT_POINT/portainer/data" \
"$MOUNT_POINT/grafana/data" \
"$MOUNT_POINT/prometheus/data" \
"$MOUNT_POINT/prometheus/targets" \
"$MOUNT_POINT/uptime-kuma/data" \
"$MOUNT_POINT/backups"
log_success "SSD mount ready at $MOUNT_POINT"

View File

@@ -0,0 +1,24 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["prometheus:9090"]
- job_name: pi-node
static_configs:
- targets: ["node-exporter:9100"]
- job_name: pi-cadvisor
static_configs:
- targets: ["cadvisor:8080"]
# Add Fedora/Unraid (or any remote host) targets in:
# /srv/ops/prometheus/targets/external.yml
# A template is generated from external.yml.example by deploy_stack.sh.
- job_name: external-node-exporters
file_sd_configs:
- files:
- /etc/prometheus/targets/*.yml

View File

@@ -0,0 +1,15 @@
# Example scrape targets for external hosts.
# Copy to /srv/ops/prometheus/targets/external.yml and edit real IPs.
- labels:
job: unraid-node
targets:
- 192.168.1.82:9100
- labels:
job: fedora-node
targets:
- 192.168.1.90:9100
# If you expose cAdvisor on remote hosts, add additional targets here.
# Keep this file in YAML list format.

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ARCHIVE=""
AUTO_YES=false
usage() {
cat <<USAGE
Usage: $(basename "$0") --archive=/path/to/pi-monitoring-*.tar.gz [options]
Restore monitoring stack data/config from a backup archive created by backup_stack.sh.
Options:
--archive=PATH Backup archive to restore (required)
--yes, -y Skip confirmation prompt
--help, -h Show help
USAGE
}
for arg in "$@"; do
case "$arg" in
--archive=*) ARCHIVE="${arg#*=}" ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd tar
if [[ -z "$ARCHIVE" ]]; then
log_error "--archive is required"
usage
exit 1
fi
if [[ ! -f "$ARCHIVE" ]]; then
log_error "Archive not found: $ARCHIVE"
exit 1
fi
if ! confirm_action "Restore from $ARCHIVE? Existing stack data may be overwritten." "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
log_info "Stopping compose services before restore (best effort)..."
if [[ -f "$SCRIPT_DIR/stack.env" ]]; then
load_stack_env "$SCRIPT_DIR/stack.env"
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$SCRIPT_DIR/stack.env" \
-f "$(compose_file)" \
down || true
fi
log_info "Extracting backup archive to / ..."
sudo tar -xzf "$ARCHIVE" -C /
log_success "Restore complete"
log_info "Run deploy_stack.sh to bring services back up"

View File

@@ -0,0 +1,39 @@
# -----------------------------------------------------------------------------
# Pi monitoring stack environment
# Copy this file to stack.env and set real secrets before deployment.
# -----------------------------------------------------------------------------
# Compose project name
COMPOSE_PROJECT_NAME=pi-monitoring
# Persistent data root (place on SSD)
OPS_ROOT=/srv/ops
# Host timezone for containers
TZ=America/New_York
# Bind IP for published ports (0.0.0.0 = all interfaces)
BIND_IP=0.0.0.0
# Published service ports
PORTAINER_HTTPS_PORT=9443
PORTAINER_EDGE_PORT=8000
GRAFANA_PORT=3000
PROMETHEUS_PORT=9090
UPTIME_KUMA_PORT=3001
# Container images (override for pinning)
PORTAINER_IMAGE=portainer/portainer-ce:latest
GRAFANA_IMAGE=grafana/grafana-oss:latest
PROMETHEUS_IMAGE=prom/prometheus:latest
NODE_EXPORTER_IMAGE=prom/node-exporter:latest
CADVISOR_IMAGE=gcr.io/cadvisor/cadvisor:latest
UPTIME_KUMA_IMAGE=louislam/uptime-kuma:1
# Grafana bootstrap admin credentials (change before deploy)
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=replace-with-strong-password
# Optional comma-separated plugin list for Grafana
# Example: grafana-piechart-panel,grafana-clock-panel
GRAFANA_PLUGINS=

53
setup/pi-monitoring/status.sh Executable file
View File

@@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ENV_FILE="$SCRIPT_DIR/stack.env"
usage() {
cat <<USAGE
Usage: $(basename "$0") [--env-file=PATH]
Show compose service status and local endpoint checks.
USAGE
}
for arg in "$@"; do
case "$arg" in
--env-file=*) ENV_FILE="${arg#*=}" ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd docker curl
load_stack_env "$ENV_FILE"
compose_file_path="$(compose_file)"
log_info "Compose services"
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
ps
check_http() {
local name="$1"
local url="$2"
local code
code="$(curl -k -sS -o /dev/null -w '%{http_code}' "$url" || true)"
if [[ "$code" =~ ^[234] ]]; then
log_success "$name reachable ($code): $url"
else
log_warn "$name not reachable ($code): $url"
fi
}
pi_ip="$(hostname -I | awk '{print $1}')"
check_http "Portainer" "https://${pi_ip}:${PORTAINER_HTTPS_PORT}"
check_http "Grafana" "http://${pi_ip}:${GRAFANA_PORT}/login"
check_http "Prometheus" "http://${pi_ip}:${PROMETHEUS_PORT}/-/ready"
check_http "Uptime Kuma" "http://${pi_ip}:${UPTIME_KUMA_PORT}"

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ENV_FILE="$SCRIPT_DIR/stack.env"
AUTO_YES=false
REMOVE_DATA=false
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Stop and remove monitoring stack containers.
Options:
--env-file=PATH Env file path (default: setup/pi-monitoring/stack.env)
--remove-data Also remove data directories under OPS_ROOT (destructive)
--yes, -y Skip confirmation prompt
--help, -h Show help
USAGE
}
for arg in "$@"; do
case "$arg" in
--env-file=*) ENV_FILE="${arg#*=}" ;;
--remove-data) REMOVE_DATA=true ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd docker
load_stack_env "$ENV_FILE"
compose_file_path="$(compose_file)"
if ! confirm_action "Tear down monitoring stack containers?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
log_info "Stopping/removing compose services..."
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
down
if [[ "$REMOVE_DATA" == "true" ]]; then
if ! confirm_action "Remove data under ${OPS_ROOT}/{portainer,grafana,prometheus,uptime-kuma}?" "$AUTO_YES"; then
log_info "Data removal skipped"
else
sudo rm -rf \
"$OPS_ROOT/portainer" \
"$OPS_ROOT/grafana" \
"$OPS_ROOT/prometheus" \
"$OPS_ROOT/uptime-kuma"
log_success "Data directories removed"
fi
fi
log_success "Teardown complete"

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=./lib.sh
source "$SCRIPT_DIR/lib.sh"
ENV_FILE="$SCRIPT_DIR/stack.env"
AUTO_YES=false
PRUNE_IMAGES=false
usage() {
cat <<USAGE
Usage: $(basename "$0") [options]
Upgrade stack images and recreate containers with current compose config.
Options:
--env-file=PATH Env file path (default: setup/pi-monitoring/stack.env)
--prune Prune dangling Docker images after upgrade
--yes, -y Skip confirmation prompt
--help, -h Show help
USAGE
}
for arg in "$@"; do
case "$arg" in
--env-file=*) ENV_FILE="${arg#*=}" ;;
--prune) PRUNE_IMAGES=true ;;
--yes|-y) AUTO_YES=true ;;
--help|-h) usage; exit 0 ;;
*) log_error "Unknown argument: $arg"; usage; exit 1 ;;
esac
done
require_cmd docker
load_stack_env "$ENV_FILE"
compose_file_path="$(compose_file)"
if ! confirm_action "Upgrade monitoring stack now?" "$AUTO_YES"; then
log_info "Cancelled"
exit 0
fi
log_info "Pulling latest images..."
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
pull
log_info "Recreating services..."
docker compose \
--project-name "$COMPOSE_PROJECT_NAME" \
--env-file "$ENV_FILE" \
-f "$compose_file_path" \
up -d --remove-orphans
if [[ "$PRUNE_IMAGES" == "true" ]]; then
log_info "Pruning dangling images..."
docker image prune -f
fi
log_success "Upgrade complete"