#!/usr/bin/env bash set -euo pipefail usage() { cat <<'EOF' Usage: scripts/monitor-pr-checks.sh Environment overrides: CHECK_FAST_INTERVAL_SECONDS default: 60 CHECK_SLOW_INTERVAL_SECONDS default: 180 CHECK_MIN_FAST_WINDOW_SECONDS default: 900 CHECK_STABLE_CYCLES_FOR_SLOW default: 5 EOF } if [[ "${1:-}" == "-h" ]] || [[ "${1:-}" == "--help" ]]; then usage exit 0 fi PR_NUMBER="${1:-}" if [[ -z "$PR_NUMBER" ]]; then usage >&2 exit 2 fi FAST_INTERVAL_SECONDS="${CHECK_FAST_INTERVAL_SECONDS:-60}" SLOW_INTERVAL_SECONDS="${CHECK_SLOW_INTERVAL_SECONDS:-180}" MIN_FAST_WINDOW_SECONDS="${CHECK_MIN_FAST_WINDOW_SECONDS:-900}" STABLE_CYCLES_FOR_SLOW="${CHECK_STABLE_CYCLES_FOR_SLOW:-5}" start_ts="$(date +%s)" stable_cycles=0 last_fingerprint="" err_file="$(mktemp)" trap 'rm -f "$err_file"' EXIT echo "Monitoring PR #${PR_NUMBER} checks" echo "Policy: fast=${FAST_INTERVAL_SECONDS}s, slow=${SLOW_INTERVAL_SECONDS}s, min-fast-window=${MIN_FAST_WINDOW_SECONDS}s, stable-cycles-for-slow=${STABLE_CYCLES_FOR_SLOW}" while true; do now_ts="$(date +%s)" elapsed="$((now_ts - start_ts))" elapsed_mm="$((elapsed / 60))" elapsed_ss="$((elapsed % 60))" if ! checks_json="$(gh pr checks "$PR_NUMBER" --json name,state,link 2>"$err_file")"; then err_msg="$(tr '\n' ' ' <"$err_file" | sed 's/[[:space:]]\+/ /g; s/^ //; s/ $//')" echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] elapsed ${elapsed_mm}m${elapsed_ss}s | check query failed: ${err_msg:-unknown error}" sleep "$FAST_INTERVAL_SECONDS" continue fi if [[ "$checks_json" == "[]" ]]; then echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] elapsed ${elapsed_mm}m${elapsed_ss}s | no checks yet" sleep "$FAST_INTERVAL_SECONDS" continue fi success_count="$(jq '[.[] | select(.state=="SUCCESS")] | length' <<<"$checks_json")" failure_count="$(jq '[.[] | select(.state=="FAILURE" or .state=="ERROR" or .state=="STARTUP_FAILURE" or .state=="TIMED_OUT")] | length' <<<"$checks_json")" cancelled_count="$(jq '[.[] | select(.state=="CANCELLED")] | length' <<<"$checks_json")" skipped_count="$(jq '[.[] | select(.state=="SKIPPED" or .state=="NEUTRAL")] | length' <<<"$checks_json")" active_count="$(jq '[.[] | select(.state=="PENDING" or .state=="QUEUED" or .state=="IN_PROGRESS" or .state=="WAITING" or .state=="REQUESTED")] | length' <<<"$checks_json")" total_count="$(jq 'length' <<<"$checks_json")" fingerprint="$(jq -r 'sort_by(.name) | map("\(.name)=\(.state)") | join(";")' <<<"$checks_json")" if [[ "$fingerprint" == "$last_fingerprint" ]]; then stable_cycles="$((stable_cycles + 1))" else stable_cycles=0 last_fingerprint="$fingerprint" fi echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] elapsed ${elapsed_mm}m${elapsed_ss}s | total=${total_count} success=${success_count} skipped=${skipped_count} active=${active_count} failed=${failure_count} cancelled=${cancelled_count}" if [[ "$failure_count" -gt 0 ]]; then echo "Failing checks:" jq -r '.[] | select(.state=="FAILURE" or .state=="ERROR" or .state=="STARTUP_FAILURE" or .state=="TIMED_OUT") | " - \(.name): \(.state) \(.link)"' <<<"$checks_json" exit 1 fi if [[ "$active_count" -eq 0 ]]; then if [[ "$cancelled_count" -gt 0 ]]; then echo "Checks ended with cancellations." jq -r '.[] | select(.state=="CANCELLED") | " - \(.name): \(.link)"' <<<"$checks_json" exit 1 fi if [[ "$((success_count + skipped_count))" -eq "$total_count" ]]; then echo "All checks passed." exit 0 fi echo "Checks finished with non-success states." jq -r '.[] | " - \(.name): \(.state) \(.link)"' <<<"$checks_json" exit 1 fi if (( elapsed < MIN_FAST_WINDOW_SECONDS )); then sleep "$FAST_INTERVAL_SECONDS" continue fi if (( stable_cycles >= STABLE_CYCLES_FOR_SLOW )); then sleep "$SLOW_INTERVAL_SECONDS" else sleep "$FAST_INTERVAL_SECONDS" fi done