From 66be5d83ff51a5e5bfb66b55210e7779384a5b67 Mon Sep 17 00:00:00 2001 From: paulh Date: Thu, 9 Apr 2026 21:56:03 -0400 Subject: [PATCH] fix(ralph-loop): replace with three-tier session reset detection from master MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supersedes the simple polling version written this session. The master harness version (from the other machine) has: Tier 1 — Anthropic API probe via ANTHROPIC_API_KEY if available Tier 2 — Parse reset time from agent output ("resets 11am America/New_York") Tier 3 — Seeded --session-ends timestamp argument Tier 4 — Fixed fallback sleep (--retry-wait, default 1800s) Agent: human Tests: N/A Tests-Added: 0 TypeScript: N/A --- ralph-loop.sh | 517 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 409 insertions(+), 108 deletions(-) diff --git a/ralph-loop.sh b/ralph-loop.sh index 7eb9931..deb6289 100755 --- a/ralph-loop.sh +++ b/ralph-loop.sh @@ -1,54 +1,62 @@ #!/usr/bin/env bash # -# Ralph Wiggum Loop — Autonomous agent iteration +# Ralph Wiggum Loop — Script-Orchestrated Autonomous Agent Iteration # -# Based on Geoffrey Huntley's approach: -# - Each iteration spawns a FRESH agent with clean context -# - Agent reads the plan, picks ONE task, implements, tests, commits, exits -# - Loop restarts until all tasks are done +# This runtime is for the "script is the orchestrator" model: +# - The shell loop spawns a fresh agent every iteration +# - The shell loop interprets runtime signals and failures +# - The shell loop decides when to retry, stop, or wait for token reset # -# Session limit handling: -# - Detects Claude Pro usage limit messages in agent output -# - Polls every SESSION_POLL_INTERVAL seconds until the session resets -# - Resumes the same iteration automatically — no manual intervention needed +# This is different from the "agent is the orchestrator" model used in +# OpenClaw/manual orchestration, where a supervising agent evaluates results, +# watches execution boards, and decides what to do next. # # Usage: -# ./ralph-loop.sh # Build mode (default) -# ./ralph-loop.sh plan # Planning mode (create IMPLEMENTATION_PLAN.md) -# ./ralph-loop.sh --max 20 # Limit to 20 iterations -# ./ralph-loop.sh --agent claude # Use claude (default) -# ./ralph-loop.sh --agent codex # Use OpenAI Codex CLI -# ./ralph-loop.sh --agent aider # Use Aider -# ./ralph-loop.sh --agent gemini # Use Gemini CLI -# ./ralph-loop.sh --agent custom # Use custom agent (see below) +# ./ralph-loop.sh # Build mode (default) +# ./ralph-loop.sh plan # Planning mode +# ./ralph-loop.sh --max 20 # Limit iterations +# ./ralph-loop.sh --agent claude # Use claude (default) +# ./ralph-loop.sh --session-ends 2026-04-09T16:00:00 +# ./ralph-loop.sh --retry-wait 1800 +# ./ralph-loop.sh --board .harness/foo/execution-board.md +# ./ralph-loop.sh --no-require-pro +# +# Token / rate-limit handling: +# Tier 1 — Anthropic API probe if ANTHROPIC_API_KEY is available +# Tier 2 — Parse "resets 11am (America/New_York)" from agent output +# Tier 3 — Use seeded --session-ends time +# Tier 4 — Fixed fallback sleep # set -euo pipefail -MODE="${1:-build}" +MODE="build" MAX_ITERATIONS=50 AGENT="claude" PLAN_FILE="IMPLEMENTATION_PLAN.md" SPEC_FILE="PROJECT-SPEC.md" AGENT_FILE="AGENT.md" +BOARD_FILE="" LOG_DIR=".ralph-logs" +RATE_LIMIT_WAIT=1800 +SESSION_ENDS="" +REQUIRE_PRO=1 -# How often (in seconds) to probe whether the session has reset. -# Default: 10 minutes. Adjust down if you want faster recovery. -SESSION_POLL_INTERVAL="${SESSION_POLL_INTERVAL:-600}" - -# Parse arguments -shift 2>/dev/null || true while [[ $# -gt 0 ]]; do case "$1" in - --max) MAX_ITERATIONS="$2"; shift 2 ;; - --agent) AGENT="$2"; shift 2 ;; + plan) MODE="plan"; shift ;; + build) MODE="build"; shift ;; + --max) MAX_ITERATIONS="$2"; shift 2 ;; + --agent) AGENT="$2"; shift 2 ;; + --retry-wait) RATE_LIMIT_WAIT="$2"; shift 2 ;; + --session-ends) SESSION_ENDS="$2"; shift 2 ;; + --board) BOARD_FILE="$2"; shift 2 ;; + --no-require-pro) REQUIRE_PRO=0; shift ;; *) echo "Unknown option: $1"; exit 1 ;; esac done mkdir -p "$LOG_DIR" -# Colors GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' @@ -62,16 +70,282 @@ warn() { echo -e "${YELLOW}[ralph]${NC} $1"; } error() { echo -e "${RED}[ralph]${NC} $1"; } info() { echo -e "${CYAN}[ralph]${NC} $1"; } -# Check prerequisites +AGENT_EXIT_CODE=0 + +get_claude_analysis_auth_json() { + env -u ANTHROPIC_API_KEY bash -ic 'claude auth status' 2>/dev/null | tail -n +1 +} + +verify_claude_pro_auth() { + local auth_json + auth_json=$(get_claude_analysis_auth_json) + if [[ -z "$auth_json" ]]; then + error "Could not determine Claude analysis auth status." + return 1 + fi + + AUTH_JSON="$auth_json" python3 - <<'PY' +import json +import os +import sys + +data = json.loads(os.environ["AUTH_JSON"]) +if data.get("loggedIn") and data.get("subscriptionType") == "pro": + print("ok") + sys.exit(0) + +print(json.dumps(data, ensure_ascii=True)) +sys.exit(1) +PY +} + +log_agent_runtime() { + case "$AGENT" in + claude) + local claude_path claude_version auth_json + claude_path=$(bash -ic 'command -v claude' 2>/dev/null | tail -n 1 || true) + claude_version=$(bash -ic 'claude --version' 2>/dev/null | tail -n 1 || true) + auth_json=$(get_claude_analysis_auth_json) + log "Claude binary: ${claude_path:-not found}" + log "Claude version: ${claude_version:-unknown}" + if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then + log "Claude auth hint: ANTHROPIC_API_KEY is set (API probe enabled)" + else + log "Claude auth hint: ANTHROPIC_API_KEY is not set" + fi + if [[ -n "$auth_json" ]]; then + log "Claude analysis auth: $(AUTH_JSON="$auth_json" python3 - <<'PY' +import json +import os + +data = json.loads(os.environ["AUTH_JSON"]) +print(f"authMethod={data.get('authMethod')} subscriptionType={data.get('subscriptionType')} apiKeySource={data.get('apiKeySource')}") +PY +)" + fi + ;; + esac +} + if [[ ! -f "$SPEC_FILE" ]]; then error "Missing $SPEC_FILE — create your project spec first." exit 1 fi - if [[ ! -f "$AGENT_FILE" ]]; then warn "No $AGENT_FILE found. Using default agent instructions." fi +probe_rate_limit() { + if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then + return 1 + fi + + local headers + headers=$(curl -s -D - -o /dev/null \ + --max-time 10 \ + -X POST "https://api.anthropic.com/v1/messages" \ + -H "x-api-key: $ANTHROPIC_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -H "content-type: application/json" \ + -d '{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{"role":"user","content":"hi"}]}' \ + 2>/dev/null) || return 1 + + local reset_str remaining + reset_str=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-reset:" | awk '{print $2}' | tr -d '\r\n') + remaining=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-remaining:" | awk '{print $2}' | tr -d '\r\n') + + if [[ -z "$reset_str" ]]; then + return 1 + fi + + local reset_epoch + reset_epoch=$(date -d "$reset_str" +%s 2>/dev/null) \ + || reset_epoch=$(python3 -c " +from datetime import datetime, timezone +import sys +s = sys.argv[1].strip() +for fmt in ('%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S+00:00', '%Y-%m-%dT%H:%M:%S%z'): + try: + dt = datetime.strptime(s, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + print(int(dt.timestamp())) + break + except Exception: + pass +" "$reset_str" 2>/dev/null) || return 1 + + echo "${reset_epoch}|${remaining:-unknown}" +} + +parse_epoch() { + local ts="$1" + date -d "$ts" +%s 2>/dev/null \ + || python3 -c " +from datetime import datetime, timezone +import sys +s = sys.argv[1] +for fmt in ('%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S', + '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S+00:00'): + try: + dt = datetime.strptime(s, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + print(int(dt.timestamp())) + break + except Exception: + pass +" "$ts" 2>/dev/null || true +} + +format_session_end() { + local epoch="$1" + date -d "@$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \ + || date -r "$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \ + || echo "" +} + +infer_reset_epoch_from_log() { + local logfile="$1" + + python3 - "$logfile" <<'PY' 2>/dev/null || true +from datetime import datetime, timedelta +from pathlib import Path +import re +import sys + +try: + from zoneinfo import ZoneInfo +except Exception: + ZoneInfo = None + +logfile = Path(sys.argv[1]) +if not logfile.exists(): + raise SystemExit(0) + +text = logfile.read_text(encoding="utf-8", errors="ignore") +matches = list(re.finditer(r"resets\s+(\d{1,2})(?::(\d{2}))?\s*(am|pm)\s*\(([^)]+)\)", text, re.IGNORECASE)) +if not matches: + raise SystemExit(0) + +match = matches[-1] +hour = int(match.group(1)) +minute = int(match.group(2) or "0") +ampm = match.group(3).lower() +tz_name = match.group(4).strip() + +if hour == 12: + hour = 0 +if ampm == "pm": + hour += 12 + +if ZoneInfo is None: + raise SystemExit(0) + +tz = ZoneInfo(tz_name) +now = datetime.now(tz) +candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0) +if candidate <= now: + candidate += timedelta(days=1) + +print(int(candidate.timestamp())) +PY +} + +countdown_sleep() { + local target_epoch=$1 + local label="${2:-token reset}" + local now + while true; do + now=$(date +%s) + local remaining=$(( target_epoch - now )) + if [[ $remaining -le 0 ]]; then + break + fi + local h=$(( remaining / 3600 )) + local m=$(( (remaining % 3600) / 60 )) + local s=$(( remaining % 60 )) + printf "\r${YELLOW}[ralph]${NC} Waiting for %s... %02dh%02dm%02ds remaining " "$label" "$h" "$m" "$s" + sleep 5 + done + echo "" +} + +wait_for_tokens() { + local logfile="${1:-}" + warn "Rate limit / token exhaustion detected." + echo "" + + local wake_epoch="" wake_source="" + + info "Tier 1 — probing Anthropic API for exact reset time..." + local probe_result + if probe_result=$(probe_rate_limit); then + local probe_epoch probe_remaining + probe_epoch="${probe_result%%|*}" + probe_remaining="${probe_result##*|}" + local now + now=$(date +%s) + if [[ -n "$probe_epoch" && "$probe_epoch" -gt "$now" ]]; then + wake_epoch=$probe_epoch + wake_source="API probe" + info "Tokens remaining: ${probe_remaining}. Reset at: $(date -d "@$probe_epoch" 2>/dev/null || date -r "$probe_epoch" 2>/dev/null || echo "$probe_epoch")" + else + info "Probe succeeded but reset time is already past — tokens may have reset. Retrying immediately." + return 0 + fi + else + warn "Tier 1 unavailable (no ANTHROPIC_API_KEY or probe failed)." + fi + + if [[ -z "$wake_epoch" && -n "$logfile" ]]; then + info "Tier 2 — parsing reset time from agent output..." + local log_epoch + log_epoch=$(infer_reset_epoch_from_log "$logfile") || true + if [[ -n "$log_epoch" ]]; then + wake_epoch=$(( log_epoch + 60 )) + wake_source="agent output" + SESSION_ENDS=$(format_session_end "$log_epoch") + info "Detected reset at: $(date -d "@$log_epoch" 2>/dev/null || date -r "$log_epoch" 2>/dev/null || echo "$log_epoch")" + if [[ -n "$SESSION_ENDS" ]]; then + info "Updated --session-ends seed to $SESSION_ENDS" + fi + else + warn "Could not extract a reset time from $logfile." + fi + fi + + if [[ -z "$wake_epoch" && -n "$SESSION_ENDS" ]]; then + info "Tier 3 — using --session-ends $SESSION_ENDS..." + local seed_epoch + seed_epoch=$(parse_epoch "$SESSION_ENDS") || true + if [[ -n "$seed_epoch" ]]; then + local now + now=$(date +%s) + if [[ "$seed_epoch" -gt "$now" ]]; then + wake_epoch=$(( seed_epoch + 60 )) + wake_source="session seed (--session-ends)" + info "Will wake at: $(date -d "@$wake_epoch" 2>/dev/null || date -r "$wake_epoch" 2>/dev/null || echo "$wake_epoch") (+60s buffer)" + else + warn "--session-ends is stale (already past). Ignoring it for this retry." + fi + else + warn "Could not parse --session-ends value: '$SESSION_ENDS'" + fi + fi + + if [[ -z "$wake_epoch" ]]; then + warn "Tier 4 — no reset time available. Sleeping ${RATE_LIMIT_WAIT}s ($(( RATE_LIMIT_WAIT / 60 )) min)." + warn "Tip: set ANTHROPIC_API_KEY or pass --session-ends for a smarter wake-up." + wake_epoch=$(( $(date +%s) + RATE_LIMIT_WAIT )) + wake_source="fixed wait" + fi + + info "Strategy: $wake_source. Press Ctrl+C to cancel." + countdown_sleep "$wake_epoch" "token reset" + log "Wake-up time reached. Retrying..." +} + run_agent() { local iteration=$1 local mode=$2 @@ -86,12 +360,23 @@ run_agent() { log "Iteration $iteration ($mode mode) — starting fresh agent..." - # Disable pipefail around the agent call so a non-zero claude exit doesn't - # kill the script. We inspect the log content instead. + if [[ "$AGENT" == "claude" && "$REQUIRE_PRO" == "1" ]]; then + if ! verify_claude_pro_auth >/tmp/ralph-auth-check.out 2>/tmp/ralph-auth-check.err; then + error "Claude analysis auth is not using Pro. Refusing to run." + if [[ -s /tmp/ralph-auth-check.out ]]; then + error "Auth details: $(tail -n 1 /tmp/ralph-auth-check.out)" + fi + if [[ -s /tmp/ralph-auth-check.err ]]; then + error "Auth check stderr: $(tail -n 1 /tmp/ralph-auth-check.err)" + fi + exit 1 + fi + fi + set +e case "$AGENT" in claude) - echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile" + echo "$prompt" | env -u ANTHROPIC_API_KEY claude -p --dangerously-skip-permissions --output-format text 2>&1 | tee "$logfile" ;; codex) echo "$prompt" | codex 2>&1 | tee "$logfile" @@ -107,87 +392,72 @@ run_agent() { ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile" else error "Custom agent selected but ./custom-agent.sh not found or not executable" + set -e exit 1 fi ;; *) - error "Unknown agent: $AGENT" - error "Supported agents: claude, codex, aider, gemini, custom" + error "Unknown agent: $AGENT. Supported: claude, codex, aider, gemini, custom" + set -e exit 1 ;; esac + AGENT_EXIT_CODE=$? set -e - - return 0 -} - -# Probe whether claude is available by sending a trivial request. -# Returns 0 if available, 1 if still rate-limited or erroring. -probe_session() { - local probe_log="$LOG_DIR/probe.log" - set +e - echo "Reply with the single word OK and nothing else." \ - | claude -p --output-format text > "$probe_log" 2>&1 - local rc=$? - set -e - - if [[ $rc -ne 0 ]]; then - return 1 - fi - # Also check the output doesn't contain a limit message - if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit' "$probe_log" 2>/dev/null; then - return 1 - fi return 0 } check_output() { local logfile="$1" - # Session / usage limit — must check BEFORE generic promise checks - if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit\|Claude AI usage' "$logfile" 2>/dev/null; then - return 4 # Rate limited - fi - if grep -q 'DONE' "$logfile" 2>/dev/null; then - return 0 # Done + return 0 elif grep -q 'STUCK' "$logfile" 2>/dev/null; then - return 2 # Stuck — needs human intervention + return 2 elif grep -q 'ERROR' "$logfile" 2>/dev/null; then - return 3 # Unrecoverable error + return 3 + elif grep -Eqi "rate.limit|rate_limit|too many requests|exceeded.*quota|usage limit|out of tokens|overloaded|you'?ve hit your limit|resets [0-9]{1,2}(:[0-9]{2})?(am|pm)" "$logfile" 2>/dev/null; then + return 4 else - return 1 # Normal iteration — continue + return 1 fi } -wait_for_session_reset() { - local iteration=$1 - warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - warn "Session usage limit hit during iteration $iteration." - warn "Will probe every ${SESSION_POLL_INTERVAL}s until session resets." - warn "No manual action needed — loop will resume automatically." - warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +plan_has_remaining_work() { + if [[ ! -f "$PLAN_FILE" ]]; then + return 1 + fi - local attempt=0 - while true; do - ((attempt++)) - local next_check - next_check=$(date -d "+${SESSION_POLL_INTERVAL} seconds" '+%H:%M:%S' 2>/dev/null \ - || date -v "+${SESSION_POLL_INTERVAL}S" '+%H:%M:%S' 2>/dev/null \ - || echo "soon") - info "Probe attempt $attempt — next check at $next_check..." - sleep "$SESSION_POLL_INTERVAL" + if grep -Eq '^- \[ \]' "$PLAN_FILE" 2>/dev/null; then + return 0 + fi - if probe_session; then - success "Session available! Resuming iteration $iteration..." - return 0 - else - warn "Still rate-limited (attempt $attempt). Waiting another ${SESSION_POLL_INTERVAL}s..." - fi - done + return 1 } -# ─── Main ──────────────────────────────────────────────────────────────────── +board_has_remaining_work() { + if [[ -z "$BOARD_FILE" || ! -f "$BOARD_FILE" ]]; then + return 1 + fi + + if grep -Eq '\| .*⬜ Pending .* \||\| .*🔄 In Progress .* \|' "$BOARD_FILE" 2>/dev/null; then + return 0 + fi + + return 1 +} + +has_remaining_work() { + if board_has_remaining_work; then + return 0 + fi + + if plan_has_remaining_work; then + return 0 + fi + + return 1 +} if [[ "$MODE" == "plan" ]]; then log "Planning mode — creating implementation plan..." @@ -197,44 +467,75 @@ if [[ "$MODE" == "plan" ]]; then fi log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)" -log "Agent: $AGENT" -log "Spec: $SPEC_FILE" -log "Plan: $PLAN_FILE" -log "Poll interval: ${SESSION_POLL_INTERVAL}s (session limit recovery)" +log "Runtime model: script-orchestrated" +log "Agent: $AGENT" +log "Spec: $SPEC_FILE" +log "Plan: $PLAN_FILE" +if [[ -n "$BOARD_FILE" ]]; then + log "Board: $BOARD_FILE" +fi +if [[ -n "$SESSION_ENDS" ]]; then + log "Tier 3 (session seed): $SESSION_ENDS" +fi +if [[ "$AGENT" == "claude" ]]; then + log_agent_runtime + if [[ "$REQUIRE_PRO" == "1" ]]; then + log "Pro guard: enabled" + else + warn "Pro guard: disabled (--no-require-pro)" + fi +fi echo "" -i=1 -while [[ $i -le $MAX_ITERATIONS ]]; do +for i in $(seq 1 "$MAX_ITERATIONS"); do run_agent "$i" build logfile="$LOG_DIR/iteration-${i}.log" - # Capture return value without triggering set -e - check_output "$logfile" || status=$? - status=${status:-0} + check_output "$logfile" + status=$? case $status in 0) - success "ALL TASKS COMPLETE after $i iterations!" - exit 0 + if has_remaining_work; then + warn "Agent reported DONE, but the tracking artifacts still show work remaining." + warn "Ignoring false DONE and restarting with fresh context." + echo "" + sleep 2 + else + success "All tracked work appears complete after $i iterations." + exit 0 + fi ;; 2) - warn "Agent is stuck on iteration $i. Review $logfile and intervene." + warn "Agent is stuck. Review $logfile and intervene." exit 1 ;; 3) - error "Agent encountered an error on iteration $i. Review $logfile." + error "Agent encountered an error. Review $logfile." exit 1 ;; 4) - # Rate limited — wait for reset, then retry the SAME iteration - wait_for_session_reset "$i" - # Do NOT increment i — retry the same task + warn "Token/rate limit hit on iteration $i." + wait_for_tokens "$logfile" + echo "" ;; 1) - log "Iteration $i complete. Restarting with fresh context..." - echo "" - sleep 2 - ((i++)) + if [[ $AGENT_EXIT_CODE -ne 0 ]]; then + warn "Agent exited with code $AGENT_EXIT_CODE but did not emit a recognized promise signal." + if has_remaining_work; then + warn "Tracked work remains. Restarting fresh." + echo "" + sleep 2 + else + error "No work remains in tracking artifacts, but agent did not finish cleanly." + error "Review $logfile." + exit 1 + fi + else + log "Iteration $i complete. Restarting with fresh context..." + echo "" + sleep 2 + fi ;; esac done