fix(ralph-loop): replace with three-tier session reset detection from master

Supersedes the simple polling version written this session. The master harness version (from the other machine) has: Tier 1 — Anthropic API probe via ANTHROPIC_API_KEY if available Tier 2 — Parse reset time from agent output ("resets 11am America/New_York") Tier 3 — Seeded --session-ends timestamp argument Tier 4 — Fixed fallback sleep (--retry-wait, default 1800s) Agent: human Tests: N/A Tests-Added: 0 TypeScript: N/A
2026-04-09 21:56:03 -04:00 · 2026-04-09 21:56:03 -04:00 · 66be5d83ff
parent 82e10ff810
commit 66be5d83ff
1 changed files with 409 additions and 108 deletions
--- a/ralph-loop.sh
+++ b/ralph-loop.sh
@ -1,54 +1,62 @@
 #!/usr/bin/env bash
 #
-# Ralph Wiggum Loop — Autonomous agent iteration
+# Ralph Wiggum Loop — Script-Orchestrated Autonomous Agent Iteration
 #
-# Based on Geoffrey Huntley's approach:
+# This runtime is for the "script is the orchestrator" model:
-# - Each iteration spawns a FRESH agent with clean context
+# - The shell loop spawns a fresh agent every iteration
-# - Agent reads the plan, picks ONE task, implements, tests, commits, exits
+# - The shell loop interprets runtime signals and failures
-# - Loop restarts until all tasks are done
+# - The shell loop decides when to retry, stop, or wait for token reset
 #
-# Session limit handling:
+# This is different from the "agent is the orchestrator" model used in
-# - Detects Claude Pro usage limit messages in agent output
+# OpenClaw/manual orchestration, where a supervising agent evaluates results,
-# - Polls every SESSION_POLL_INTERVAL seconds until the session resets
+# watches execution boards, and decides what to do next.
 # - Resumes the same iteration automatically — no manual intervention needed
 #
 # Usage:
 #   ./ralph-loop.sh                               # Build mode (default)
-#   ./ralph-loop.sh plan         # Planning mode (create IMPLEMENTATION_PLAN.md)
+#   ./ralph-loop.sh plan                          # Planning mode
-#   ./ralph-loop.sh --max 20     # Limit to 20 iterations
+#   ./ralph-loop.sh --max 20                      # Limit iterations
 #   ./ralph-loop.sh --agent claude                # Use claude (default)
-#   ./ralph-loop.sh --agent codex   # Use OpenAI Codex CLI
+#   ./ralph-loop.sh --session-ends 2026-04-09T16:00:00
-#   ./ralph-loop.sh --agent aider   # Use Aider
+#   ./ralph-loop.sh --retry-wait 1800
-#   ./ralph-loop.sh --agent gemini  # Use Gemini CLI
+#   ./ralph-loop.sh --board .harness/foo/execution-board.md
-#   ./ralph-loop.sh --agent custom  # Use custom agent (see below)
+#   ./ralph-loop.sh --no-require-pro
 #
 # Token / rate-limit handling:
 #   Tier 1 — Anthropic API probe if ANTHROPIC_API_KEY is available
 #   Tier 2 — Parse "resets 11am (America/New_York)" from agent output
 #   Tier 3 — Use seeded --session-ends time
 #   Tier 4 — Fixed fallback sleep
 #
 set -euo pipefail
-MODE="${1:-build}"
+MODE="build"
 MAX_ITERATIONS=50
 AGENT="claude"
 PLAN_FILE="IMPLEMENTATION_PLAN.md"
 SPEC_FILE="PROJECT-SPEC.md"
 AGENT_FILE="AGENT.md"
 BOARD_FILE=""
 LOG_DIR=".ralph-logs"
 RATE_LIMIT_WAIT=1800
 SESSION_ENDS=""
 REQUIRE_PRO=1
 # How often (in seconds) to probe whether the session has reset.
 # Default: 10 minutes. Adjust down if you want faster recovery.
 SESSION_POLL_INTERVAL="${SESSION_POLL_INTERVAL:-600}"
 # Parse arguments
 shift 2>/dev/null || true
 while [[ $# -gt 0 ]]; do
  case "$1" in
    plan)              MODE="plan";             shift ;;
    build)             MODE="build";            shift ;;
    --max)             MAX_ITERATIONS="$2";     shift 2 ;;
    --agent)           AGENT="$2";              shift 2 ;;
    --retry-wait)      RATE_LIMIT_WAIT="$2";    shift 2 ;;
    --session-ends)    SESSION_ENDS="$2";       shift 2 ;;
    --board)           BOARD_FILE="$2";         shift 2 ;;
    --no-require-pro)  REQUIRE_PRO=0;           shift ;;
    *) echo "Unknown option: $1"; exit 1 ;;
  esac
 done
 mkdir -p "$LOG_DIR"
 # Colors
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
@ -62,16 +70,282 @@ warn()    { echo -e "${YELLOW}[ralph]${NC} $1"; }
 error()   { echo -e "${RED}[ralph]${NC} $1"; }
 info()    { echo -e "${CYAN}[ralph]${NC} $1"; }
-# Check prerequisites
+AGENT_EXIT_CODE=0
 get_claude_analysis_auth_json() {
  env -u ANTHROPIC_API_KEY bash -ic 'claude auth status' 2>/dev/null | tail -n +1
 }
 verify_claude_pro_auth() {
  local auth_json
  auth_json=$(get_claude_analysis_auth_json)
  if [[ -z "$auth_json" ]]; then
    error "Could not determine Claude analysis auth status."
    return 1
  fi
  AUTH_JSON="$auth_json" python3 - <<'PY'
 import json
 import os
 import sys
 data = json.loads(os.environ["AUTH_JSON"])
 if data.get("loggedIn") and data.get("subscriptionType") == "pro":
    print("ok")
    sys.exit(0)
 print(json.dumps(data, ensure_ascii=True))
 sys.exit(1)
 PY
 }
 log_agent_runtime() {
  case "$AGENT" in
    claude)
      local claude_path claude_version auth_json
      claude_path=$(bash -ic 'command -v claude' 2>/dev/null | tail -n 1 || true)
      claude_version=$(bash -ic 'claude --version' 2>/dev/null | tail -n 1 || true)
      auth_json=$(get_claude_analysis_auth_json)
      log "Claude binary: ${claude_path:-not found}"
      log "Claude version: ${claude_version:-unknown}"
      if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
        log "Claude auth hint: ANTHROPIC_API_KEY is set (API probe enabled)"
      else
        log "Claude auth hint: ANTHROPIC_API_KEY is not set"
      fi
      if [[ -n "$auth_json" ]]; then
        log "Claude analysis auth: $(AUTH_JSON="$auth_json" python3 - <<'PY'
 import json
 import os
 data = json.loads(os.environ["AUTH_JSON"])
 print(f"authMethod={data.get('authMethod')} subscriptionType={data.get('subscriptionType')} apiKeySource={data.get('apiKeySource')}")
 PY
 )"
      fi
      ;;
  esac
 }
 if [[ ! -f "$SPEC_FILE" ]]; then
  error "Missing $SPEC_FILE — create your project spec first."
  exit 1
 fi
 if [[ ! -f "$AGENT_FILE" ]]; then
  warn "No $AGENT_FILE found. Using default agent instructions."
 fi
 probe_rate_limit() {
  if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
    return 1
  fi
  local headers
  headers=$(curl -s -D - -o /dev/null \
    --max-time 10 \
    -X POST "https://api.anthropic.com/v1/messages" \
    -H "x-api-key: $ANTHROPIC_API_KEY" \
    -H "anthropic-version: 2023-06-01" \
    -H "content-type: application/json" \
    -d '{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{"role":"user","content":"hi"}]}' \
    2>/dev/null) || return 1
  local reset_str remaining
  reset_str=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-reset:" | awk '{print $2}' | tr -d '\r\n')
  remaining=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-remaining:" | awk '{print $2}' | tr -d '\r\n')
  if [[ -z "$reset_str" ]]; then
    return 1
  fi
  local reset_epoch
  reset_epoch=$(date -d "$reset_str" +%s 2>/dev/null) \
    || reset_epoch=$(python3 -c "
 from datetime import datetime, timezone
 import sys
 s = sys.argv[1].strip()
 for fmt in ('%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S+00:00', '%Y-%m-%dT%H:%M:%S%z'):
    try:
        dt = datetime.strptime(s, fmt)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        print(int(dt.timestamp()))
        break
    except Exception:
        pass
 " "$reset_str" 2>/dev/null) || return 1
  echo "${reset_epoch}|${remaining:-unknown}"
 }
 parse_epoch() {
  local ts="$1"
  date -d "$ts" +%s 2>/dev/null \
    || python3 -c "
 from datetime import datetime, timezone
 import sys
 s = sys.argv[1]
 for fmt in ('%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S',
            '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S+00:00'):
    try:
        dt = datetime.strptime(s, fmt)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        print(int(dt.timestamp()))
        break
    except Exception:
        pass
 " "$ts" 2>/dev/null || true
 }
 format_session_end() {
  local epoch="$1"
  date -d "@$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \
    || date -r "$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \
    || echo ""
 }
 infer_reset_epoch_from_log() {
  local logfile="$1"
  python3 - "$logfile" <<'PY' 2>/dev/null || true
 from datetime import datetime, timedelta
 from pathlib import Path
 import re
 import sys
 try:
    from zoneinfo import ZoneInfo
 except Exception:
    ZoneInfo = None
 logfile = Path(sys.argv[1])
 if not logfile.exists():
    raise SystemExit(0)
 text = logfile.read_text(encoding="utf-8", errors="ignore")
 matches = list(re.finditer(r"resets\s+(\d{1,2})(?::(\d{2}))?\s*(am|pm)\s*\(([^)]+)\)", text, re.IGNORECASE))
 if not matches:
    raise SystemExit(0)
 match = matches[-1]
 hour = int(match.group(1))
 minute = int(match.group(2) or "0")
 ampm = match.group(3).lower()
 tz_name = match.group(4).strip()
 if hour == 12:
    hour = 0
 if ampm == "pm":
    hour += 12
 if ZoneInfo is None:
    raise SystemExit(0)
 tz = ZoneInfo(tz_name)
 now = datetime.now(tz)
 candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
 if candidate <= now:
    candidate += timedelta(days=1)
 print(int(candidate.timestamp()))
 PY
 }
 countdown_sleep() {
  local target_epoch=$1
  local label="${2:-token reset}"
  local now
  while true; do
    now=$(date +%s)
    local remaining=$(( target_epoch - now ))
    if [[ $remaining -le 0 ]]; then
      break
    fi
    local h=$(( remaining / 3600 ))
    local m=$(( (remaining % 3600) / 60 ))
    local s=$(( remaining % 60 ))
    printf "\r${YELLOW}[ralph]${NC} Waiting for %s... %02dh%02dm%02ds remaining   " "$label" "$h" "$m" "$s"
    sleep 5
  done
  echo ""
 }
 wait_for_tokens() {
  local logfile="${1:-}"
  warn "Rate limit / token exhaustion detected."
  echo ""
  local wake_epoch="" wake_source=""
  info "Tier 1 — probing Anthropic API for exact reset time..."
  local probe_result
  if probe_result=$(probe_rate_limit); then
    local probe_epoch probe_remaining
    probe_epoch="${probe_result%%|*}"
    probe_remaining="${probe_result##*|}"
    local now
    now=$(date +%s)
    if [[ -n "$probe_epoch" && "$probe_epoch" -gt "$now" ]]; then
      wake_epoch=$probe_epoch
      wake_source="API probe"
      info "Tokens remaining: ${probe_remaining}. Reset at: $(date -d "@$probe_epoch" 2>/dev/null || date -r "$probe_epoch" 2>/dev/null || echo "$probe_epoch")"
    else
      info "Probe succeeded but reset time is already past — tokens may have reset. Retrying immediately."
      return 0
    fi
  else
    warn "Tier 1 unavailable (no ANTHROPIC_API_KEY or probe failed)."
  fi
  if [[ -z "$wake_epoch" && -n "$logfile" ]]; then
    info "Tier 2 — parsing reset time from agent output..."
    local log_epoch
    log_epoch=$(infer_reset_epoch_from_log "$logfile") || true
    if [[ -n "$log_epoch" ]]; then
      wake_epoch=$(( log_epoch + 60 ))
      wake_source="agent output"
      SESSION_ENDS=$(format_session_end "$log_epoch")
      info "Detected reset at: $(date -d "@$log_epoch" 2>/dev/null || date -r "$log_epoch" 2>/dev/null || echo "$log_epoch")"
      if [[ -n "$SESSION_ENDS" ]]; then
        info "Updated --session-ends seed to $SESSION_ENDS"
      fi
    else
      warn "Could not extract a reset time from $logfile."
    fi
  fi
  if [[ -z "$wake_epoch" && -n "$SESSION_ENDS" ]]; then
    info "Tier 3 — using --session-ends $SESSION_ENDS..."
    local seed_epoch
    seed_epoch=$(parse_epoch "$SESSION_ENDS") || true
    if [[ -n "$seed_epoch" ]]; then
      local now
      now=$(date +%s)
      if [[ "$seed_epoch" -gt "$now" ]]; then
        wake_epoch=$(( seed_epoch + 60 ))
        wake_source="session seed (--session-ends)"
        info "Will wake at: $(date -d "@$wake_epoch" 2>/dev/null || date -r "$wake_epoch" 2>/dev/null || echo "$wake_epoch") (+60s buffer)"
      else
        warn "--session-ends is stale (already past). Ignoring it for this retry."
      fi
    else
      warn "Could not parse --session-ends value: '$SESSION_ENDS'"
    fi
  fi
  if [[ -z "$wake_epoch" ]]; then
    warn "Tier 4 — no reset time available. Sleeping ${RATE_LIMIT_WAIT}s ($(( RATE_LIMIT_WAIT / 60 )) min)."
    warn "Tip: set ANTHROPIC_API_KEY or pass --session-ends for a smarter wake-up."
    wake_epoch=$(( $(date +%s) + RATE_LIMIT_WAIT ))
    wake_source="fixed wait"
  fi
  info "Strategy: $wake_source. Press Ctrl+C to cancel."
  countdown_sleep "$wake_epoch" "token reset"
  log "Wake-up time reached. Retrying..."
 }
 run_agent() {
  local iteration=$1
  local mode=$2
@ -86,12 +360,23 @@ run_agent() {
  log "Iteration $iteration ($mode mode) — starting fresh agent..."
-  # Disable pipefail around the agent call so a non-zero claude exit doesn't
+  if [[ "$AGENT" == "claude" && "$REQUIRE_PRO" == "1" ]]; then
-  # kill the script. We inspect the log content instead.
+    if ! verify_claude_pro_auth >/tmp/ralph-auth-check.out 2>/tmp/ralph-auth-check.err; then
      error "Claude analysis auth is not using Pro. Refusing to run."
      if [[ -s /tmp/ralph-auth-check.out ]]; then
        error "Auth details: $(tail -n 1 /tmp/ralph-auth-check.out)"
      fi
      if [[ -s /tmp/ralph-auth-check.err ]]; then
        error "Auth check stderr: $(tail -n 1 /tmp/ralph-auth-check.err)"
      fi
      exit 1
    fi
  fi
  set +e
  case "$AGENT" in
    claude)
-      echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile"
+      echo "$prompt" | env -u ANTHROPIC_API_KEY claude -p --dangerously-skip-permissions --output-format text 2>&1 | tee "$logfile"
      ;;
    codex)
      echo "$prompt" | codex 2>&1 | tee "$logfile"
@ -107,87 +392,72 @@ run_agent() {
        ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile"
      else
        error "Custom agent selected but ./custom-agent.sh not found or not executable"
        set -e
        exit 1
      fi
      ;;
    *)
-      error "Unknown agent: $AGENT"
+      error "Unknown agent: $AGENT. Supported: claude, codex, aider, gemini, custom"
-      error "Supported agents: claude, codex, aider, gemini, custom"
+      set -e
      exit 1
      ;;
  esac
  AGENT_EXIT_CODE=$?
  set -e
  return 0
 }
 # Probe whether claude is available by sending a trivial request.
 # Returns 0 if available, 1 if still rate-limited or erroring.
 probe_session() {
  local probe_log="$LOG_DIR/probe.log"
  set +e
  echo "Reply with the single word OK and nothing else." \
    | claude -p --output-format text > "$probe_log" 2>&1
  local rc=$?
  set -e
  if [[ $rc -ne 0 ]]; then
    return 1
  fi
  # Also check the output doesn't contain a limit message
  if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit' "$probe_log" 2>/dev/null; then
    return 1
  fi
  return 0
 }
 check_output() {
  local logfile="$1"
  # Session / usage limit — must check BEFORE generic promise checks
  if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit\|Claude AI usage' "$logfile" 2>/dev/null; then
    return 4  # Rate limited
  fi
  if grep -q '<promise>DONE</promise>' "$logfile" 2>/dev/null; then
    return 0  # Done
  elif grep -q '<promise>STUCK</promise>' "$logfile" 2>/dev/null; then
    return 2  # Stuck — needs human intervention
  elif grep -q '<promise>ERROR</promise>' "$logfile" 2>/dev/null; then
    return 3  # Unrecoverable error
  else
    return 1  # Normal iteration — continue
  fi
 }
 wait_for_session_reset() {
  local iteration=$1
  warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
  warn "Session usage limit hit during iteration $iteration."
  warn "Will probe every ${SESSION_POLL_INTERVAL}s until session resets."
  warn "No manual action needed — loop will resume automatically."
  warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
  local attempt=0
  while true; do
    ((attempt++))
    local next_check
    next_check=$(date -d "+${SESSION_POLL_INTERVAL} seconds" '+%H:%M:%S' 2>/dev/null \
      || date -v "+${SESSION_POLL_INTERVAL}S" '+%H:%M:%S' 2>/dev/null \
      || echo "soon")
    info "Probe attempt $attempt — next check at $next_check..."
    sleep "$SESSION_POLL_INTERVAL"
    if probe_session; then
      success "Session available! Resuming iteration $iteration..."
    return 0
  elif grep -q '<promise>STUCK</promise>' "$logfile" 2>/dev/null; then
    return 2
  elif grep -q '<promise>ERROR</promise>' "$logfile" 2>/dev/null; then
    return 3
  elif grep -Eqi "rate.limit|rate_limit|too many requests|exceeded.*quota|usage limit|out of tokens|overloaded|you'?ve hit your limit|resets [0-9]{1,2}(:[0-9]{2})?(am|pm)" "$logfile" 2>/dev/null; then
    return 4
  else
-      warn "Still rate-limited (attempt $attempt). Waiting another ${SESSION_POLL_INTERVAL}s..."
+    return 1
  fi
  done
 }
-# ─── Main ────────────────────────────────────────────────────────────────────
+plan_has_remaining_work() {
  if [[ ! -f "$PLAN_FILE" ]]; then
    return 1
  fi
  if grep -Eq '^- \[ \]' "$PLAN_FILE" 2>/dev/null; then
    return 0
  fi
  return 1
 }
 board_has_remaining_work() {
  if [[ -z "$BOARD_FILE" || ! -f "$BOARD_FILE" ]]; then
    return 1
  fi
  if grep -Eq '\| .*⬜ Pending .* \||\| .*🔄 In Progress .* \|' "$BOARD_FILE" 2>/dev/null; then
    return 0
  fi
  return 1
 }
 has_remaining_work() {
  if board_has_remaining_work; then
    return 0
  fi
  if plan_has_remaining_work; then
    return 0
  fi
  return 1
 }
 if [[ "$MODE" == "plan" ]]; then
  log "Planning mode — creating implementation plan..."
@ -197,44 +467,75 @@ if [[ "$MODE" == "plan" ]]; then
 fi
 log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)"
 log "Runtime model: script-orchestrated"
 log "Agent: $AGENT"
 log "Spec: $SPEC_FILE"
 log "Plan: $PLAN_FILE"
-log "Poll interval:       ${SESSION_POLL_INTERVAL}s (session limit recovery)"
+if [[ -n "$BOARD_FILE" ]]; then
  log "Board: $BOARD_FILE"
 fi
 if [[ -n "$SESSION_ENDS" ]]; then
  log "Tier 3 (session seed): $SESSION_ENDS"
 fi
 if [[ "$AGENT" == "claude" ]]; then
  log_agent_runtime
  if [[ "$REQUIRE_PRO" == "1" ]]; then
    log "Pro guard: enabled"
  else
    warn "Pro guard: disabled (--no-require-pro)"
  fi
 fi
 echo ""
-i=1
+for i in $(seq 1 "$MAX_ITERATIONS"); do
 while [[ $i -le $MAX_ITERATIONS ]]; do
  run_agent "$i" build
  logfile="$LOG_DIR/iteration-${i}.log"
-  # Capture return value without triggering set -e
+  check_output "$logfile"
-  check_output "$logfile" || status=$?
+  status=$?
  status=${status:-0}
  case $status in
    0)
-      success "ALL TASKS COMPLETE after $i iterations!"
+      if has_remaining_work; then
        warn "Agent reported DONE, but the tracking artifacts still show work remaining."
        warn "Ignoring false DONE and restarting with fresh context."
        echo ""
        sleep 2
      else
        success "All tracked work appears complete after $i iterations."
        exit 0
      fi
      ;;
    2)
-      warn "Agent is stuck on iteration $i. Review $logfile and intervene."
+      warn "Agent is stuck. Review $logfile and intervene."
      exit 1
      ;;
    3)
-      error "Agent encountered an error on iteration $i. Review $logfile."
+      error "Agent encountered an error. Review $logfile."
      exit 1
      ;;
    4)
-      # Rate limited — wait for reset, then retry the SAME iteration
+      warn "Token/rate limit hit on iteration $i."
-      wait_for_session_reset "$i"
+      wait_for_tokens "$logfile"
-      # Do NOT increment i — retry the same task
+      echo ""
      ;;
    1)
      if [[ $AGENT_EXIT_CODE -ne 0 ]]; then
        warn "Agent exited with code $AGENT_EXIT_CODE but did not emit a recognized promise signal."
        if has_remaining_work; then
          warn "Tracked work remains. Restarting fresh."
          echo ""
          sleep 2
        else
          error "No work remains in tracking artifacts, but agent did not finish cleanly."
          error "Review $logfile."
          exit 1
        fi
      else
        log "Iteration $i complete. Restarting with fresh context..."
        echo ""
        sleep 2
-      ((i++))
+      fi
      ;;
  esac
 done