fix(ralph-loop): add session limit detection and auto-recovery

When claude -p hits the Pro subscription usage limit, the old loop had no detection — it would find no <promise> signal, treat it as a normal continuation, and immediately retry, burning all --max iterations. New behaviour: - check_output() returns status 4 when the log contains any usage/rate limit message (case-insensitive, multiple pattern variants) - wait_for_session_reset() polls via a trivial probe call every SESSION_POLL_INTERVAL seconds (default: 600s / 10 min) until claude responds cleanly again - When rate-limited, the same iteration is retried (i is not incremented) so no task is skipped or double-counted - set -e is temporarily suspended around agent calls so a non-zero claude exit doesn't kill the bash process Also updated the master template in docs/agent-harness/ralph-loop.sh. Agent: human Tests: N/A Tests-Added: 0 TypeScript: N/A
2026-04-09 21:39:29 -04:00 · 2026-04-09 21:39:29 -04:00 · 82e10ff810
parent b1c199d21d
commit 82e10ff810
1 changed files with 92 additions and 47 deletions
--- a/ralph-loop.sh
+++ b/ralph-loop.sh
@ -7,7 +7,10 @@
 # - Agent reads the plan, picks ONE task, implements, tests, commits, exits
 # - Loop restarts until all tasks are done
 #
-# No context compaction. No stale reasoning. Just fresh starts.
+# Session limit handling:
 # - Detects Claude Pro usage limit messages in agent output
 # - Polls every SESSION_POLL_INTERVAL seconds until the session resets
 # - Resumes the same iteration automatically — no manual intervention needed
 #
 # Usage:
 #   ./ralph-loop.sh              # Build mode (default)
@ -19,22 +22,6 @@
 #   ./ralph-loop.sh --agent gemini  # Use Gemini CLI
 #   ./ralph-loop.sh --agent custom  # Use custom agent (see below)
 #
 # Extensibility:
 # To add support for other AI coding agents (aider, cursor, windsurf, etc.):
 # 1. Add a new case in the run_agent() function's agent selection block
 # 2. Format the prompt appropriately for that agent's CLI interface
 # 3. Ensure the agent outputs to the logfile for promise detection
 #
 # Example for Aider:
 #   aider)
 #     aider --message "$prompt" --yes 2>&1 | tee "$logfile"
 #     ;;
 #
 # Example for custom script:
 #   custom)
 #     ./my-agent-wrapper.sh "$prompt" 2>&1 | tee "$logfile"
 #     ;;
 #
 set -euo pipefail
 MODE="${1:-build}"
@ -45,6 +32,10 @@ SPEC_FILE="PROJECT-SPEC.md"
 AGENT_FILE="AGENT.md"
 LOG_DIR=".ralph-logs"
 # How often (in seconds) to probe whether the session has reset.
 # Default: 10 minutes. Adjust down if you want faster recovery.
 SESSION_POLL_INTERVAL="${SESSION_POLL_INTERVAL:-600}"
 # Parse arguments
 shift 2>/dev/null || true
 while [[ $# -gt 0 ]]; do
@ -62,12 +53,14 @@ GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 BLUE='\033[0;34m'
 CYAN='\033[0;36m'
 NC='\033[0m'
 log()     { echo -e "${BLUE}[ralph]${NC} $1"; }
 success() { echo -e "${GREEN}[ralph]${NC} $1"; }
 warn()    { echo -e "${YELLOW}[ralph]${NC} $1"; }
 error()   { echo -e "${RED}[ralph]${NC} $1"; }
 info()    { echo -e "${CYAN}[ralph]${NC} $1"; }
 # Check prerequisites
 if [[ ! -f "$SPEC_FILE" ]]; then
@ -93,8 +86,9 @@ run_agent() {
  log "Iteration $iteration ($mode mode) — starting fresh agent..."
-  # Agent selection block
+  # Disable pipefail around the agent call so a non-zero claude exit doesn't
-  # Extend this case statement to support additional agents
+  # kill the script. We inspect the log content instead.
  set +e
  case "$AGENT" in
    claude)
      echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile"
@ -103,23 +97,12 @@ run_agent() {
      echo "$prompt" | codex 2>&1 | tee "$logfile"
      ;;
    aider)
      # Aider: AI pair programming in your terminal
      # https://aider.chat
      aider --message "$prompt" --yes 2>&1 | tee "$logfile"
      ;;
    gemini)
      # Google Gemini CLI (if available)
      # Adjust command based on actual Gemini CLI interface
      echo "$prompt" | gemini-cli 2>&1 | tee "$logfile"
      ;;
    custom)
      # Custom agent integration
      # Replace this with your own agent wrapper script
      # The script should:
      # 1. Accept prompt as first argument or via stdin
      # 2. Perform the requested work (read files, write code, run tests, commit)
      # 3. Output promise signals: <promise>PLANNED|DONE|STUCK|ERROR</promise>
      # 4. Exit with appropriate code
      if [[ -x "./custom-agent.sh" ]]; then
        ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile"
      else
@ -130,29 +113,82 @@ run_agent() {
    *)
      error "Unknown agent: $AGENT"
      error "Supported agents: claude, codex, aider, gemini, custom"
      error "To add support for other agents, edit the run_agent() function in this script"
      exit 1
      ;;
  esac
  set -e
  return 0
 }
 # Probe whether claude is available by sending a trivial request.
 # Returns 0 if available, 1 if still rate-limited or erroring.
 probe_session() {
  local probe_log="$LOG_DIR/probe.log"
  set +e
  echo "Reply with the single word OK and nothing else." \
    | claude -p --output-format text > "$probe_log" 2>&1
  local rc=$?
  set -e
  if [[ $rc -ne 0 ]]; then
    return 1
  fi
  # Also check the output doesn't contain a limit message
  if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit' "$probe_log" 2>/dev/null; then
    return 1
  fi
  return 0
 }
 check_output() {
  local logfile="$1"
  # Session / usage limit — must check BEFORE generic promise checks
  if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit\|Claude AI usage' "$logfile" 2>/dev/null; then
    return 4  # Rate limited
  fi
  if grep -q '<promise>DONE</promise>' "$logfile" 2>/dev/null; then
    return 0  # Done
  elif grep -q '<promise>STUCK</promise>' "$logfile" 2>/dev/null; then
-    return 2  # Stuck
+    return 2  # Stuck — needs human intervention
  elif grep -q '<promise>ERROR</promise>' "$logfile" 2>/dev/null; then
-    return 3  # Error
+    return 3  # Unrecoverable error
  else
-    return 1  # Continue
+    return 1  # Normal iteration — continue
  fi
 }
-# Main loop
+wait_for_session_reset() {
  local iteration=$1
  warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
  warn "Session usage limit hit during iteration $iteration."
  warn "Will probe every ${SESSION_POLL_INTERVAL}s until session resets."
  warn "No manual action needed — loop will resume automatically."
  warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
  local attempt=0
  while true; do
    ((attempt++))
    local next_check
    next_check=$(date -d "+${SESSION_POLL_INTERVAL} seconds" '+%H:%M:%S' 2>/dev/null \
      || date -v "+${SESSION_POLL_INTERVAL}S" '+%H:%M:%S' 2>/dev/null \
      || echo "soon")
    info "Probe attempt $attempt — next check at $next_check..."
    sleep "$SESSION_POLL_INTERVAL"
    if probe_session; then
      success "Session available! Resuming iteration $iteration..."
      return 0
    else
      warn "Still rate-limited (attempt $attempt). Waiting another ${SESSION_POLL_INTERVAL}s..."
    fi
  done
 }
 # ─── Main ────────────────────────────────────────────────────────────────────
 if [[ "$MODE" == "plan" ]]; then
  log "Planning mode — creating implementation plan..."
  run_agent 0 plan
@ -164,32 +200,41 @@ log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)"
 log "Agent:               $AGENT"
 log "Spec:                $SPEC_FILE"
 log "Plan:                $PLAN_FILE"
 log "Poll interval:       ${SESSION_POLL_INTERVAL}s (session limit recovery)"
 echo ""
-for i in $(seq 1 "$MAX_ITERATIONS"); do
+i=1
 while [[ $i -le $MAX_ITERATIONS ]]; do
  run_agent "$i" build
  logfile="$LOG_DIR/iteration-${i}.log"
-  check_output "$logfile"
+  # Capture return value without triggering set -e
-  status=$?
+  check_output "$logfile" || status=$?
  status=${status:-0}
  case $status in
    0)
-      success "🎉 ALL TASKS COMPLETE after $i iterations!"
+      success "ALL TASKS COMPLETE after $i iterations!"
      exit 0
      ;;
    2)
-      warn "Agent is stuck. Review $logfile and intervene."
+      warn "Agent is stuck on iteration $i. Review $logfile and intervene."
      exit 1
      ;;
    3)
-      error "Agent encountered an error. Review $logfile."
+      error "Agent encountered an error on iteration $i. Review $logfile."
      exit 1
      ;;
    4)
      # Rate limited — wait for reset, then retry the SAME iteration
      wait_for_session_reset "$i"
      # Do NOT increment i — retry the same task
      ;;
    1)
      log "Iteration $i complete. Restarting with fresh context..."
      echo ""
      sleep 2
      ((i++))
      ;;
  esac
 done