fix(ralph-loop): add session limit detection and auto-recovery

When claude -p hits the Pro subscription usage limit, the old loop had
no detection — it would find no <promise> signal, treat it as a normal
continuation, and immediately retry, burning all --max iterations.

New behaviour:
- check_output() returns status 4 when the log contains any usage/rate
  limit message (case-insensitive, multiple pattern variants)
- wait_for_session_reset() polls via a trivial probe call every
  SESSION_POLL_INTERVAL seconds (default: 600s / 10 min) until claude
  responds cleanly again
- When rate-limited, the same iteration is retried (i is not incremented)
  so no task is skipped or double-counted
- set -e is temporarily suspended around agent calls so a non-zero claude
  exit doesn't kill the bash process

Also updated the master template in docs/agent-harness/ralph-loop.sh.

Agent: human
Tests: N/A
Tests-Added: 0
TypeScript: N/A
This commit is contained in:
paulh 2026-04-09 21:39:29 -04:00
parent b1c199d21d
commit 82e10ff810
1 changed files with 92 additions and 47 deletions

View File

@ -7,7 +7,10 @@
# - Agent reads the plan, picks ONE task, implements, tests, commits, exits # - Agent reads the plan, picks ONE task, implements, tests, commits, exits
# - Loop restarts until all tasks are done # - Loop restarts until all tasks are done
# #
# No context compaction. No stale reasoning. Just fresh starts. # Session limit handling:
# - Detects Claude Pro usage limit messages in agent output
# - Polls every SESSION_POLL_INTERVAL seconds until the session resets
# - Resumes the same iteration automatically — no manual intervention needed
# #
# Usage: # Usage:
# ./ralph-loop.sh # Build mode (default) # ./ralph-loop.sh # Build mode (default)
@ -19,22 +22,6 @@
# ./ralph-loop.sh --agent gemini # Use Gemini CLI # ./ralph-loop.sh --agent gemini # Use Gemini CLI
# ./ralph-loop.sh --agent custom # Use custom agent (see below) # ./ralph-loop.sh --agent custom # Use custom agent (see below)
# #
# Extensibility:
# To add support for other AI coding agents (aider, cursor, windsurf, etc.):
# 1. Add a new case in the run_agent() function's agent selection block
# 2. Format the prompt appropriately for that agent's CLI interface
# 3. Ensure the agent outputs to the logfile for promise detection
#
# Example for Aider:
# aider)
# aider --message "$prompt" --yes 2>&1 | tee "$logfile"
# ;;
#
# Example for custom script:
# custom)
# ./my-agent-wrapper.sh "$prompt" 2>&1 | tee "$logfile"
# ;;
#
set -euo pipefail set -euo pipefail
MODE="${1:-build}" MODE="${1:-build}"
@ -45,6 +32,10 @@ SPEC_FILE="PROJECT-SPEC.md"
AGENT_FILE="AGENT.md" AGENT_FILE="AGENT.md"
LOG_DIR=".ralph-logs" LOG_DIR=".ralph-logs"
# How often (in seconds) to probe whether the session has reset.
# Default: 10 minutes. Adjust down if you want faster recovery.
SESSION_POLL_INTERVAL="${SESSION_POLL_INTERVAL:-600}"
# Parse arguments # Parse arguments
shift 2>/dev/null || true shift 2>/dev/null || true
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
@ -62,12 +53,14 @@ GREEN='\033[0;32m'
YELLOW='\033[1;33m' YELLOW='\033[1;33m'
RED='\033[0;31m' RED='\033[0;31m'
BLUE='\033[0;34m' BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' NC='\033[0m'
log() { echo -e "${BLUE}[ralph]${NC} $1"; } log() { echo -e "${BLUE}[ralph]${NC} $1"; }
success() { echo -e "${GREEN}[ralph]${NC} $1"; } success() { echo -e "${GREEN}[ralph]${NC} $1"; }
warn() { echo -e "${YELLOW}[ralph]${NC} $1"; } warn() { echo -e "${YELLOW}[ralph]${NC} $1"; }
error() { echo -e "${RED}[ralph]${NC} $1"; } error() { echo -e "${RED}[ralph]${NC} $1"; }
info() { echo -e "${CYAN}[ralph]${NC} $1"; }
# Check prerequisites # Check prerequisites
if [[ ! -f "$SPEC_FILE" ]]; then if [[ ! -f "$SPEC_FILE" ]]; then
@ -93,8 +86,9 @@ run_agent() {
log "Iteration $iteration ($mode mode) — starting fresh agent..." log "Iteration $iteration ($mode mode) — starting fresh agent..."
# Agent selection block # Disable pipefail around the agent call so a non-zero claude exit doesn't
# Extend this case statement to support additional agents # kill the script. We inspect the log content instead.
set +e
case "$AGENT" in case "$AGENT" in
claude) claude)
echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile" echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile"
@ -103,23 +97,12 @@ run_agent() {
echo "$prompt" | codex 2>&1 | tee "$logfile" echo "$prompt" | codex 2>&1 | tee "$logfile"
;; ;;
aider) aider)
# Aider: AI pair programming in your terminal
# https://aider.chat
aider --message "$prompt" --yes 2>&1 | tee "$logfile" aider --message "$prompt" --yes 2>&1 | tee "$logfile"
;; ;;
gemini) gemini)
# Google Gemini CLI (if available)
# Adjust command based on actual Gemini CLI interface
echo "$prompt" | gemini-cli 2>&1 | tee "$logfile" echo "$prompt" | gemini-cli 2>&1 | tee "$logfile"
;; ;;
custom) custom)
# Custom agent integration
# Replace this with your own agent wrapper script
# The script should:
# 1. Accept prompt as first argument or via stdin
# 2. Perform the requested work (read files, write code, run tests, commit)
# 3. Output promise signals: <promise>PLANNED|DONE|STUCK|ERROR</promise>
# 4. Exit with appropriate code
if [[ -x "./custom-agent.sh" ]]; then if [[ -x "./custom-agent.sh" ]]; then
./custom-agent.sh "$prompt" 2>&1 | tee "$logfile" ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile"
else else
@ -130,29 +113,82 @@ run_agent() {
*) *)
error "Unknown agent: $AGENT" error "Unknown agent: $AGENT"
error "Supported agents: claude, codex, aider, gemini, custom" error "Supported agents: claude, codex, aider, gemini, custom"
error "To add support for other agents, edit the run_agent() function in this script"
exit 1 exit 1
;; ;;
esac esac
set -e
return 0 return 0
} }
# Probe whether claude is available by sending a trivial request.
# Returns 0 if available, 1 if still rate-limited or erroring.
probe_session() {
local probe_log="$LOG_DIR/probe.log"
set +e
echo "Reply with the single word OK and nothing else." \
| claude -p --output-format text > "$probe_log" 2>&1
local rc=$?
set -e
if [[ $rc -ne 0 ]]; then
return 1
fi
# Also check the output doesn't contain a limit message
if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit' "$probe_log" 2>/dev/null; then
return 1
fi
return 0
}
check_output() { check_output() {
local logfile="$1" local logfile="$1"
# Session / usage limit — must check BEFORE generic promise checks
if grep -qi 'usage limit\|rate limit\|limit reached\|exceeded.*limit\|Claude AI usage' "$logfile" 2>/dev/null; then
return 4 # Rate limited
fi
if grep -q '<promise>DONE</promise>' "$logfile" 2>/dev/null; then if grep -q '<promise>DONE</promise>' "$logfile" 2>/dev/null; then
return 0 # Done return 0 # Done
elif grep -q '<promise>STUCK</promise>' "$logfile" 2>/dev/null; then elif grep -q '<promise>STUCK</promise>' "$logfile" 2>/dev/null; then
return 2 # Stuck return 2 # Stuck — needs human intervention
elif grep -q '<promise>ERROR</promise>' "$logfile" 2>/dev/null; then elif grep -q '<promise>ERROR</promise>' "$logfile" 2>/dev/null; then
return 3 # Error return 3 # Unrecoverable error
else else
return 1 # Continue return 1 # Normal iteration — continue
fi fi
} }
# Main loop wait_for_session_reset() {
local iteration=$1
warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
warn "Session usage limit hit during iteration $iteration."
warn "Will probe every ${SESSION_POLL_INTERVAL}s until session resets."
warn "No manual action needed — loop will resume automatically."
warn "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
local attempt=0
while true; do
((attempt++))
local next_check
next_check=$(date -d "+${SESSION_POLL_INTERVAL} seconds" '+%H:%M:%S' 2>/dev/null \
|| date -v "+${SESSION_POLL_INTERVAL}S" '+%H:%M:%S' 2>/dev/null \
|| echo "soon")
info "Probe attempt $attempt — next check at $next_check..."
sleep "$SESSION_POLL_INTERVAL"
if probe_session; then
success "Session available! Resuming iteration $iteration..."
return 0
else
warn "Still rate-limited (attempt $attempt). Waiting another ${SESSION_POLL_INTERVAL}s..."
fi
done
}
# ─── Main ────────────────────────────────────────────────────────────────────
if [[ "$MODE" == "plan" ]]; then if [[ "$MODE" == "plan" ]]; then
log "Planning mode — creating implementation plan..." log "Planning mode — creating implementation plan..."
run_agent 0 plan run_agent 0 plan
@ -164,32 +200,41 @@ log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)"
log "Agent: $AGENT" log "Agent: $AGENT"
log "Spec: $SPEC_FILE" log "Spec: $SPEC_FILE"
log "Plan: $PLAN_FILE" log "Plan: $PLAN_FILE"
log "Poll interval: ${SESSION_POLL_INTERVAL}s (session limit recovery)"
echo "" echo ""
for i in $(seq 1 "$MAX_ITERATIONS"); do i=1
while [[ $i -le $MAX_ITERATIONS ]]; do
run_agent "$i" build run_agent "$i" build
logfile="$LOG_DIR/iteration-${i}.log" logfile="$LOG_DIR/iteration-${i}.log"
check_output "$logfile" # Capture return value without triggering set -e
status=$? check_output "$logfile" || status=$?
status=${status:-0}
case $status in case $status in
0) 0)
success "🎉 ALL TASKS COMPLETE after $i iterations!" success "ALL TASKS COMPLETE after $i iterations!"
exit 0 exit 0
;; ;;
2) 2)
warn "Agent is stuck. Review $logfile and intervene." warn "Agent is stuck on iteration $i. Review $logfile and intervene."
exit 1 exit 1
;; ;;
3) 3)
error "Agent encountered an error. Review $logfile." error "Agent encountered an error on iteration $i. Review $logfile."
exit 1 exit 1
;; ;;
4)
# Rate limited — wait for reset, then retry the SAME iteration
wait_for_session_reset "$i"
# Do NOT increment i — retry the same task
;;
1) 1)
log "Iteration $i complete. Restarting with fresh context..." log "Iteration $i complete. Restarting with fresh context..."
echo "" echo ""
sleep 2 sleep 2
((i++))
;; ;;
esac esac
done done