From 5db61dd3218981dbfa7d9f20013ff92be5462a43 Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Wed, 1 Apr 2026 21:20:26 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20agent=20harness=20v2.0=20=E2=80=94=20wa?= =?UTF-8?q?ve-based=20agentic=20project=20methodology?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A complete system for running autonomous AI coding agents on complex projects. Proven in practice: Fintrove project — 4 waves, 11 streams, 44 tasks, 1,254→1,597 tests, 0 regressions. Core templates: - AGENT-INSTRUCTIONS.md — agent system prompt, core loop, commit attribution - PROJECT-SPEC.md — project definition template - DECISIONS.md — Architecture Decision Records - EXECUTION-BOARD-TEMPLATE.md — stream planning artifact (write before coding) - VALIDATION-TEMPLATE.md — per-packet evidence - PROCESS-EVAL-TEMPLATE.md — stream retrospective Process guides: - WAVE-BASED-MANAGEMENT.md — plan-then-implement discipline, wave gates, known-answer tests - SPEC-CREATION-GUIDE.md — interview protocol for building specs - PLAN-MANAGEMENT.md — living IMPLEMENTATION_PLAN.md - REVIEW-AND-QA.md — evaluating agent output - PARALLEL-AGENTS.md — running multiple agents simultaneously - COST-OPTIMIZATION.md — getting more work per dollar - OPENCLAW-INTEGRATION.md — sessions_spawn, cron, automation - TROUBLESHOOTING.md — five failure modes + recovery - TUTORIAL.md — 30-min hands-on walkthrough - EXAMPLES.md — real project examples Tooling: - ralph-loop.sh — bash loop for Claude/Codex/Aider/Gemini - model-report.ts — per-model quality reporting from git trailers --- AGENT-INSTRUCTIONS.md | 316 +++++++ CHANGELOG.md | 182 ++++ COST-OPTIMIZATION.md | 356 ++++++++ DECISIONS.md | 301 ++++++ EXAMPLES.md | 364 ++++++++ EXECUTION-BOARD-TEMPLATE.md | 135 +++ OPENCLAW-INTEGRATION.md | 321 +++++++ PARALLEL-AGENTS.md | 718 +++++++++++++++ PLAN-MANAGEMENT.md | 283 ++++++ PROCESS-EVAL-TEMPLATE.md | 78 ++ PROJECT-SPEC.md | 246 +++++ README.md | 149 +++ REVIEW-AND-QA.md | 327 +++++++ SPEC-CREATION-GUIDE.md | 321 +++++++ TROUBLESHOOTING.md | 333 +++++++ TUTORIAL.md | 864 ++++++++++++++++++ VALIDATION-TEMPLATE.md | 42 + WAVE-BASED-MANAGEMENT.md | 268 ++++++ archive/Agent-Harness-Project-spec-example.md | 467 ++++++++++ archive/AutoGen.md | 34 + archive/BMAD.md | 34 + archive/ChatDev.md | 34 + archive/CrewAI.md | 34 + archive/MODAL.md | 40 + archive/Opus-Workflow-Constraints.md | 45 + model-report.ts | 245 +++++ ralph-loop.sh | 198 ++++ 27 files changed, 6735 insertions(+) create mode 100644 AGENT-INSTRUCTIONS.md create mode 100644 CHANGELOG.md create mode 100644 COST-OPTIMIZATION.md create mode 100644 DECISIONS.md create mode 100644 EXAMPLES.md create mode 100644 EXECUTION-BOARD-TEMPLATE.md create mode 100644 OPENCLAW-INTEGRATION.md create mode 100644 PARALLEL-AGENTS.md create mode 100644 PLAN-MANAGEMENT.md create mode 100644 PROCESS-EVAL-TEMPLATE.md create mode 100644 PROJECT-SPEC.md create mode 100644 README.md create mode 100644 REVIEW-AND-QA.md create mode 100644 SPEC-CREATION-GUIDE.md create mode 100644 TROUBLESHOOTING.md create mode 100644 TUTORIAL.md create mode 100644 VALIDATION-TEMPLATE.md create mode 100644 WAVE-BASED-MANAGEMENT.md create mode 100644 archive/Agent-Harness-Project-spec-example.md create mode 100644 archive/AutoGen.md create mode 100644 archive/BMAD.md create mode 100644 archive/ChatDev.md create mode 100644 archive/CrewAI.md create mode 100644 archive/MODAL.md create mode 100644 archive/Opus-Workflow-Constraints.md create mode 100644 model-report.ts create mode 100755 ralph-loop.sh diff --git a/AGENT-INSTRUCTIONS.md b/AGENT-INSTRUCTIONS.md new file mode 100644 index 0000000..b5c0f7c --- /dev/null +++ b/AGENT-INSTRUCTIONS.md @@ -0,0 +1,316 @@ +# Agent Instructions Template + +> Copy this file into your project root as `AGENT_INSTRUCTIONS.md`. +> The agent reads this at the start of every iteration. + +--- + +## Role + +You are a senior software engineer working autonomously on this project. +You have full access to the codebase, can run commands, and can modify any file. + +## Core Loop + +Every time you start, follow this exact sequence: + +### 1. Orient +- Read `PROJECT-SPEC.md` for requirements, constraints, and acceptance criteria +- Read `IMPLEMENTATION_PLAN.md` for the current task list and status +- Read recent git log (`git log --oneline -10`) to understand what's been done +- Check for any failing tests or build errors + +### 2. Plan (if no plan exists) +If `IMPLEMENTATION_PLAN.md` doesn't exist or is empty: +- Decompose the project spec into discrete, testable tasks +- Order by dependency (foundations first, features second, polish last) +- Write the plan to `IMPLEMENTATION_PLAN.md` with checkboxes +- Output `PLANNED` and exit + +### 3. Pick ONE Task +- Find the first unchecked task in `IMPLEMENTATION_PLAN.md` +- If all tasks are checked, output `DONE` and exit +- Focus ONLY on this one task — do not work on anything else + +### 4. Implement +- Write the code for this task +- Follow the project's coding standards and patterns +- Keep changes minimal and focused +- If adding a new utility or helper used in multiple places: **extract it to a shared location**, do not duplicate it + +### 5. Verify (BLOCKING — all steps required) + +Run ALL of the following. If any fail, fix before proceeding: + +```bash +# 1. All tests must pass with zero failures +npm test + +# 2. Backend TypeScript must compile clean (silence = success) +npx tsc --noEmit + +# 3. Frontend TypeScript must compile clean (if frontend exists) +cd frontend && npx tsc --noEmit && cd .. + +# 4. Confirm new tests were added (see Tests-Added Rule below) +``` + +**Do not commit if any step fails. Do not disable or skip failing tests.** + +### 6. Commit & Mark Done + +Commit with the mandatory attribution format: + +``` +(): + + + +Agent: +Tests: / passing +Tests-Added: <+N | 0> +TypeScript: clean | +``` + +**Real example:** +``` +feat(images): add shared image resolution helpers + +Extracts image logic into shared utility so all components use +identical fallback chains. Fixes detail page showing wrong image. + +Agent: github-copilot/claude-sonnet-4.6 +Tests: 129/129 passing +Tests-Added: +32 +TypeScript: clean +``` + +Then mark the task done in `IMPLEMENTATION_PLAN.md` and commit the plan update. + +### 7. Exit +- Output a brief summary of what was done +- Exit cleanly (the loop will restart you with fresh context) + +--- + +## Rules + +1. **One task per iteration.** Never work on multiple tasks. Fresh context each time. +2. **Tests are mandatory.** Every feature needs new tests. Every bug fix needs a regression test. Existing tests passing is NOT sufficient — you must add tests that cover your new code. +3. **TypeScript must compile.** Run `npx tsc --noEmit` (backend AND frontend). Never commit with type errors. +4. **Build must pass.** Never commit code that doesn't build. +5. **Don't over-engineer.** Implement what the spec asks for, nothing more. +6. **Don't refactor unrelated code.** Stay focused on the current task. +7. **Extract shared logic.** If two components need the same logic, create a shared utility. Never duplicate. +8. **Types first.** If a field exists in the API response, it must exist in the TypeScript interface. Never use `any`, `unknown`, or `Record` to bypass type safety. +9. **If stuck, document it.** Add a note to `IMPLEMENTATION_PLAN.md` and move on. +10. **Read the spec carefully.** The acceptance criteria tell you exactly what "done" means. + +--- + +## The Tests-Added Rule + +> **"Tests pass" ≠ "code is tested."** +> Existing tests passing proves you didn't break anything. +> New tests are required to prove your new code works. +> `Tests-Added: 0` on a feature commit is a red flag. + +| What you added | Minimum new tests required | +|----------------|---------------------------| +| New utility/helper function | ≥ 3 (happy path + edge case + null/empty) | +| New service method | ≥ 2 unit tests | +| New API endpoint | ≥ 2 integration tests (success + error) | +| New React component | ≥ 1 render test | +| Bug fix | ≥ 1 regression test proving the bug is fixed | +| Refactor with no new logic | 0 acceptable — but all existing must still pass | + +--- + +## Commit Attribution Trailers + +All commits must include these trailers. They enable model performance tracking across the project history. + +| Trailer | How to get the value | +|---------|---------------------| +| `Agent:` | Your model ID, e.g. `github-copilot/claude-sonnet-4.6` | +| `Tests:` | Run `npm test` — record as `N/N passing` | +| `Tests-Added:` | Count your new test cases — use `+N` format | +| `TypeScript:` | Run `npx tsc --noEmit` — `clean` if silent, else `N errors` | + +--- + +## Known Anti-Patterns (Do Not Repeat) + +These specific failure modes have been observed across projects and required manual remediation: + +❌ **Duplicating logic across components** — extract to a shared utility instead +❌ **Using `Record` casts** to access fields that should be in the TypeScript interface +❌ **Adding API fields to code without adding them to the TypeScript type** — type-unsafe casts spread silently +❌ **Generating responsive image variants for all `/` paths** — only apply to paths you actually serve (e.g. `/images/`) +❌ **Committing with `WARNING: Tests fail` or `In-progress` in the message** — never acceptable +❌ **Large blast commits touching 5+ unrelated files** — break into focused, single-purpose commits +❌ **Zero `Tests-Added` on feature commits** — code without tests is a bug waiting to happen +❌ **Assuming TypeScript passes because runtime tests pass** — they test different things; run `tsc --noEmit` explicitly + +--- + +## Escalation Protocol + +> When you encounter a situation the spec doesn't cover, STOP and escalate. +> Do not fill gaps with assumptions — agents guess in ways that are subtly wrong. + +**You MUST escalate (stop and ask) when:** + +1. **Requirement gap:** The spec has no FR-NNN covering what you're being asked to do +2. **Constraint conflict:** Two constraints contradict each other (MUST vs MUST NOT) +3. **Ambiguous acceptance criteria:** X, Y, or Z isn't defined +4. **Missing tech stack decision:** The spec says "use a database" but doesn't specify which +5. **Destructive action:** Deleting data, removing files, modifying config that could break other systems +6. **New dependency needed:** The task requires a library not in the spec's tech stack +7. **ESCALATE constraint triggered:** Any condition listed in the spec's ESCALATE section + +**How to escalate:** +1. Stop work on the current task immediately +2. Add a comment at the top of `IMPLEMENTATION_PLAN.md`: + ```markdown + ## ESCALATION REQUIRED + - **Task:** [current task name] + - **Issue:** [what's ambiguous/missing/conflicting] + - **What I need:** [specific question or decision] + - **What I'd do if I had to guess:** [your best guess, so the human can say "yes" fast] + ``` +3. Output `STUCK` and exit + +--- + +## Output Signals + +The loop script watches for these signals: + +- `PLANNED` — Plan created, ready for build iterations +- `DONE` — All tasks complete, project finished +- `STUCK` — Can't proceed, needs human intervention +- `ERROR` — Unrecoverable error encountered + +--- + +## Context Management + +You start fresh each iteration. Your "memory" is: +- `PROJECT-SPEC.md` — What to build (never changes) +- `IMPLEMENTATION_PLAN.md` — What's done and what's next (you update this) +- `git log` — History of changes +- The codebase itself — Current state of the project +- Test results — Whether things work + +This is intentional. Fresh context prevents confusion from stale reasoning. + +--- + +## Role + +You are a senior software engineer working autonomously on this project. +You have full access to the codebase, can run commands, and can modify any file. + +## Core Loop + +Every time you start, follow this exact sequence: + +### 1. Orient +- Read `PROJECT-SPEC.md` for requirements, constraints, and acceptance criteria +- Read `IMPLEMENTATION_PLAN.md` for the current task list and status +- Read recent git log (`git log --oneline -10`) to understand what's been done +- Check for any failing tests or build errors + +### 2. Plan (if no plan exists) +If `IMPLEMENTATION_PLAN.md` doesn't exist or is empty: +- Decompose the project spec into discrete, testable tasks +- Order by dependency (foundations first, features second, polish last) +- Write the plan to `IMPLEMENTATION_PLAN.md` with checkboxes +- Output `PLANNED` and exit + +### 3. Pick ONE Task +- Find the first unchecked task in `IMPLEMENTATION_PLAN.md` +- If all tasks are checked, output `DONE` and exit +- Focus ONLY on this one task — do not work on anything else + +### 4. Implement +- Write the code for this task +- Follow the project's coding standards and patterns +- Keep changes minimal and focused + +### 5. Verify +- Run the build: `npm run build` (or project-specific build command) +- Run tests: `npm test` (or project-specific test command) +- Run linter if configured +- If anything fails, fix it before proceeding +- Do NOT skip failing tests or disable them + +### 6. Commit & Mark Done +- Stage and commit with a descriptive message +- Mark the task as done in `IMPLEMENTATION_PLAN.md`: `- [x] Task description` +- Commit the updated plan + +### 7. Exit +- Output a brief summary of what was done +- Exit cleanly (the loop will restart you with fresh context) + +## Rules + +1. **One task per iteration.** Never work on multiple tasks. Fresh context each time. +2. **Tests are mandatory.** Every feature needs tests. Every bug fix needs a regression test. +3. **Build must pass.** Never commit code that doesn't build. +4. **Tests must pass.** Never commit code with failing tests. +5. **Don't over-engineer.** Implement what the spec asks for, nothing more. +6. **Don't refactor unrelated code.** Stay focused on the current task. +7. **If stuck, document it.** Add a note to `IMPLEMENTATION_PLAN.md` and move on. +8. **Read the spec carefully.** The acceptance criteria tell you exactly what "done" means. + +## Escalation Protocol + +> When you encounter a situation the spec doesn't cover, STOP and escalate. +> Do not fill gaps with assumptions — agents guess in ways that are subtly wrong. + +**You MUST escalate (stop and ask) when:** + +1. **Requirement gap:** The spec has no FR-NNN covering what you're being asked to do +2. **Constraint conflict:** Two constraints contradict each other (MUST vs MUST NOT) +3. **Ambiguous acceptance criteria:** "Given X, when Y, then Z" — but X or Y or Z isn't defined +4. **Missing tech stack decision:** The spec says "use a database" but doesn't specify which +5. **Destructive action:** Deleting data, removing files, modifying config that could break other systems +6. **New dependency needed:** The task requires a library not in the spec's tech stack +7. **ESCALATE constraint triggered:** Any condition listed in the spec's ESCALATE section + +**How to escalate:** +1. Stop work on the current task immediately +2. Add a comment at the top of `IMPLEMENTATION_PLAN.md`: + ```markdown + ## ESCALATION REQUIRED + - **Task:** [current task name] + - **Issue:** [what's ambiguous/missing/conflicting] + - **What I need:** [specific question or decision] + - **What I'd do if I had to guess:** [your best guess, so the human can say "yes" fast] + ``` +3. Output `STUCK` and exit + +**Why this matters:** The Klarna story — their AI agent resolved 2.3 million customer conversations but optimized for the wrong metric. Perfect execution, terrible intent alignment. Every assumption you make silently risks the same outcome. + +## Output Signals + +The loop script watches for these signals: + +- `PLANNED` — Plan created, ready for build iterations +- `DONE` — All tasks complete, project finished +- `STUCK` — Can't proceed, needs human intervention +- `ERROR` — Unrecoverable error encountered + +## Context Management + +You start fresh each iteration. Your "memory" is: +- `PROJECT-SPEC.md` — What to build (never changes) +- `IMPLEMENTATION_PLAN.md` — What's done and what's next (you update this) +- `git log` — History of changes +- The codebase itself — Current state of the project +- Test results — Whether things work + +This is intentional. Fresh context prevents confusion from stale reasoning. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8d39f1c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,182 @@ +# Changelog + +All notable changes to the Agent Harness project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +--- + +## [2.0.0] - 2026-04-01 + +### The Wave-Based Management Release + +Patterns developed during the Fintrove project (2026-03-31 → 2026-04-01): +4 waves, 11 streams, 44 tasks, 1,254 → 1,597 tests, zero regressions. + +The key insight: **the harness was missing a planning artifact between "the spec" and "the task."** +The execution board fills that gap — a stream-level plan written entirely before any code is written. + +### Added + +#### New Templates +- **EXECUTION-BOARD-TEMPLATE.md** — Pre-implementation planning artifact for a stream. Defines ALL packets (goal, steps, files, known-answer tests, acceptance criteria) before any code is written. The board is the contract. +- **VALIDATION-TEMPLATE.md** — Per-packet evidence file. Written immediately after each packet completes. Records: test count delta, known-answer test results, acceptance criteria pass/fail. +- **PROCESS-EVAL-TEMPLATE.md** — Stream retrospective written after merge. Covers task sizing accuracy, test-first compliance, known-answer coverage, architecture integrity, model attribution. + +#### New Guide +- **WAVE-BASED-MANAGEMENT.md** — Complete guide to the wave/stream/packet hierarchy. The plan-then-implement discipline, execution boards, known-answer tests, EXECUTION_MASTER.md pattern, wave gates, file organization. + +### New Patterns Documented + +#### The Plan-Then-Implement Discipline +Before writing any implementation code for a stream: +1. Write the execution board (all packets, all acceptance criteria, known-answer tests) +2. Only then: start coding + +#### Known-Answer Tests +For domain-specific calculations, every module must include ≥1 test citing an official source: +```typescript +test('CPP at 70 is exactly 42% more than at 65', () => { + // Source: ESDC https://www.canada.ca/en/services/benefits/publicpensions/cpp/benefit-amount.html + expect(at70 / at65).toBeCloseTo(1.42, 5); +}); +``` + +#### Wave Gates +Explicit checklist before Wave N+1: all streams merged, domain accuracy suite passing, process evals written, human sign-off. + +#### EXECUTION_MASTER.md Pattern +Project-level dashboard: wave status, active streams, blockers, parallelism rules. + +### Metrics (Fintrove, 2026-04-01) +- Waves: 4 | Streams: 11 | Tasks: 44/44 +- Test growth: 1,254 → 1,597 (+343) | Regressions: 0 + +--- + +## [1.0.0] - 2024-03-18 + +### Added + +#### Core Templates +- **AGENT-INSTRUCTIONS.md** — The agent's system prompt defining the core loop: Orient → Plan → Pick ONE task → Implement → Verify → Commit → Exit +- **PROJECT-SPEC.md** — Comprehensive template for defining projects with sections for overview, tech stack, requirements with acceptance criteria, data models, API design, constraints, phasing, and anti-patterns +- **DECISIONS.md** — Architecture Decision Record (ADR) template for documenting non-obvious technical choices and preventing agent drift +- **ralph-loop.sh** — Bash script implementing the Ralph Wiggum loop pattern: spawns fresh agent instances, checks for completion signals, restarts until done + +#### Process Guides +- **SPEC-CREATION-GUIDE.md** — Complete interview protocol for creating high-quality specifications through structured conversation between human and agent. Covers vision, requirements extraction, technical discovery, constraint mapping, and spec assembly +- **PLAN-MANAGEMENT.md** — Guide for managing IMPLEMENTATION_PLAN.md as a living document. Covers task decomposition patterns, intervention strategies, progress tracking, and plan anti-patterns +- **REVIEW-AND-QA.md** — Framework for evaluating agent output. Includes review timing, quality checklists, drift detection, course-correction strategies, and review templates +- **COST-OPTIMIZATION.md** — Comprehensive guide to model billing (request-based vs token-based), optimal strategies per provider, model selection, context management, and the hybrid approach +- **OPENCLAW-INTEGRATION.md** — Running the harness in OpenClaw with sessions_spawn, cron jobs, and shell scripts. Covers model selection, monitoring, and OpenClaw-specific agent instructions +- **TROUBLESHOOTING.md** — Failure taxonomy covering five common failure modes (stuck loop, drift, overengineering, test theater, context overflow) with root causes and recovery steps +- **TUTORIAL.md** — Complete 30-minute walkthrough building a markdown link checker CLI tool from zero using the harness. Concrete, copy-pasteable example demonstrating the entire workflow + +#### Examples & Documentation +- **EXAMPLES.md** — Worked example of a Fintrove-style personal finance app with complete PROJECT-SPEC.md. Compares three approaches (Ezward, Ralph Wiggum, Nate Jones) and provides best practices +- **README.md** — Project overview with file index, quick start guide, and core insights +- **PARALLEL-AGENTS.md** — Guide for running multiple agents simultaneously on independent tasks, covering parallelization strategies, work splitting, result merging, and conflict resolution + +### Features + +#### The Core Loop Pattern +- Stateless iteration model: each agent starts fresh with clean context +- Orient phase: agent reads spec, plan, and git history +- Single-task focus: agents complete ONE task per iteration +- Mandatory verification: build and test must pass before commit +- Promise-based signaling: `PLANNED|DONE|STUCK|ERROR` + +#### Interview Protocol +- Five-phase structured interview for spec creation +- Domain knowledge extraction techniques +- Technical discovery patterns +- Constraint mapping (MUST/MUST NOT/PREFER) +- Spec quality checklist + +#### Plan Management Patterns +- Scaffold-first pattern +- Vertical slice pattern +- Test-first pattern +- Dependency chain pattern +- Human intervention mechanisms (notes, task splitting, reprioritization) + +#### Cost Optimization Strategies +- Request-based optimization (batch tasks, compound requests) +- Token-based optimization (fresh sub-agents, minimal context) +- Model selection by task complexity +- Hybrid strategy using multiple subscriptions +- Usage monitoring and budget allocation + +#### OpenClaw Integration +- Manual orchestration via sessions_spawn +- Cron-based automation for overnight work +- Shell script orchestration +- Model selection per iteration +- Sub-agent monitoring and session history + +#### Troubleshooting Framework +- Stuck loop detection and resolution +- Architecture drift prevention with ADRs +- Overengineering constraints +- Test quality validation +- Context overflow mitigation + +### Documentation Quality Standards +- Comprehensive examples with real code +- Anti-pattern documentation +- Copy-pasteable templates +- Concrete acceptance criteria +- Decision record patterns + +### Supported Agents +- Claude CLI (via ralph-loop.sh) +- OpenAI Codex CLI (via ralph-loop.sh) +- OpenClaw sessions_spawn (any model) +- Extensible to other agent frameworks + +### Supported Workflows +- CLI loop (ralph-loop.sh) +- OpenClaw manual orchestration +- OpenClaw cron automation +- Hybrid approaches + +--- + +## [Unreleased] + +### Planned +- Additional language-specific examples (Python, Go, Rust) +- Integration templates for common CI/CD systems +- Cost calculator tool (estimate iterations × model cost) +- Spec validator (check completeness before starting) +- Template variations for different project types (API, CLI, library, web app) + +--- + +## Version History Summary + +- **1.0.0** (2024-03-18) — Initial release with complete harness system: core templates, process guides, examples, and multi-platform support + +--- + +## Contributing + +This harness is a living system. If you: +- Discover new failure modes +- Develop better patterns +- Find gaps in the guides +- Create examples for other project types + +Please document them and contribute back. The harness improves as we learn what works. + +--- + +## License + +This project is released into the public domain. Use it, modify it, share it. No attribution required. + +--- + +_The harness is 1.0 because it works. It's not 2.0 yet because we're still learning how to use it better._ diff --git a/COST-OPTIMIZATION.md b/COST-OPTIMIZATION.md new file mode 100644 index 0000000..940535d --- /dev/null +++ b/COST-OPTIMIZATION.md @@ -0,0 +1,356 @@ +# Cost Optimization — Getting More Work Per Dollar + +> AI model subscriptions have fundamentally different billing models. +> Understanding them is the difference between $5 and $50 for the same output. +> This guide teaches you to match your work patterns to your billing model. + +--- + +## The Two Billing Models + +### Request-Based (GitHub Copilot Pro) + +- **What counts:** Each API call = 1 request +- **Multipliers:** Advanced models cost more per request (e.g., Opus 4.6 = 3x) +- **Key insight:** A 2-second request costs the same as a 10-minute request +- **Budget:** Fixed monthly request pool (e.g., 300 premium requests/month) + +**What's expensive:** Many small requests +**What's cheap:** Few large requests that do lots of work + +### Token-Based (Anthropic Claude Pro) + +- **What counts:** Input tokens + output tokens consumed +- **Windows:** Per-session (5-hour) and weekly token budgets +- **Key insight:** Context grows with every turn — turn 50 includes ALL previous turns as input +- **Danger zone:** Long conversations burn tokens exponentially + +**What's expensive:** Large context windows, long conversations +**What's cheap:** Fresh sessions with minimal context + +--- + +## How Context Affects Cost + +### The Context Growth Problem (Token-Based) + +Every message in a conversation gets re-sent as context: + +``` +Turn 1: Input = system prompt (2K tokens) → Total: 2K +Turn 5: Input = system prompt + 4 prior turns (8K) → Total: 8K +Turn 10: Input = system prompt + 9 prior turns (20K) → Total: 20K +Turn 20: Input = system prompt + 19 prior turns (50K) → Total: 50K +Turn 30: Input = system prompt + 29 prior turns (90K) → Total: 90K +``` + +By turn 30, you're burning 90K input tokens PER TURN just for context. The actual +new content might be 500 tokens, but you're paying for the full history every time. + +**This is why sub-agents are token-efficient on Anthropic:** +Each sub-agent starts with ~2K tokens of context (just the task prompt), regardless +of how long your main conversation has been running. + +### Context Doesn't Matter (Request-Based) + +On Copilot, a request with 2K context costs the same as a request with 100K context. +It's still 1 request. So for request-based billing: +- Let context accumulate — it's free +- Pack more work into each request +- Don't spawn sub-agents unnecessarily (each spawn = new request) + +--- + +## Optimal Strategies by Subscription + +### GitHub Copilot Pro — Batch Everything + +**Goal:** Maximize work per request + +**Pattern: Fat Sub-Agents** +``` +# BAD: 5 requests × 3 multiplier = 15 premium requests +sessions_spawn("Do task 1") → 1 request +sessions_spawn("Do task 2") → 1 request +sessions_spawn("Do task 3") → 1 request +sessions_spawn("Do task 4") → 1 request +sessions_spawn("Do task 5") → 1 request + +# GOOD: 1 request × 3 multiplier = 3 premium requests +sessions_spawn("Do tasks 1-5 sequentially. For each: + implement, test, commit, then move to the next.") +``` + +**Same work, 80% cheaper.** + +**Pattern: Compound Tasks** +``` +# BAD: 3 separate requests +"Review the code" → 1 request +"Fix the issues you found" → 1 request +"Update the docs" → 1 request + +# GOOD: 1 compound request +"Review the code, fix any issues you find, and update the +docs to reflect the changes. Commit each fix separately." +``` + +**Pattern: Agent Harness with Multi-Task Iterations** +``` +# Modified AGENT-INSTRUCTIONS.md for Copilot: +### 3. Pick Tasks +- Find the NEXT 3-5 unchecked tasks in IMPLEMENTATION_PLAN.md +- Complete ALL of them in this iteration +- Commit after each task (for clean git history) +- Then exit for a fresh context restart +``` + +**When to use Copilot models:** +- Long autonomous coding sessions (lots of tool calls = still 1 request) +- Complex multi-step tasks (planning + implementation + testing) +- Agent harness iterations (pack 3-5 tasks per iteration) +- Overnight batch work + +**When NOT to use Copilot models:** +- Quick questions ("What time is it in Tokyo?") +- Simple file reads or lookups +- Anything you could do with a cheaper/free model + +### Anthropic Claude Pro — Stay Lean + +**Goal:** Minimize token consumption per interaction + +**Pattern: Fresh Sub-Agents** +``` +# GOOD on Anthropic: Each sub-agent starts with clean context +sessions_spawn("Do task 1") → ~5K tokens (fresh context) +sessions_spawn("Do task 2") → ~5K tokens (fresh context) +sessions_spawn("Do task 3") → ~5K tokens (fresh context) +Total: ~15K tokens + +# BAD on Anthropic: One long conversation +Main session turn 1: "Do task 1" → 3K input tokens +Main session turn 5: "Do task 2" → 15K input tokens +Main session turn 10: "Do task 3" → 35K input tokens +Total: ~53K tokens (3.5x more!) +``` + +**Pattern: Minimal Context Agent Instructions** +``` +# BAD: Agent reads entire spec every iteration +"Read PROJECT-SPEC.md (5000 tokens), IMPLEMENTATION_PLAN.md (2000 tokens), + DECISIONS.md (1500 tokens), and the last 20 git commits..." + +# GOOD: Agent reads only what it needs +"Read IMPLEMENTATION_PLAN.md. Find the first unchecked task. + Read ONLY the relevant section of PROJECT-SPEC.md for that task. + Implement, test, commit." +``` + +**Pattern: Offload to Cheaper Models** +``` +# Use Sonnet (cheaper) for routine work +sessions_spawn("Implement the CRUD endpoints", model: "sonnet") + +# Use Opus (expensive) only for complex reasoning +sessions_spawn("Design the Monte Carlo simulation algorithm", model: "opus") +``` + +**When to use Anthropic models:** +- Quick interactions in main session (small context = few tokens) +- Tasks requiring strong reasoning (Opus quality) +- Sub-agent swarms (fresh context each time) + +**When NOT to use Anthropic models:** +- Long main-session conversations (context grows = token burn) +- Low-complexity tasks (use a cheaper model) +- Repetitive iterations (context grows even with similar content) + +--- + +## Model Selection Guide + +### By Task Complexity + +| Task | Recommended | Why | +|------|-------------|-----| +| Planning & decomposition | Opus (either provider) | Needs strong reasoning | +| Scaffolding & config | Sonnet or GPT-4.1 | Simple, deterministic | +| Feature implementation | Sonnet | Good balance | +| Complex algorithms | Opus | Deep reasoning needed | +| Bug diagnosis | Opus | Pattern recognition | +| Bug fixing | Sonnet | Usually straightforward once diagnosed | +| Documentation | Sonnet or GPT-4.1 | Writing, not complex reasoning | +| Code review | Opus | Needs to spot subtle issues | +| Test writing | Sonnet | Follows patterns from spec | + +### By Provider Optimization + +| Scenario | Best Provider | Reasoning | +|----------|--------------|-----------| +| 5 tasks in agent harness | Copilot (batch 5 tasks = 1 request) | Request efficiency | +| Quick "what's the status?" | Anthropic (small context) | Token efficiency | +| Overnight autonomous loop | Copilot (fewer requests total) | Request efficiency | +| Sub-agent swarm (10 agents) | Anthropic (fresh context each) | Token efficiency | +| Long planning conversation | Copilot (context growth is free) | Request efficiency | +| One-shot code generation | Either (1 request, small context) | Similar cost | + +--- + +## The Hybrid Strategy + +Use both subscriptions strategically: + +``` +Morning check-in (main session): Anthropic Sonnet (small context, quick) +Planning conversation: Copilot Opus (context growth is free) +Agent harness iterations: Copilot Sonnet (batch tasks, 1 request each) +Complex debugging: Copilot Opus (1 request, deep reasoning) +Quick questions throughout the day: Anthropic Sonnet (minimal tokens) +Overnight autonomous work: Copilot Sonnet (batch tasks, few requests) +``` + +### Budget Allocation Example + +Monthly budget: +- Copilot Pro: 300 premium requests (Opus = 3x, Sonnet = 1x) +- Anthropic Pro: Weekly token budget (resets Sundays) + +**Agent harness project (20 iterations):** +``` +Copilot approach: + Planning: 1 Opus request = 3 premium → 3 + 20 iterations × batch 5 tasks = 4 Sonnet → 4 + Code review: 2 Opus requests = 6 premium → 6 + Total: 13 premium requests + +Anthropic approach: + Planning: 1 session = ~10K tokens + 20 iterations × 1 task = 20 sessions × ~8K → 160K tokens + Code review: 2 sessions × ~15K → 30K tokens + Total: ~200K tokens (could eat a chunk of weekly budget) +``` + +For an agent harness project, **Copilot is usually cheaper** because you can batch. + +For daily conversational use, **Anthropic is usually cheaper** because most interactions are short. + +--- + +## Anti-Patterns to Avoid + +### 1. The Chatty Agent (Anthropic killer) +``` +Turn 1: "What should I work on?" ← Wastes a turn +Turn 2: "I'll start with the parser" ← Wastes a turn +Turn 3: "Here's my plan..." ← Wastes a turn +Turn 4: *actually starts working* + +# Fix: Give clear instructions upfront so the agent works immediately +``` + +### 2. The Spawn Happy Pattern (Copilot killer) +``` +sessions_spawn("Read the plan") ← 1 request for reading?! +sessions_spawn("Pick the next task") ← 1 request for picking?! +sessions_spawn("Implement the task") ← Finally useful +sessions_spawn("Run the tests") ← 1 request for one command?! + +# Fix: One spawn that does all four steps +``` + +### 3. The "Let Me Check" Loop (Both killers) +``` +"Check if the build passes" → agent runs build, reports back +"OK now run the tests" → agent runs tests, reports back +"OK now check the linter" → agent runs linter, reports back + +# Fix: "Run build, tests, and linter. Report all results." +``` + +### 4. Using Opus for Everything +``` +# Opus is 3x on Copilot, token-heavy on Anthropic +# Most tasks don't need it + +# Fix: Default to Sonnet. Upgrade to Opus only for: +# - Planning and decomposition +# - Complex algorithm design +# - Subtle bug diagnosis +# - Architecture decisions +``` + +### 5. Ignoring Context Size (Anthropic killer) +``` +Main session at turn 50: "Hey can you also check the weather?" +# That weather check just cost 90K input tokens because of context + +# Fix: Use a sub-agent for unrelated tasks +# Or start a new session for new topics +``` + +--- + +## Monitoring Your Usage + +### GitHub Copilot +- Check premium request usage at: github.com/settings/copilot +- Track requests per task in your daily memory notes +- Set alerts at 80% of monthly budget + +### Anthropic Claude Pro +- Check usage at: claude.ai/settings/usage (subscription) +- API usage at: console.anthropic.com (if using API directly) +- Monitor "Current session X% used" — stop at 90% +- Weekly reset: Sundays at 11 AM ET + +### OpenClaw Session Status +``` +/status → Shows current model, session %, premium request % +``` + +### Logging Strategy +Track in your daily memory notes: +```markdown +## Model Usage — 2026-03-18 +- Copilot premium requests: 12 used today (45/300 monthly) +- Anthropic session: 35% used (resets in 3 days) +- Tasks completed: 8 +- Cost per task: ~1.5 premium requests average +``` + +--- + +## Quick Reference Card + +``` +┌─────────────────────────────────────────────────┐ +│ COST OPTIMIZATION CHEAT SHEET │ +├─────────────────────────────────────────────────┤ +│ │ +│ COPILOT PRO (request-based): │ +│ ✅ Batch tasks into one request │ +│ ✅ Let context grow (it's free) │ +│ ✅ Long sessions with many tool calls │ +│ ❌ Don't spawn many small sub-agents │ +│ ❌ Don't use Opus for simple tasks (3x!) │ +│ │ +│ ANTHROPIC PRO (token-based): │ +│ ✅ Fresh sub-agents (clean context) │ +│ ✅ Short, focused interactions │ +│ ✅ Use Sonnet for most work │ +│ ❌ Don't let main session context grow │ +│ ❌ Don't have long planning conversations │ +│ │ +│ GENERAL: │ +│ • Sonnet for building, Opus for thinking │ +│ • Batch related work, split unrelated work │ +│ • Monitor usage daily, adjust weekly │ +│ • When in doubt, use the cheaper model first │ +│ │ +└─────────────────────────────────────────────────┘ +``` + +--- + +_The cheapest token is the one you don't spend. The cheapest request is the one that does five things._ diff --git a/DECISIONS.md b/DECISIONS.md new file mode 100644 index 0000000..c2eca9d --- /dev/null +++ b/DECISIONS.md @@ -0,0 +1,301 @@ +# Architecture Decision Records (ADR) + +> Copy this file into your project root as `DECISIONS.md`. +> Use it to document non-obvious architecture choices so agents don't undo them. +> This is your defense against drift and "helpful improvements." + +--- + +## Why Architecture Decision Records? + +The agent harness spawns fresh agents each iteration. Each one starts with zero memory of: +- **Why** you chose approach X over Y +- **What** you tried that didn't work +- **Which** patterns are intentional vs accidental + +Without ADRs, iteration 10's agent "improves" iteration 3's code in ways that break things. ADRs create continuity across fresh contexts. + +--- + +## When to Write an ADR + +Write a decision record when: +- You choose an unusual approach (curl instead of fetch) +- You explicitly avoid a "better" solution (no ORM, raw SQL instead) +- You make a tradeoff (simple+slow over complex+fast) +- You discover a gotcha with a tool/API (SpringCM headers, DocuSign auth quirks) +- An agent keeps trying to "fix" something that's working + +**Not every decision needs an ADR.** Only the non-obvious ones. If the code is self-explanatory, skip the record. + +--- + +## ADR Template + +```markdown +### ADR-NNN: [Short Title] +**Date:** YYYY-MM-DD +**Status:** [Proposed | Accepted | Deprecated | Superseded by ADR-XXX] + +**Context:** +What's the situation? Why does this decision matter? + +**Decision:** +What did you decide? Be specific and actionable. + +**Consequences:** +What does this enable? What does it prevent? What are the tradeoffs? + +**Alternatives Considered:** +What else did you try or think about? Why didn't you choose them? +``` + +--- + +## Example ADRs + +### ADR-001: Use curl over Node.js fetch for HTTP calls +**Date:** 2024-03-15 +**Status:** Accepted + +**Context:** +The DocuSign SpringCM API returns 500 errors when called from Node.js using the native `fetch()` function. The errors are inconsistent — same request works in Postman but fails in Node. + +Investigation showed that Node's fetch sends extra headers (`Connection: keep-alive`, `Accept-Encoding: gzip, deflate`) that SpringCM's proxy doesn't handle correctly. + +**Decision:** +All HTTP calls to SpringCM (and potentially other DocuSign APIs) will use `child_process.exec` with `curl` instead of fetch or axios. + +```typescript +// CORRECT +const result = execSync(`curl -X GET "${url}" -H "Authorization: Bearer ${token}"`, + { encoding: 'utf-8' }); + +// DO NOT USE +const result = await fetch(url, { headers: { Authorization: `Bearer ${token}` }}); +``` + +**Consequences:** +✅ API calls work reliably +✅ Easier to debug (can copy curl command to terminal) +❌ Slightly less "Node-native" (using shell commands) +❌ Must escape shell arguments carefully + +**Alternatives Considered:** +- **Axios with minimal headers:** Still sent headers that broke SpringCM +- **Got library:** Same issue as axios +- **Manual request crafting:** Too complex, curl is simpler + +**Notes:** +If this decision becomes annoying (shell escaping hell), consider writing a thin wrapper: +```typescript +function curlGet(url: string, token: string): string { + const safeUrl = shellEscape(url); + return execSync(`curl -X GET ${safeUrl} -H "Authorization: Bearer ${token}"`, + { encoding: 'utf-8' }); +} +``` + +--- + +### ADR-002: Shared package for cross-cutting utilities +**Date:** 2024-03-17 +**Status:** Accepted + +**Context:** +Seven packages (`clm-direct`, `docgen-direct`, `maestro-direct`, `template-direct`, `formbuilder-direct`, `springcm-direct`, `powerforms-direct`) had duplicated code for: +- JWT authentication +- Environment variable loading +- API error handling +- Retry logic + +Changes required updating 7 files. Tests were inconsistent across packages. + +**Decision:** +Extract shared utilities to `packages/shared/` and import as `docusign-direct-shared`. + +```typescript +// Before (duplicated in each package) +const token = process.env.DOCUSIGN_TOKEN; +if (!token) throw new Error("Missing token"); + +// After (shared utility) +import { requireEnv } from 'docusign-direct-shared'; +const token = requireEnv('DOCUSIGN_TOKEN'); +``` + +**Consequences:** +✅ Single source of truth for auth logic +✅ Consistent error messages across packages +✅ Tests only need to cover shared code once +❌ Adds a build-time dependency (shared must build first) +❌ Breaking changes in shared affect all packages + +**Alternatives Considered:** +- **Copy-paste approach:** Current state, unacceptable for long-term maintenance +- **Monolithic package:** All functionality in one big package — loses clarity of API-specific packages +- **External npm package:** Overkill for this codebase, adds publish/version complexity + +**Migration:** +New packages MUST use shared utilities. Existing packages should migrate opportunistically (when touching auth code anyway, switch to shared). + +--- + +### ADR-003: No ORM — Raw SQL with better-sqlite3 +**Date:** 2024-04-02 +**Status:** Accepted + +**Context:** +Early iterations suggested using Prisma or TypeORM for database access. The schema is simple (4 tables: users, transactions, categories, rules). Most queries are straightforward CRUD or aggregations. + +ORMs add: +- Build complexity (Prisma requires codegen) +- Migration complexity (Prisma's migration format vs raw SQL) +- Learning curve (Prisma's query API vs SQL) +- Bundle size (~50KB for Prisma client) + +**Decision:** +Use `better-sqlite3` with raw parameterized SQL queries. No ORM. + +```typescript +// Query example +const transactions = db.prepare(` + SELECT * FROM transactions + WHERE account_id = ? AND date >= ? AND date <= ? + ORDER BY date DESC +`).all(accountId, startDate, endDate); + +// Migration example (migrations/001-initial-schema.sql) +CREATE TABLE transactions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + account_id INTEGER NOT NULL, + date TEXT NOT NULL, + amount REAL NOT NULL, + ... +); +``` + +**Consequences:** +✅ Simple: SQL is explicit, no query builder magic +✅ Fast: better-sqlite3 is one of the fastest SQLite bindings +✅ No build step for database code +✅ Migrations are just SQL files (easy to review and version) +❌ No automatic type generation from schema +❌ Must manually write types (acceptable for 4 tables) + +**Alternatives Considered:** +- **Prisma:** Overkill for this schema size, adds complexity +- **TypeORM:** Active Record pattern feels wrong for this use case +- **Knex:** Query builder without ORM — middle ground, but still adds abstraction + +**Review after:** +If the schema grows to 15+ tables with complex relationships, revisit this decision. For now, raw SQL is the right fit. + +--- + +### ADR-004: Monte Carlo in main thread, not Web Workers +**Date:** 2024-04-10 +**Status:** Accepted + +**Context:** +Monte Carlo simulation runs 1,000+ iterations. Each iteration: +- Samples random market returns +- Calculates portfolio balance year by year (30+ years) +- Tracks success/failure + +Initial implementation in a Web Worker for non-blocking UI. However: +- Simulation completes in < 2 seconds on modern hardware +- Complexity of Worker setup (separate build, message passing) adds 100 LOC +- User expectation: click "Run Simulation" → see results immediately (not async) + +**Decision:** +Run Monte Carlo in the main thread. Show a loading spinner during execution. + +```typescript +// Main thread, synchronous +function runMonteCarlo(profile: RetirementProfile, runs: number): SimulationResult { + const results = []; + for (let i = 0; i < runs; i++) { + results.push(runSingleSimulation(profile)); + } + return aggregateResults(results); +} + +// UI +button.onclick = () => { + showSpinner(); + const result = runMonteCarlo(profile, 1000); + hideSpinner(); + renderResults(result); +}; +``` + +**Consequences:** +✅ Simpler code (no Worker setup, no message passing) +✅ Easier to debug (single execution context) +✅ Fast enough (<2s) that blocking isn't a problem +❌ UI freezes for ~1.5 seconds during simulation +❌ Can't run multiple simulations in parallel + +**Alternatives Considered:** +- **Web Worker:** Complexity not justified for 1.5s task +- **WebAssembly:** Overkill, JS is fast enough +- **Async/await with chunking:** Adds complexity, still blocks event loop + +**Review after:** +If simulation time grows to >5 seconds (e.g., 10,000 runs, more complex models), reconsider Web Workers. + +--- + +## Tips for Writing Good ADRs + +### 1. Write them when the decision is fresh +Don't wait until the project is done. Write the ADR immediately after you make the choice. You'll remember the reasoning. + +### 2. Include what DIDN'T work +"We tried fetch — it sent headers that broke SpringCM" is more valuable than "We use curl." Future agents need to know the trap. + +### 3. Be specific with code examples +Show the DO and DON'T patterns. Code is clearer than prose. + +### 4. Include review triggers +"Review this after the schema grows to 15+ tables" gives future-you permission to change course. + +### 5. Status field matters +- **Proposed:** Still discussing +- **Accepted:** This is the way +- **Deprecated:** Don't use this anymore (but still in codebase) +- **Superseded by ADR-XXX:** This approach was replaced + +### 6. Keep them short +An ADR should fit on one screen. If it's a 5-page essay, you're over-explaining. Extract detail to separate docs and link them. + +--- + +## Maintaining ADRs + +### Add to AGENT.md Orient Phase +```markdown +## Orient +- Read PROJECT-SPEC.md +- Read IMPLEMENTATION_PLAN.md +- Read DECISIONS.md ← Add this +- Check git log --oneline -5 +``` + +### Reference in Constraints +```markdown +## Constraints +- MUST follow architecture decisions documented in DECISIONS.md +- MUST NOT change approaches listed as "Accepted" without human approval +``` + +### Review Periodically +Every few weeks, scan DECISIONS.md: +- Are the decisions still valid? +- Should any be deprecated? +- Are agents following them? + +--- + +_Decisions don't drift if they're written down._ diff --git a/EXAMPLES.md b/EXAMPLES.md new file mode 100644 index 0000000..50d21f1 --- /dev/null +++ b/EXAMPLES.md @@ -0,0 +1,364 @@ +# Agent Harness — Worked Examples + +## How Much Context Does an Agent Need? + +The key insight from both the Nate Jones approach and the Ralph Wiggum loop: + +> **The agent needs enough context to work autonomously for ONE task, +> but the system needs enough structure to coordinate across MANY tasks.** + +This means two layers of documentation: + +### Layer 1: The Spec (written by you, read-only for agents) +- What you're building and why +- Technical constraints and decisions +- Acceptance criteria for every feature +- Data models and API shapes + +### Layer 2: The Plan (created by agent, updated each iteration) +- Task decomposition with checkboxes +- Dependencies between tasks +- Current status + +--- + +## The Three Approaches Compared + +### Ezward's Approach (vibe-basic) +**Style:** Single sequential PRD — numbered steps, each building on the last. + +**Strengths:** +- Very explicit about what to build in what order +- Each step includes "add unit tests" and "make sure it compiles" +- The "Generally" section at the end sets cross-cutting standards +- Language spec provided as a separate reference file + +**Best for:** Well-understood problems where you know the implementation order. + +**Key pattern:** The PRD *is* the implementation plan. Steps 1-14, do them in order. + +### Ralph Wiggum Loop +**Style:** Spec + Plan separation. Agent creates its own plan from the spec. + +**Strengths:** +- Fresh context each iteration (no context window overflow) +- Agent decomposes tasks itself (may find better ordering) +- Git history is the "memory" between iterations +- Simple bash loop — no complex orchestration + +**Best for:** Larger projects where you want the agent to figure out task ordering. + +**Key pattern:** `while :; do cat PROMPT.md | claude -p; done` + +### Nate Jones / Task Decomposition +**Style:** Decompose → parallelize → verify → iterate. + +**Strengths:** +- Multiple agents can work on different tasks simultaneously +- Verification step catches integration issues +- Iteration handles failures gracefully + +**Best for:** Large projects with independent components that can be parallelized. + +**Key pattern:** Orchestrator agent spawns worker agents for each task. + +--- + +## Example: Personal Finance App (Fintrove-style) + +Here's what a complete spec would look like for a Fintrove-like personal finance application. This is the document you'd give to a team of agents. + +### PROJECT-SPEC.md + +```markdown +# Project Specification: FinPlan — Personal Finance Dashboard + +## 1. Project Overview + +### What are we building? +A privacy-first personal finance dashboard that helps a retiree manage +their money. It imports transaction data, categorizes spending, projects +retirement income against expenses, and runs Monte Carlo simulations to +stress-test withdrawal strategies. + +### Why does it matter? +Existing tools (Mint, YNAB) are cloud-based and sell your data. Quicken +is stagnating. We want a local-first tool that's actually useful for +retirement planning with Canadian tax rules (RRSP meltdown, CPP/OAS +optimization, pension integration). + +### Success criteria +- [ ] Import Quicken QFX/CSV exports and categorize transactions +- [ ] Dashboard shows monthly spending by category (current month + trends) +- [ ] Retirement projection shows income vs expenses for 30 years +- [ ] Monte Carlo simulation with 1000+ runs using historical market data +- [ ] All data stays local (SQLite, no cloud) +- [ ] Runs in browser via local server + +## 2. Technical Foundation + +### Tech stack +- **Language:** TypeScript (Node.js backend, browser frontend) +- **Framework:** Express.js (API), vanilla HTML/CSS/JS (frontend) +- **Database:** SQLite via better-sqlite3 +- **Build system:** esbuild for frontend bundling +- **Test framework:** Node.js built-in test runner +- **Package manager:** npm + +### Project structure +project/ +├── packages/ +│ ├── server/ # Express API + SQLite +│ ├── client/ # Browser frontend +│ └── shared/ # Types, constants, utils +├── data/ # Sample data for testing +├── docs/ # Design docs +├── PROJECT-SPEC.md +├── IMPLEMENTATION_PLAN.md +└── AGENT.md + +### Build & test commands +npm install +npm run build +npm test +npm run lint + +### Coding standards +- TypeScript strict mode +- No `any` types except in test fixtures +- All public functions documented with JSDoc +- Error messages must be user-friendly (no stack traces in UI) +- SQL queries use parameterized statements (no string concatenation) + +## 3. Requirements + +### FR-001: Transaction Import +**Description:** Import financial transactions from QFX (OFX) and CSV files. +**Acceptance criteria:** +- [ ] Parse QFX files and extract: date, amount, payee, memo, type +- [ ] Parse CSV files with configurable column mapping +- [ ] Deduplicate transactions by date + amount + payee +- [ ] Store in SQLite with account association +- [ ] CLI command: `npm run import -- --file data/transactions.qfx` + +### FR-002: Auto-Categorization +**Description:** Automatically categorize transactions based on payee patterns. +**Acceptance criteria:** +- [ ] Rule-based matching: payee contains "COSTCO" → Groceries +- [ ] Rules stored in SQLite, editable via API +- [ ] Uncategorized transactions flagged for manual review +- [ ] Bulk categorization: apply rule retroactively to past transactions +- [ ] At least 20 default rules for common Canadian merchants + +### FR-003: Spending Dashboard +**Description:** Web dashboard showing spending breakdown and trends. +**Acceptance criteria:** +- [ ] Monthly spending by category (bar chart) +- [ ] 12-month trend line per category +- [ ] Total income vs total expenses per month +- [ ] Filter by date range and account +- [ ] Loads in < 500ms for 10,000 transactions + +### FR-004: Retirement Projection +**Description:** Project income and expenses over a 30-year retirement. +**Acceptance criteria:** +- [ ] Input: current age, retirement age, life expectancy +- [ ] Income sources: pension (fixed), CPP (age-dependent), OAS (age-dependent) +- [ ] RRSP meltdown strategy: withdraw X/year for Y years before age 65 +- [ ] Inflation adjustment (configurable rate, default 2.5%) +- [ ] Output: year-by-year table of income, expenses, portfolio balance + +### FR-005: Monte Carlo Simulation +**Description:** Stress-test retirement plan against historical market returns. +**Acceptance criteria:** +- [ ] Use S&P 500 historical annual returns (1928-present) +- [ ] Run 1,000+ simulations with random return sequences +- [ ] Output: success rate (% of runs where money lasts) +- [ ] Visualization: fan chart showing percentile bands +- [ ] Compare strategies: 4% rule vs dynamic withdrawal + +### NFR-001: Privacy +- [ ] All data stored locally in SQLite +- [ ] No network requests except to localhost +- [ ] No analytics, telemetry, or tracking + +### NFR-002: Performance +- [ ] Dashboard loads in < 1 second +- [ ] Monte Carlo (1000 runs) completes in < 5 seconds +- [ ] Import 10,000 transactions in < 10 seconds + +### NFR-003: Testing +- [ ] 80%+ code coverage +- [ ] Integration tests for API endpoints +- [ ] Unit tests for calculation functions +- [ ] Sample data fixtures for reproducible tests + +## 4. Data Model + +### Entities + +Entity: Account + - id: INTEGER (primary key, auto-increment) + - name: TEXT (required, e.g. "RRSP", "TFSA", "Chequing") + - type: TEXT (checking | savings | investment | credit) + - institution: TEXT (optional) + +Entity: Transaction + - id: INTEGER (primary key, auto-increment) + - account_id: INTEGER (foreign key → Account) + - date: TEXT (ISO 8601 date) + - amount: REAL (positive = income, negative = expense) + - payee: TEXT + - memo: TEXT (optional) + - category_id: INTEGER (foreign key → Category, nullable) + - import_hash: TEXT (unique, for deduplication) + +Entity: Category + - id: INTEGER (primary key, auto-increment) + - name: TEXT (unique, e.g. "Groceries", "Utilities") + - type: TEXT (expense | income | transfer) + - budget: REAL (optional monthly budget) + +Entity: CategoryRule + - id: INTEGER (primary key, auto-increment) + - pattern: TEXT (substring match on payee) + - category_id: INTEGER (foreign key → Category) + - priority: INTEGER (higher = matched first) + +Entity: RetirementProfile + - id: INTEGER (primary key, auto-increment) + - name: TEXT + - current_age: INTEGER + - retirement_age: INTEGER + - life_expectancy: INTEGER + - annual_expenses: REAL + - cpp_start_age: INTEGER (default 70) + - oas_start_age: INTEGER (default 70) + - pension_annual: REAL + - rrsp_balance: REAL + - tfsa_balance: REAL + - non_reg_balance: REAL + +## 5. API Design + +### REST Endpoints + +GET /api/accounts +POST /api/accounts +GET /api/transactions?from=&to=&account=&category= +POST /api/import (multipart file upload) +GET /api/categories +POST /api/categories +GET /api/categories/rules +POST /api/categories/rules +GET /api/spending/monthly?from=&to= +GET /api/spending/trends?months=12 +GET /api/retirement/projection/:profileId +POST /api/retirement/monte-carlo/:profileId + +## 6. Architecture Decisions + +### Constraints +- MUST: Use SQLite (no PostgreSQL, no cloud DB) +- MUST: Run entirely on localhost +- MUST: Work offline +- MUST NOT: Make any external network requests +- MUST NOT: Use React/Vue/Angular (vanilla JS + HTML templates) +- PREFER: Native ES modules over bundling where possible + +### Known Challenges +- QFX/OFX parsing is XML-based with quirky formatting +- Canadian CPP/OAS calculations have complex age-dependent rules +- Monte Carlo needs to be fast — consider Web Workers for UI + +## 7. Phasing + +### Phase 1: Data Foundation (Tasks 1-5) +- [ ] Project scaffolding (monorepo, build, test) +- [ ] SQLite schema + migrations +- [ ] QFX/CSV import +- [ ] Category rules engine +- [ ] REST API for CRUD + +### Phase 2: Dashboard (Tasks 6-8) +- [ ] Spending by category (API + chart) +- [ ] Trend lines +- [ ] Date/account filters + +### Phase 3: Retirement Engine (Tasks 9-12) +- [ ] Income projection calculator +- [ ] RRSP meltdown logic +- [ ] CPP/OAS optimization +- [ ] Monte Carlo simulation + +### Phase 4: Polish (Tasks 13-15) +- [ ] Error handling + user messages +- [ ] Performance optimization +- [ ] Documentation + +## 8. Reference Materials + +### External docs +- QFX/OFX spec: https://www.ofx.net/ +- CPP benefits: https://www.canada.ca/en/services/benefits/publicpensions/cpp.html +- OAS benefits: https://www.canada.ca/en/services/benefits/publicpensions/old-age-security.html +- S&P 500 historical returns: included in data/sp500-returns.csv + +### Anti-patterns +- Don't use localStorage for data — SQLite is the source of truth +- Don't try to parse bank-specific CSV formats — use configurable column mapping +- Don't calculate CPP/OAS inline — extract to a dedicated module with unit tests +``` + +--- + +## What Makes a Good Spec? + +Looking at what works across Ezward's PRD, Ralph Wiggum, and Nate Jones: + +### 1. Be Specific About Acceptance Criteria +Bad: "Import transactions" +Good: "Parse QFX files and extract: date, amount, payee, memo, type. Deduplicate by date + amount + payee. Store in SQLite." + +### 2. Define the Tech Stack — Don't Let the Agent Choose +Bad: "Use a modern framework" +Good: "TypeScript, Express.js, SQLite via better-sqlite3, vanilla HTML/CSS/JS frontend" + +### 3. Include Data Models +Agents that know the data model write better code. Define entities, relationships, and constraints explicitly. + +### 4. Provide Build/Test Commands +The agent needs to verify its own work. If it can't run `npm test`, it can't iterate. + +### 5. List Anti-Patterns +Tell the agent what NOT to do. This prevents it from going down rabbit holes you've already explored. + +### 6. Phase the Work +Large projects need phases. Each phase should be independently deployable. The agent can complete Phase 1 before touching Phase 2. + +### 7. Include Sample Data +Agents test better when they have example inputs and expected outputs. + +--- + +## Running with OpenClaw + +You can use OpenClaw's `sessions_spawn` to run the Ralph Wiggum pattern: + +```bash +# Planning phase +sessions_spawn --task "Read PROJECT-SPEC.md in /path/to/project. + Decompose into tasks. Write IMPLEMENTATION_PLAN.md." \ + --model opus + +# Build iterations (spawn one at a time, or use cron) +sessions_spawn --task "Read AGENT.md in /path/to/project. + Follow the core loop. Pick ONE task, implement, test, commit." \ + --model sonnet +``` + +Or use the bash loop directly with Claude Code: +```bash +cd /path/to/project +./ralph-loop.sh --agent claude --max 30 +``` diff --git a/EXECUTION-BOARD-TEMPLATE.md b/EXECUTION-BOARD-TEMPLATE.md new file mode 100644 index 0000000..93ee858 --- /dev/null +++ b/EXECUTION-BOARD-TEMPLATE.md @@ -0,0 +1,135 @@ +# Execution Board Template + +> **The execution board is the contract for a stream.** +> Copy this file into `.harness//execution-board.md`. +> **The entire board must be written BEFORE any code is committed.** +> Plan-then-implement is non-negotiable. + +--- + +# Execution Board — [STREAM NAME] +**Feature:** [One-line description of what this stream builds] +**Created:** YYYY-MM-DD +**Branch:** `feat/` +**IMPLEMENTATION_PLAN tasks:** [e.g., 5–8] +**Status:** 🔴 Not started | 🟡 Planning | 🟠 In progress | ✅ Complete +**Design reference:** [path to design doc, or N/A] + +--- + +## 🎯 Goal + +[2–4 sentences. What does this stream accomplish? What user-facing outcome does it produce? +How does it fit the larger product vision?] + +--- + +## ⚠️ Dependencies + +[Other streams or tasks that must be complete before this one can start. +If none: "None — can start immediately."] + +--- + +## 📦 Packets + + + +### Packet [XX-01] — [Name] +**IMPLEMENTATION_PLAN task:** [N] +**Status:** ⬜ Not started | 🔄 In progress | ✅ Done +**Est. effort:** [N sessions] +**Depends on:** [XX-00 or "none"] + +**Goal:** [One sentence] + +**Steps:** +1. [Concrete step] +2. [...] + +**Files created/modified:** +- `src/...` — [description] +- `src/.../__tests__/...` — [test file] + +**Known-answer tests (mandatory for calculation modules):** +``` +test('[what is being verified]', () => { + // Source: [official reference — URL, standard, specification] + expect(fn(input)).toBeCloseTo(expected, precision); +}); +``` + +**Acceptance criteria:** +- [ ] [Programmatically verifiable criterion] +- [ ] Known-answer test passes +- [ ] Full test suite green (count ≥ baseline) +- [ ] TypeScript: clean (`npx tsc --noEmit` outputs nothing) + +**Validation evidence:** `.harness//validation/-validation.md` + +--- + +### Packet [XX-02] — [Name] +[Repeat above structure for each packet] + +--- + +## 🔢 Dependency Order + +``` +[XX-01] → [XX-02] → [XX-04 (UI/integration)] +[XX-01] → [XX-03] → [XX-04 (UI/integration)] +``` + +[Which packets can run in parallel? Which must be sequential?] + +--- + +## 🏁 Stream Completion Criteria + +- [ ] All packets complete with validation evidence written +- [ ] All known-answer tests pass (list them here explicitly) +- [ ] Full test suite green +- [ ] TypeScript: clean +- [ ] Regression baseline saved: `.harness/regression-baselines/-baseline.json` +- [ ] Branch merged to main via `--no-ff` merge commit +- [ ] Process eval written: `.harness//process-eval.md` +- [ ] IMPLEMENTATION_PLAN tasks marked `[x]` +- [ ] EXECUTION_MASTER.md (or project equivalent) updated + +--- + +## 📋 Mandatory Commit Trailer Format + +Every implementation commit in this stream: + +``` +feat(): + +Agent: +Tests: +Tests-Added: +TypeScript: clean +``` + +--- + +## 🔍 Pre-Coding Checklist + +Before writing any implementation code: + +- [ ] This execution board is fully written (all packets defined) +- [ ] Branch created from latest main +- [ ] Baseline test count verified +- [ ] No open schema migrations from other active streams (if relevant) +- [ ] Design reference doc has been read diff --git a/OPENCLAW-INTEGRATION.md b/OPENCLAW-INTEGRATION.md new file mode 100644 index 0000000..9ea4273 --- /dev/null +++ b/OPENCLAW-INTEGRATION.md @@ -0,0 +1,321 @@ +# OpenClaw Integration — Running the Harness in OpenClaw + +> The ralph-loop.sh runs locally via CLI tools (claude, codex). +> OpenClaw provides a richer runtime: sub-agents, cron scheduling, cross-session +> messaging, and persistent workspace. This guide shows how to use them. + +--- + +## Architecture: CLI Loop vs OpenClaw + +### CLI Approach (ralph-loop.sh) +``` +bash loop → spawn CLI agent → agent reads spec → works → exits → loop restarts +``` +- Runs on your terminal +- You watch the output in real-time +- Agent uses CLI tool (claude -p, codex) +- Loop manages restarts + +### OpenClaw Approach +``` +main session → sessions_spawn → sub-agent reads spec → works → exits → result delivered +``` +- Runs in the background +- Results delivered to your chat (Telegram, Discord, etc.) +- Sub-agent gets full workspace access +- You orchestrate via main session or cron jobs + +### When to Use Which + +| Scenario | Use | +|----------|-----| +| Sitting at your terminal, watching progress | ralph-loop.sh | +| Overnight autonomous work | OpenClaw sessions_spawn | +| Scheduled recurring tasks | OpenClaw cron | +| Quick one-off agent tasks | OpenClaw sessions_spawn | +| Need to monitor from your phone | OpenClaw (results come to Telegram) | + +--- + +## Method 1: Manual Orchestration (Main Session) + +You drive each iteration from your chat with your agent (Cleo, etc.): + +``` +You: "Run the next iteration of the agent harness on /path/to/project" +Cleo: *spawns sub-agent* → *sub-agent does one task* → *reports back* +You: "Looks good, run the next one" +Cleo: *spawns another sub-agent* → *reports back* +``` + +**Pros:** Full control, you review between iterations +**Cons:** Requires your attention, not autonomous + +### How Your Agent Orchestrates + +Your main agent (e.g., Cleo) uses `sessions_spawn` internally: + +``` +sessions_spawn( + task: "Read AGENT.md in /home/user/project/. Follow the core loop: + orient, pick one task from IMPLEMENTATION_PLAN.md, implement, + verify (build + test), commit, exit. Report what you completed.", + model: "sonnet", + timeoutSeconds: 600 +) +``` + +The sub-agent: +1. Reads AGENT.md and PROJECT-SPEC.md +2. Reads IMPLEMENTATION_PLAN.md +3. Picks the first unchecked task +4. Implements and tests it +5. Commits +6. Updates the plan +7. Reports back to your main session + +--- + +## Method 2: Cron-Based Automation + +For true overnight autonomy, use OpenClaw cron jobs: + +### The Iteration Job + +```json +{ + "name": "agent-harness-iteration", + "schedule": { "kind": "every", "everyMs": 900000 }, + "sessionTarget": "isolated", + "payload": { + "kind": "agentTurn", + "message": "Read AGENT.md in /home/paulh/.openclaw/workspace/projects/my-project/. Follow the core loop: orient → pick ONE task → implement → verify → commit → update plan → exit. If all tasks are done, say DONE. If stuck, say STUCK with details.", + "model": "sonnet", + "timeoutSeconds": 600 + }, + "delivery": { "mode": "announce" } +} +``` + +This spawns a fresh agent every 15 minutes. Each one picks the next task, does it, and reports to your chat. + +### The Completion Watcher + +Add logic to stop the loop when done: + +```json +{ + "name": "agent-harness-completion-check", + "schedule": { "kind": "every", "everyMs": 3600000 }, + "sessionTarget": "isolated", + "payload": { + "kind": "agentTurn", + "message": "Read IMPLEMENTATION_PLAN.md in /home/paulh/.openclaw/workspace/projects/my-project/. Count completed vs total tasks. If all tasks are checked, disable the cron job named 'agent-harness-iteration' and announce 'All tasks complete!' If less than 80% done and no progress in last 3 git commits, announce a warning.", + "model": "sonnet", + "timeoutSeconds": 120 + }, + "delivery": { "mode": "announce" } +} +``` + +### Starting and Stopping + +``` +You: "Start the agent harness loop for my-project" +Cleo: *creates cron job* → iterations begin every 15 min + +You: "Pause the harness" +Cleo: *disables cron job* + +You: "Resume" +Cleo: *re-enables cron job* +``` + +--- + +## Method 3: Shell Script Orchestration + +Create a script in your workspace that uses OpenClaw's CLI: + +```bash +#!/usr/bin/env bash +# openclaw-harness.sh — Run agent harness via OpenClaw sub-agents +# +# Usage: +# ./openclaw-harness.sh /path/to/project [max-iterations] + +set -euo pipefail + +PROJECT_DIR="${1:?Usage: $0 [max-iterations]}" +MAX="${2:-20}" +PLAN="$PROJECT_DIR/IMPLEMENTATION_PLAN.md" + +echo "🔄 Starting OpenClaw agent harness" +echo " Project: $PROJECT_DIR" +echo " Max iterations: $MAX" + +for i in $(seq 1 "$MAX"); do + echo "" + echo "━━━ Iteration $i/$MAX ━━━" + + # Check if plan exists and all tasks are done + if [[ -f "$PLAN" ]] && ! grep -q '^\- \[ \]' "$PLAN" 2>/dev/null; then + echo "✅ All tasks complete!" + exit 0 + fi + + # Spawn sub-agent via OpenClaw + # Note: This is conceptual — actual invocation depends on OpenClaw CLI + openclaw sessions spawn \ + --task "Read AGENT.md in $PROJECT_DIR. Follow the core loop: orient, pick ONE task, implement, verify, commit, exit." \ + --model sonnet \ + --timeout 600 + + echo " Iteration $i complete. Waiting 30s..." + sleep 30 +done + +echo "⚠️ Reached max iterations ($MAX)" +``` + +**Note:** The exact `openclaw` CLI syntax may vary. Check `openclaw help` for current commands. The `sessions_spawn` tool within an agent session is the most reliable method. + +--- + +## Monitoring Sub-Agent Work + +### Real-Time: Session List +``` +You: "What sub-agents are running?" +Cleo: *calls sessions_list* → shows active sessions +``` + +### After Completion: Session History +``` +You: "What did the last sub-agent do?" +Cleo: *calls sessions_history(sessionKey)* → shows full transcript +``` + +The transcript includes: +- Every tool call the agent made (file reads, writes, exec commands) +- The agent's reasoning +- Build/test output +- Git commits made +- Final summary + +### Git Log: The Source of Truth +```bash +cd /path/to/project +git log --oneline -10 +``` + +Each iteration should produce exactly one commit. If you see: +- **0 commits** after an iteration → agent was stuck +- **1 commit** → working correctly +- **2+ commits** → task was large or agent fixed its own mistakes + +### Plan File: Progress at a Glance +```bash +# Count completed vs total +grep -c '^\- \[x\]' IMPLEMENTATION_PLAN.md # Done +grep -c '^\- \[ \]' IMPLEMENTATION_PLAN.md # Remaining +``` + +--- + +## Model Selection Strategy + +Different tasks need different models: + +| Task Type | Recommended Model | Why | +|-----------|------------------|-----| +| Planning (decomposition) | opus | Needs strong reasoning for task ordering | +| Scaffolding (config, structure) | sonnet | Simple, deterministic work | +| Feature implementation | sonnet | Good balance of speed and quality | +| Complex algorithms | opus | Needs deeper reasoning | +| Bug fixing | sonnet or opus | Depends on bug complexity | +| Documentation | sonnet | Writing, not complex reasoning | + +### Cost Optimization + +``` +Planning: 1 iteration × opus = $$ (one-time cost) +Building: 15 iterations × sonnet = $ per iteration +Complex: 3 iterations × opus = $$ per iteration +Total: 19 iterations = $$$ (manageable) +``` + +vs. + +``` +Everything: 19 iterations × opus = $$$$$$ (expensive!) +``` + +Use the `model` parameter in sessions_spawn to control this per-iteration. + +--- + +## OpenClaw-Specific Agent Instructions + +When running in OpenClaw, add these to your AGENT.md: + +```markdown +## OpenClaw Environment Notes + +### File Access +- Working directory: /home/paulh/.openclaw/workspace/projects/[project] +- Use absolute paths in all exec commands +- The `~` shorthand does NOT work in exec — use full paths + +### Build Commands +- Use `cd /absolute/path && command` pattern (cwd parameter is unreliable) +- Example: `cd /home/paulh/.openclaw/workspace/projects/my-project && npm test` + +### Git +- Commit after each completed task +- Use descriptive commit messages: `feat: implement QFX parser with field extraction` +- Don't push — let the human decide when to push + +### Communication +- Your output is delivered to the human's chat (Telegram/Discord) +- Keep summaries brief — one paragraph max +- Include: what task you completed, what test results look like, what's next + +### Signals +- If all tasks done: include "ALL_TASKS_COMPLETE" in your response +- If stuck: include "STUCK:" followed by a description of the blocker +- If error: include "ERROR:" followed by the error details +``` + +--- + +## Troubleshooting OpenClaw Integration + +### Sub-agent can't find files +- Verify the project path is absolute +- Check that the workspace is accessible (not in a different user's home) + +### Sub-agent uses wrong model +- Specify `model` explicitly in sessions_spawn +- Check session_status to verify which model was used +- Known issue: isolated sessions may not respect model parameter (use main session sub-agents instead) + +### Sub-agent times out +- Default timeout may be too short for complex tasks +- Set `timeoutSeconds: 600` (10 minutes) for implementation tasks +- Set `timeoutSeconds: 900` (15 minutes) for complex tasks +- Set `timeoutSeconds: 120` (2 minutes) for simple checks + +### Cron job fires but nothing happens +- Check cron job status: `cron list` +- Review recent runs: `cron runs --jobId ` +- Verify the project path in the cron job message + +### Results not delivered to chat +- Check `delivery.mode` is set to `"announce"` in cron job config +- For sessions_spawn from main session, results auto-deliver + +--- + +_OpenClaw turns the agent harness from a terminal-bound tool into an always-available autonomous workforce._ diff --git a/PARALLEL-AGENTS.md b/PARALLEL-AGENTS.md new file mode 100644 index 0000000..6fe6c19 --- /dev/null +++ b/PARALLEL-AGENTS.md @@ -0,0 +1,718 @@ +# Parallel Agents — Running Multiple Agents Simultaneously + +> The agent harness defaults to sequential work: one task, one iteration, one commit. +> But many projects have independent modules that can be built in parallel. +> This guide teaches you when to parallelize, how to coordinate, and how to merge results. + +--- + +## When to Parallelize + +### Good Candidates for Parallel Work + +✅ **Independent packages in a monorepo** +``` +packages/ +├── auth/ ← Agent A +├── payment/ ← Agent B +├── notification/ ← Agent C +└── shared/ ← Build first, then parallelize +``` + +Each package has its own spec, plan, and tests. No shared mutable state. + +✅ **Separate features with no shared code** +``` +# Agent A +- [ ] Import QFX files +- [ ] Parse QFX format +- [ ] Test QFX parser + +# Agent B +- [ ] Import CSV files +- [ ] Parse CSV with column mapping +- [ ] Test CSV parser +``` + +Both implement file import, but use different parsers. No conflicts. + +✅ **Documentation and code in parallel** +``` +# Agent A: Builds features +- [ ] Implement transaction API + +# Agent B: Writes docs +- [ ] Document transaction API endpoints +- [ ] Add usage examples +- [ ] Write integration guide +``` + +Code and docs touch different files. + +✅ **Testing different components** +``` +# Agent A: Unit tests +- [ ] Test parser functions + +# Agent B: Integration tests +- [ ] Test API endpoints +``` + +Different test suites, no overlap. + +--- + +### Bad Candidates for Parallel Work + +❌ **Dependent tasks** +``` +# These MUST be sequential +- [ ] Create database schema ← Must finish first +- [ ] Implement data access layer ← Depends on schema +- [ ] Build REST API ← Depends on data layer +``` + +Can't parallelize a dependency chain. + +❌ **Shared mutable files** +``` +# Both agents editing package.json simultaneously +Agent A: Adding dependency "express" +Agent B: Adding dependency "commander" +→ Merge conflict in package.json +``` + +Single shared configuration file = bottleneck. + +❌ **Overlapping code areas** +``` +# Both working on the same module +Agent A: Refactoring src/parser.ts +Agent B: Adding feature to src/parser.ts +→ Guaranteed conflict +``` + +--- + +## Parallelization Strategies + +### Strategy 1: Independent Sub-Projects + +**Pattern:** Divide your project into complete, independent sub-projects. Each gets its own agent. + +**Example: Microservices** +``` +services/ +├── auth-service/ +│ ├── PROJECT-SPEC.md +│ ├── IMPLEMENTATION_PLAN.md +│ └── src/ +├── api-gateway/ +│ ├── PROJECT-SPEC.md +│ ├── IMPLEMENTATION_PLAN.md +│ └── src/ +└── notification-service/ + ├── PROJECT-SPEC.md + ├── IMPLEMENTATION_PLAN.md + └── src/ +``` + +**Orchestration:** +```bash +# Spawn three agents, each with their own directory +sessions_spawn "Read AGENT.md in /path/to/auth-service. Build it." & +sessions_spawn "Read AGENT.md in /path/to/api-gateway. Build it." & +sessions_spawn "Read AGENT.md in /path/to/notification-service. Build it." & +wait +``` + +**Coordination:** None needed — they don't interact. + +**Merge:** Each commits to its own directory. No conflicts. + +--- + +### Strategy 2: Feature-Parallel + +**Pattern:** Same codebase, different features. Split the plan into independent feature sets. + +**Example: CLI Tool with Multiple Commands** +``` +# Agent A: PLAN-A.md +- [ ] Implement `auth` command +- [ ] Test `auth` command + +# Agent B: PLAN-B.md +- [ ] Implement `list` command +- [ ] Test `list` command + +# Agent C: PLAN-C.md +- [ ] Implement `upload` command +- [ ] Test `upload` command +``` + +**Orchestration:** +``` +# Each agent reads the same AGENT.md but different plans +sessions_spawn "Read AGENT.md, use PLAN-A.md as implementation plan" +sessions_spawn "Read AGENT.md, use PLAN-B.md as implementation plan" +sessions_spawn "Read AGENT.md, use PLAN-C.md as implementation plan" +``` + +**Coordination:** Each feature is isolated in its own source file: +``` +src/ +├── commands/ +│ ├── auth.ts ← Agent A +│ ├── list.ts ← Agent B +│ └── upload.ts ← Agent C +├── cli.ts ← Shared, agents don't touch +``` + +**Merge:** Standard git merge. Conflicts unlikely if directories are separate. + +--- + +### Strategy 3: Test-Parallel + +**Pattern:** One agent builds, others write tests. + +**Example:** +``` +# Agent A (builder): Implements features sequentially +- [ ] Implement parser +- [ ] Implement checker +- [ ] Implement reporter + +# Agent B (tester): Writes tests as features complete +- [ ] Write parser tests (wait for Agent A to commit parser) +- [ ] Write checker tests (wait for Agent A to commit checker) +- [ ] Write reporter tests (wait for Agent A to commit reporter) +``` + +**Orchestration:** +```python +# Pseudo-code for coordinated spawning +agent_a = spawn_agent(plan="PLAN-BUILD.md") +wait_for_commit(agent_a, pattern="feat: implement parser") + +agent_b = spawn_agent(plan="PLAN-TEST.md", task="Write parser tests") +# Agent B reads the code Agent A wrote, writes tests +``` + +**Coordination:** Builder commits first, tester follows immediately. + +**Merge:** Sequential commits, no conflicts. + +--- + +### Strategy 4: Layer-Parallel + +**Pattern:** Assign agents to different architectural layers. + +**Example:** +``` +# Agent A: Data layer +- [ ] Database schema +- [ ] Migration scripts +- [ ] Data access functions + +# Agent B: Business logic layer +- [ ] (Waits for Agent A to finish schema) +- [ ] Business logic using data layer +- [ ] Validation and rules + +# Agent C: API layer +- [ ] (Waits for Agent B to finish business logic) +- [ ] REST endpoints +- [ ] Request/response handling +``` + +**This is mostly sequential with brief parallel windows.** + +**Orchestration:** +```bash +# Phase 1: Only Agent A +agent_a_session=$(sessions_spawn "Build data layer") +wait_until_done $agent_a_session + +# Phase 2: Agent B (uses Agent A's output) +agent_b_session=$(sessions_spawn "Build business logic") +wait_until_done $agent_b_session + +# Phase 3: Agent C (uses Agent B's output) +agent_c_session=$(sessions_spawn "Build API layer") +``` + +**Coordination:** Explicit phases with dependencies. + +**Merge:** Sequential, each agent builds on the previous layer. + +--- + +## OpenClaw Patterns for Parallel Work + +### Pattern A: Manual Spawn with Tracking + +```markdown +You: "Spawn three agents to work on auth, payment, and notifications packages." + +Cleo: *spawns 3 sub-agents* + - Agent alpha-1 → auth package + - Agent alpha-2 → payment package + - Agent alpha-3 → notifications package + +You: "What's the status of those agents?" + +Cleo: *checks sessions_list* + - alpha-1: Completed 3/5 tasks (auth) + - alpha-2: Completed 4/5 tasks (payment) + - alpha-3: Stuck on task 2 (notifications) — reported STUCK + +You: "What's alpha-3 stuck on?" + +Cleo: *reads session transcript* + "Can't connect to notification service API. Missing NOTIFICATION_KEY in .env" + +You: "Add NOTIFICATION_KEY to .env, then resume alpha-3." + +Cleo: *updates .env* → *spawns new agent for notifications* → completes +``` + +**Pros:** Full visibility and control +**Cons:** You're the orchestrator (manual coordination) + +--- + +### Pattern B: Cron-Based Parallel Iterations + +```json +{ + "jobs": [ + { + "name": "agent-auth-iteration", + "schedule": { "kind": "every", "everyMs": 900000 }, + "payload": { + "kind": "agentTurn", + "message": "Read AGENT.md in /path/to/auth/. Follow the loop.", + "model": "sonnet" + } + }, + { + "name": "agent-payment-iteration", + "schedule": { "kind": "every", "everyMs": 900000 }, + "payload": { + "kind": "agentTurn", + "message": "Read AGENT.md in /path/to/payment/. Follow the loop.", + "model": "sonnet" + } + }, + { + "name": "agent-notification-iteration", + "schedule": { "kind": "every", "everyMs": 900000 }, + "payload": { + "kind": "agentTurn", + "message": "Read AGENT.md in /path/to/notification/. Follow the loop.", + "model": "sonnet" + } + } + ] +} +``` + +**Effect:** Three agents iterate every 15 minutes, independently, in parallel. + +**Pros:** Fully autonomous +**Cons:** Merge conflicts require human intervention + +--- + +### Pattern C: Coordinated Phases with Gates + +```markdown +You: "Build the project in three phases. Phase 1: data layer. + Phase 2: business logic (3 agents in parallel). + Phase 3: API layer." + +Cleo orchestrates: + Phase 1: Sequential agent for data layer + → Waits for DONE signal + Phase 2: Spawns 3 agents for business logic modules + → Waits for all 3 to finish + Phase 3: Sequential agent for API layer + → Waits for DONE signal + +Cleo: "All phases complete. Review?" +``` + +**Pros:** Maximizes parallelism while respecting dependencies +**Cons:** Requires sophisticated orchestration logic + +--- + +## Conflict Resolution + +### Preventing Conflicts + +**1. Namespace isolation** +``` +# Bad (shared file) +src/utils.ts ← Multiple agents editing this = conflict + +# Good (separate namespaces) +src/auth/utils.ts ← Agent A +src/payment/utils.ts ← Agent B +src/notification/utils.ts ← Agent C +``` + +**2. Explicit ownership in the plan** +```markdown +# PLAN-A.md (Agent A owns these) +- [ ] src/commands/auth.ts +- [ ] tests/auth.test.ts + +# PLAN-B.md (Agent B owns these) +- [ ] src/commands/list.ts +- [ ] tests/list.test.ts +``` + +**3. Shared files = sequential phases** +```markdown +# Phase 1: One agent sets up shared infrastructure +- [ ] package.json with all dependencies +- [ ] tsconfig.json +- [ ] .eslintrc + +# Phase 2: Multiple agents build features (don't touch shared files) +``` + +--- + +### Handling Conflicts When They Happen + +**Detection:** +```bash +# Agent A commits first +git log --oneline +# a1b2c3d feat: implement auth command + +# Agent B tries to commit +git pull +# CONFLICT (content): Merge conflict in src/cli.ts +``` + +**Resolution strategies:** + +**Strategy 1: Last-In Rebases** +```bash +# Agent B's iteration pauses, human resolves conflict +git checkout agent-b-branch +git rebase main +# Resolve conflicts in src/cli.ts +git add src/cli.ts +git rebase --continue +git push --force + +# Resume Agent B +``` + +**Strategy 2: Accept Both Changes** +```bash +# Both agents added a command to src/cli.ts +# Conflict: +<<<<<<< HEAD +program.command('auth') +======= +program.command('list') +>>>>>>> agent-b-branch + +# Resolution: Both are valid, keep both +program.command('auth') +program.command('list') +``` + +**Strategy 3: Abort and Retry Sequentially** +```bash +# If conflict is complex, give up on parallelism +git merge --abort + +# Run Agent B AFTER Agent A finishes +# Sequential is slower but conflict-free +``` + +--- + +## Merging Results + +### Sequential Integration + +**Pattern:** Each agent commits to its branch, you merge one at a time. + +```bash +# Agent A finishes (auth feature) +git checkout -b agent-a +# ... Agent A's commits ... +git checkout main +git merge agent-a # ✅ Clean merge + +# Agent B finishes (payment feature) +git checkout -b agent-b +# ... Agent B's commits ... +git checkout main +git merge agent-b # ✅ Clean merge (no overlap with agent-a) + +# Agent C finishes (notification feature) +git checkout -b agent-c +# ... Agent C's commits ... +git checkout main +git merge agent-c # ✅ Clean merge +``` + +**When it works:** Features are truly independent. + +--- + +### Simultaneous Integration with Review + +**Pattern:** All agents finish, then you review and integrate. + +```bash +# All three agents worked overnight +# Morning review: + +git log --oneline agent-a +# 5 commits, auth feature complete + +git log --oneline agent-b +# 7 commits, payment feature complete + +git log --oneline agent-c +# 4 commits, notification feature complete + +# Integration strategy: +git checkout main +git merge agent-a # Merge first +npm test # ✅ All tests pass + +git merge agent-b # Merge second +npm test # ✅ All tests pass + +git merge agent-c # Merge third +npm test # ❌ Test failure! Notification imports auth, but different version? + +# Debug: +git show agent-c:src/notification/handler.ts +# Agent C imported from 'auth' v1, but Agent A built v2 + +# Fix: Update Agent C's branch +git checkout agent-c +# ... update imports ... +git checkout main +git merge agent-c # ✅ Now works +``` + +**Lesson:** Integration testing catches issues that unit tests miss. + +--- + +### Squash and Clean + +**Pattern:** Parallel work was messy (trial and error), squash before merging. + +```bash +# Agent A made 12 commits with several reverts +git log --oneline agent-a +# 12 commits, but really just 1 feature + +# Squash into one clean commit +git checkout agent-a +git rebase -i main +# Mark all but first commit as "squash" +# Edit commit message: "feat: implement auth command" + +git checkout main +git merge agent-a --ff-only +# Now main has one clean commit instead of 12 messy ones +``` + +**When to use:** Agent made progress but with messy intermediate states. + +--- + +## Monitoring Parallel Agents + +### Status Dashboard (Manual) + +```bash +# Check all branches for progress +for branch in agent-a agent-b agent-c; do + echo "=== $branch ===" + git checkout $branch + echo "Commits: $(git log --oneline main..$branch | wc -l)" + echo "Last: $(git log -1 --oneline)" + echo "Tests: $(npm test 2>&1 | grep -E 'passing|failing')" +done +``` + +--- + +### OpenClaw Session Monitoring + +```markdown +You: "List all active agent sessions" +Cleo: *calls sessions_list* + Active sessions: + - agent-auth (alpha-1): Running for 5 minutes + - agent-payment (alpha-2): Running for 3 minutes + - agent-notification (alpha-3): Completed + +You: "What did agent-notification accomplish?" +Cleo: *reads session history for alpha-3* + Completed tasks: + - Set up notification package structure + - Implemented email notification handler + - Wrote tests for email sender + + Summary: 3 commits, all tests passing +``` + +--- + +### Git Log Summary + +```bash +# See all parallel work at once +git log --all --graph --oneline + +# Output: +# * a1b2c3d (agent-a) feat: add auth command +# | * d4e5f6g (agent-b) feat: add payment processing +# |/ +# | * h8i9j0k (agent-c) feat: add notification handler +# |/ +# * l0m1n2o (main) chore: project setup +``` + +Visual representation of parallel branches. + +--- + +## Cost Considerations + +### Request-Based Billing (Copilot) + +**Parallel agents = More requests** +``` +# Sequential: 10 iterations = 10 requests +One agent does 10 tasks sequentially + +# Parallel: 10 iterations = 30 requests +Three agents do 10 tasks in parallel (each agent = separate request) +``` + +**BUT: Wall-clock time is 1/3** +- Sequential: 10 iterations × 2 min = 20 minutes +- Parallel: 10 iterations ÷ 3 agents = 7 minutes + +**Tradeoff:** Pay more (3x requests), finish faster (1/3 time). + +--- + +### Token-Based Billing (Anthropic) + +**Parallel agents = Better token efficiency** + +Each agent starts with fresh context (~2K tokens). If you ran one long session: +- Turn 30 = 90K input tokens (context growth) + +Three parallel agents with fresh context: +- Agent A: 10 turns × 5K avg = 50K tokens +- Agent B: 10 turns × 5K avg = 50K tokens +- Agent C: 10 turns × 5K avg = 50K tokens +- Total: 150K tokens + +vs. one long session: +- 30 turns with context growth = 300K+ tokens + +**Parallel is cheaper on token-based systems.** + +--- + +## Anti-Patterns + +### 1. Parallel-by-Default +``` +# Bad: Everything in parallel! +spawn("Do task 1") & spawn("Do task 2") & spawn("Do task 3") +# → Tasks 2 and 3 depend on task 1 +# → Failures, conflicts, wasted work +``` + +**Fix:** Default to sequential. Parallelize only when independence is certain. + +--- + +### 2. Ignoring Dependencies +``` +# Bad: Parallel agents building layered architecture +Agent A: Building API (needs business logic) +Agent B: Building business logic (needs data layer) +Agent C: Building data layer + +# All start at once → A and B fail (missing dependencies) +``` + +**Fix:** Build dependencies first, then parallelize. + +--- + +### 3. No Integration Testing +``` +# Each agent's tests pass +Agent A tests: ✅ +Agent B tests: ✅ +Agent C tests: ✅ + +# But together they fail +Integration: ❌ (A calls B with wrong interface) +``` + +**Fix:** Always run integration tests after merging parallel work. + +--- + +### 4. Over-Parallelization +``` +# 10 agents working on 10 tiny tasks +# More time coordinating than building +``` + +**Fix:** Parallelize only when tasks are substantial (30+ minutes each). + +--- + +## Decision Framework + +Use this to decide: sequential or parallel? + +``` +Is the work truly independent? + ├─ NO → Sequential (don't even try) + └─ YES → Continue + +Will conflicts be easy to resolve? + ├─ NO → Sequential (not worth the pain) + └─ YES → Continue + +Is each task substantial (30+ min)? + ├─ NO → Sequential (overhead not worth it) + └─ YES → Continue + +Do you have time to monitor and merge? + ├─ NO → Sequential (safer unattended) + └─ YES → Parallelize! +``` + +--- + +_Parallelism is a force multiplier, but only if wielded carefully. Default to sequential. Parallelize when it's obviously safe._ diff --git a/PLAN-MANAGEMENT.md b/PLAN-MANAGEMENT.md new file mode 100644 index 0000000..c0df4f8 --- /dev/null +++ b/PLAN-MANAGEMENT.md @@ -0,0 +1,283 @@ +# Plan Management — The Living Document + +> The IMPLEMENTATION_PLAN.md is the agent's "working memory" between iterations. +> It's the only file the agent both reads AND writes. +> Managing it well is the difference between a smooth build and a chaotic mess. + +--- + +## What Is the Plan? + +The plan is a **task decomposition** of the PROJECT-SPEC.md, created by the agent during the planning phase. It's a checklist of discrete, testable tasks ordered by dependency. + +```markdown +# Implementation Plan + +## Phase 1: Foundation +- [x] Project scaffolding (monorepo, tsconfig, build scripts) +- [x] SQLite schema and migration system +- [ ] QFX file parser with field extraction ← Agent picks this next +- [ ] CSV import with configurable column mapping +- [ ] Category rules engine + +## Phase 2: API Layer +- [ ] REST endpoints for CRUD operations +- [ ] Transaction query with filters +- [ ] Spending aggregation endpoints +``` + +**Key properties:** +- **Checkboxes** track completion (`[x]` = done, `[ ]` = pending) +- **Order matters** — tasks are listed in dependency order +- **One task per iteration** — the agent picks the first unchecked item +- **Agent updates it** after completing each task + +--- + +## The Plan Lifecycle + +### 1. Creation (Planning Phase) + +The agent reads PROJECT-SPEC.md and decomposes it: + +``` +Spec → Agent Analysis → IMPLEMENTATION_PLAN.md +``` + +**What makes a good decomposition:** +- Each task is **independently testable** (has a clear "done" state) +- Tasks are **small enough** to complete in one iteration (30-60 minutes of agent work) +- Dependencies are respected (can't build API before schema exists) +- Each task maps to one or more acceptance criteria from the spec + +**What makes a bad decomposition:** +- Tasks too large ("Build the entire backend") +- Tasks too small ("Create the src directory") +- Missing dependencies (trying to write tests before the test framework is set up) +- Vague tasks ("Improve error handling" — which errors? where?) + +### 2. Execution (Build Iterations) + +Each iteration, the agent: +1. Reads the plan +2. Finds the first `[ ]` task +3. Implements it +4. Marks it `[x]` +5. Commits the updated plan +6. Exits + +**The commit includes the plan update:** +```bash +git add IMPLEMENTATION_PLAN.md src/parser.ts tests/parser.test.ts +git commit -m "feat: implement QFX file parser with field extraction" +``` + +### 3. Human Review (Periodic Check-ins) + +You should review the plan at these moments: + +| When | What to check | +|------|---------------| +| After planning phase | Does the decomposition make sense? Are tasks the right size? | +| After every 3-5 tasks | Is progress on track? Any tasks taking multiple iterations? | +| When agent signals STUCK | What's blocking it? Can you clarify the spec? | +| At phase boundaries | Is the phase complete? Ready for the next phase? | + +--- + +## Intervening in the Plan + +Sometimes you need to change the plan mid-flight. Here's how: + +### Adding Tasks + +New requirement discovered? Add it to the plan: +```markdown +- [ ] 🆕 Handle expired token during API calls (auto-refresh) +``` + +Use the 🆕 emoji to mark human-added tasks so you can track what changed. + +### Removing Tasks + +Scope cut? Mark it explicitly: +```markdown +- [~] ~~Monte Carlo visualization~~ (deferred to Phase 4) +``` + +Don't just delete — the agent might re-derive it from the spec. + +### Reprioritizing + +Move tasks up or down. Add a note so the agent understands: +```markdown +## Phase 1: Foundation +- [x] Project scaffolding +- [ ] REST API framework ← Moved up: needed for integration testing +- [ ] SQLite schema +``` + +### Splitting Tasks + +If a task keeps failing (agent can't complete it in one iteration), split it: +```markdown +# Before +- [ ] Transaction import with deduplication + +# After +- [ ] QFX file parser (parse file → array of transaction objects) +- [ ] CSV file parser (configurable column mapping) +- [ ] Transaction dedup engine (match on date + amount + payee hash) +- [ ] Import orchestrator (parse → dedup → insert) +``` + +### Adding Notes + +Leave notes for the agent in the plan: +```markdown +- [ ] CLM API integration + > NOTE: The DTC CLM Demo account returns 401. Try CSA CLM Demo or + > DocuSign CLM Demo accounts instead. See ACCOUNTS.md for IDs. +``` + +--- + +## Plan Patterns + +### The Scaffold-First Pattern +```markdown +- [ ] Project scaffolding (dirs, configs, package.json) +- [ ] Build pipeline (TypeScript compilation, scripts) +- [ ] Test infrastructure (framework, first passing test) +- [ ] CI configuration (lint + build + test) +``` + +Always start with infrastructure. An agent that can't build and test will produce garbage. + +### The Vertical Slice Pattern +```markdown +- [ ] Single endpoint: GET /api/health (route + handler + test) +- [ ] Single entity CRUD: categories (model + routes + tests) +- [ ] Second entity: transactions (model + routes + tests + import) +``` + +Build one complete vertical slice first. It establishes patterns for everything else. + +### The Test-First Pattern +```markdown +- [ ] Write failing tests for QFX parser +- [ ] Implement QFX parser to pass tests +- [ ] Write failing tests for CSV parser +- [ ] Implement CSV parser to pass tests +``` + +Pairs well with agents — the failing test IS the acceptance criterion. + +### The Dependency Chain Pattern +```markdown +## Layer 0: Shared +- [ ] Type definitions and interfaces + +## Layer 1: Data +- [ ] Database schema and migrations +- [ ] Data access layer + +## Layer 2: Business Logic +- [ ] Calculation engines (depends on Layer 1) +- [ ] Rule processors (depends on Layer 1) + +## Layer 3: Interface +- [ ] CLI commands (depends on Layer 2) +- [ ] API endpoints (depends on Layer 2) +``` + +Makes dependencies explicit. Agent can't skip ahead. + +--- + +## Tracking Progress + +### Git Log as Progress Report + +The git log tells the story: +```bash +git log --oneline +# a1b2c3d feat: implement Monte Carlo simulation engine +# d4e5f6g feat: add retirement projection calculator +# h7i8j9k feat: spending aggregation API endpoints +# l0m1n2o feat: transaction import with deduplication +# p3q4r5s chore: project scaffolding and build pipeline +``` + +Each commit = one completed task. If you see multiple commits for one task, the task was too big. + +### Velocity Tracking + +Count completed tasks per iteration: +- **1 task/iteration** = ideal (the system is working) +- **0 tasks/iteration** = agent is stuck (intervene) +- **2+ tasks/iteration** = tasks are too small (merge them) + +### Stuck Detection + +Signs the agent is stuck: +- Same task attempted 3+ iterations in a row +- Agent outputs `STUCK` +- Build or tests failing repeatedly +- Agent is modifying unrelated files + +**When stuck, ask:** +1. Is the task too large? → Split it +2. Is the spec unclear? → Clarify acceptance criteria +3. Is there a technical blocker? → Investigate and add notes +4. Is the agent fighting its own previous work? → Reset to last good commit + +--- + +## Plan Anti-Patterns + +### 1. The God Task +```markdown +# Bad +- [ ] Build the backend +# Good +- [ ] Database schema for users and transactions +- [ ] CRUD API for users +- [ ] CRUD API for transactions +- [ ] Transaction query with date/category filters +``` + +### 2. The Missing Dependency +```markdown +# Bad (schema doesn't exist yet!) +- [ ] REST API for transactions +- [ ] Database schema + +# Good +- [ ] Database schema +- [ ] REST API for transactions +``` + +### 3. The Implicit Task +```markdown +# Bad (who sets up the test framework?) +- [ ] Write tests for parser + +# Good +- [ ] Set up test framework (vitest config, first passing test) +- [ ] Write tests for parser +``` + +### 4. The Never-Done Task +```markdown +# Bad (when is "improve" done?) +- [ ] Improve error handling + +# Good +- [ ] Add user-friendly error messages for auth failures (expired token, missing key, network error) +- [ ] Add try/catch with contextual messages to all API endpoints +``` + +--- + +_The plan is your control surface. Master it, and you master the agent loop._ diff --git a/PROCESS-EVAL-TEMPLATE.md b/PROCESS-EVAL-TEMPLATE.md new file mode 100644 index 0000000..c7a69c8 --- /dev/null +++ b/PROCESS-EVAL-TEMPLATE.md @@ -0,0 +1,78 @@ +# Process Eval Template + +> Write this file after the stream is fully merged. +> File location: `.harness//process-eval.md` +> Be honest — this is a retrospective, not a press release. +> Future agents and sessions will read this to understand what worked. + +--- + +# Process Eval — [STREAM NAME] +**Completed:** YYYY-MM-DD +**Agent:** [model name] +**Packets:** [XX-01, XX-02, ...] +**Tests added:** NN total +**Final test count:** NNNN +**Wall-clock duration:** [estimated] + +--- + +## Packet Summary + +| Packet | Est. Effort | Actual | On Time? | Tests Added | +|--------|-------------|--------|----------|-------------| +| XX-01 | N sessions | N sessions | ✅/❌ | NN | +| XX-02 | ... | ... | ... | ... | + +--- + +## Known-Answer Test Results + +| Test | Expected | Actual | Pass? | +|------|----------|--------|-------| +| [Description] | [value] | [value] | ✅/❌ | + +--- + +## Process Quality Dimensions + +### Task Sizing +- Estimate accuracy: [XX%] +- Packets that overran: [list or "none"] +- Root cause of overruns: [...] + +### Test-First Discipline +- Tests committed same commit as implementation: [XX/NN packets] +- Patches needed after initial commit: [list or "none"] + +### Acceptance Criteria Quality +- Programmatically verifiable criteria: [XX/NN] +- Criteria that required human judgment: [list or "none"] + +### Known-Answer Coverage +- New calculation modules: N +- Modules with ≥1 known-answer test: N/N +- Any gaps: [list or "none"] + +### Architecture Integrity +- Cross-module import violations: [N] +- New shared utilities created: [list] + +### Regression Protection +- Regression baseline saved: [yes/no — path if yes] + +--- + +## What Went Well +- [Honest list] + +## What Was Hard +- [Honest list — useful for planning the next stream] + +## What To Do Differently +- [Actionable changes for next time] + +## Model Attribution +- Model: [model name] +- Strengths observed: [...] +- Weaknesses observed: [...] diff --git a/PROJECT-SPEC.md b/PROJECT-SPEC.md new file mode 100644 index 0000000..4654674 --- /dev/null +++ b/PROJECT-SPEC.md @@ -0,0 +1,246 @@ +# Project Specification Template + +> Copy this file into your project root as `PROJECT-SPEC.md`. +> Fill out each section. The more specific you are, the better the agent performs. +> This is the single source of truth — the agent reads it every iteration. + +--- + +## 1. Project Overview + +### What are we building? + + +### Why does it matter? + + +### Success criteria + +- [ ] Criterion 1 +- [ ] Criterion 2 +- [ ] Criterion 3 + +--- + +## 2. Technical Foundation + +### Tech stack + +- **Language:** +- **Framework:** +- **Database:** +- **Build system:** +- **Test framework:** +- **Package manager:** + +### Project structure + +``` +project/ +├── src/ +├── tests/ +├── docs/ +└── ... +``` + +### Build & test commands + +```bash +# Build +npm run build + +# Test +npm test + +# Lint +npm run lint +``` + +### Coding standards + +- +- +- + +--- + +## 3. Requirements + +### Functional Requirements + +> List every feature the system must have. Each should be independently testable. +> Use the format: FR-NNN: Description — Acceptance criteria + +#### FR-001: [Feature Name] +**Description:** What it does. +**Acceptance criteria:** +- [ ] Given X, when Y, then Z +- [ ] Edge case: ... +- [ ] Error case: ... + +#### FR-002: [Feature Name] +**Description:** What it does. +**Acceptance criteria:** +- [ ] ... + + + +### Non-Functional Requirements + +#### NFR-001: Performance +- [ ] Response time < X ms for Y operation +- [ ] Handles Z concurrent users + +#### NFR-002: Security +- [ ] No secrets in source code +- [ ] Input validation on all user inputs + +#### NFR-003: Testing +- [ ] Minimum 80% code coverage +- [ ] All happy paths tested +- [ ] All error paths tested + +--- + +## 4. Data Model + +### Entities + + +``` +Entity: User + - id: UUID (primary key) + - name: string (required, max 100 chars) + - email: string (required, unique, valid email) + - created_at: timestamp (auto-set) +``` + +### Relationships + + +### Sample Data + + +--- + +## 5. API / Interface Design + +### Endpoints / Commands / UI Screens + + +### Input/Output Examples + + +--- + +## 6. Architecture Decisions + +### Constraints + + +> Four categories of constraints. Fill all four — even if one is just "none in this category." +> The ESCALATE category is critical for autonomous agents: it defines when to stop and ask rather than decide. + +- **MUST:** ... (non-negotiable requirements) +- **MUST NOT:** ... (explicit prohibitions — prevents entire failure categories) +- **PREFER:** ... (soft preferences when multiple valid approaches exist) +- **ESCALATE:** ... (conditions where the agent must stop and ask the human rather than decide autonomously) + +**Example ESCALATE entries:** +- If a task requires deleting production data → ask first +- If acceptance criteria are ambiguous and the spec doesn't resolve it → ask before implementing +- If a dependency needs to be added that wasn't in the tech stack → confirm before installing +- If the task conflicts with another requirement or constraint → flag the conflict and wait +- If the agent discovers a requirement gap not covered by any FR-NNN → don't fill the gap silently + +### Dependencies + + +### Known Challenges + + +--- + +## 7. Phasing (Optional) + +> If the project is large, break it into phases. Each phase should be +> independently deployable and testable. + +### Phase 1: Foundation +- [ ] Task A +- [ ] Task B + +### Phase 2: Core Features +- [ ] Task C +- [ ] Task D + +### Phase 3: Polish +- [ ] Task E +- [ ] Task F + +--- + +## 8. Reference Materials + +### External docs + + +### Existing code to learn from + + +### Anti-patterns + + +--- + +## 9. Evaluation Design + +> How do you know the agent's output is good? Not "does it look reasonable?" +> but "can you prove it's correct, measurably and consistently?" +> +> Evaluation design is the only thing standing between AI output you can use +> as-is and AI output that requires 40 minutes of cleanup. Build evals before +> you need them — especially before model updates that could silently change behavior. + +### Test cases + + +#### TC-001: [Test Name] +**Input:** What you feed to the agent +**Expected output:** What you should get back (specific, verifiable) +**Verification:** How to check programmatically (command, assertion, manual checklist) + +#### TC-002: [Test Name] +**Input:** ... +**Expected output:** ... +**Verification:** ... + +### Verification steps + +```bash +# Example: verify the build passes +npm test + +# Example: check that specific files exist +ls -la src/new-feature/ + +# Example: verify no regressions +npm run lint +``` + +### Run-after-update checklist + +- [ ] Run all test cases (TC-001 through TC-00N) +- [ ] Compare outputs to expected baselines +- [ ] Check for regression in failure modes (see anti-patterns) +- [ ] Verify ESCALATE triggers still work correctly + +### Regression baselines + + +```json +{ + "tc_001_baseline": "...", + "tc_002_baseline": "..." +} +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1f3dd0 --- /dev/null +++ b/README.md @@ -0,0 +1,149 @@ +# Agent Harness Templates + +A complete system for running autonomous AI coding agents on complex projects. + +## Files + +### Core Templates (copy into your project) +| File | Purpose | +|------|---------| +| `AGENT-INSTRUCTIONS.md` | The agent's "system prompt" — reads this every iteration. Defines the core loop, mandatory pre-commit checklist (tests + TypeScript), commit attribution format, Tests-Added rule, and known anti-patterns | +| `PROJECT-SPEC.md` | Template for defining your problem. Sections for: overview, tech stack, requirements with acceptance criteria, data model, API design, constraints, phasing, anti-patterns | +| `DECISIONS.md` | Architecture Decision Record (ADR) template for documenting non-obvious technical choices. Prevents agent drift by creating continuity across fresh contexts | +| `EXECUTION-BOARD-TEMPLATE.md` | **⭐ New.** Pre-implementation planning artifact for a stream. Defines ALL packets, known-answer tests, and acceptance criteria BEFORE any code is written. The core of the plan-then-implement discipline. | +| `VALIDATION-TEMPLATE.md` | **⭐ New.** Per-packet evidence file written after each packet completes. Records test counts, known-answer results, and acceptance criteria tick-off. | +| `PROCESS-EVAL-TEMPLATE.md` | **⭐ New.** Stream retrospective written after merge. Honest assessment of task sizing, test-first compliance, and model quality. | +| `ralph-loop.sh` | The Ralph Wiggum bash loop — spawns fresh agent instances, checks for completion signals, restarts until done. Supports Claude, Codex, Aider, Gemini, and custom agents | +| `model-report.ts` | Parses git log `Agent:` trailers to generate per-model quality table (commits, tests added, TypeScript errors). Copy to `scripts/model-report.ts`, add `"model-report": "ts-node scripts/model-report.ts"` to package.json | + +### Process Guides (read before you start) +| File | Purpose | +|------|---------| +| `SPEC-CREATION-GUIDE.md` | **Start here.** How to create a great spec through structured interview. The interview protocol, domain knowledge extraction, and spec quality checklist | +| `TUTORIAL.md` | **Best way to learn.** Complete 30-minute walkthrough building a markdown link checker CLI tool from zero. Concrete, copy-pasteable example of the entire workflow | +| `WAVE-BASED-MANAGEMENT.md` | **⭐ New.** How to structure larger projects into waves, streams, and packets. The plan-then-implement discipline, execution boards, known-answer tests, and wave gates. Essential for projects with 10+ tasks. | +| `PLAN-MANAGEMENT.md` | How the IMPLEMENTATION_PLAN.md works — the living document agents update. Task decomposition patterns, intervention strategies, progress tracking | +| `REVIEW-AND-QA.md` | How to evaluate agent output. When to review, what to look for, how to course-correct. Review checklist template including model attribution and TypeScript hygiene checks | +| `COST-OPTIMIZATION.md` | Getting more work per dollar. Request-based vs token-based billing, optimal strategies per provider, model selection guide, the hybrid strategy, anti-patterns | +| `OPENCLAW-INTEGRATION.md` | Running the harness in OpenClaw with sessions_spawn, cron jobs, and shell scripts. Model selection, monitoring, cost optimization | +| `TROUBLESHOOTING.md` | When things go wrong. The five failure modes (stuck loop, drift, overengineering, test theater, context overflow) and how to fix each | +| `PARALLEL-AGENTS.md` | Running multiple agents simultaneously on independent tasks. When to parallelize, how to split work, how to merge results, conflict resolution, OpenClaw patterns | + +### Examples & Reference +| File | Purpose | +|------|---------| +| `EXAMPLES.md` | Worked example: Fintrove-style finance app spec + comparison of three approaches (Ezward, Ralph Wiggum, Nate Jones) | +| `CHANGELOG.md` | Version history and evolution of the agent harness project itself | + +## Quick Start + +### New to the Harness? (Start Here) +1. **Read** `TUTORIAL.md` — 30-minute hands-on walkthrough building a real CLI tool +2. **Read** `SPEC-CREATION-GUIDE.md` — learn the interview protocol +3. **Try it** — build your own project using the workflow + +### Ready to Build? (Simple project, <10 tasks) +1. **Read** `COST-OPTIMIZATION.md` — understand your billing model before you start burning budget +2. **Interview** — work with your agent to create the spec (or do it solo) +3. **Fill out** `PROJECT-SPEC.md` with your problem definition +4. **Copy** `PROJECT-SPEC.md`, `AGENT-INSTRUCTIONS.md`, and `DECISIONS.md` into your project root +5. **Run** `./ralph-loop.sh` (CLI) or use OpenClaw sessions_spawn (see `OPENCLAW-INTEGRATION.md`) +6. **Review** at phase boundaries using `REVIEW-AND-QA.md` checklist +7. **Troubleshoot** failures using `TROUBLESHOOTING.md` + +### Building Something Larger? (10+ tasks, multiple features) +1. **Read** `WAVE-BASED-MANAGEMENT.md` — the plan-then-implement discipline +2. **Create** your `IMPLEMENTATION_PLAN.md` with all tasks grouped into waves +3. **Create** `.harness/EXECUTION_MASTER.md` — your wave/stream dashboard +4. **For each stream:** copy `EXECUTION-BOARD-TEMPLATE.md`, fill ALL packets before coding any +5. **After each packet:** copy `VALIDATION-TEMPLATE.md` and fill it in +6. **After each stream:** copy `PROCESS-EVAL-TEMPLATE.md` and write the retrospective +7. **At each wave boundary:** run the wave gate checklist before starting the next wave + +## The Core Insight + +All successful agent approaches share the same loop: + +``` +Orient (read spec + plan) → Pick ONE task → Build → Test → Commit → Exit → Restart fresh +``` + +The spec defines WHAT. The plan tracks WHERE we are. Fresh context each iteration prevents drift. The human reviews and course-corrects. + +See each file for detailed instructions. + +## When to Use Which Guide + +``` +┌─────────────────────────────────────────────────┐ +│ "Which guide do I need?" │ +├─────────────────────────────────────────────────┤ +│ │ +│ Just starting? │ +│ → TUTORIAL.md (hands-on learning) │ +│ │ +│ Creating a spec? │ +│ → SPEC-CREATION-GUIDE.md (interview) │ +│ │ +│ Agent is stuck? │ +│ → TROUBLESHOOTING.md (failure modes) │ +│ │ +│ Reviewing agent output? │ +│ → REVIEW-AND-QA.md (what to check) │ +│ │ +│ Worried about cost? │ +│ → COST-OPTIMIZATION.md (billing models) │ +│ │ +│ Multiple independent features? │ +│ → PARALLEL-AGENTS.md (coordination) │ +│ │ +│ Using OpenClaw? │ +│ → OPENCLAW-INTEGRATION.md (sessions_spawn) │ +│ │ +│ Agent keeps changing past decisions? │ +│ → DECISIONS.md (ADR template) │ +│ │ +│ Want to see it in action? │ +│ → EXAMPLES.md (real project example) │ +│ │ +└─────────────────────────────────────────────────┘ +``` + +## Philosophy + +### Fresh Context > Long Context +Each iteration starts with a fresh agent. No accumulated confusion, no stale reasoning. The git history and plan file provide continuity. + +### One Task > Many Tasks +Agents that try to do everything in one session produce spaghetti. Agents that focus on ONE task produce clean commits. + +### Spec Quality > Agent Quality +A great spec with a mediocre agent beats a vague spec with a great agent. The spec is your leverage point. + +### Review > Repair +It's easier to review and guide than to debug and fix. Catch drift early through periodic reviews. + +### Explicit > Implicit +Agents can't read your mind. Write down constraints, anti-patterns, and decisions. What's obvious to you is invisible to the agent. + +## Contributing + +This harness is a living system. If you: +- Discover new failure modes +- Develop better patterns +- Find gaps in the guides +- Create examples for other project types + +Document them and contribute back. The harness improves as we learn what works. + +## Version + +Current version: **2.0.0** (see `CHANGELOG.md` for history) + +## License + +Public domain. Use it, modify it, share it. No attribution required. + +--- + +_The harness doesn't write code. It creates conditions where agents can write code reliably._ diff --git a/REVIEW-AND-QA.md b/REVIEW-AND-QA.md new file mode 100644 index 0000000..628ccba --- /dev/null +++ b/REVIEW-AND-QA.md @@ -0,0 +1,327 @@ +# Review & QA — Evaluating Agent Output + +> Agents are fast but not infallible. Your role shifts from "writer" to "reviewer." +> This guide teaches you when to check in, what to look for, and how to course-correct. + +--- + +## The New Developer Workflow + +Traditional development: +``` +Think → Write Code → Test → Debug → Commit +``` + +Agent-assisted development: +``` +Spec → Agent Builds → You Review → Course-Correct → Agent Continues +``` + +Your job is no longer writing code. It's: +1. **Defining** what to build (the spec) +2. **Reviewing** what the agent built +3. **Course-correcting** when it drifts + +This is a fundamentally different skill. Most developers struggle with it at first because reviewing feels passive. It's not — it's the highest-leverage activity in the loop. + +--- + +## When to Review + +### Mandatory Review Points + +| Moment | What to check | +|--------|--------------| +| **After planning** | Task decomposition makes sense, dependencies are correct | +| **After Phase 1 (scaffold)** | Build works, tests pass, project structure is sane | +| **At every phase boundary** | Phase deliverables match spec, no drift from requirements | +| **When agent signals STUCK** | Understand the blocker, clarify or unblock | +| **Before merging to main** | Final quality pass on the complete work | + +### Optional Review Points + +| Moment | Why | +|--------|-----| +| **Every 3-5 iterations** | Spot drift early before it compounds | +| **After complex tasks** | Tasks involving algorithms, business logic, or external APIs | +| **When you're curious** | Honestly, watching agents work is fascinating and educational | + +### When NOT to Review + +- Every single iteration (you'll burn out and slow the process) +- Simple scaffolding tasks (directory creation, config files) +- Tasks where the test suite provides sufficient verification + +--- + +## What to Look For + +### 1. Correctness — Does it do what the spec says? + +**Check against acceptance criteria:** +```markdown +# Spec says: +FR-003: Parse QFX files and extract: date, amount, payee, memo, type + +# Review: +- Does the parser handle all five fields? ✅ +- Does it handle missing memo fields? (spec says optional) ✅ +- Does it handle negative amounts correctly? 🤔 (not specified — add to spec) +``` + +**Run the code yourself:** +```bash +# Don't just read the code — actually run it +node build/cli.js templates list +node build/cli.js auth +npm test +``` + +Agent code that passes its own tests might still not do what YOU expected. + +### 2. Architecture — Does the structure make sense? + +**Check for:** +- Are files in the right places? (following the spec's project structure) +- Are dependencies flowing the right direction? (no circular imports) +- Is shared code actually shared? (no copy-paste between packages) +- Are abstractions appropriate? (not too many layers, not too few) + +**Red flags:** +- A 500-line function (should be decomposed) +- Utility functions scattered across packages (should be in shared/) +- Direct database access from API handlers (should go through a service layer) +- Hardcoded values that should be configurable + +### 3. Quality — Is it well-written? + +**Check for:** +- Error handling (what happens when things fail?) +- Edge cases (empty input, null values, large datasets) +- Type safety (any `any` types sneaking in?) +- Naming (do function/variable names make sense?) +- Comments (only where code ISN'T self-explanatory) + +**Agents tend to:** +- Over-comment obvious code (`// increment counter` above `counter++`) +- Under-handle errors (happy path works, sad path crashes) +- Use generic variable names in complex logic (`data`, `result`, `item`) +- Create unnecessary abstractions to seem "architecturally sound" + +### 4. Tests — Are they actually testing anything? + +**The worst agent habit:** Tests that always pass because they don't test anything meaningful. + +```typescript +// BAD — this tests nothing +test('parser works', () => { + const result = parse(sampleData); + expect(result).toBeDefined(); // Always passes if parse returns anything +}); + +// GOOD — this tests behavior +test('parser extracts all QFX fields', () => { + const result = parse(sampleQFX); + expect(result[0].date).toBe('2024-01-15'); + expect(result[0].amount).toBe(-42.50); + expect(result[0].payee).toBe('COSTCO WHOLESALE'); + expect(result[0].memo).toBe('Purchase'); + expect(result[0].type).toBe('debit'); +}); +``` + +**Check for:** +- Tests with meaningful assertions (not just `toBeDefined()`) +- Edge case tests (empty input, invalid data, boundary values) +- Error path tests (what happens with bad input?) +- Integration tests (do components work together?) +- Test data that's realistic (not `"test"`, `"foo"`, `"bar"`) +- **Net test count increased** — if the agent added a feature and `Tests-Added: 0`, that's a failure + +### 4a. TypeScript Hygiene — Did it actually compile? + +Runtime tests and TypeScript compilation test different things. An agent can pass all tests while introducing type errors that only surface at build time or when another component uses the broken type. + +**Always verify:** +```bash +# Backend +npx tsc --noEmit + +# Frontend (if exists) +cd frontend && npx tsc --noEmit +``` + +**Red flags in the diff:** +- New fields used in code but not added to TypeScript interfaces +- `as unknown as Record` casts to bypass type checking +- `any` type appearing in new code +- Interface definitions missing fields that the API/DB actually returns + +### 5. Drift — Is it still aligned with the spec? + +Over many iterations, agents can drift: +- Adding features not in the spec (scope creep) +- Changing patterns established in earlier iterations +- Ignoring constraints listed in the spec +- "Improving" things that were working fine + +**Drift detection:** +```bash +# Check what changed +git diff main..agent-branch --stat + +# Look for unexpected file changes +# If the agent modified files outside the current task scope, investigate +``` + +--- + +## How to Course-Correct + +### Light Touch: Notes in the Plan + +For minor issues, add notes to IMPLEMENTATION_PLAN.md: +```markdown +- [ ] Implement transaction query filters + > AGENT NOTE: Use parameterized SQL queries. Previous iteration used + > string concatenation — revert that pattern. See FR-003 acceptance criteria. +``` + +### Medium Touch: Spec Clarification + +If the agent is misunderstanding a requirement, clarify the spec: +```markdown +# Before (ambiguous) +FR-005: Handle errors gracefully + +# After (clear) +FR-005: Error Handling +- [ ] Expired token → auto-refresh and retry (no user action needed) +- [ ] Network failure → show "Cannot reach DocuSign API. Check your connection." +- [ ] Invalid input → show specific field that failed and why +- [ ] Never show raw stack traces to the user +``` + +### Heavy Touch: Reset and Redirect + +If the agent has gone significantly off-track: +```bash +# Find the last good state +git log --oneline + +# Reset to it +git reset --hard + +# Update the plan to reflect the reset +# Add notes explaining what went wrong +``` + +Then update the spec or plan to prevent the same drift. + +### Nuclear Option: Rewrite the Task + +Sometimes a task is fundamentally misconceived: +```markdown +# Remove the bad task +- [~] ~~Build real-time WebSocket dashboard~~ (REMOVED: overkill for MVP) + +# Replace with something appropriate +- [ ] Build static HTML dashboard that refreshes on page load +``` + +--- + +## Review Checklist Template + +Use this for phase boundary reviews: + +```markdown +# Review: Phase [N] — [Name] +**Date:** YYYY-MM-DD +**Iterations completed:** X +**Tasks completed:** Y/Z + +## Spec Alignment +- [ ] All phase acceptance criteria met +- [ ] No features added beyond spec +- [ ] Constraints respected (MUST/MUST NOT) + +## Code Quality +- [ ] Build passes with zero errors +- [ ] All tests pass +- [ ] TypeScript compiles clean — `npx tsc --noEmit` (backend AND frontend) +- [ ] New code has new tests (Tests-Added > 0 for all feature commits) +- [ ] No obvious code smells +- [ ] Error handling present for failure paths +- [ ] No hardcoded secrets or credentials +- [ ] Shared logic is extracted — no duplication across components + +## Architecture +- [ ] File structure matches spec layout +- [ ] Shared code is actually shared (not duplicated) +- [ ] Dependencies flow in one direction +- [ ] No unnecessary abstractions +- [ ] TypeScript interfaces match actual API/DB fields + +## Tests +- [ ] Tests assert meaningful behavior (not just "it exists") +- [ ] Edge cases covered +- [ ] Error paths tested +- [ ] Test data is realistic +- [ ] Net test count increased from previous phase + +## Model Attribution +- [ ] All commits include `Agent:` trailer +- [ ] All commits include `Tests:` and `Tests-Added:` trailers +- [ ] All commits include `TypeScript:` trailer +- [ ] Run `npm run model-report` (if available) and review output + +## Performance +- [ ] Meets NFR thresholds (if specified) +- [ ] No obvious performance issues (N+1 queries, unnecessary loops) + +## Issues Found +1. [Issue description] → [Resolution: fix now / add to plan / defer] +2. ... + +## Verdict +- [ ] Approved for next phase +- [ ] Needs fixes (list above, re-review after) +- [ ] Needs significant rework (reset to [commit]) +``` + +--- + +## The Learning Loop + +Every review teaches you something: + +1. **About the agent:** What patterns it follows well, where it struggles +2. **About your spec:** What was clear enough, what caused confusion +3. **About your process:** What review cadence works, what to check + +Capture these lessons: +```markdown +# Lessons Learned + +## Spec Writing +- Agents interpret "handle errors" as "catch and log." + Must specify WHAT to show the user. +- Including example I/O prevents 90% of format misunderstandings. + +## Agent Behavior +- Tends to over-abstract on iteration 1 (creates interfaces before implementations) +- Reliable at following MUST/MUST NOT constraints +- Struggles with tasks that require external API knowledge (add docs links) + +## Process +- Reviewing every 5 iterations is about right for this project size +- Phase boundary reviews catch 80% of drift +- Running the code myself (not just reading tests) catches the other 20% +``` + +Feed these lessons back into your spec templates and interview protocol. + +--- + +_Review is not overhead. Review is the product. The code is just the artifact._ diff --git a/SPEC-CREATION-GUIDE.md b/SPEC-CREATION-GUIDE.md new file mode 100644 index 0000000..a64e635 --- /dev/null +++ b/SPEC-CREATION-GUIDE.md @@ -0,0 +1,321 @@ +# Spec Creation Guide — The Interview Protocol + +> The spec is the most important document in the entire harness. +> A bad spec produces bad code, no matter how good the agent is. +> This guide teaches you how to create a great spec through structured conversation. + +--- + +## Why Interview-Based Spec Creation? + +Most agent harness guides say "write a good spec" and move on. That's like telling someone "just write good code." The spec requires **two kinds of knowledge**: + +1. **Domain knowledge** — what the human knows (goals, constraints, edge cases, things they've tried) +2. **Technical knowledge** — what the agent/engineer knows (architecture patterns, tooling, testing strategies) + +Neither side has the full picture. The interview process brings them together. + +### The Anti-Pattern: Agent-Written Specs + +If you ask an agent to "analyze this codebase and write a spec," you get a description of **what exists**, not a plan for **what should exist**. The agent can't know: +- Why you're building this +- What you've tried that didn't work +- What tradeoffs you're willing to make +- What "done" looks like to you + +### The Anti-Pattern: Human-Only Specs + +If the human writes the spec alone, you get: +- Vague acceptance criteria ("it should be fast") +- Missing technical details (no build commands, no test strategy) +- Implied knowledge that the agent can't access +- Gaps where the human assumed things were obvious + +--- + +## The Interview Protocol + +### Phase 1: Vision & Context (5-10 minutes) + +Start broad. Understand the "why" before the "what." + +**Questions to ask:** + +1. **"What are we building, in one sentence?"** + - Forces clarity. If they can't say it in one sentence, the scope isn't clear yet. + - Good: "A CLI toolkit for interacting with DocuSign APIs without a proxy server." + - Bad: "Something to help with DocuSign stuff." + +2. **"Who is this for?"** + - The user? Other developers? An automated system? + - This shapes API design, error messages, documentation needs. + +3. **"Why now? What's the trigger?"** + - Understanding urgency and motivation reveals hidden requirements. + - "I'm tired of copying tokens manually" → auto-refresh is a core requirement, not a nice-to-have. + +4. **"What does 'done' look like? How will you know it's working?"** + - Push for measurable criteria, not feelings. + - "It works" → "I can run `cli auth` and get a valid token without opening a browser." + +5. **"What have you tried before? What didn't work?"** + - This is GOLD. Anti-patterns save agents hours of wasted effort. + - "Node.js fetch sends headers that break SpringCM" → use curl instead. + +**What you're listening for:** +- Unstated assumptions ("obviously it needs to...") +- Emotional language (frustration = high-priority requirement) +- Scope creep indicators ("and eventually it could also...") + +--- + +### Phase 2: Requirements Extraction (10-15 minutes) + +Now go feature by feature. For each feature: + +**The requirement loop:** + +1. **"Walk me through how you'd use this feature."** + - Get the happy path first. Concrete scenario, not abstract description. + - "I'd run `cli templates list` and see my 20 most recent templates with names and IDs." + +2. **"What could go wrong?"** + - Error cases, edge cases, permissions issues. + - "The token could be expired." → auto-refresh requirement. + - "The account might not have that API enabled." → graceful error message. + +3. **"What's the input? What's the output?"** + - Be specific about formats, fields, defaults. + - "Input: template ID. Output: JSON with name, ID, folder, page count, created date." + +4. **"How would you test this?"** + - If they can describe a test, you have an acceptance criterion. + - "I'd run it and check that I get at least one template back with a valid ID." + +5. **"Is this a must-have or nice-to-have?"** + - Prioritization prevents scope explosion. + - Phase 1 = must-haves. Phase 2+ = nice-to-haves. + +**Pro tip:** Number requirements as you go (FR-001, FR-002...). It creates shared language for the rest of the project. + +--- + +### Phase 3: Technical Discovery (10-15 minutes) + +This is where the engineer/agent fills in what the human might not think to specify. + +**Questions to explore together:** + +1. **Tech stack confirmation** + - "You're using TypeScript with npm workspaces — should we keep that pattern?" + - Don't assume. The human might want to change direction. + +2. **Existing code patterns** + - Read the codebase. Identify patterns already in use. + - "I see you're using Commander.js for CLI parsing — should all packages follow that?" + - "Your auth module uses JWT with RSA keys — should new packages share that?" + +3. **Build and test infrastructure** + - "What are the build commands? What test framework? What's the CI/CD setup?" + - If there's no test framework, that's a Phase 0 task. + +4. **Data model and persistence** + - "Where does data live? Files? Database? Environment variables?" + - "How do packages share configuration?" (e.g., monorepo root `.env`) + +5. **Deployment and environment** + - "Is this demo-only or does it need production support?" + - "What environments exist?" (demo, staging, production) + +6. **Dependencies and external services** + - "What APIs are involved? What are their quirks?" + - "Any rate limits, authentication requirements, or known issues?" + +**What the engineer contributes:** +- Suggest architecture patterns the human hasn't considered +- Identify missing infrastructure (test framework, linting, CI) +- Spot potential issues early (circular dependencies, shared state) +- Propose phasing based on technical dependencies + +--- + +### Phase 4: Constraint Mapping (5 minutes) + +Explicitly capture the guardrails. + +**Three categories:** + +1. **MUST** — Non-negotiable requirements + - "MUST use curl for HTTP calls (fetch breaks SpringCM)" + - "MUST store tokens in .env file" + +2. **MUST NOT** — Explicit prohibitions + - "MUST NOT commit secrets to git" + - "MUST NOT use React for the frontend" + +3. **PREFER** — Soft preferences + - "PREFER ES modules over CommonJS" + - "PREFER shared utilities over code duplication" + +**Why this matters:** Agents follow explicit constraints better than implied ones. A MUST NOT prevents entire categories of mistakes. + +--- + +### Phase 5: Spec Assembly (Agent's job) + +After the interview, the agent assembles the spec: + +1. **Fill the PROJECT-SPEC.md template** with interview answers +2. **Add technical details** discovered from code review +3. **Write acceptance criteria** from the requirement conversations +4. **Propose phasing** based on dependencies +5. **Include anti-patterns** from "what didn't work" answers +6. **Present to human for review** + +**The review conversation:** +- Read through each section together +- Human corrects misunderstandings +- Agent asks clarifying questions on gaps +- Iterate until the human says "yes, that's what I want" + +--- + +### Phase 6: Self-Containment Test (5 minutes) + +> **The critical test:** Can the spec be solved without the agent needing +> to fetch information not included in it? +> +> This is Toby Lütke's insight: *Can you state a problem with enough +> context that the task is plausibly solvable without the agent going +> out and getting more information?* + +**The test — rewrite the spec as if:** +1. The reader has never seen your project before +2. The reader doesn't know your coding conventions or style +3. The reader has no access to information you don't include +4. The reader will stop and do nothing if anything is ambiguous + +**The checklist:** +- [ ] Every acronym is defined on first use +- [ ] File paths referenced actually exist and are correct +- [ ] External dependencies have versions pinned or install instructions included +- [ ] Domain-specific terms are explained (not everyone knows what "JWT" or "FTS" means) +- [ ] The agent can find all referenced files without searching +- [ ] If removing any sentence would cause the agent to make mistakes, the spec isn't self-contained yet + +**The failure mode this catches:** +Agents fill gaps with statistical plausibility — they guess in ways that are +often subtly wrong. A spec that relies on shared context (even 5 minutes of +prior conversation) will produce outputs that look right but aren't. + +**If the spec fails the test:** Add the missing context. If you can't add it +(too much to document), add an ESCALATE constraint: "If you encounter +information not covered by this spec, do not assume — ask the human." + +--- + +## Spec Quality Checklist + +Before handing a spec to agents, verify: + +### Completeness +- [ ] Every feature has numbered acceptance criteria (FR-NNN) +- [ ] Data model is defined with types and constraints +- [ ] Build and test commands are specified and work +- [ ] Anti-patterns section exists with real examples +- [ ] Phasing is defined with dependencies noted +- [ ] All four constraint categories are filled (MUST / MUST NOT / PREFER / ESCALATE) +- [ ] Evaluation design section exists with test cases and verification steps + +### Clarity +- [ ] A stranger could read this and understand what to build +- [ ] No ambiguous words ("fast", "nice", "good") — use numbers +- [ ] Input/output examples for key operations +- [ ] Error cases are explicitly described + +### Testability +- [ ] Every acceptance criterion can be verified by running code +- [ ] Sample data or fixtures are provided +- [ ] Performance criteria have specific thresholds +- [ ] "Done" is objectively measurable + +### Feasibility +- [ ] Tech stack is proven for this type of project +- [ ] External dependencies are accessible (API keys, permissions) +- [ ] Scope fits the timeline (phasing handles overflow) +- [ ] Known challenges are documented with mitigation strategies + +### Self-Containment +- [ ] A stranger could solve this without asking follow-up questions +- [ ] No domain-specific terms used without definition +- [ ] All file paths, commands, and references are correct +- [ ] ESCALATE constraints cover situations where spec is ambiguous + +--- + +## Common Interview Mistakes + +### 1. Leading the witness +**Bad:** "You probably want auto-refresh, right?" +**Good:** "What happens when the token expires mid-session?" + +### 2. Accepting vague answers +**Bad:** Human: "It should handle errors well." Agent: "Got it." +**Good:** "Can you give me an example of an error? What should the user see?" + +### 3. Skipping the 'why' +**Bad:** Jumping straight to features. +**Good:** Understanding context first — it changes how you interpret every requirement. + +### 4. Over-engineering the spec +**Bad:** 50-page spec with UML diagrams for a CLI tool. +**Good:** Enough detail for an agent to work autonomously, no more. + +### 5. Forgetting anti-patterns +**Bad:** Only describing what TO do. +**Good:** Explicitly listing what NOT to do — saves agents from repeating your mistakes. + +--- + +## Template: Interview Notes + +Use this to capture notes during the interview before assembling the spec: + +```markdown +# Interview Notes — [Project Name] +**Date:** YYYY-MM-DD +**Participants:** [Human], [Agent] + +## Vision +- One-liner: +- Target user: +- Trigger/motivation: +- Success criteria: + +## Features (raw notes) +1. Feature name — description — happy path — error cases — priority +2. ... + +## Technical Context +- Existing stack: +- Patterns to follow: +- Patterns to avoid: +- Build/test commands: + +## Constraints +- MUST: +- MUST NOT: +- PREFER: + +## Anti-patterns (things that didn't work) +1. +2. + +## Open Questions +1. +2. +``` + +--- + +_This guide is a living document. Update it as you learn what works._ diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..4b4ac8a --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,333 @@ +# Troubleshooting — When Things Go Wrong + +> Agents are remarkably capable, but they fail in predictable ways. +> This guide catalogs the common failure modes and how to fix each one. + +--- + +## Failure Taxonomy + +Agent failures fall into five categories: + +| Category | Symptom | Severity | Fix Effort | +|----------|---------|----------|------------| +| **Stuck Loop** | Same task attempted repeatedly | 🟡 Medium | Clarify spec or split task | +| **Drift** | Code diverges from spec | 🟠 High | Review + reset + constrain | +| **Overengineering** | Too much abstraction, unnecessary complexity | 🟡 Medium | Simplify constraints in spec | +| **Test Theater** | Tests pass but don't test anything real | 🔴 Critical | Rewrite tests, add examples | +| **Context Overflow** | Agent loses track mid-iteration | 🟡 Medium | Reduce task size | + +--- + +## Problem 1: The Stuck Loop + +### Symptom +The agent attempts the same task 3+ iterations in a row. Git log shows repeated attempts and reverts, or no commits at all. + +### Root Causes + +**1a. Task is too large** +```markdown +# Bad +- [ ] Implement the entire authentication system + +# Fix: Split into smaller pieces +- [ ] Create JWT token generation function +- [ ] Create token refresh function +- [ ] Create auth middleware +- [ ] Wire auth into CLI commands +``` + +**1b. Spec is ambiguous** +The agent interprets the requirement differently each iteration, never matching what you expect. +```markdown +# Bad +- [ ] Handle errors properly + +# Fix: Be explicit +- [ ] Return HTTP 400 with { error: "message" } for validation failures +- [ ] Return HTTP 401 with { error: "Token expired" } for auth failures +- [ ] Log errors to stderr, never expose stack traces to users +``` + +**1c. External dependency is broken** +The agent can't complete the task because an API, service, or tool isn't working. +```markdown +# Detection: Agent's commit messages mention the same error repeatedly +# Fix: Add a note to the plan +- [ ] CLM API integration + > BLOCKED: CLM API returns 401 for all accounts. Skip this task. + > Will revisit when account provisioning is resolved. +``` + +**1d. Tests are impossible to pass** +Previous iteration wrote tests with wrong expectations, now the agent can't make them pass. +```bash +# Fix: Reset tests to last known-good state +git checkout HEAD~3 -- tests/ +# Then update the plan with clarification +``` + +### Recovery Steps +1. Check git log — is the agent making progress at all? +2. Read the agent's output from the last 2-3 iterations (session transcripts) +3. Identify which root cause matches +4. Apply the appropriate fix +5. Add a note to the plan explaining the resolution + +--- + +## Problem 2: Architecture Drift + +### Symptom +Over many iterations, the codebase structure diverges from what the spec defines. Files appear in wrong directories, patterns change between iterations, or the agent introduces frameworks/tools not in the spec. + +### Root Causes + +**2a. Spec doesn't specify architecture strongly enough** +```markdown +# Bad (too vague) +### Project structure +src/ +tests/ + +# Fix (explicit) +### Project structure +packages/ +├── server/ +│ ├── src/ +│ │ ├── routes/ # Express route handlers +│ │ ├── services/ # Business logic (no HTTP awareness) +│ │ ├── models/ # Database access +│ │ └── index.ts # Server entry point +│ └── tests/ +│ ├── routes/ # Integration tests (HTTP) +│ └── services/ # Unit tests (pure functions) +``` + +**2b. Agent "improves" existing patterns** +Iteration 8's agent thinks iteration 3's pattern is bad and refactors it, breaking the consistency. + +```markdown +# Fix: Add to spec constraints +### Constraints +- MUST NOT refactor code from previous iterations unless the current task requires it +- MUST follow existing patterns (look at how similar features are already implemented) +- MUST NOT introduce new dependencies without explicit approval +``` + +**2c. Fresh context means no memory of decisions** +Each iteration starts fresh. The agent in iteration 10 doesn't know WHY iteration 3 chose a particular approach. + +```markdown +# Fix: Document decisions in a DECISIONS.md file +## Architecture Decisions + +### ADR-001: curl over fetch for HTTP calls +**Context:** Node.js fetch sends extra headers that cause SpringCM 500 errors. +**Decision:** Use child_process.exec with curl for all API calls. +**Status:** Accepted. DO NOT CHANGE. + +### ADR-002: Shared package for cross-cutting utilities +**Context:** 7 packages had duplicated auth, env, and API code. +**Decision:** Extract to packages/shared/, import as docusign-direct-shared. +**Status:** Accepted. All new packages must use shared utilities. +``` + +Add `DECISIONS.md` to the files the agent reads during the Orient phase. + +### Recovery Steps +1. `git diff` the current state against the spec's project structure +2. Identify what drifted and when (git log + git blame) +3. Reset if severe, or add corrective tasks to the plan +4. Strengthen the spec's architecture section +5. Add DECISIONS.md for non-obvious choices + +--- + +## Problem 3: Overengineering + +### Symptom +The agent creates elaborate abstractions, design patterns, and infrastructure that the spec doesn't call for. Factory factories. Abstract base classes for things with one implementation. Configuration systems for things with one value. + +### Root Causes + +**3a. Agent defaults to "enterprise" patterns** +LLMs are trained on a lot of enterprise code. They gravitate toward abstraction. + +```markdown +# Fix: Add to spec constraints +### Constraints +- PREFER simple functions over classes +- PREFER direct implementation over abstraction layers +- MUST NOT create an interface unless there are 2+ implementations +- MUST NOT add configuration for things that have one value +- Follow YAGNI: You Aren't Gonna Need It +``` + +**3b. Task is too vague, agent fills the gap with architecture** +```markdown +# Bad +- [ ] Create the data layer + +# Fix +- [ ] Create SQLite database with schema: users(id, name, email), + transactions(id, user_id, amount, date). Use better-sqlite3. + One file: src/db.ts. No ORM. +``` + +### Recovery Steps +1. Identify the unnecessary abstraction +2. Add explicit simplicity constraints to the spec +3. If the code works despite being over-engineered, leave it (unless it impedes future tasks) +4. If it's blocking progress, simplify and recommit + +--- + +## Problem 4: Test Theater + +### Symptom +All tests pass, but when you actually USE the software, it doesn't work correctly. Tests are checking for existence (`toBeDefined`), not behavior. + +### This is the most dangerous failure mode because it's invisible. + +### Root Causes + +**4a. No example I/O in the spec** +The agent doesn't know what correct output looks like, so it tests for "something came back." + +```markdown +# Fix: Add to spec +### Input/Output Examples + +**QFX Import:** +Input file: (see data/sample.qfx) +Expected output: +[ + { date: "2024-01-15", amount: -42.50, payee: "COSTCO", memo: "Purchase", type: "debit" }, + { date: "2024-01-16", amount: 2500.00, payee: "EMPLOYER INC", memo: "Payroll", type: "credit" } +] +``` + +**4b. No test quality standards in the spec** +```markdown +# Fix: Add to spec +### Testing Standards +- Every test must assert SPECIFIC values, not just "defined" or "truthy" +- Tests must include at least one edge case (empty input, null values) +- Tests must include at least one error case (invalid input, missing data) +- Use realistic test data, not "foo" and "bar" +- Test the PUBLIC behavior, not internal implementation details +``` + +**4c. Agent writes tests after implementation (confirmation bias)** +The agent sees what the code does and writes tests that confirm it — even if the code is wrong. + +```markdown +# Fix: Use the test-first pattern in the plan +- [ ] Write failing tests for QFX parser (based on spec examples) +- [ ] Implement QFX parser to pass tests +``` + +### Recovery Steps +1. Run the code yourself — does it actually work? +2. Read the test assertions — are they testing behavior or existence? +3. Add example I/O to the spec +4. Add a "rewrite tests" task to the plan with explicit expected values +5. Consider adding a test-first constraint to the spec + +--- + +## Problem 5: Context Overflow + +### Symptom +Agent starts strong but degrades mid-iteration. Later changes contradict earlier ones. The agent "forgets" what it was doing partway through a task. + +### Root Causes + +**5a. Task requires reading too many files** +The agent fills its context window with file contents and loses track of the goal. + +```markdown +# Fix: Make tasks more focused +# Bad +- [ ] Refactor all 7 packages to use shared utilities + +# Good +- [ ] Refactor clm-direct to use shared utilities +- [ ] Refactor docgen-direct to use shared utilities +- [ ] Refactor maestro-direct to use shared utilities +# ... (one per package) +``` + +**5b. Spec is too long** +If PROJECT-SPEC.md is 50 pages, the agent uses half its context just reading it. + +```markdown +# Fix: Section the spec so agents only read what they need +# In AGENT.md: +### Orient +- Read PROJECT-SPEC.md sections 1-2 (overview and tech stack) +- Read the acceptance criteria ONLY for the current task +- Read IMPLEMENTATION_PLAN.md +- Do NOT read sections you don't need this iteration +``` + +**5c. Agent reads too many files during orient** +```markdown +# Fix: Limit the orient phase +### Orient +- Read IMPLEMENTATION_PLAN.md (always) +- Read PROJECT-SPEC.md section for current task (not the whole thing) +- Run git log --oneline -5 (not -50) +- Check build status: npm run build 2>&1 | tail -5 +``` + +### Recovery Steps +1. Check if the task requires touching many files — if so, split it +2. Trim the spec (move detailed examples to separate reference files) +3. Adjust AGENT.md to limit the orient phase +4. Reduce iteration timeout to force smaller tasks + +--- + +## The Meta-Problem: When to Give Up + +Sometimes a task genuinely can't be done by an agent. Signs: + +- **Requires external knowledge** the agent can't access (undocumented API behavior) +- **Requires human judgment** that can't be specified (design aesthetics, UX decisions) +- **Requires real-time interaction** with a service (OAuth browser flows, 2FA) +- **Requires physical access** (hardware testing, network configuration) + +In these cases: +1. Mark the task as `HUMAN` in the plan +2. Do it yourself +3. Commit the result +4. Let the agent continue with the next task + +```markdown +- [x] Set up OAuth application in DocuSign admin panel (HUMAN) +- [ ] Implement JWT auth flow using the credentials from .env +``` + +There's no shame in doing parts yourself. The harness is a tool, not a religion. + +--- + +## Quick Reference: Failure → Fix + +| Failure | Quick Fix | +|---------|-----------| +| Agent repeats same task | Split the task into smaller pieces | +| Wrong architecture | Add explicit project structure to spec | +| Too much abstraction | Add YAGNI constraint to spec | +| Tests don't test anything | Add example I/O to spec | +| Agent adds unrequested features | Add "MUST NOT add features not in spec" constraint | +| Agent changes existing patterns | Add "MUST follow existing patterns" constraint | +| Agent uses wrong tool/framework | Be explicit about tech stack (MUST use X, MUST NOT use Y) | +| Progress stalls completely | Read last 3 transcripts, identify the blocker, unblock manually | + +--- + +_Every failure teaches you to write a better spec. That's the real loop._ diff --git a/TUTORIAL.md b/TUTORIAL.md new file mode 100644 index 0000000..b1fa8b8 --- /dev/null +++ b/TUTORIAL.md @@ -0,0 +1,864 @@ +# Tutorial: Build a CLI Tool in 30 Minutes with the Harness + +> This is a complete walkthrough from zero to working software. +> Example project: A markdown link checker that finds broken links in .md files. +> Follow along exactly — copy-paste commands, see what works. + +--- + +## What You'll Build + +**mdlinkcheck** — A CLI tool that: +- Scans markdown files for links +- Tests each HTTP/HTTPS link (returns 200?) +- Identifies broken links (404, timeout, etc.) +- Outputs a report with line numbers + +**Tech stack:** TypeScript, Node.js, Commander.js for CLI parsing + +**Time investment:** +- Interview & spec: 10 minutes +- Agent planning: 2 minutes +- Agent building: 15-20 minutes (4-6 iterations) +- Your review: 5 minutes + +--- + +## Prerequisites + +- Node.js 18+ installed +- Claude CLI (`npm install -g @anthropic-ai/claude-cli`) or OpenClaw access +- Agent harness templates (this repo) +- 30 minutes of your time + +--- + +## Phase 0: Setup (2 minutes) + +### 1. Create the project directory +```bash +mkdir ~/projects/mdlinkcheck +cd ~/projects/mdlinkcheck +git init +``` + +### 2. Copy templates +```bash +cp /path/to/agent-harness/AGENT-INSTRUCTIONS.md ./AGENT.md +cp /path/to/agent-harness/PROJECT-SPEC.md ./PROJECT-SPEC.md +cp /path/to/agent-harness/DECISIONS.md ./DECISIONS.md +cp /path/to/agent-harness/ralph-loop.sh ./ralph-loop.sh +chmod +x ralph-loop.sh +``` + +### 3. Initial commit +```bash +git add . +git commit -m "Initial setup: agent harness templates" +``` + +You now have an empty project with the harness scaffolding. + +--- + +## Phase 1: The Interview (10 minutes) + +Open a chat with your agent (Claude, Cleo, etc.) and run through the interview protocol from SPEC-CREATION-GUIDE.md. + +### Your opening message: +``` +I want to build a CLI tool using the agent harness. Let's create +the spec together using the interview protocol. + +Project: A markdown link checker that finds broken links. +``` + +### Expected Interview Flow + +**Agent:** "What are we building, in one sentence?" + +**You:** +``` +A CLI tool that scans markdown files for HTTP/HTTPS links and +reports which ones are broken (404, timeout, etc.). +``` + +**Agent:** "Who is this for?" + +**You:** +``` +Developers who maintain documentation. They run it before +committing to catch broken links. +``` + +**Agent:** "Why now? What's the trigger?" + +**You:** +``` +I maintain several docs repos and keep finding broken links after +they're published. I want to catch them earlier. +``` + +**Agent:** "What does 'done' look like?" + +**You:** +``` +I can run `mdlinkcheck docs/**/*.md` and get a report like: +docs/api.md:15 - BROKEN - https://example.com/missing (404) +docs/guide.md:42 - BROKEN - https://oldsite.com (timeout) + +Found 2 broken links out of 47 total. + +Exit code 0 if all links work, 1 if any are broken (for CI). +``` + +**Agent:** "What have you tried before? What didn't work?" + +**You:** +``` +Tried markdown-link-check (npm package) but it's slow and has +too many dependencies. I want something simple and fast. +``` + +**Agent:** "Walk me through how you'd use this feature." + +**You:** +``` +1. Install: npm install -g mdlinkcheck +2. Run: mdlinkcheck README.md +3. See output: list of broken links or "All links OK!" +4. Check exit code: 0 = success, 1 = broken links found +``` + +**Agent:** "What could go wrong?" + +**You:** +``` +- No markdown files found (should show helpful message) +- Network is down (should timeout gracefully, not hang) +- Huge file with 1000+ links (should show progress) +- Link requires auth (should report as "unreachable" not "broken") +``` + +**Agent:** "What's the input? What's the output?" + +**You:** +``` +Input: File paths or glob patterns (e.g., docs/**/*.md) +Output: + - For each broken link: filename:lineNumber - STATUS - URL (error) + - Summary: "Found X broken links out of Y total" + - Exit code: 0 (all good) or 1 (broken links) +``` + +**Agent:** "How would you test this?" + +**You:** +``` +Create a test.md with known good and bad links: +- https://google.com (should work) +- https://thisurldoesnotexist123456.com (should fail) + +Run the tool, verify it reports the broken one. +``` + +**Agent:** "Is this a must-have or nice-to-have?" + +**You:** +``` +Must-haves: +- Parse markdown, extract links +- Test HTTP/HTTPS links +- Report broken ones with filename and line number +- Proper exit codes for CI + +Nice-to-haves (defer to v2): +- Check relative links (./other-doc.md) +- Parallel checking for speed +- Retry on timeout +- Config file for ignored URLs +``` + +--- + +## Phase 2: Spec Assembly (Agent does this) + +**You:** +``` +Great! Now fill out PROJECT-SPEC.md based on our interview. +Use the template in the repo. Be specific about: +- Tech stack: TypeScript, Node.js, Commander.js +- Build command: npm run build (compile TS to dist/) +- Test command: npm test (use Node.js test runner) +- Data model: not needed (stateless tool) +- One requirement per feature with acceptance criteria +``` + +**Agent generates spec (2-3 minutes):** + +The agent reads PROJECT-SPEC.md template and fills it out: + +```markdown +# Project Specification: mdlinkcheck + +## 1. Project Overview + +### What are we building? +A command-line tool that scans markdown files for HTTP/HTTPS links +and reports broken ones (404, timeout, connection errors). + +### Why does it matter? +Developers maintaining documentation need to catch broken links +before publishing. Existing tools are slow or over-complicated. + +### Success criteria +- [ ] Scan .md files and extract all HTTP/HTTPS links +- [ ] Test each link (HEAD request, follow redirects) +- [ ] Report broken links with filename:line number +- [ ] Exit code 0 (all OK) or 1 (broken links found) +- [ ] Works on CLI: `mdlinkcheck docs/**/*.md` + +## 2. Technical Foundation + +### Tech stack +- Language: TypeScript +- Runtime: Node.js 18+ +- CLI framework: Commander.js +- HTTP client: Native fetch (Node 18+) +- Build: tsc (TypeScript compiler) +- Test: Node.js test runner (node:test) + +### Project structure +mdlinkcheck/ +├── src/ +│ ├── cli.ts # Commander.js entry point +│ ├── parser.ts # Extract links from markdown +│ ├── checker.ts # Test links via HTTP +│ └── reporter.ts # Format output +├── tests/ +│ ├── parser.test.ts +│ ├── checker.test.ts +│ └── fixtures/ # Test markdown files +├── dist/ # Compiled JS (gitignored) +├── package.json +├── tsconfig.json +└── PROJECT-SPEC.md + +### Build & test commands +npm install +npm run build # tsc +npm test # node --test tests/**/*.test.ts +npm run lint # tsc --noEmit (type check only) + +## 3. Requirements + +### FR-001: Extract Links from Markdown +**Description:** Parse markdown files and extract all HTTP/HTTPS URLs. +**Acceptance criteria:** +- [ ] Find links in standard format: [text](https://url) +- [ ] Find links in reference format: [text][ref] ... [ref]: https://url +- [ ] Find bare URLs: https://example.com +- [ ] Return array of { url, line } objects +- [ ] Skip relative links (./file.md, /path/file.md) + +### FR-002: Test HTTP/HTTPS Links +**Description:** Send HTTP request to each URL and determine status. +**Acceptance criteria:** +- [ ] Use HEAD request (faster than GET) +- [ ] Follow redirects (up to 3) +- [ ] Timeout after 10 seconds +- [ ] Return status: OK | BROKEN | TIMEOUT | UNREACHABLE +- [ ] Handle network errors gracefully (don't crash) + +### FR-003: CLI Interface +**Description:** Command-line tool that accepts file paths or globs. +**Acceptance criteria:** +- [ ] Command: `mdlinkcheck ` +- [ ] Expand glob patterns (docs/**/*.md) +- [ ] Show usage help with --help +- [ ] Show version with --version +- [ ] Exit code 0 if all links OK, 1 if any broken + +### FR-004: Reporting +**Description:** Output broken links in readable format. +**Acceptance criteria:** +- [ ] Format: `filename:line - STATUS - URL (error message)` +- [ ] Summary: "Found X broken links out of Y total" +- [ ] Show "All links OK!" if none broken +- [ ] Color output: red for broken, green for OK (optional) + +## 4. Anti-patterns +- Don't use regex for markdown parsing — use a markdown parser +- Don't use GET requests — HEAD is faster and sufficient +- Don't hang on timeout — set explicit timeout +- Don't crash on network errors — catch and report + +## 5. Phasing + +### Phase 1: Core (must-have) +- [ ] Project setup (package.json, tsconfig, build) +- [ ] Link extraction from markdown +- [ ] HTTP link checking +- [ ] CLI interface +- [ ] Basic reporting + +### Phase 2: Polish (nice-to-have, defer) +- [ ] Parallel checking +- [ ] Progress bar +- [ ] Config file for ignored URLs +- [ ] Check relative links +``` + +**You review the spec:** +``` +Looks good! One clarification: use the `marked` library for parsing +markdown (don't write a regex-based parser). Update FR-001. +``` + +**Agent updates spec**, commits: +```bash +git add PROJECT-SPEC.md +git commit -m "Add project specification for mdlinkcheck" +``` + +--- + +## Phase 3: Planning (Agent does this) + +**You:** +``` +Read PROJECT-SPEC.md. Create IMPLEMENTATION_PLAN.md by decomposing +the project into discrete tasks ordered by dependency. Output +PLANNED when done. +``` + +**Agent thinks and writes plan:** + +```markdown +# Implementation Plan: mdlinkcheck + +## Phase 1: Foundation +- [ ] Project scaffolding (package.json, tsconfig, src/, tests/) +- [ ] CLI entry point with Commander.js (--help, --version) +- [ ] Markdown link extraction (using marked lexer) +- [ ] HTTP link checker (fetch with timeout) +- [ ] Reporter (format output) +- [ ] Wire together: CLI → parser → checker → reporter +- [ ] Integration test with fixtures + +## Phase 2: Polish (Deferred) +- [ ] Parallel checking +- [ ] Progress indicator +``` + +**Agent outputs:** +``` +PLANNED + +Plan created with 7 tasks. Review IMPLEMENTATION_PLAN.md. +``` + +**You review:** +``` +Good! Let's start building. +``` + +--- + +## Phase 4: Build Iterations (Agent does this, you watch) + +### Method A: Using ralph-loop.sh +```bash +./ralph-loop.sh --agent claude --max 10 +``` + +### Method B: Using OpenClaw sessions_spawn +``` +You: "Run the first agent harness iteration for mdlinkcheck" +Cleo: *spawns sub-agent* → reads AGENT.md → picks first task → builds +``` + +### Method C: Manual (for learning) +``` +You: "Read AGENT.md in /path/to/mdlinkcheck. Follow the core loop: + orient, pick the first unchecked task in IMPLEMENTATION_PLAN.md, + implement it, verify (npm run build && npm test), commit, exit." +``` + +--- + +### Iteration 1: Project scaffolding + +**Agent output:** +``` +Reading PROJECT-SPEC.md and IMPLEMENTATION_PLAN.md... +First task: Project scaffolding + +Creating package.json... +Creating tsconfig.json... +Creating src/, tests/, tests/fixtures/... +Installing dependencies: commander, marked, @types/node... +Running build: ✅ compiles successfully +Running tests: ✅ (no tests yet, but runner works) + +Committing... +[main a1b2c3d] chore: project scaffolding with TypeScript and Commander +Task complete. Exiting for fresh context. +``` + +**Git log:** +``` +a1b2c3d chore: project scaffolding with TypeScript and Commander +``` + +**IMPLEMENTATION_PLAN.md updated:** +```markdown +- [x] Project scaffolding (package.json, tsconfig, src/, tests/) +- [ ] CLI entry point with Commander.js (--help, --version) +... +``` + +--- + +### Iteration 2: CLI entry point + +**Agent output:** +``` +Reading IMPLEMENTATION_PLAN.md... +Next task: CLI entry point with Commander.js + +Creating src/cli.ts... +Wiring up Commander: program.name('mdlinkcheck').description('...') +Adding --help and --version... +Adding bin entry to package.json... +Testing: npm link (local install) +Running: mdlinkcheck --help ✅ +Running: mdlinkcheck --version ✅ + +Committing... +[main b2c3d4e] feat: CLI entry point with Commander.js +``` + +**You can now run:** +```bash +npm link +mdlinkcheck --help +# mdlinkcheck - Find broken links in markdown files +# Usage: mdlinkcheck +``` + +--- + +### Iteration 3: Markdown link extraction + +**Agent output:** +``` +Next task: Markdown link extraction + +Creating src/parser.ts... +Using marked.lexer to parse markdown into tokens... +Extracting links from link and image tokens... +Tracking line numbers... +Writing tests with fixtures/test.md... + +Test results: +✅ Extracts standard markdown links +✅ Extracts reference-style links +✅ Skips relative links +✅ Returns line numbers + +Committing... +[main c3d4e5f] feat: markdown link extraction with line numbers +``` + +**Files created:** +- `src/parser.ts` +- `tests/parser.test.ts` +- `tests/fixtures/test.md` (sample markdown with various link formats) + +--- + +### Iteration 4: HTTP link checker + +**Agent output:** +``` +Next task: HTTP link checker + +Creating src/checker.ts... +Using fetch with HEAD method... +Following redirects (up to 3)... +10-second timeout via AbortController... +Error handling for network failures... + +Test results: +✅ Returns OK for 200 status +✅ Returns BROKEN for 404 +✅ Returns TIMEOUT for hung requests +✅ Returns UNREACHABLE for network errors + +Committing... +[main d4e5f6g] feat: HTTP link checker with timeout and error handling +``` + +--- + +### Iteration 5: Reporter + +**Agent output:** +``` +Next task: Reporter + +Creating src/reporter.ts... +Format: filename:line - STATUS - URL (error)... +Summary line with counts... +Color output using ANSI codes (red for broken, green for OK)... + +Test results: +✅ Formats broken link correctly +✅ Shows summary +✅ Handles no broken links case + +Committing... +[main e5f6g7h] feat: reporter with colored output and summary +``` + +--- + +### Iteration 6: Wire together + +**Agent output:** +``` +Next task: Wire together CLI → parser → checker → reporter + +Updating src/cli.ts... +For each file argument: + 1. Read file + 2. Extract links (parser) + 3. Check each link (checker) + 4. Collect broken links +Report results (reporter) +Set exit code (0 or 1) + +Build: ✅ +Test: ✅ + +Manual test with real URLs: +Created tests/fixtures/real-links.md: +- https://google.com +- https://thisurldoesnotexist123456.com + +Running: mdlinkcheck tests/fixtures/real-links.md +Output: + tests/fixtures/real-links.md:2 - BROKEN - https://thisurldoesnotexist123456.com (404) + Found 1 broken link out of 2 total. + +Exit code: 1 ✅ + +Committing... +[main f6g7h8i] feat: wire CLI components together, end-to-end working +``` + +--- + +### Iteration 7: Integration test + +**Agent output:** +``` +Next task: Integration test with fixtures + +Creating tests/integration.test.ts... +Test fixture with known good and bad links... +Run CLI programmatically... +Assert output contains expected broken links... +Assert exit code is 1... + +Test results: +✅ Detects broken links in fixture +✅ Reports correct filename and line number +✅ Exit code is 1 when links are broken +✅ Exit code is 0 when all links are OK + +All tasks complete! +DONE + +Committing... +[main g7h8i9j] test: add integration tests with fixtures +``` + +--- + +## Phase 5: Review (You do this) + +### 1. Check the git log +```bash +git log --oneline +``` + +Output: +``` +g7h8i9j test: add integration tests with fixtures +f6g7h8i feat: wire CLI components together, end-to-end working +e5f6g7h feat: reporter with colored output and summary +d4e5f6g feat: HTTP link checker with timeout and error handling +c3d4e5f feat: markdown link extraction with line numbers +b2c3d4e feat: CLI entry point with Commander.js +a1b2c3d chore: project scaffolding with TypeScript and Commander +``` + +✅ **7 iterations, 7 commits** — clean history + +### 2. Run the build and tests +```bash +npm run build +npm test +``` + +Output: +``` +> build +tsc + +> test +✅ parser extracts links correctly (4 tests) +✅ checker tests URLs correctly (4 tests) +✅ reporter formats output (3 tests) +✅ integration end-to-end (2 tests) + +13 tests passed +``` + +✅ **All tests pass** + +### 3. Try it for real +```bash +npm link +mdlinkcheck README.md +``` + +Output: +``` +All links OK! ✅ +Found 0 broken links out of 5 total. +``` + +**Try with a broken link:** +```bash +echo "[broken](https://thisurldoesnotexist123456.com)" > test.md +mdlinkcheck test.md +``` + +Output: +``` +test.md:1 - BROKEN - https://thisurldoesnotexist123456.com (404) +Found 1 broken link out of 1 total. +``` + +✅ **Works as specified** + +### 4. Review the code quality + +**Check src/parser.ts:** +```typescript +import { marked } from 'marked'; + +interface LinkInfo { + url: string; + line: number; +} + +export function extractLinks(markdown: string): LinkInfo[] { + const tokens = marked.lexer(markdown); + const links: LinkInfo[] = []; + + function walkTokens(token: any, lineOffset = 0) { + if (token.type === 'link' || token.type === 'image') { + if (token.href.startsWith('http://') || token.href.startsWith('https://')) { + links.push({ + url: token.href, + line: lineOffset + (token.line || 1) + }); + } + } + if (token.tokens) { + token.tokens.forEach((t: any) => walkTokens(t, lineOffset)); + } + } + + tokens.forEach(token => walkTokens(token)); + return links; +} +``` + +✅ **Clean, readable, uses marked instead of regex** + +**Check src/checker.ts:** +```typescript +export async function checkLink(url: string): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 10000); + + try { + const response = await fetch(url, { + method: 'HEAD', + signal: controller.signal, + redirect: 'follow' + }); + + clearTimeout(timeout); + return response.ok ? 'OK' : 'BROKEN'; + } catch (error) { + clearTimeout(timeout); + if (error.name === 'AbortError') return 'TIMEOUT'; + return 'UNREACHABLE'; + } +} +``` + +✅ **Proper timeout handling, follows redirects, good error handling** + +### 5. Review against spec + +| Requirement | Status | +|-------------|--------| +| FR-001: Extract links from markdown | ✅ Implemented with marked | +| FR-002: Test HTTP/HTTPS links | ✅ HEAD requests with timeout | +| FR-003: CLI interface | ✅ Commander.js with --help | +| FR-004: Reporting | ✅ Colored output with summary | +| Exit codes | ✅ 0 for success, 1 for broken | +| Tests | ✅ 13 tests, all passing | + +--- + +## Phase 6: Polish (Optional) + +**You:** +``` +This looks great! Let's add one enhancement: show a progress +indicator when checking many links. Add this as a task to the plan. +``` + +**You update IMPLEMENTATION_PLAN.md:** +```markdown +## Phase 2: Polish +- [ ] Progress indicator (show "Checking... 5/47" while running) +``` + +**You:** +``` +Run one more iteration to implement the progress indicator. +``` + +**Agent (iteration 8):** +``` +Reading plan... +Next task: Progress indicator + +Adding progress output to src/cli.ts... +Show "Checking link 5/47..." as each link is tested... +Clear line and show summary when done... + +Build: ✅ +Test: ✅ +Manual test: Shows progress ✅ + +Committing... +[main h8i9j0k] feat: add progress indicator for link checking +DONE +``` + +**Now when you run it:** +```bash +mdlinkcheck docs/**/*.md +``` + +Output: +``` +Checking link 1/47... +Checking link 2/47... +... +Checking link 47/47... + +All links OK! ✅ +Found 0 broken links out of 47 total. +``` + +--- + +## What You Just Did + +In **30 minutes**, you: + +1. ✅ Interviewed with an agent to create a clear spec +2. ✅ Let the agent plan the task decomposition +3. ✅ Ran 7 autonomous iterations (each: implement → test → commit) +4. ✅ Reviewed the output (5 minutes) +5. ✅ Added a polish enhancement (1 more iteration) +6. ✅ Shipped a working CLI tool with tests + +**Your effort:** +- Interview: 10 min (you answered questions) +- Review: 5 min (you ran the code and checked quality) +- Total: 15 minutes of YOUR time + +**Agent's effort:** +- Planning: 2 min +- Building: 18 min (8 iterations × ~2 min each) +- Total: 20 minutes of autonomous work + +**You wrote zero lines of code.** You defined WHAT to build, and the agent figured out HOW. + +--- + +## What to Try Next + +### Package it for npm +```markdown +- [ ] Add README.md with usage examples +- [ ] Add LICENSE (MIT) +- [ ] Publish to npm: `npm publish` +``` + +### Add features +```markdown +- [ ] Check relative links (./other-doc.md) +- [ ] Config file to ignore certain URLs +- [ ] Parallel checking (Promise.all for speed) +- [ ] JSON output mode for CI integration +``` + +### Use it in CI +```yaml +# .github/workflows/docs.yml +name: Check Links +on: [push] +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: npm install -g mdlinkcheck + - run: mdlinkcheck docs/**/*.md +``` + +--- + +## Key Lessons from This Tutorial + +### 1. The spec is everything +The 10-minute interview created a spec that guided 8 flawless iterations. Bad spec = bad output. + +### 2. Fresh context prevents drift +Each iteration started fresh. No context overflow, no confusion from stale reasoning. + +### 3. Tests validate autonomy +Every iteration ran `npm test`. Failing tests forced the agent to fix before proceeding. No test theater. + +### 4. Git history tells the story +One commit per task. Clean, reviewable, revertable. + +### 5. You shift from writer to reviewer +Your job: define the goal, review the output, course-correct. The agent writes the code. + +--- + +_Now go build something real. Interview, spec, run the loop, review. Repeat._ diff --git a/VALIDATION-TEMPLATE.md b/VALIDATION-TEMPLATE.md new file mode 100644 index 0000000..db8e814 --- /dev/null +++ b/VALIDATION-TEMPLATE.md @@ -0,0 +1,42 @@ +# Validation Evidence Template + +> Write this file after completing each packet. +> File location: `.harness//validation/-validation.md` +> Commit it in the same commit as the packet code. + +--- + +# [XX-NN] Validation Evidence +**Date:** YYYY-MM-DD +**Agent:** [model name, e.g. claude-sonnet-4.6] +**Stream:** [stream name] +**Packet:** [XX-NN — Packet Name] + +## Test Counts +| Metric | Value | +|--------|-------| +| Tests before packet | NNNN | +| Tests after packet | NNNN | +| New tests added | NN | +| TypeScript errors in new files | 0 | + +## Known-Answer Tests + +- [x] [Description] (Source: [URL]): PASS +- [x] [Description] (Source: [URL]): PASS + +## Acceptance Criteria + +- [x] [Criterion]: ✅ +- [x] Full test suite green: ✅ (NNNN passing) +- [x] TypeScript clean: ✅ + +## Files Created +- `src/...` — [N lines, brief description] +- `src/.../__tests__/...` — [N lines, N tests] + +## Commit +`[hash]` — [commit message first line] + +## Notes +[Implementation decisions, deviations from plan, lessons learned, gotchas] diff --git a/WAVE-BASED-MANAGEMENT.md b/WAVE-BASED-MANAGEMENT.md new file mode 100644 index 0000000..1ecbe6e --- /dev/null +++ b/WAVE-BASED-MANAGEMENT.md @@ -0,0 +1,268 @@ +# Wave-Based Project Management + +> The biggest gap in most agentic projects: **planning only one task at a time.** +> This guide captures the wave-based approach — planning a full stream's worth of work +> before writing a single line of implementation code. +> +> Proven in practice: 44 tasks across 4 waves, 1,254 → 1,597 tests, zero regressions. + +--- + +## The Core Insight: Plan the Stream, Not the Task + +The basic harness has you plan one task at a time. This works for small projects. +For larger projects, it creates problems: + +- **Scope drift:** Agent picks up the next task without understanding how it fits the stream +- **Missing dependencies:** Packet 3 turns out to need something Packet 1 should have built +- **Unknown-answer tests discovered too late:** Financial formulas validated by feel, not by known CRA/ESDC figures +- **No clear "done":** What does stream completion actually mean? + +The solution: **write the entire execution board for a stream before implementing any of it.** + +``` +❌ Old approach: + Plan task → Implement task → Plan next task → Implement → ... + +✅ Wave approach: + Plan ENTIRE stream → Review plan → Implement packet-by-packet → Close stream +``` + +--- + +## The Four Levels of Structure + +``` +Project +└── Waves (groups of streams, sequenced by dependency) + └── Streams (a feature or module — has its own branch) + └── Packets (atomic unit of work — one commit per packet) + └── Tasks (sub-steps within a packet) +``` + +### Waves +A wave is a set of streams that logically belong together and can be started in parallel (or have light dependencies between them). Waves are gated — Wave N+1 doesn't start until Wave N is fully merged and green. + +**Example:** +- Wave 1: Core data models + calculation engines (everything else depends on this) +- Wave 2: Advisory layer + specialized tools (uses Wave 1 outputs) +- Wave 3: Infrastructure + integrations (can be parallel with Wave 2) +- Wave 4: Future vision / stretch goals + +### Streams +A stream is a feature branch with a defined scope. It has: +- One `execution-board.md` (written before any code) +- 2–6 packets +- One `process-eval.md` (written after merge) +- Validation evidence per packet + +### Packets +A packet is the atomic unit — one focused chunk of work that produces a commit. It has: +- A clear goal (one sentence) +- Explicit steps +- Known-answer tests (mandatory for calculation work) +- Programmatically verifiable acceptance criteria +- One validation evidence file + +--- + +## The Execution Board: Your Planning Artifact + +The execution board lives at `.harness//execution-board.md`. +Copy `EXECUTION-BOARD-TEMPLATE.md` and fill it in. + +**The rule:** The board must be complete before you write a single line of implementation. + +### What "complete" means: +- Every packet is defined with goal, steps, files, and acceptance criteria +- Known-answer tests are written out (not "TBD") for any calculation +- Dependency order between packets is explicit +- Stream completion criteria are listed + +### What happens if you skip it: +- You discover mid-stream that Packet 3 needs something Packet 1 didn't build +- You commit calculation code with no ground-truth validation +- You have no clear definition of "done" for the stream +- The next agent session doesn't know what state the stream is in + +--- + +## Known-Answer Tests: The Most Important Rule + +For any stream that touches domain-specific calculations (financial math, scientific formulas, regulatory thresholds, physical constants), every calculation module **must** include at least one known-answer test citing an official source. + +```typescript +// ✅ Correct: cites official source, tests exact value +test('CPP at 70 is exactly 42% more than at 65', () => { + // Source: ESDC https://www.canada.ca/en/services/benefits/publicpensions/cpp/benefit-amount.html + // Formula: +0.7% per month after 65 × 60 months = +42% + expect(calculateCPPBenefitAtAge(1000, 70) / calculateCPPBenefitAtAge(1000, 65)).toBeCloseTo(1.42, 5); +}); + +// ❌ Wrong: no source, tests implementation against itself +test('CPP at 70 returns more than at 65', () => { + expect(calculateCPPBenefitAtAge(1000, 70)).toBeGreaterThan(calculateCPPBenefitAtAge(1000, 65)); +}); +``` + +**Why this matters:** An agent can write a plausible-looking formula that's subtly wrong. Without a known-answer test from an authoritative source, you won't catch it until someone gets incorrect results in production. With known-answer tests, errors are caught immediately. + +### What qualifies as a "known-answer source": +- Government publications (CRA, ESDC, IRS, HMRC, etc.) +- Official standards documents (ISO, RFC, IEEE) +- Published academic results +- Regulatory filings with specific numerical requirements +- Product specifications with exact values + +### The financial accuracy eval pattern +For financial software, create a separate calibration test suite that lives outside the normal unit tests: + +``` +evals/ +└── code-quality/ + └── financial-accuracy.test.ts ← Run with: npm run eval:financial-accuracy +``` + +This suite contains ONLY known-answer tests from official sources. It grows over time as you add calculation modules. Run it independently to verify the app's financial accuracy hasn't drifted. + +--- + +## EXECUTION_MASTER.md: The Project Dashboard + +Every project using wave-based management should have a single coordination file — typically `EXECUTION_MASTER.md` or equivalent — that shows: + +```markdown +# Project Execution Master + +## Wave Status +| Wave | Description | Status | +|------|-------------|--------| +| Wave 1 | Core foundations | ✅ Complete | +| Wave 2 | Advisory layer | 🟡 In progress | +| Wave 3 | Infrastructure | ⏸️ Not started | + +## Active Streams +| Stream | Branch | Status | Blocker | +|--------|--------|--------|---------| +| cpp-optimizer | feat/cpp-optimizer | ✅ Merged | — | +| rrsp-meltdown | feat/rrsp-meltdown | 🟠 In progress | — | +| estate-planning | feat/estate-planning | ⏸️ Planned | Needs rrsp-meltdown | + +## Parallelism Rules +1. Max 2 active streams simultaneously +2. Shared schema changes are always sequential +3. Integration gate before any merge: full test suite must stay green +``` + +**Every agent session starts by reading this file.** It immediately knows: +- What wave is active +- Which streams are running +- What's blocked and why +- What can run in parallel + +--- + +## The Wave Gate + +Before starting Wave N+1, verify: + +``` +[ ] All streams in Wave N merged to main +[ ] Full test suite green (count ≥ baseline) +[ ] Domain-specific accuracy suite passing (if applicable) +[ ] All regression baselines saved +[ ] Process evals written for all Wave N streams +[ ] process-eval-history.json updated +[ ] IMPLEMENTATION_PLAN: all Wave N tasks marked [x] +[ ] EXECUTION_MASTER: Wave N status updated to ✅ +[ ] Human sign-off: outputs are producing correct/plausible results +``` + +The gate exists because Wave N+1 often builds on Wave N's outputs. If Wave N has silent bugs, they compound in Wave N+1. Catch them at the gate. + +--- + +## File Organization + +``` +/ +├── AGENT.md ← Agent instructions (adapted from AGENT-INSTRUCTIONS.md) +├── IMPLEMENTATION_PLAN.md ← Master backlog (tasks 1-N, all waves) +├── PROJECT-SPEC.md ← What to build (never changes) +├── DECISIONS.md ← Architecture Decision Records +└── .harness/ + ├── EXECUTION_MASTER.md ← Wave/stream dashboard + ├── EXECUTION-BOARD-TEMPLATE.md ← Copy this for new streams + ├── VALIDATION-TEMPLATE.md ← Copy this for packet evidence + ├── PROCESS-EVAL-TEMPLATE.md ← Copy this for stream retrospectives + ├── regression-baselines/ ← Deterministic output snapshots + ├── / + │ ├── execution-board.md ← Written BEFORE implementation + │ ├── process-eval.md ← Written AFTER merge + │ └── validation/ + │ ├── -validation.md + │ └── -validation.md + └── / + └── ... +``` + +--- + +## Adapting for Your Project + +### Projects WITHOUT domain-specific calculations +Skip the known-answer tests and financial accuracy eval. Keep everything else. + +### Projects with a small scope (< 10 tasks) +Skip waves entirely — just use streams. One execution board per logical feature group. + +### Projects with a single developer (no parallelism) +Streams are still valuable for planning discipline even if run sequentially. + +### Non-TypeScript / non-test projects +Adapt the commit trailers. The key trackers are: +- **What model did the work** (for attribution and quality tracking) +- **Test counts** or equivalent quality metric +- **Build / type check status** + +--- + +## Quick Reference: The Discipline in One Page + +``` +BEFORE CODING: + ✅ Write execution board for the entire stream + ✅ Define known-answer tests for ALL calculation modules + ✅ Get acceptance criteria to programmatically verifiable + +PER PACKET: + ✅ Code + tests in same commit + ✅ Full suite green before moving on + ✅ Write validation evidence immediately after + ✅ Commit trailer: Agent / Tests / Tests-Added / TypeScript + +PER STREAM: + ✅ Write process eval honestly + ✅ Merge with --no-ff + ✅ Update EXECUTION_MASTER + +PER WAVE: + ✅ Run wave gate checklist before starting next wave + ✅ Human sign-off on outputs +``` + +--- + +## Why This Works + +The wave-based approach solves three failure modes common in agent projects: + +**1. Scope drift** — The execution board defines the stream's boundaries upfront. Agents can't drift into unrelated work because the plan is explicit. + +**2. Hidden inaccuracies** — Known-answer tests with official citations are written in the planning phase, before any implementation. This forces precision in the spec, which translates directly into correct implementations. + +**3. No definition of done** — The stream completion criteria (in the execution board) tell every agent, every session: "the stream is done when these boxes are checked." No ambiguity. + +--- + +*This pattern was developed through practice on the Fintrove project (2026-03-31 → 2026-04-01): 4 waves, 11 streams, 44 tasks, 1,254 → 1,597 tests, zero regressions.* diff --git a/archive/Agent-Harness-Project-spec-example.md b/archive/Agent-Harness-Project-spec-example.md new file mode 100644 index 0000000..b83ffb6 --- /dev/null +++ b/archive/Agent-Harness-Project-spec-example.md @@ -0,0 +1,467 @@ +# Project Specification: Agent Harness System + +## 1. Project Overview + +### What are we building? +The Agent Harness System is a collection of templates, scripts, and best practices for running autonomous AI-powered coding agents on complex software projects. It provides a structured framework to decompose large projects into manageable tasks, execute them iteratively with fresh agent contexts, and maintain high-quality code through mandatory testing and verification. + +### Why does it matter? +Traditional AI coding assistants struggle with large, multi-step projects due to context window limitations and the need for iterative refinement. The Agent Harness addresses this by providing a "Ralph Wiggum Loop" mechanism that spawns fresh agents for each task iteration, preventing context drift while maintaining project coherence through structured documentation and git-based memory. + +### Success criteria +- [ ] Agents can autonomously decompose complex project specs into testable tasks +- [ ] Fresh agent iterations prevent context overflow and stale reasoning +- [ ] Mandatory build/test cycles ensure code quality +- [ ] Git history serves as reliable inter-iteration memory +- [ ] System works with multiple AI agents (Claude, Codex, etc.) +- [ ] Clear signals for completion, stuck states, and errors +- [ ] Comprehensive documentation enables easy adoption + +--- + +## 2. Technical Foundation + +### Tech stack +- **Language:** Bash (for the loop script), Markdown (for templates) +- **Tools:** Git, shell commands, AI agent CLIs (claude, codex) +- **Build system:** N/A (templates for various project types) +- **Test framework:** Project-specific (agents run their own tests) +- **Package manager:** N/A + +### Project structure +``` +docs/agent-harness/ +├── README.md # Quick overview and file purposes +├── AGENT-INSTRUCTIONS.md # Template for agent system prompts +├── PROJECT-SPEC.md # Template for project specifications +├── ralph-loop.sh # The loop execution script +└── EXAMPLES.md # Worked examples and best practices +``` + +### Build & test commands +The harness itself doesn't have build/test commands, but agents using it must define them in their PROJECT-SPEC.md. + +### Coding standards +- Markdown files use consistent formatting with headers, lists, code blocks +- Bash scripts use set -euo pipefail for error handling +- Templates include clear placeholders and examples +- Documentation focuses on actionable, specific guidance + +--- + +## 3. Requirements + +### Functional Requirements + +#### FR-001: Project Specification Template +**Description:** A comprehensive template that captures all necessary project details for autonomous agent work. +**Acceptance criteria:** +- [ ] Covers project overview, technical foundation, requirements, data models +- [ ] Includes phasing for large projects +- [ ] Provides reference materials and anti-patterns +- [ ] Enables agents to work without human intervention + +#### FR-002: Agent Instructions Template +**Description:** System prompt template that defines agent behavior, the core loop, and rules. +**Acceptance criteria:** +- [ ] Defines senior engineer role with full codebase access +- [ ] Specifies exact sequence: orient → plan → pick task → implement → verify → commit → exit +- [ ] Includes output signals for loop control ( tags) +- [ ] Enforces one-task-per-iteration rule + +#### FR-003: Ralph Wiggum Loop Script +**Description:** Bash script that orchestrates agent iterations with fresh contexts. +**Acceptance criteria:** +- [ ] Spawns fresh agent processes each iteration +- [ ] Supports planning mode and build mode +- [ ] Monitors output signals for completion/stuck/error states +- [ ] Logs all iterations for debugging +- [ ] Configurable max iterations and agent type + +#### FR-004: Implementation Plan Management +**Description:** Dynamic task decomposition and tracking system. +**Acceptance criteria:** +- [ ] Agents create IMPLEMENTATION_PLAN.md from project spec +- [ ] Tasks ordered by dependency with checkboxes +- [ ] Plan updated after each completed task +- [ ] Git commits preserve plan history + +#### FR-005: Quality Assurance Integration +**Description:** Mandatory build and test verification in each iteration. +**Acceptance criteria:** +- [ ] Agents run project-specific build commands +- [ ] All tests must pass before committing +- [ ] Build failures prevent progression +- [ ] Linting enforced if configured + +### Non-Functional Requirements + +#### NFR-001: Simplicity +- [ ] No complex dependencies or frameworks +- [ ] Works with standard shell and git +- [ ] Easy to copy templates into any project +- [ ] Minimal setup required + +#### NFR-002: Reliability +- [ ] Fresh contexts prevent reasoning drift +- [ ] Git history provides audit trail +- [ ] Clear error signals for human intervention +- [ ] Handles agent failures gracefully + +#### NFR-003: Flexibility +- [ ] Supports multiple AI agents (Claude, Codex, etc.) +- [ ] Works with various project types and tech stacks +- [ ] Configurable iteration limits and modes +- [ ] Extensible for custom workflows + +--- + +## 4. Data Model + +The Agent Harness is documentation-focused, not data-focused. The "data" is the project files themselves. + +### Entities + +Entity: Project Spec +- Overview: what/why/success criteria +- Technical foundation: stack, structure, commands +- Requirements: functional/non-functional +- Data model: project-specific entities +- Architecture: constraints, decisions +- Phasing: optional breakdown +- References: docs, examples, anti-patterns + +Entity: Implementation Plan +- Tasks: discrete, testable, dependency-ordered +- Status: checkbox per task +- Notes: agent comments on stuck tasks +- History: git commits track plan evolution + +Entity: Agent Iteration +- Context: fresh read of spec + plan + git log +- Task: one unchecked item from plan +- Changes: code modifications + tests +- Verification: build + test results +- Commit: descriptive message + plan update + +### Relationships +- Project Spec → Implementation Plan (agent creates from spec) +- Implementation Plan → Agent Iterations (one task per iteration) +- Agent Iterations → Git Commits (each iteration commits changes) + +--- + +## 5. API / Interface Design + +The harness provides command-line interfaces: + +### ralph-loop.sh Commands + +```bash +./ralph-loop.sh # Build mode (default) +./ralph-loop.sh plan # Planning mode +./ralph-loop.sh --max 20 # Limit iterations +./ralph-loop.sh --agent claude # Specify agent +``` + +### Template Files +- PROJECT-SPEC.md: Fill with project details +- AGENT.md: Copy from AGENT-INSTRUCTIONS.md +- IMPLEMENTATION_PLAN.md: Generated by agent + +### Output Signals +Agents output special tags that the loop monitors: +- `PLANNED`: Plan created +- `DONE`: All tasks complete +- `STUCK`: Needs human help +- `ERROR`: Unrecoverable error + +--- + +## 6. Architecture Decisions + +### Constraints +- MUST: Use fresh agent contexts each iteration +- MUST: One task per agent iteration +- MUST: Mandatory build/test verification +- MUST NOT: Allow context compaction or memory accumulation +- PREFER: Git as the coordination mechanism +- PREFER: Simple bash orchestration over complex frameworks + +### Dependencies +- Git (version control) +- AI agent CLI (claude, codex, etc.) +- Shell environment (bash) +- Project-specific build tools (npm, etc.) + +### Known Challenges +- Context window limitations of AI agents +- Maintaining coherence across iterations +- Handling agent failures or stuck states +- Balancing specificity vs flexibility in templates + +--- + +## 7. Phasing (Optional) + +The harness itself is complete in one phase, but projects using it should phase their work. + +### Phase 1: Foundation +- [ ] Copy templates into project +- [ ] Fill PROJECT-SPEC.md +- [ ] Run planning mode to create IMPLEMENTATION_PLAN.md + +### Phase 2: Execution +- [ ] Run build iterations until completion +- [ ] Monitor for stuck/error signals +- [ ] Intervene as needed + +### Phase 3: Refinement +- [ ] Review final codebase +- [ ] Update templates based on lessons learned +- [ ] Document improvements for future use + +--- + +## 8. Reference Materials + +### External docs +- Geoffrey Huntley's Ralph Wiggum approach +- Nate Jones task decomposition method +- Ezward's sequential PRD style +- OpenClaw sessions_spawn documentation + +### Existing code to learn from +- ralph-loop.sh: Clean bash scripting with error handling +- Templates: Structured markdown with clear sections +- Examples: Real-world project specifications + +### Anti-patterns +- Don't try to pass context between iterations +- Don't let agents work on multiple tasks simultaneously +- Don't skip build/test verification +- Don't use complex orchestration when bash loop suffices +- Don't make templates too rigid — they should be adapted per project + +--- + +## All Template Files and Their Roles + +### AGENT-INSTRUCTIONS.md +**Role:** System prompt template for the AI agent. Defines the senior engineer role, core workflow loop, strict rules, and output signals. Agents read this each iteration to understand their behavior. + +**Key Sections:** +- Role definition and capabilities +- Core loop: orient → plan/pick → implement → verify → commit → exit +- Rules: one task per iteration, mandatory testing, no over-engineering +- Output signals: tags for loop control +- Context management: fresh starts with git as memory + +### PROJECT-SPEC.md +**Role:** Comprehensive project definition template. The single source of truth that agents read every iteration. Captures all requirements, constraints, and context needed for autonomous work. + +**Key Sections:** +- Project overview (what, why, success criteria) +- Technical foundation (stack, structure, commands) +- Detailed requirements (functional + non-functional) +- Data models and API design +- Architecture decisions and constraints +- Phasing and reference materials + +### ralph-loop.sh +**Role:** Bash script implementing the Ralph Wiggum Loop mechanism. Orchestrates agent iterations, monitors completion signals, handles errors, and maintains logs. + +**Key Features:** +- Fresh agent spawning each iteration +- Planning mode vs build mode +- Signal monitoring ( tags) +- Configurable agents and iteration limits +- Comprehensive logging + +### EXAMPLES.md +**Role:** Worked examples, comparisons of approaches, and best practices. Shows how to write good specs, compares different methodologies, and provides integration examples. + +**Key Content:** +- Comparison of Ezward/Ralph/Nate approaches +- Complete FinPlan project spec example +- Best practices for spec writing +- OpenClaw integration examples + +## The Ralph Wiggum Loop Mechanism + +The Ralph Wiggum Loop is named after the Simpsons character known for forgetting everything immediately, forcing fresh starts. This is the core innovation: + +### How It Works +1. **Fresh Context Each Time:** Every iteration spawns a completely new agent process with no accumulated context from previous runs. + +2. **Read-Only Memory:** Agents rely on: + - PROJECT-SPEC.md (static requirements) + - IMPLEMENTATION_PLAN.md (current task status) + - Git log (recent changes) + - Codebase state + - Test results + +3. **One Task Per Iteration:** Agents pick exactly one unchecked task, implement it completely, verify with build/tests, commit, and exit. + +4. **Signal-Based Control:** Agents output tags that the bash loop monitors to determine next action. + +5. **Git as Coordination:** Each iteration's changes are committed, creating an audit trail and allowing the next agent to see what was done. + +### Benefits +- Prevents context window overflow +- Eliminates stale reasoning problems +- Enables indefinite project scaling +- Provides clear intervention points +- Maintains code quality through iteration + +### Flow Diagram +``` +Start Loop +├── Read PROJECT-SPEC.md +├── Run Agent with Fresh Context +├── Agent: Orient (read plan, git log) +├── Agent: Pick ONE Task +├── Agent: Implement + Verify +├── Agent: Commit + Mark Done +├── Check Output Signals +├── If DONE: Exit Success +├── If STUCK/ERROR: Exit with Warning +└── Else: Loop Again +``` + +## How to Use for Autonomous Coding Workflows + +### Quick Start +1. Copy templates into your project root +2. Fill out PROJECT-SPEC.md with complete project details +3. Run `./ralph-loop.sh plan` to generate IMPLEMENTATION_PLAN.md +4. Run `./ralph-loop.sh` to start autonomous building +5. Monitor progress; intervene if agent gets stuck + +### Detailed Workflow +1. **Preparation:** + - Choose project directory + - Copy all 4 template files + - Customize PROJECT-SPEC.md with your requirements + - Ensure build/test commands work + +2. **Planning Phase:** + - Run `./ralph-loop.sh plan` + - Agent reads spec and creates task decomposition + - Review IMPLEMENTATION_PLAN.md for completeness + +3. **Build Iterations:** + - Run `./ralph-loop.sh --max 50` (or your preferred limit) + - Each iteration: fresh agent → one task → verify → commit + - Loop continues until DONE or max iterations + +4. **Monitoring:** + - Check `.ralph-logs/` for iteration details + - Look for STUCK/ERROR signals requiring intervention + - Review git log for progress + +5. **Intervention:** + - If stuck: update IMPLEMENTATION_PLAN.md with notes + - If error: fix the issue and restart loop + - If plan needs changes: edit and restart + +### Configuration Options +- `--max N`: Limit iterations (default 50) +- `--agent claude|codex`: Choose AI agent +- `plan` mode: Just create implementation plan + +## Examples and Use Cases + +### Personal Finance App (FinPlan) +Complete example in EXAMPLES.md showing: +- Privacy-first local finance dashboard +- Transaction import, categorization, projections +- Monte Carlo retirement simulations +- Tech stack: TypeScript, Express, SQLite, vanilla JS +- 15+ features decomposed into phases + +### Key Patterns from Examples +- **Be Specific:** Acceptance criteria like "Parse QFX files and extract: date, amount, payee, memo, type" +- **Define Tech Stack:** Don't let agents choose — specify "TypeScript, Express.js, SQLite" +- **Include Data Models:** Explicit entity definitions with constraints +- **Phase Large Projects:** Independent deployable phases +- **Anti-Patterns:** "Don't use localStorage — SQLite is source of truth" + +### Use Cases +- **Complex Web Apps:** Multi-feature applications with databases +- **Libraries/Frameworks:** API design and implementation +- **Data Processing:** ETL pipelines, analysis tools +- **CLI Tools:** Command-line utilities with multiple commands +- **Prototypes to Production:** Start with working prototype, iterate to full product + +## Integration with OpenClaw sessions_spawn + +OpenClaw provides `sessions_spawn` for agent orchestration, offering an alternative to the bash loop. + +### Basic Usage +```bash +# Planning phase +sessions_spawn --task "Read PROJECT-SPEC.md. Decompose into tasks. Write IMPLEMENTATION_PLAN.md." --model opus + +# Build iterations +sessions_spawn --task "Read AGENT.md. Follow core loop: pick one task, implement, test, commit." --model sonnet +``` + +### Advanced Integration +- **Parallel Tasks:** Spawn multiple agents for independent tasks +- **Different Models:** Use opus for planning, sonnet for coding +- **Cron Scheduling:** Automate iterations with cron jobs +- **Channel Output:** Direct results to specific channels + +### Benefits Over Bash Loop +- Model selection per task type +- Parallel execution for independent work +- Integration with OpenClaw's session management +- Richer output formatting and notifications + +### When to Use Each +- **Ralph Loop:** Simple sequential projects, bash environments +- **OpenClaw:** Complex projects, parallel work, advanced features + +## Best Practices for Agent-Driven Development + +### Writing Project Specs +1. **Be Exhaustively Specific:** Include exact acceptance criteria, not vague requirements +2. **Define Everything:** Tech stack, directory structure, build commands, coding standards +3. **Provide Examples:** Sample data, API responses, UI mockups +4. **Phase Appropriately:** Break large projects into independent phases +5. **Document Constraints:** What MUST/MUST NOT do, plus preferences +6. **Include Anti-Patterns:** Lessons from previous attempts + +### Agent Instructions +1. **Role Definition:** Clear capabilities and limitations +2. **Strict Rules:** One task per iteration, mandatory testing, no refactoring unrelated code +3. **Clear Signals:** Use tags for loop control +4. **Context Boundaries:** Fresh start each time, rely on files/git + +### Loop Management +1. **Monitor Logs:** Check .ralph-logs/ for issues +2. **Set Reasonable Limits:** --max 20-50 iterations depending on project size +3. **Plan Reviews:** Always review IMPLEMENTATION_PLAN.md after planning phase +4. **Intervention Ready:** Be prepared to help when agents get stuck + +### Quality Assurance +1. **Test Everything:** Unit, integration, end-to-end tests +2. **Build Verification:** Every iteration must pass build +3. **Code Standards:** Lint, format, document consistently +4. **Manual Reviews:** Spot-check critical functionality + +### Scaling Up +1. **Phase Work:** Complete foundations before features +2. **Parallel Execution:** Use OpenClaw for independent tasks +3. **Iterative Refinement:** Start with working prototype, enhance gradually +4. **Documentation Updates:** Improve templates based on lessons learned + +### Common Pitfalls +- **Vague Specs:** Leads to agent confusion and poor decomposition +- **Missing Build/Test:** Code quality suffers without verification +- **Context Sharing:** Don't try to pass state between iterations +- **Over-Parallelization:** Dependencies must be respected +- **Ignoring Signals:** STUCK/ERROR states need attention + +This system transforms AI coding assistants from helpful sidekicks into autonomous development partners capable of delivering complete, tested software projects. \ No newline at end of file diff --git a/archive/AutoGen.md b/archive/AutoGen.md new file mode 100644 index 0000000..1bc3510 --- /dev/null +++ b/archive/AutoGen.md @@ -0,0 +1,34 @@ +# AutoGen Framework + +## Overview +AutoGen is an open-source agentic toolkit for configuring, chaining, and managing LLM-driven agents. AutoGen supports customizable agent roles and sophisticated interaction patterns. + +## Typical Workflow +1. Define agents and their capabilities. +2. Specify tasks, triggers, and communication flows. +3. Run agent conversations (chain, parallel, sequential). +4. Monitor and adapt via logs and manual tuning. + +## Strengths +- Highly customizable agent interaction workflows. +- Supports advanced chaining and conversation flows. +- Integration-friendly with multiple LLMs. + +## Weaknesses +- Adaptation is mostly manual; lacks feedback loops. +- Requires careful orchestration for complex setups. +- Can be verbose and require boilerplate. + +## Example Use Cases +- LLM agent research +- Custom workflow prototyping +- R&D sandboxes + +## Table: AutoGen vs Others +| Feature | AutoGen | BMAD | CrewAI | ChatDev | +|---------------|------------|------------|-------------|-------------| +| Agent Roles | Configurable| Explicit | Flexible | Scripted | +| Lifecycle | Partial | Full | Partial | Simple | +| Adaptation | Limited | Yes | Manual | None | +| Teaming | Moderate | Strong | Strong | Weak | +| Use Cases | R&D | Prod/Res | Orchestration| Dev Sims | diff --git a/archive/BMAD.md b/archive/BMAD.md new file mode 100644 index 0000000..2a20fda --- /dev/null +++ b/archive/BMAD.md @@ -0,0 +1,34 @@ +# BMAD Framework + +## Overview +BMAD (Build, Manage, Adapt, Deploy) is an agentic engineering framework for composable, modular agent systems. BMAD emphasizes lifecycle management, clear boundaries between agent roles, and iterative improvement. + +## Typical Workflow +1. **Build**: Define agent roles and capabilities. +2. **Manage**: Orchestrate agents, monitor state, provide feedback loops. +3. **Adapt**: Adjust agent configurations based on outcomes & feedback. +4. **Deploy**: Package, integrate, and run agent teams in production or simulation environments. + +## Strengths +- Modular structure simplifies lifecycle and iteration. +- Good for heterogeneous teams with evolving requirements. +- Clear separation of concerns aids maintainability. + +## Weaknesses +- Requires robust management layer (can be complex). +- Adaptation step needs reliable metrics and evaluation. +- Not plug-and-play; demands thoughtful composition. + +## Example Use Cases +- Multi-agent research projects +- Continually improving workflows +- Complex, production agent teams + +## Table: BMAD vs Others +| Feature | BMAD | CrewAI | AutoGen | ChatDev | +|---------------|-----------|-------------|-------------|-------------| +| Agent Roles | Explicit | Flexible | Configurable | Scripted | +| Lifecycle | Full | Partial | Partial | Simple | +| Adaptation | Yes | Manual | Limited | None | +| Teaming | Strong | Strong | Moderate | Weak | +| Use Cases | Prod/Res | Orchestration| R&D | Dev Sims | diff --git a/archive/ChatDev.md b/archive/ChatDev.md new file mode 100644 index 0000000..0acfdbd --- /dev/null +++ b/archive/ChatDev.md @@ -0,0 +1,34 @@ +# ChatDev Framework + +## Overview +ChatDev is a playful, simulated software company using chained LLM agents to mimic collaborative development. Each agent acts as a role (e.g., CEO, CTO, coder), passing instructions down a pipeline. + +## Typical Workflow +1. Define dev roles (CEO, CTO, etc.). +2. Input a project or prompt. +3. Run the chain—each agent acts in turn. +4. Output a simulated dev artifact or code. + +## Strengths +- Great for education, demos, and prototyping. +- Simple chained workflow, easy to use. +- Highly scripted interactions. + +## Weaknesses +- Limited flexibility, not modular. +- No ongoing adaptation or feedback. +- Weak real-world deployment/teaming. + +## Example Use Cases +- Edtech +- Demoing LLM teamwork +- Collaborative coding simulations + +## Table: ChatDev vs Others +| Feature | ChatDev | BMAD | CrewAI | AutoGen | +|---------------|------------|------------|-------------|-------------| +| Agent Roles | Scripted | Explicit | Flexible | Configurable | +| Lifecycle | Simple | Full | Partial | Partial | +| Adaptation | None | Yes | Manual | Limited | +| Teaming | Weak | Strong | Strong | Moderate | +| Use Cases | Dev Sims | Prod/Res | Orchestration| R&D | diff --git a/archive/CrewAI.md b/archive/CrewAI.md new file mode 100644 index 0000000..e4a538f --- /dev/null +++ b/archive/CrewAI.md @@ -0,0 +1,34 @@ +# CrewAI Framework + +## Overview +CrewAI is a Python package for orchestrating multiple AI agents (“crew”), often with role assignment, task definition, and parallel task execution. + +## Typical Workflow +1. Define agents and assign roles/tasks. +2. Group agents into a crew. +3. Run the crew to execute tasks in parallel or sequence. +4. Review outputs; adjust roles and tasks as needed. + +## Strengths +- Easy orchestration of multi-agent tasks. +- Role assignment is flexible and prompt-based. +- Simple integration with LLMs and tools. + +## Weaknesses +- Rudimentary adaptation; manual configuration required. +- Lacks deep lifecycle or performance management. +- Limited underlying metrics for team optimization. + +## Example Use Cases +- Research orchestration +- Content generation teams +- Parallelized task bots + +## Table: CrewAI vs Others +| Feature | CrewAI | BMAD | AutoGen | ChatDev | +|---------------|------------|------------|-------------|-------------| +| Agent Roles | Flexible | Explicit | Configurable | Scripted | +| Lifecycle | Partial | Full | Partial | Simple | +| Adaptation | Manual | Yes | Limited | None | +| Teaming | Strong | Strong | Moderate | Weak | +| Use Cases | Orchestration| Prod/Res | R&D | Dev Sims | diff --git a/archive/MODAL.md b/archive/MODAL.md new file mode 100644 index 0000000..82d65f1 --- /dev/null +++ b/archive/MODAL.md @@ -0,0 +1,40 @@ +# Paul's Custom Agentic Framework: MODAL + +## Overview +MODAL (Modular Orchestration, Dynamic Adaptation, Lifecycle) is designed for Paul's needs: robust agent team management, continuous adaptation, and a practical development-to-deployment workflow. It integrates lessons from BMAD, CrewAI, AutoGen, and ChatDev. + +## Typical Workflow +1. **Modular Orchestration** + - Define agent roles and group them by project/task. + - Mix human and AI agents as needed. +2. **Dynamic Adaptation** + - Agents adjust configuration based on outcome metrics, feedback, and error reports (automatic + manual input). + - Allow plug-in adaptation modules for custom evaluation. +3. **Lifecycle Management** + - Agents move through: development → simulated run → feedback review → production deployment → continuous improvement. + - Integrated logs, metrics, and snapshotting for easy rollback. + +## Strengths +- Combines modular role definition with strong lifecycle and adaptation. +- Teaming is flexible—allows for both scripted and dynamic agent teams. +- Both manual and automatic adaptation supported. +- Designed for production, research, and prototyping. + +## Weaknesses +- Complexity increases with adaptation modules. +- Requires metric design for feedback loops. +- Demands careful orchestration for large teams. + +## Example Use Cases +- Multi-agent production systems +- Iterative project development +- Custom agent research + +## Comparison Table +| Feature | MODAL | BMAD | CrewAI | AutoGen | ChatDev | +|---------------|------------|------------|-------------|-------------|------------| +| Agent Roles | Modular | Explicit | Flexible | Configurable | Scripted | +| Lifecycle | Full | Full | Partial | Partial | Simple | +| Adaptation | Auto+Manual| Yes | Manual | Limited | None | +| Teaming | Strong | Strong | Strong | Moderate | Weak | +| Use Cases | Prod/Res/R&D| Prod/Res | Orchestration| R&D | Dev Sims | diff --git a/archive/Opus-Workflow-Constraints.md b/archive/Opus-Workflow-Constraints.md new file mode 100644 index 0000000..89ffd3c --- /dev/null +++ b/archive/Opus-Workflow-Constraints.md @@ -0,0 +1,45 @@ +# Opus Workflow Constraints + +**Purpose:** Strict, non-negotiable constraints for Anthropic Claude Opus sessions to prevent assumptions, ensure exact adherence, and maintain cost control. This is the HARNESS for Opus work. + +## Core Constraints + +### 1. Sequential Execution Only +- **Rule:** Spawn and run ONE sub-agent at a time. Wait for full completion and summary before starting the next. +- **No Variations:** No parallel spawning, no assumptions about "efficient" batching. +- **Verification:** Check sessions_list after each completion to confirm no active extras. + +### 2. Model and Usage Verification +- **Rule:** Before starting ANY Opus work, check browser relay (targetId BB9DB072F0FDFBF5093323CC75DE8099) for current session % and weekly %. +- **Halt Thresholds:** Stop all Opus work if session >90% or weekly >80%. Notify Paul immediately. +- **Confirmation:** Log the check in memory with timestamp and values. +- **Model:** Must be Claude Opus (Sonnet 4.5). No other models allowed for these sessions. + +### 3. Token Monitoring +- **Rule:** Monitor token usage in real-time during sessions. Log to memory/anthropic-usage-log.md after each major task. +- **Alerts:** Notify Paul at 80% session/weekly usage. +- **Burn Strategy:** Maximize weekly budget within per-session limits. Never exceed session cap (avoid extra charges). + +### 4. Spawning Limits +- **Rule:** Max 1 active sub-agent per session. Embed constraints in each sub-agent's task prompt to prevent them from spawning further agents. +- **Task Embedding:** Include "Follow Opus Workflow Constraints exactly" in every sub-agent prompt. + +### 5. Exact Adherence to Instructions +- **Rule:** Follow the reminder/task list verbatim. No additions, assumptions, or optimizations unless explicitly approved. +- **Logging:** Update memory/YYYY-MM-DD.md with every action, including checks and decisions. +- **Post-Completion:** Send summary only after all tasks in the list are done. + +### 6. Error Handling +- **Rule:** If any constraint is violated, halt and notify Paul for approval before continuing. +- **Fallback:** If browser relay fails, use cached last check but re-verify within 1 hour. + +## Enforcement +- **Self-Monitoring:** Main agent must verify compliance before, during, and after each step. +- **Audit Trail:** All actions logged to memory for review. +- **Paul Override:** Paul can modify this document anytime; agent must read and follow updated version. + +## Reference +- Original Workflow: Reminder from cron (Opus session 5, sequential spawn, etc.) +- Integration: Referenced in AGENTS.md for Opus sessions. + +**Last Updated:** 2026-03-13 \ No newline at end of file diff --git a/model-report.ts b/model-report.ts new file mode 100644 index 0000000..7e44e8c --- /dev/null +++ b/model-report.ts @@ -0,0 +1,245 @@ +#!/usr/bin/env ts-node +/** + * scripts/model-report.ts + * + * Parses git log trailers to generate a model performance table. + * Reads: Agent, Tests, Tests-Added, TypeScript trailers from commit messages. + * + * Usage: + * npx ts-node scripts/model-report.ts + * npx ts-node scripts/model-report.ts --since=2026-03-01 + * npx ts-node scripts/model-report.ts --json + */ + +import { execSync } from 'child_process'; + +interface CommitRecord { + hash: string; + date: string; + subject: string; + agent: string; + tests: string; // raw e.g. "129/129 passing" + testsAdded: string; // raw e.g. "+32" + typescript: string; // raw e.g. "clean" or "2 errors" + body: string; +} + +interface ModelStats { + model: string; + commits: number; + testsAdded: number; + typeScriptErrors: number; // errors introduced + typeScriptClean: number; // commits that were clean + hasAttribution: number; // commits with Agent: trailer + commitsByType: Record; + firstSeen: string; + lastSeen: string; + subjects: string[]; +} + +function parseArgs() { + const args = process.argv.slice(2); + const since = args.find(a => a.startsWith('--since='))?.split('=')[1]; + const asJson = args.includes('--json'); + return { since, asJson }; +} + +function getCommits(since?: string): CommitRecord[] { + const sinceFlag = since ? `--since="${since}"` : ''; + // Use %x00 as field separator, %x01 as record separator + const format = '%H%x00%ad%x00%s%x00%b%x01'; + const cmd = `git log ${sinceFlag} --format="${format}" --date=short`; + + let raw: string; + try { + raw = execSync(cmd, { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }); + } catch { + console.error('Failed to run git log'); + process.exit(1); + } + + const records = raw.split('\x01').filter(r => r.trim()); + return records.map(record => { + const [hash, date, subject, body = ''] = record.split('\x00'); + + const trailer = (key: string) => { + const match = body.match(new RegExp(`^${key}:\\s*(.+)$`, 'm')); + return match ? match[1].trim() : ''; + }; + + return { + hash: (hash || '').trim().slice(0, 7), + date: (date || '').trim(), + subject: (subject || '').trim(), + agent: trailer('Agent'), + tests: trailer('Tests'), + testsAdded: trailer('Tests-Added'), + typescript: trailer('TypeScript'), + body: body.trim(), + }; + }).filter(r => r.hash); +} + +function parseTestsAdded(raw: string): number { + if (!raw) return -1; // -1 = unknown (no trailer) + const match = raw.match(/[+-]?(\d+)/); + return match ? parseInt(match[1], 10) : 0; +} + +function parseTsErrors(raw: string): number { + if (!raw) return -1; // -1 = unknown + if (raw.toLowerCase().includes('clean')) return 0; + const match = raw.match(/(\d+)/); + return match ? parseInt(match[1], 10) : -1; +} + +function commitType(subject: string): string { + const match = subject.match(/^(feat|fix|refactor|test|docs|chore|build|ci|perf)/); + return match ? match[1] : 'other'; +} + +function buildStats(commits: CommitRecord[]): Map { + const stats = new Map(); + + for (const c of commits) { + const model = c.agent || 'unknown'; + + if (!stats.has(model)) { + stats.set(model, { + model, + commits: 0, + testsAdded: 0, + typeScriptErrors: 0, + typeScriptClean: 0, + hasAttribution: 0, + commitsByType: {}, + firstSeen: c.date, + lastSeen: c.date, + subjects: [], + }); + } + + const s = stats.get(model)!; + s.commits++; + + if (c.agent) s.hasAttribution++; + + const added = parseTestsAdded(c.testsAdded); + if (added >= 0) s.testsAdded += added; + + const tsErrors = parseTsErrors(c.typescript); + if (tsErrors > 0) s.typeScriptErrors += tsErrors; + if (tsErrors === 0) s.typeScriptClean++; + + const type = commitType(c.subject); + s.commitsByType[type] = (s.commitsByType[type] || 0) + 1; + + if (c.date < s.firstSeen) s.firstSeen = c.date; + if (c.date > s.lastSeen) s.lastSeen = c.date; + + s.subjects.push(`${c.hash} ${c.subject}`); + } + + return stats; +} + +function printTable(stats: Map, commits: CommitRecord[]) { + const total = commits.length; + const withAttribution = commits.filter(c => c.agent).length; + + console.log('\n📊 Recipe Manager — Model Performance Report'); + console.log(` Generated: ${new Date().toISOString()}`); + console.log(` Total commits: ${total} | With attribution: ${withAttribution} | Unknown: ${total - withAttribution}\n`); + + // Sort by commit count desc + const sorted = [...stats.values()].sort((a, b) => b.commits - a.commits); + + const col = (s: string, w: number) => s.slice(0, w).padEnd(w); + + const header = [ + col('Model', 36), + col('Commits', 8), + col('Tests+', 8), + col('TS-clean', 9), + col('TS-errors', 10), + col('Last seen', 12), + ].join(' │ '); + + const divider = '─'.repeat(header.length); + console.log(divider); + console.log(header); + console.log(divider); + + for (const s of sorted) { + const tsClean = s.typeScriptClean > 0 + ? `${s.typeScriptClean}/${s.commits}` + : '?'; + const tsErrors = s.typeScriptErrors > 0 + ? `⚠️ ${s.typeScriptErrors}` + : s.typeScriptErrors === 0 ? '✅ 0' : '?'; + + console.log([ + col(s.model, 36), + col(String(s.commits), 8), + col(s.testsAdded > 0 ? `+${s.testsAdded}` : '?', 8), + col(tsClean, 9), + col(tsErrors, 10), + col(s.lastSeen, 12), + ].join(' │ ')); + } + + console.log(divider); + + // Commits without attribution + const unknown = commits.filter(c => !c.agent); + if (unknown.length > 0) { + console.log(`\n⚠️ ${unknown.length} commits without Agent: trailer:`); + for (const c of unknown.slice(0, 10)) { + console.log(` ${c.hash} ${c.date} ${c.subject}`); + } + if (unknown.length > 10) console.log(` ... and ${unknown.length - 10} more`); + } + + // Commits with TS errors + const tsErrorCommits = commits.filter(c => parseTsErrors(c.typescript) > 0); + if (tsErrorCommits.length > 0) { + console.log(`\n🔴 Commits with TypeScript errors:`); + for (const c of tsErrorCommits) { + console.log(` ${c.hash} ${c.date} [${c.agent || 'unknown'}] ${c.subject}`); + } + } + + // Zero tests added on feat commits + const featNoTests = commits.filter(c => + commitType(c.subject) === 'feat' && + c.testsAdded !== '' && + parseTestsAdded(c.testsAdded) === 0 + ); + if (featNoTests.length > 0) { + console.log(`\n🟡 feat commits with Tests-Added: 0 (no new tests):`); + for (const c of featNoTests) { + console.log(` ${c.hash} ${c.date} [${c.agent || 'unknown'}] ${c.subject}`); + } + } + + console.log(''); +} + +function main() { + const { since, asJson } = parseArgs(); + const commits = getCommits(since); + const stats = buildStats(commits); + + if (asJson) { + console.log(JSON.stringify({ + generatedAt: new Date().toISOString(), + totalCommits: commits.length, + models: [...stats.values()], + commits: commits, + }, null, 2)); + } else { + printTable(stats, commits); + } +} + +main(); diff --git a/ralph-loop.sh b/ralph-loop.sh new file mode 100755 index 0000000..3ac1de7 --- /dev/null +++ b/ralph-loop.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +# +# Ralph Wiggum Loop — Autonomous agent iteration +# +# Based on Geoffrey Huntley's approach: +# - Each iteration spawns a FRESH agent with clean context +# - Agent reads the plan, picks ONE task, implements, tests, commits, exits +# - Loop restarts until all tasks are done +# +# No context compaction. No stale reasoning. Just fresh starts. +# +# Usage: +# ./ralph-loop.sh # Build mode (default) +# ./ralph-loop.sh plan # Planning mode (create IMPLEMENTATION_PLAN.md) +# ./ralph-loop.sh --max 20 # Limit to 20 iterations +# ./ralph-loop.sh --agent claude # Use claude (default) +# ./ralph-loop.sh --agent codex # Use OpenAI Codex CLI +# ./ralph-loop.sh --agent aider # Use Aider +# ./ralph-loop.sh --agent gemini # Use Gemini CLI +# ./ralph-loop.sh --agent custom # Use custom agent (see below) +# +# Extensibility: +# To add support for other AI coding agents (aider, cursor, windsurf, etc.): +# 1. Add a new case in the run_agent() function's agent selection block +# 2. Format the prompt appropriately for that agent's CLI interface +# 3. Ensure the agent outputs to the logfile for promise detection +# +# Example for Aider: +# aider) +# aider --message "$prompt" --yes 2>&1 | tee "$logfile" +# ;; +# +# Example for custom script: +# custom) +# ./my-agent-wrapper.sh "$prompt" 2>&1 | tee "$logfile" +# ;; +# +set -euo pipefail + +MODE="${1:-build}" +MAX_ITERATIONS=50 +AGENT="claude" +PLAN_FILE="IMPLEMENTATION_PLAN.md" +SPEC_FILE="PROJECT-SPEC.md" +AGENT_FILE="AGENT.md" +LOG_DIR=".ralph-logs" + +# Parse arguments +shift 2>/dev/null || true +while [[ $# -gt 0 ]]; do + case "$1" in + --max) MAX_ITERATIONS="$2"; shift 2 ;; + --agent) AGENT="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "$LOG_DIR" + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +NC='\033[0m' + +log() { echo -e "${BLUE}[ralph]${NC} $1"; } +success() { echo -e "${GREEN}[ralph]${NC} $1"; } +warn() { echo -e "${YELLOW}[ralph]${NC} $1"; } +error() { echo -e "${RED}[ralph]${NC} $1"; } + +# Check prerequisites +if [[ ! -f "$SPEC_FILE" ]]; then + error "Missing $SPEC_FILE — create your project spec first." + exit 1 +fi + +if [[ ! -f "$AGENT_FILE" ]]; then + warn "No $AGENT_FILE found. Using default agent instructions." +fi + +run_agent() { + local iteration=$1 + local mode=$2 + local logfile="$LOG_DIR/iteration-${iteration}.log" + local prompt="" + + if [[ "$mode" == "plan" ]]; then + prompt="Read PROJECT-SPEC.md. Decompose the project into discrete, testable tasks ordered by dependency. Write the plan to IMPLEMENTATION_PLAN.md with checkboxes. Output PLANNED when done." + else + prompt="Read AGENT.md (if it exists) for your instructions. Follow the core loop: orient, pick one task, implement, verify, commit, exit." + fi + + log "Iteration $iteration ($mode mode) — starting fresh agent..." + + # Agent selection block + # Extend this case statement to support additional agents + case "$AGENT" in + claude) + echo "$prompt" | claude -p --output-format text 2>&1 | tee "$logfile" + ;; + codex) + echo "$prompt" | codex 2>&1 | tee "$logfile" + ;; + aider) + # Aider: AI pair programming in your terminal + # https://aider.chat + aider --message "$prompt" --yes 2>&1 | tee "$logfile" + ;; + gemini) + # Google Gemini CLI (if available) + # Adjust command based on actual Gemini CLI interface + echo "$prompt" | gemini-cli 2>&1 | tee "$logfile" + ;; + custom) + # Custom agent integration + # Replace this with your own agent wrapper script + # The script should: + # 1. Accept prompt as first argument or via stdin + # 2. Perform the requested work (read files, write code, run tests, commit) + # 3. Output promise signals: PLANNED|DONE|STUCK|ERROR + # 4. Exit with appropriate code + if [[ -x "./custom-agent.sh" ]]; then + ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile" + else + error "Custom agent selected but ./custom-agent.sh not found or not executable" + exit 1 + fi + ;; + *) + error "Unknown agent: $AGENT" + error "Supported agents: claude, codex, aider, gemini, custom" + error "To add support for other agents, edit the run_agent() function in this script" + exit 1 + ;; + esac + + return 0 +} + +check_output() { + local logfile="$1" + + if grep -q 'DONE' "$logfile" 2>/dev/null; then + return 0 # Done + elif grep -q 'STUCK' "$logfile" 2>/dev/null; then + return 2 # Stuck + elif grep -q 'ERROR' "$logfile" 2>/dev/null; then + return 3 # Error + else + return 1 # Continue + fi +} + +# Main loop +if [[ "$MODE" == "plan" ]]; then + log "Planning mode — creating implementation plan..." + run_agent 0 plan + success "Plan created. Review $PLAN_FILE, then run: ./ralph-loop.sh" + exit 0 +fi + +log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)" +log "Agent: $AGENT" +log "Spec: $SPEC_FILE" +log "Plan: $PLAN_FILE" +echo "" + +for i in $(seq 1 "$MAX_ITERATIONS"); do + run_agent "$i" build + logfile="$LOG_DIR/iteration-${i}.log" + + check_output "$logfile" + status=$? + + case $status in + 0) + success "🎉 ALL TASKS COMPLETE after $i iterations!" + exit 0 + ;; + 2) + warn "Agent is stuck. Review $logfile and intervene." + exit 1 + ;; + 3) + error "Agent encountered an error. Review $logfile." + exit 1 + ;; + 1) + log "Iteration $i complete. Restarting with fresh context..." + echo "" + sleep 2 + ;; + esac +done + +warn "Reached max iterations ($MAX_ITERATIONS). Review progress in $PLAN_FILE." +exit 1