diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ce2946a --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# Node / JS +node_modules/ +dist/ +build/ +coverage/ + +# Python +.venv/ +__pycache__/ +.pytest_cache/ + +# Env / local +.env +.env.* + +# OS / editor +.DS_Store +.vscode/ +.idea/ +agent/models/**/*.zip diff --git a/.harness/EXECUTION_MASTER.md b/.harness/EXECUTION_MASTER.md new file mode 100644 index 0000000..f80b512 --- /dev/null +++ b/.harness/EXECUTION_MASTER.md @@ -0,0 +1,37 @@ +# Execution Master β€” DonkeyCar RL Autoresearch + +## Wave Status +| Wave | Description | Status | +|------|-------------|--------| +| Wave 1 | Real Training Foundation | 🟠 In progress | +| Wave 2 | Multi-Track Generalization | ⏸️ Not started | +| Wave 3 | Racing / Speed Optimization | ⏸️ Not started | + +## Active Streams +| Stream | Branch | Status | Blocker | +|--------|--------|--------|---------| +| 1A: Core Runner Rebuild | main | 🟠 In progress | None | +| 1B: Tests | main | ⏸️ Planned | 1A-01 must complete first | +| 1C: First Real Autoresearch | main | ⏸️ Planned | 1A + 1B complete, sim running | + +## Wave 1 Gate Criteria +Before starting Wave 2, ALL must be true: +- [ ] All 1A, 1B, 1C tasks checked off in IMPLEMENTATION_PLAN.md +- [ ] `pytest tests/ -v` β€” all tests green +- [ ] Champion model exists at `agent/models/champion/model.zip` +- [ ] Champion mean_reward > 100 on training track +- [ ] `champion_manifest.json` exists and is valid +- [ ] Regression baseline saved +- [ ] Wave 1 process eval written: `.harness/wave1/process-eval.md` +- [ ] All results pushed to Gitea + +## Parallelism Rules +1. 1A and 1B can run in parallel (1B mocks the env) +2. 1C cannot start until 1A AND 1B are complete +3. Wave 2 cannot start until Wave 1 gate passes +4. Only one stream touches the sim at a time (1C has exclusive sim access) + +## Current Agent Context +- **Active task:** 1A-01 β€” Rebuild donkeycar_sb3_runner.py with real PPO training +- **Read:** PROJECT-SPEC.md, DECISIONS.md, then pick from IMPLEMENTATION_PLAN.md +- **Important:** The existing autoresearch_results.jsonl contains RANDOM POLICY data β€” do not mix with Phase 1 real training results. New results go to `autoresearch_results_phase1.jsonl` diff --git a/.harness/templates/EXECUTION-BOARD-TEMPLATE.md b/.harness/templates/EXECUTION-BOARD-TEMPLATE.md new file mode 100644 index 0000000..2fc9040 --- /dev/null +++ b/.harness/templates/EXECUTION-BOARD-TEMPLATE.md @@ -0,0 +1,137 @@ +# Execution Board Template + +> **The execution board is the contract for a stream.** +> Copy this file into `.harness//execution-board.md`. +> **The entire board must be written BEFORE any code is committed.** +> Plan-then-implement is non-negotiable. + +--- + +# Execution Board β€” [STREAM NAME] +**Feature:** [One-line description of what this stream builds] +**Created:** YYYY-MM-DD +**Branch:** `feat/` +**IMPLEMENTATION_PLAN tasks:** [e.g., 5–8] +**Status:** πŸ”΄ Not started | 🟑 Planning | 🟠 In progress | βœ… Complete +**Design reference:** [path to design doc, or N/A] + +--- + +## 🎯 Goal + +[2–4 sentences. What does this stream accomplish? What user-facing outcome does it produce? +How does it fit the larger product vision?] + +--- + +## ⚠️ Dependencies + +[Other streams or tasks that must be complete before this one can start. +If none: "None β€” can start immediately."] + +--- + +## πŸ“¦ Packets + + + +### Packet [XX-01] β€” [Name] +**IMPLEMENTATION_PLAN task:** [N] +**Status:** ⬜ Not started | πŸ”„ In progress | βœ… Done +**Est. effort:** [N sessions] +**Depends on:** [XX-00 or "none"] + +**Goal:** [One sentence] + +**Steps:** +1. [Concrete step] +2. [...] + +**Files created/modified:** +- `src/...` β€” [description] +- `src/.../__tests__/...` β€” [test file] + +**Known-answer tests (mandatory for calculation modules):** +``` +test('[what is being verified]', () => { + // Source: [official reference β€” URL, standard, specification] + expect(fn(input)).toBeCloseTo(expected, precision); +}); +``` + +**Acceptance criteria:** +- [ ] [Programmatically verifiable criterion] +- [ ] Known-answer test passes +- [ ] Full test suite green (count β‰₯ baseline) +- [ ] TypeScript: clean (`npx tsc --noEmit` outputs nothing) +- [ ] **Documentation updated** (if user-facing): user guide, feature overview, or design doc reflects this change + +**Validation evidence:** `.harness//validation/-validation.md` + +--- + +### Packet [XX-02] β€” [Name] +[Repeat above structure for each packet] + +--- + +## πŸ”’ Dependency Order + +``` +[XX-01] β†’ [XX-02] β†’ [XX-04 (UI/integration)] +[XX-01] β†’ [XX-03] β†’ [XX-04 (UI/integration)] +``` + +[Which packets can run in parallel? Which must be sequential?] + +--- + +## 🏁 Stream Completion Criteria + +- [ ] All packets complete with validation evidence written +- [ ] All known-answer tests pass (list them here explicitly) +- [ ] Full test suite green +- [ ] TypeScript: clean +- [ ] Regression baseline saved: `.harness/regression-baselines/-baseline.json` +- [ ] Branch merged to main via `--no-ff` merge commit +- [ ] Process eval written: `.harness//process-eval.md` +- [ ] IMPLEMENTATION_PLAN tasks marked `[x]` +- [ ] EXECUTION_MASTER.md (or project equivalent) updated +- [ ] **Documentation updated**: any user-facing feature changes are reflected in the relevant user guide, features overview, database schema doc, or design doc (whichever applies) + +--- + +## πŸ“‹ Mandatory Commit Trailer Format + +Every implementation commit in this stream: + +``` +feat(): β€” + +Agent: +Tests: β†’ +Tests-Added: +TypeScript: clean +``` + +--- + +## πŸ” Pre-Coding Checklist + +Before writing any implementation code: + +- [ ] This execution board is fully written (all packets defined) +- [ ] Branch created from latest main +- [ ] Baseline test count verified +- [ ] No open schema migrations from other active streams (if relevant) +- [ ] Design reference doc has been read diff --git a/.harness/templates/PROCESS-EVAL-TEMPLATE.md b/.harness/templates/PROCESS-EVAL-TEMPLATE.md new file mode 100644 index 0000000..d0c1da1 --- /dev/null +++ b/.harness/templates/PROCESS-EVAL-TEMPLATE.md @@ -0,0 +1,82 @@ +# Process Eval Template + +> Write this file after the stream is fully merged. +> File location: `.harness//process-eval.md` +> Be honest β€” this is a retrospective, not a press release. +> Future agents and sessions will read this to understand what worked. + +--- + +# Process Eval β€” [STREAM NAME] +**Completed:** YYYY-MM-DD +**Agent:** [model name] +**Packets:** [XX-01, XX-02, ...] +**Tests added:** NN total +**Final test count:** NNNN +**Wall-clock duration:** [estimated] + +--- + +## Packet Summary + +| Packet | Est. Effort | Actual | On Time? | Tests Added | +|--------|-------------|--------|----------|-------------| +| XX-01 | N sessions | N sessions | βœ…/❌ | NN | +| XX-02 | ... | ... | ... | ... | + +--- + +## Known-Answer Test Results + +| Test | Expected | Actual | Pass? | +|------|----------|--------|-------| +| [Description] | [value] | [value] | βœ…/❌ | + +--- + +## Process Quality Dimensions + +### Task Sizing +- Estimate accuracy: [XX%] +- Packets that overran: [list or "none"] +- Root cause of overruns: [...] + +### Test-First Discipline +- Tests committed same commit as implementation: [XX/NN packets] +- Patches needed after initial commit: [list or "none"] + +### Acceptance Criteria Quality +- Programmatically verifiable criteria: [XX/NN] +- Criteria that required human judgment: [list or "none"] + +### Known-Answer Coverage +- New calculation modules: N +- Modules with β‰₯1 known-answer test: N/N +- Any gaps: [list or "none"] + +### Architecture Integrity +- Cross-module import violations: [N] +- New shared utilities created: [list] + +### Regression Protection +- Regression baseline saved: [yes/no β€” path if yes] + +--- + +## What Went Well +- [Honest list] + +## What Was Hard +- [Honest list β€” useful for planning the next stream] + +## What To Do Differently +- [Actionable changes for next time] + +## Rejected Approaches Captured +- [Approach] β€” rejected because [...] β€” captured in [spec / ADR / validation / harness docs] +- [Approach] β€” rejected because [...] β€” captured in [...] + +## Model Attribution +- Model: [model name] +- Strengths observed: [...] +- Weaknesses observed: [...] diff --git a/.harness/templates/VALIDATION-TEMPLATE.md b/.harness/templates/VALIDATION-TEMPLATE.md new file mode 100644 index 0000000..e3dc334 --- /dev/null +++ b/.harness/templates/VALIDATION-TEMPLATE.md @@ -0,0 +1,54 @@ +# Validation Evidence Template + +> Write this file after completing each packet. +> File location: `.harness//validation/-validation.md` +> Commit it in the same commit as the packet code. + +--- + +# [XX-NN] Validation Evidence +**Date:** YYYY-MM-DD +**Agent:** [model name, e.g. claude-sonnet-4.6] +**Stream:** [stream name] +**Packet:** [XX-NN β€” Packet Name] + +## Test Counts +| Metric | Value | +|--------|-------| +| Tests before packet | NNNN | +| Tests after packet | NNNN | +| New tests added | NN | +| TypeScript errors in new files | 0 | + +## Known-Answer Tests + +- [x] [Description] (Source: [URL]): PASS +- [x] [Description] (Source: [URL]): PASS + +## Acceptance Criteria + +- [x] [Criterion]: βœ… +- [x] Full test suite green: βœ… (NNNN passing) +- [x] TypeScript clean: βœ… + +## Documentation Check + +- [ ] Is this change user-facing? (new UI, new tab, new workflow, changed behavior) + - If YES: which doc was updated? ___________________________ + - If NO: N/A (internal service / refactor / test only) + +**Doc files updated** (list any changed): +- `docs/product/...` β€” [description] +- `packages/fintrove-app/docs/...` β€” [description] + +If no docs needed: briefly explain why: ___________________________ + +## Files Created +- `src/...` β€” [N lines, brief description] +- `src/.../__tests__/...` β€” [N lines, N tests] + +## Commit +`[hash]` β€” [commit message first line] + +## Notes +[Implementation decisions, deviations from plan, lessons learned, gotchas] diff --git a/.harness/wave1-runner/execution-board.md b/.harness/wave1-runner/execution-board.md new file mode 100644 index 0000000..74278f9 --- /dev/null +++ b/.harness/wave1-runner/execution-board.md @@ -0,0 +1,202 @@ +# Execution Board β€” Stream 1A: Core Runner Rebuild + +**Feature:** Replace random-action inner loop with real PPO/DQN training, model save, and evaluated policy rewards +**Created:** 2026-04-13 +**Branch:** main +**IMPLEMENTATION_PLAN tasks:** 1A-01, 1A-02, 1A-03, 1A-04 +**Status:** 🟠 In progress + +--- + +## 🎯 Goal + +Rebuild `donkeycar_sb3_runner.py` so that every trial: +1. Trains a real PPO (or DQN) model using `model.learn(total_timesteps=N)` +2. Evaluates the trained model with `evaluate_policy()` (learned policy, NOT random) +3. Saves the model to disk +4. Tracks the champion model across all trials +5. Supports speed-aware reward shaping + +--- + +## ⚠️ Dependencies + +None β€” can start immediately. + +--- + +## πŸ“¦ Packets + +### Packet 1A-01 β€” Rebuild Runner with Real Training + +**Status:** ⬜ Not started +**Est. effort:** 1 session +**Depends on:** none + +**Goal:** Replace random `env.action_space.sample()` loop with real `PPO.learn()` + `evaluate_policy()`. + +**Steps:** +1. Remove all legacy random-action loop code +2. Add `model = PPO('CnnPolicy', env, learning_rate=lr, verbose=1)` initialization +3. Add `model.learn(total_timesteps=timesteps)` training call +4. Add `mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=eval_episodes)` +5. Add `model.save(save_dir)` β€” save after every successful training run +6. Print per-trial summary: timesteps, mean_reward, std_reward, save path +7. Keep `env.close()` + `time.sleep(2)` teardown (non-negotiable per ADR-006) +8. Add `--learning-rate` and `--save-dir` CLI args +9. Add DQN path: if `--agent dqn`, use DQN with DiscretizedActionWrapper + +**Files created/modified:** +- `agent/donkeycar_sb3_runner.py` β€” complete rebuild + +**Known-answer tests:** +- PPO with 100 timesteps on mocked env should produce a non-None model object +- Model saved to `save_dir/model.zip` should be loadable with `PPO.load()` + +**Acceptance criteria:** +- [ ] Running `python3 donkeycar_sb3_runner.py --agent ppo --timesteps 100 --save-dir /tmp/test-model` with a live sim produces `/tmp/test-model/model.zip` +- [ ] `mean_reward` in output comes from `evaluate_policy()`, not random episodes +- [ ] Script exits with code 0 and calls `env.close()` +- [ ] `--learning-rate` flag is respected (check SB3 verbose output) +- [ ] No `NameError: name 'model' is not defined` possible (model always defined before save) + +**Validation evidence:** `.harness/wave1-runner/validation/1A-01-validation.md` + +--- + +### Packet 1A-02 β€” Speed-Aware Reward Wrapper + +**Status:** ⬜ Not started +**Est. effort:** 1 session +**Depends on:** 1A-01 + +**Goal:** Add `SpeedRewardWrapper` that replaces default CTE-only reward with `speed * (1 - abs(cte)/max_cte)`. + +**Steps:** +1. Create `agent/reward_wrapper.py` with `SpeedRewardWrapper(gym.Wrapper)` +2. In `step()`, extract `speed` and `cte` from `info` dict (DonkeyCar provides these) +3. Compute shaped reward: `speed * (1.0 - min(abs(cte)/max_cte, 1.0))` minus penalty on crash +4. Add `--reward-shaping` boolean flag to runner CLI +5. Apply wrapper in runner if flag set: `env = SpeedRewardWrapper(env, max_cte=8.0)` +6. Log which reward mode is active at startup + +**Files created/modified:** +- `agent/reward_wrapper.py` β€” new file +- `agent/donkeycar_sb3_runner.py` β€” add `--reward-shaping` flag and wrapper application + +**Acceptance criteria:** +- [ ] `SpeedRewardWrapper` replaces reward when `--reward-shaping` is set +- [ ] Default behavior unchanged when flag not set +- [ ] Wrapper handles missing `speed` or `cte` in info gracefully (falls back to original reward) +- [ ] Unit test passes without simulator (mocked info dict) + +**Validation evidence:** `.harness/wave1-runner/validation/1A-02-validation.md` + +--- + +### Packet 1A-03 β€” Champion Model Tracking + +**Status:** ⬜ Not started +**Est. effort:** 0.5 sessions +**Depends on:** 1A-01 + +**Goal:** Track the best model across all trials; maintain `agent/models/champion/` with the current best. + +**Steps:** +1. After each trial, read `agent/models/champion/manifest.json` (if exists) to get current best reward +2. If new `mean_reward > current_best_reward`, copy model to `agent/models/champion/model.zip` +3. Write updated `manifest.json`: `{trial, timestamp, params, mean_reward, model_path}` +4. Log `[CHAMPION] New best: mean_reward=X params=Y` to console and autoresearch log +5. Add `champion` boolean field to JSONL result record + +**Files created/modified:** +- `agent/autoresearch_controller.py` β€” add champion tracking logic +- `agent/models/champion/` β€” directory for champion model + manifest + +**Known-answer tests:** +```python +# Rewards [50, 80, 60, 90, 70] β†’ champion updates at trials 1, 2, 4 +rewards = [50, 80, 60, 90, 70] +tracker = ChampionTracker('/tmp/test-champion') +champions = [] +for i, r in enumerate(rewards): + if tracker.update_if_better(r, params={}, model_path=f'trial-{i}'): + champions.append(i) +assert champions == [0, 1, 3] # 0-indexed +``` + +**Acceptance criteria:** +- [ ] Champion manifest updated whenever new best reward is found +- [ ] `agent/models/champion/model.zip` always contains the best model seen +- [ ] `champion` field in JSONL is `true` for the best trial, `false` otherwise +- [ ] Known-answer champion tracking test passes + +**Validation evidence:** `.harness/wave1-runner/validation/1A-03-validation.md` + +--- + +### Packet 1A-04 β€” Autoresearch Controller Wiring + +**Status:** ⬜ Not started +**Est. effort:** 0.5 sessions +**Depends on:** 1A-01, 1A-03 + +**Goal:** Update `autoresearch_controller.py` to pass all required args to the rebuilt runner, use a separate Phase 1 results file, and add timesteps to the search space. + +**Steps:** +1. Add `timesteps` to GP search space: `{'type': 'int', 'min': 5000, 'max': 30000}` +2. Pass `--learning-rate`, `--save-dir`, `--reward-shaping` to runner subprocess +3. Save new results to `autoresearch_results_phase1.jsonl` (do NOT mix with random-policy data) +4. Parse `mean_reward` from `[SB3 Runner] mean_reward=X` output line +5. Parse `std_reward` from `[SB3 Runner] std_reward=X` output line (add to runner output) +6. Add `--push-every N` flag: git add + commit + push every N trials +7. Add `--min-trials-before-gp 3` (default): use random sampling for first N trials + +**Files created/modified:** +- `agent/autoresearch_controller.py` β€” wire up new args, new results file, push support + +**Acceptance criteria:** +- [ ] Phase 1 results go to `autoresearch_results_phase1.jsonl` only +- [ ] `learning_rate` arg is passed to and used by the runner +- [ ] `save_dir` is a trial-specific path: `agent/models/trial-{trial_number:04d}` +- [ ] Git push happens every N trials if `--push-every N` is set +- [ ] Random proposal used for first `min_trials_before_gp` trials + +**Validation evidence:** `.harness/wave1-runner/validation/1A-04-validation.md` + +--- + +## πŸ”’ Dependency Order + +``` +1A-01 β†’ 1A-02 (reward wrapper) +1A-01 β†’ 1A-03 (champion tracking) +1A-01 + 1A-03 β†’ 1A-04 (controller wiring) +``` + +1A-02 and 1A-03 can run in parallel after 1A-01. + +--- + +## 🏁 Stream Completion Criteria + +- [ ] All 4 packets complete with validation evidence written +- [ ] `python3 donkeycar_sb3_runner.py --agent ppo --timesteps 5000 --save-dir /tmp/t` produces a saved model +- [ ] `pytest tests/ -v` β€” stream 1A tests pass (once 1B is done) +- [ ] No `NameError: name 'model' is not defined` possible in any code path +- [ ] Champion tracking works: manifest.json updated correctly +- [ ] IMPLEMENTATION_PLAN tasks 1A-01 through 1A-04 marked `[x]` +- [ ] EXECUTION_MASTER updated + +--- + +## πŸ“‹ Mandatory Commit Trailer Format + +``` +feat(runner): 1A-NN β€” + +Agent: pi/claude-sonnet +Tests: N/A (sim required) | N/N passing +Tests-Added: +N +TypeScript: N/A +``` diff --git a/AGENT.md b/AGENT.md new file mode 100644 index 0000000..d36543e --- /dev/null +++ b/AGENT.md @@ -0,0 +1,288 @@ +# AGENT.md Template + +> Copy this file into your project root as `AGENT.md`. +> The agent reads this at the start of every iteration. +> This file is canonical β€” do not keep older duplicated instruction blocks beneath it. + +--- + +## Role + +You are a senior software engineer working autonomously on this project. +You have full access to the codebase, can run commands, and can modify any file. + +--- + +## Core Loop + +Every time you start, follow this exact sequence: + +### 1. Orient +- Read `PROJECT-SPEC.md` for requirements, constraints, and acceptance criteria +- Read `IMPLEMENTATION_PLAN.md` for the current task list and status +- Read recent git log (`git log --oneline -10`) to understand what's been done +- Check for any failing tests or build errors +- If this project uses wave-based execution, also read `.harness/EXECUTION_MASTER.md` and the active stream execution board +- If a task spec exists for the current unit of work, read it and treat it as binding + +### 2. Plan (if no plan exists) +If `IMPLEMENTATION_PLAN.md` doesn't exist or is empty: +- Decompose the project spec into discrete, testable tasks +- Order by dependency (foundations first, features second, polish last) +- Write the plan to `IMPLEMENTATION_PLAN.md` with checkboxes +- Output `PLANNED` and exit + +### 3. Pick ONE Unit of Work +- For simple projects: find the first unchecked task in `IMPLEMENTATION_PLAN.md` +- For wave-based projects: pick the next packet defined in the active execution board +- If all tasks/packets are complete, output `DONE` and exit +- Focus ONLY on this one unit of work β€” do not work on anything else + +### 3b. Load the Task Spec When Present + +If the project provides a task spec for the selected unit of work: +- read it before implementation starts +- use its acceptance criteria as the immediate success contract +- follow its MUST / MUST NOT / PREFER / ESCALATE constraints +- treat its proof artifact requirement as part of done + +If the task spec conflicts with the project spec or execution board: +- escalate instead of guessing + +### 4. Implement +- Write the code for this unit of work +- Follow the project's coding standards and patterns +- Keep changes minimal and focused +- If adding a new utility or helper used in multiple places: extract it to a shared location, do not duplicate it + +### 5. Verify (BLOCKING β€” required before commit) + +Run ALL relevant verification for the current unit of work. At minimum: + +```bash +# 1. Tests +npm test + +# 2. TypeScript / compile verification where applicable +npx tsc --noEmit + +# 3. Build verification where applicable +npm run build +``` + +Also run any project-specific commands required by the spec or execution board, such as: +- frontend type-check +- lint +- known-answer eval suite +- regression comparison scripts + +**Do not commit if any required verification step fails.** +Do not disable or skip failing tests. + +### 5b. Documentation Check (required for user-facing changes) + +After verification, ask: **Is this change user-facing?** + +A change is user-facing if it: +- adds, removes, or renames a UI element, tab, page, or feature +- changes a user-visible workflow +- changes how a domain calculation produces its visible result +- adds a new CLI command or configuration option + +If YES: +1. Identify the relevant documentation file(s) +2. Update them in the same unit of work +3. Record which docs were updated in validation evidence + +If NO: +- Record "No user-facing change β€” docs not required" in validation evidence + +### 5c. Validation Evidence (required for stream/packet-based work) + +If the project uses execution boards or packet discipline: +- write/update the packet validation evidence file immediately after verification +- tick off acceptance criteria explicitly +- record test deltas, doc status, and any important deviations from plan + +Proof is part of done. + +### 5d. Post-Run Validation Mindset + +Do not treat your own completion signal as proof. + +Before you consider the unit complete, confirm that: +- the intended output files actually exist or changed +- the relevant tracker moved correctly +- the required proof artifact exists +- the claimed completion agrees with the actual remaining work + +Your job is not just to do work. +Your job is to leave behind evidence that a runtime or reviewer can trust. + +### 6. Commit & Mark Done + +Commit with the mandatory attribution format: + +```text +(): + + + +Agent: +Tests: +Tests-Added: <+N | 0> +TypeScript: clean | | N/A (docs-only) +``` + +Then: +- mark the task done in `IMPLEMENTATION_PLAN.md`, or +- update packet/stream status in the execution board and related tracking docs + +Commit the tracking-file update in the same focused change where practical. + +### 7. Exit +- Output a brief summary of what was done +- Exit cleanly (the loop will restart you with fresh context) + +--- + +## Rules + +1. **One unit of work per iteration.** Never work on multiple tasks/packets. Fresh context each time. +2. **Tests are mandatory.** Every feature needs new tests. Every bug fix needs a regression test. Existing tests passing is not sufficient for new logic. +3. **TypeScript must compile when applicable.** Never assume runtime tests replace type checks. +4. **Build must pass when applicable.** Never commit code that does not build. +5. **Follow the spec, not your imagination.** Implement what the spec asks for, nothing more. +6. **Do not refactor unrelated code.** Stay focused on the current unit of work. +7. **Extract shared logic.** If two components need the same logic, create a shared utility. Do not duplicate. +8. **Types first.** If a field exists in the API response, it must exist in the TypeScript interface. Do not use `any` or unsafe casts to bypass missing types. +9. **Documentation is part of done.** If user-facing behavior changed, docs must change too. +10. **Validation is part of done.** If the project uses packet/stream discipline, proof artifacts must be written immediately. +11. **Task specs are binding when present.** Do not override them casually. +12. **Completion signals are not proof.** Leave behind evidence a runtime can validate. +13. **If stuck, document it and escalate.** Do not silently invent missing requirements. + +--- + +## The Tests-Added Rule + +> "Tests pass" is not the same as "the new work is tested." + +| What you added | Minimum new tests required | +|----------------|---------------------------| +| New utility/helper function | β‰₯ 3 (happy path + edge case + null/empty) | +| New service method | β‰₯ 2 unit tests | +| New API endpoint | β‰₯ 2 integration tests (success + error) | +| New React component | β‰₯ 1 render test | +| Bug fix | β‰₯ 1 regression test proving the bug is fixed | +| Refactor with no new logic | 0 acceptable β€” but all existing must still pass | +| Docs-only packet | 0 acceptable | + +If a feature commit says `Tests-Added: 0`, assume that is a red flag until proven otherwise. + +--- + +## Rejection Capture Rule + +When you reject an implementation path, design option, or workaround that looked plausible but was not chosen, capture it in the most relevant project artifact. + +Examples worth capturing: +- a dependency was considered and rejected +- a more complex architecture was rejected as overkill +- a shortcut was rejected because it violated a MUST NOT constraint +- a plausible implementation was rejected because it failed a known-answer test + +Capture three things: +1. **What was rejected** +2. **Why it was rejected** +3. **Where the lesson belongs** β€” local project note, decision record, validation evidence, or process eval + +This prevents future sessions from retrying the same bad idea. + +--- + +## Commit Attribution Trailers + +All commits must include these trailers. + +| Trailer | How to get the value | +|---------|---------------------| +| `Agent:` | Your model ID, e.g. `github-copilot/claude-sonnet-4.6` | +| `Tests:` | Run the test command β€” record as `N/N passing`, or `N/A (docs-only)` | +| `Tests-Added:` | Count your new test cases β€” use `+N` format | +| `TypeScript:` | Run `npx tsc --noEmit` β€” `clean` if silent, otherwise count errors, or `N/A (docs-only)` | + +--- + +## Known Anti-Patterns (Do Not Repeat) + +❌ Duplicating logic across components instead of extracting shared utilities +❌ Using unsafe casts to access fields missing from real TypeScript interfaces +❌ Adding API fields in code without adding them to the type definitions +❌ Committing with failing tests, failing type-check, or β€œin-progress” messages +❌ Large blast commits spanning unrelated concerns +❌ Zero meaningful tests on feature work +❌ Treating "looks plausible" as proof in regulated or calculation-heavy domains +❌ Keeping superseded instruction blocks instead of maintaining one canonical version + +--- + +## Escalation Protocol + +> When the spec does not cover a decision, STOP and escalate. +> Do not fill gaps with assumptions. + +You MUST escalate when: + +1. **Requirement gap** β€” the spec has no FR-NNN covering the work +2. **Constraint conflict** β€” two constraints contradict each other +3. **Ambiguous acceptance criteria** β€” key expected behavior is undefined +4. **Missing tech stack decision** β€” the task requires a tool/framework not specified +5. **Destructive action** β€” deleting data, removing files, or changing risky configuration +6. **New dependency needed** β€” a library/tool must be added beyond the approved stack +7. **ESCALATE constraint triggered** β€” any explicit project-level escalate condition applies + +### How to escalate +1. Stop work on the current unit immediately +2. Add this block at the top of `IMPLEMENTATION_PLAN.md` (or equivalent active tracker): + +```markdown +## ESCALATION REQUIRED +- **Task:** [current task name] +- **Issue:** [what's ambiguous/missing/conflicting] +- **What I need:** [specific question or decision] +- **What I'd do if I had to guess:** [best guess] +``` + +3. Output `STUCK` and exit + +--- + +## Output Signals + +The loop script or orchestrator watches for these signals: + +- `PLANNED` β€” Plan created, ready for build iterations +- `DONE` β€” All tasks complete, project finished +- `STUCK` β€” Cannot proceed without human intervention +- `ERROR` β€” Unrecoverable error encountered + +These signals are coordination hints, not proof by themselves. +The surrounding runtime or reviewer may validate the result against plans, boards, files, and proof artifacts. + +--- + +## Context Management + +You start fresh each iteration. Your working memory is the project artifact set: +- `PROJECT-SPEC.md` β€” what to build +- `IMPLEMENTATION_PLAN.md` β€” what is done and what is next +- `.harness/EXECUTION_MASTER.md` β€” wave/stream dashboard for larger projects +- `.harness//execution-board.md` β€” stream contract +- task spec files β€” the current delegated unit-of-work contract when present +- validation evidence and process evals β€” proof/history of prior work +- git log β€” execution history +- the codebase itself β€” current state +- test/eval results β€” whether the work holds up + +This is intentional. Fresh context prevents stale reasoning and makes the artifacts the source of truth. diff --git a/DECISIONS.md b/DECISIONS.md new file mode 100644 index 0000000..b523348 --- /dev/null +++ b/DECISIONS.md @@ -0,0 +1,192 @@ +# Architecture Decision Records β€” DonkeyCar RL Autoresearch + +> One ADR per major non-obvious technical choice. +> Agents read this to avoid re-opening settled decisions. + +--- + +## ADR-001: PPO over DQN as Primary Agent + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** DonkeyCar driving is a continuous control problem (steer ∈ [-1,1], throttle ∈ [0,1]). DQN requires discrete action spaces; we worked around this with DiscretizedActionWrapper. PPO supports continuous action spaces natively. + +**Decision:** Use PPO as the primary agent. Keep DQN support for discrete action experiments. + +**Consequences:** +- PPO trains faster on continuous driving tasks (no discretization artifacts) +- No need for DiscretizedActionWrapper with PPO (but keep it for DQN experiments) +- PPO with CnnPolicy handles raw image observations natively + +**Rejected alternatives:** +- DQN only β€” requires discretization; loses steering resolution +- SAC β€” valid alternative but PPO is simpler and well-tested on DonkeyCar + +--- + +## ADR-002: Pure Numpy GP (TinyGP) over sklearn + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** We need a Gaussian Process surrogate model for the autoresearch controller. sklearn.gaussian_process exists but has had compatibility issues with our numpy version. + +**Decision:** Use TinyGP β€” a pure numpy RBF kernel GP implemented in autoresearch_controller.py. + +**Consequences:** +- No sklearn dependency +- Full control over kernel and noise parameters +- Slightly less optimized than sklearn but sufficient for < 1000 data points + +**Rejected alternatives:** +- sklearn GaussianProcessRegressor β€” dependency issues +- GPyTorch β€” overkill, adds PyTorch dependency +- Botorch β€” same + +--- + +## ADR-003: JSONL Append-Only Results + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** Results from 300+ trials must be persistent, recoverable, and never lost. + +**Decision:** All results are appended to JSONL files. Results files are never truncated or overwritten. + +**Consequences:** +- System can be interrupted and resumed at any point +- Historical data is preserved even if a later trial fails +- Easy to parse with `json.loads(line)` per line + +**Rejected alternatives:** +- SQLite β€” adds dependency, overkill for this volume +- CSV β€” loses type information, harder to extend + +--- + +## ADR-004: GP+UCB Bayesian Optimization for Hyperparameter Search + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** We need an intelligent hyperparameter search strategy. Grid search was the starting point but misses non-grid-aligned optimal regions (proven: n_steer=8 was NOT in the original grid of [3,5,7]). + +**Decision:** Gaussian Process + Upper Confidence Bound (UCB) acquisition. GP models the reward landscape; UCB balances exploration vs exploitation. + +**kappa=2.0** default: reasonable balance, can be increased for more exploration. + +**Consequences:** +- Finds optimal regions with fewer trials than grid search +- Naturally handles continuous parameter spaces (learning_rate ∈ [0.00005, 0.005]) +- Requires at least 2 data points before GP can be fit (random sampling for first 2 trials) + +**Rejected alternatives:** +- Random search β€” better than grid but no learning +- Tree Parzen Estimator (TPE/Optuna) β€” valid alternative, adds dependency +- CMA-ES β€” better for high-dimensional spaces; our space is 3D, GP is sufficient +- Population-Based Training (PBT) β€” requires parallel sim instances (we only have 1) + +--- + +## ADR-005: No Model Saving Before Model is Defined + +**Date:** 2026-04-13 +**Status:** Accepted (bug fix β€” never repeat) + +**Context:** The original donkeycar_sb3_runner.py called `model.save(save_path)` after removing the model training code. This caused `NameError: name 'model' is not defined` on every single run for 300 trials. + +**Decision:** Never call `model.save()` without first verifying `model` is defined. Training and saving must be atomic β€” if training fails, no save attempt. + +**Pattern:** +```python +try: + model = PPO('CnnPolicy', env, ...) + model.learn(total_timesteps=timesteps) + model.save(save_path) +except Exception as e: + log(f'Training failed: {e}') + sys.exit(102) +``` + +**Rejected alternatives:** +- Checking `if 'model' in locals()` before save β€” fragile, hides bugs + +--- + +## ADR-006: env.close() + 2-Second Cooldown is Non-Negotiable + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** Early in the project, not calling env.close() between runs caused simulator zombie processes that locked up the entire system. 20+ consecutive runs work reliably with this pattern. + +**Decision:** Every runner process MUST: +1. Call `env.close()` in a try/except before exit +2. Sleep 2 seconds after close +3. Then exit + +This applies even if training or evaluation fails. + +**Rejected alternatives:** +- Relying on Python garbage collection for env cleanup β€” proven to cause hangs + +--- + +## ADR-007: PPO with CnnPolicy for Image Observations + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** DonkeyCar provides 120x160x3 RGB camera images as observations. The policy must process images. + +**Decision:** Use `PPO('CnnPolicy', env, ...)` from SB3. CnnPolicy automatically handles image preprocessing with a CNN feature extractor. + +**Consequences:** +- Larger model than MlpPolicy (image processing overhead) +- Requires VecTransposeImage wrapper (SB3 handles this internally) +- Training is slower per step but produces better driving behavior + +**Rejected alternatives:** +- MlpPolicy β€” cannot handle raw image inputs +- Custom CNN β€” unnecessary complexity given SB3's built-in CnnPolicy + +--- + +## ADR-008: All Phases Planned, Phase 1 Executed First + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** User asked whether to implement Phase 1 only or all phases. Three phases identified: +1. Real Training Foundation +2. Multi-Track Generalization +3. Racing / Speed Optimization + +**Decision:** Plan all phases in full documentation, execute Phase 1 first. Do not start Phase 2 until Phase 1 produces a genuine champion model (mean_reward > 100 on training track). This creates a wave gate between Phase 1 and Phase 2. + +**Rationale:** Phase 2 and 3 depend on having a real trained model. Without Phase 1 complete, there is nothing to generalize or optimize for speed. + +--- + +## ADR-009: Tests Must Not Require Live Simulator + +**Date:** 2026-04-13 +**Status:** Accepted + +**Context:** The DonkeyCar simulator must be running on port 9091 for live training. Tests cannot depend on this. + +**Decision:** All pytest tests mock the gym environment. Integration tests use a MagicMock gym env that returns fake observations, rewards, and done signals. Only manual/acceptance tests require the live simulator. + +**Pattern:** +```python +@patch('gymnasium.make') +def test_runner_exits_cleanly(mock_make): + mock_env = MagicMock() + mock_env.reset.return_value = (np.zeros((120,160,3)), {}) + mock_env.step.return_value = (np.zeros((120,160,3)), 1.0, True, False, {}) + mock_env.action_space = gym.spaces.Box(...) + mock_make.return_value = mock_env + # ... test runner +``` diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..e859381 --- /dev/null +++ b/IMPLEMENTATION_PLAN.md @@ -0,0 +1,77 @@ +# Implementation Plan β€” DonkeyCar RL Autoresearch + +> Agent: read this at the start of every iteration. +> Pick the first unchecked task in the current active wave. +> Mark done immediately after commit. + +--- + +## Wave 1: Real Training Foundation +**Goal:** Make the inner loop actually train and save models. Produce a real champion model. +**Gate:** champion model achieves mean_reward > 100 on training track. +**Status:** 🟠 In progress + +### Stream 1A: Core Runner Rebuild + +- [ ] **1A-01** β€” Rebuild `donkeycar_sb3_runner.py` with real PPO training (`model.learn()`), model save, and proper evaluation (`evaluate_policy()`) +- [ ] **1A-02** β€” Add `SpeedRewardWrapper` β€” reward = `speed * (1 - abs(cte)/max_cte)`; add `--reward-shaping` flag +- [ ] **1A-03** β€” Add champion model tracking β€” write `champion_manifest.json` when new best is found +- [ ] **1A-04** β€” Fix autoresearch controller to pass `learning_rate`, `save_dir`, `reward_shaping` args to runner + +### Stream 1B: Tests + +- [ ] **1B-01** β€” Write `tests/test_discretize_action.py` β€” action encoding, decoding, round-trip +- [ ] **1B-02** β€” Write `tests/test_autoresearch_controller.py` β€” GP fit, UCB computation, param round-trip, champion tracking +- [ ] **1B-03** β€” Write `tests/test_runner_integration.py` β€” mocked sim, training + save + eval cycle + +### Stream 1C: First Real Autoresearch Run + +- [ ] **1C-01** β€” Run 50-trial autoresearch with real PPO training; verify models saved +- [ ] **1C-02** β€” Save regression baseline: `champion_reward_phase1.txt` +- [ ] **1C-03** β€” Push all results and models to Gitea +- [ ] **1C-04** β€” Write Wave 1 process eval + +--- + +## Wave 2: Multi-Track Generalization +**Goal:** Champion model drives any track with mean_reward > 50. +**Gate:** Wave 1 champion achieves mean_reward > 100. Wave 1 process eval complete. +**Status:** ⏸️ Not started β€” blocked on Wave 1 + +- [ ] **2-01** β€” Write `evaluate_champion.py` β€” load champion model, evaluate on specified track +- [ ] **2-02** β€” Implement multi-track training curriculum (train on 2 tracks alternately) +- [ ] **2-03** β€” Add domain randomization wrapper (randomize road width, lighting) +- [ ] **2-04** β€” Implement convergence detection in autoresearch (stop when GP sigma collapses) +- [ ] **2-05** β€” Add automatic Gitea push every N trials +- [ ] **2-06** β€” Evaluate champion on unseen track; record generalization gap + +--- + +## Wave 3: Racing / Speed Optimization +**Goal:** Fastest possible lap times on any track. +**Gate:** Wave 2 champion generalizes to β‰₯1 unseen track (mean_reward > 50). +**Status:** ⏸️ Not started β€” blocked on Wave 2 + +- [ ] **3-01** β€” Implement lap time measurement and logging +- [ ] **3-02** β€” Tune reward function for pure speed (aggressive speed weight) +- [ ] **3-03** β€” Fine-tuning from champion checkpoint on new tracks +- [ ] **3-04** β€” Head-to-head: autoresearch champion vs human-tuned baseline +- [ ] **3-05** β€” Research writeup / report + +--- + +## Completion Signals + +The agent outputs one of these at the end of each iteration: +- `PLANNED` β€” just created/updated the plan, ready to implement +- `DONE` β€” all tasks in current wave complete +- `STUCK` β€” needs human input (see ESCALATION REQUIRED block if present) +- `ERROR` β€” unrecoverable error + +--- + +## Notes + +- **Random policy data (300 trials):** The existing autoresearch_results.jsonl contains rewards from random-action policy runs. These are valid for n_steer/n_throttle discretization insights but NOT for learning_rate optimization. Do not mix with Phase 1 real training results. Create a separate results file: `autoresearch_results_phase1.jsonl`. +- **Model storage:** Large CNN models (>100MB) should be excluded from git or use git LFS. Add `agent/models/**/*.zip` to .gitignore if needed, and document download location. +- **Simulator requirement:** All live training tasks (1C-*) require DonkeyCar sim running on port 9091. Tests (1B-*) do NOT require the simulator. diff --git a/PROJECT-KICKOFF.md b/PROJECT-KICKOFF.md new file mode 100644 index 0000000..574f045 --- /dev/null +++ b/PROJECT-KICKOFF.md @@ -0,0 +1,88 @@ +# Project Kickoff Checklist + +> Copy this into a new project root as `PROJECT-KICKOFF.md`. +> Use it to make sure the project is genuinely ready before agent implementation starts. + +--- + +## Project Identity + +- **Project name:** +- **Created:** +- **Mode:** simple / large +- **Primary runtime:** CLI loop / OpenClaw / other +- **Primary language/runtime:** +- **Owner / reviewer:** + +--- + +## Kickoff Status + +### Project setup +- [ ] Project folder created +- [ ] Git initialized +- [ ] `README.md` exists +- [ ] `.gitignore` exists +- [ ] Initial baseline commit created + +### Harness files +- [ ] `AGENT.md` exists +- [ ] `PROJECT-SPEC.md` exists +- [ ] `DECISIONS.md` exists +- [ ] `IMPLEMENTATION_PLAN.md` exists or is ready to be created from the spec +- [ ] `ralph-loop.sh` copied if using CLI loop + +### Larger-project structure (if needed) +- [ ] `.harness/EXECUTION_MASTER.md` exists +- [ ] `.harness/templates/EXECUTION-BOARD-TEMPLATE.md` exists +- [ ] `.harness/templates/VALIDATION-TEMPLATE.md` exists +- [ ] `.harness/templates/PROCESS-EVAL-TEMPLATE.md` exists +- [ ] `.harness/regression-baselines/` exists + +### Spec readiness +- [ ] Project overview filled in +- [ ] Functional requirements are numbered +- [ ] Acceptance criteria are testable +- [ ] `MUST / MUST NOT / PREFER / ESCALATE` are filled in +- [ ] Evaluation design section is filled in +- [ ] Rejected approaches captured where useful +- [ ] Self-containment test passed + +### Technical readiness +- [ ] Build command known +- [ ] Test command known +- [ ] Lint/type-check command known if applicable +- [ ] Directory structure decided +- [ ] Key dependencies identified +- [ ] Destructive/risky areas called out in constraints + +### Eval readiness +- [ ] Known-answer test needs identified (if any) +- [ ] Regression baseline needs identified (if any) +- [ ] Fixture/sample data needs identified +- [ ] Review cadence roughly decided + +### Execution readiness +- [ ] Execution mode chosen (simple task loop vs wave/stream) +- [ ] Runtime mode chosen (CLI loop vs OpenClaw) +- [ ] First prompt prepared +- [ ] Human knows what β€œdone” looks like + +--- + +## First Prompt + +```text +Read README.md, PROJECT-SPEC.md, AGENT.md, DECISIONS.md, and PROJECT-KICKOFF.md. +If PROJECT-SPEC.md is incomplete, help me finish it using the spec interview protocol. +If the spec is complete and IMPLEMENTATION_PLAN.md does not exist or is still placeholder text, +create it. Do not implement code yet unless the plan is complete and I explicitly say to start. +``` + +--- + +## Notes / Open Questions + +- +- +- diff --git a/PROJECT-SPEC.md b/PROJECT-SPEC.md new file mode 100644 index 0000000..3b8ae4e --- /dev/null +++ b/PROJECT-SPEC.md @@ -0,0 +1,455 @@ +# Project Specification β€” DonkeyCar RL Autoresearch + +**Version:** 1.0.0 +**Date:** 2026-04-13 +**Owner:** paulh +**Status:** Active + +--- + +## 1. Project Overview + +### What are we building? + +An end-to-end autonomous research and training system for DonkeyCar reinforcement learning agents. The system: +1. Trains DQN/PPO RL agents in the DonkeyCar simulator using Stable-Baselines3 +2. Saves the best-performing models to disk after every training run +3. Uses a Gaussian Process + UCB Bayesian autoresearch controller to intelligently propose and evaluate new hyperparameter configurations β€” learning from every run +4. Produces a champion model capable of driving a DonkeyCar on any track at maximum speed with minimum cross-track error + +The project replaces manual hyperparameter tuning and random grid sweeps with a self-directing autoresearch loop that gets smarter with each trial. + +### Why does it matter? + +Manual hyperparameter search for RL is slow, expensive, and non-systematic. The DonkeyCar task (fast, stable lap driving generalizable across tracks) requires careful tuning of the action space, reward function, and learning parameters. A Bayesian autoresearch loop: +- Finds better configurations than grid search with fewer trials +- Discovers non-obvious parameter regions (e.g., n_steer=8, n_throttle=5 emerged from autoresearch, not from the grid) +- Creates a reproducible, logged, version-controlled research artifact +- Enables unattended overnight experimentation with full observability + +### Success Criteria + +- [ ] Inner loop trains a real PPO/DQN model for a configurable number of timesteps and saves the best model to disk +- [ ] Autoresearch controller proposes hyperparameters using GP+UCB and evaluates trained models (not random policy) +- [ ] Champion model (highest eval reward across all trials) is saved separately and can be loaded for demonstration +- [ ] Champion model can complete at least one lap on the training track with mean_reward > 100 +- [ ] Champion model generalizes to at least one unseen track (mean_reward > 50 on eval track) +- [ ] All results are logged, versioned, and pushed to Gitea automatically +- [ ] System can run unattended overnight with zero hangs or zombie processes +- [ ] Full documentation exists: PRD, architecture, decisions, implementation plan, evals + +--- + +## 2. Technical Foundation + +### Tech stack + +- **Language:** Python 3.10 +- **RL Framework:** Stable-Baselines3 (SB3) β€” PPO and DQN +- **Simulator:** DonkeyCar Gym (gym_donkeycar) running locally on port 9091 +- **Gym Interface:** Gymnasium (gymnasium) +- **Surrogate Model:** Pure numpy Gaussian Process (TinyGP β€” no sklearn required) +- **Action Wrapper:** Custom DiscretizedActionWrapper (discretize_action.py) +- **Version Control:** Git + Gitea (https://paje.ca/git/paulh/donkeycar-rl-autoresearch) +- **Test Framework:** pytest +- **Logging:** JSON Lines (JSONL) + human-readable log files + +### Project Structure + +``` +donkeycar-rl-autoresearch/ +β”œβ”€β”€ AGENT.md ← Agent instructions (this harness) +β”œβ”€β”€ PROJECT-SPEC.md ← This file +β”œβ”€β”€ DECISIONS.md ← Architecture Decision Records +β”œβ”€β”€ IMPLEMENTATION_PLAN.md ← Master task backlog +β”œβ”€β”€ README.md ← Project overview +β”œβ”€β”€ .gitignore +β”œβ”€β”€ .harness/ +β”‚ β”œβ”€β”€ EXECUTION_MASTER.md ← Wave/stream dashboard +β”‚ β”œβ”€β”€ templates/ ← Harness templates +β”‚ β”œβ”€β”€ regression-baselines/ ← Saved eval baselines +β”‚ └── / +β”‚ β”œβ”€β”€ execution-board.md +β”‚ β”œβ”€β”€ process-eval.md +β”‚ └── validation/ +β”œβ”€β”€ agent/ +β”‚ β”œβ”€β”€ autoresearch_controller.py ← GP+UCB autoresearch loop +β”‚ β”œβ”€β”€ donkeycar_sb3_runner.py ← Inner loop: real training + model save +β”‚ β”œβ”€β”€ donkeycar_outer_loop.py ← Grid sweep (legacy baseline) +β”‚ β”œβ”€β”€ discretize_action.py ← Action space wrapper +β”‚ β”œβ”€β”€ outerloop-results/ +β”‚ β”‚ β”œβ”€β”€ clean_sweep_results.jsonl ← Base sweep data (18 records) +β”‚ β”‚ β”œβ”€β”€ autoresearch_results.jsonl ← Autoresearch trial results +β”‚ β”‚ └── autoresearch_log.txt ← Human-readable autoresearch log +β”‚ └── models/ +β”‚ β”œβ”€β”€ champion/ ← Best model across all trials +β”‚ └── trial-/ ← Per-trial saved models +└── tests/ + β”œβ”€β”€ test_discretize_action.py + β”œβ”€β”€ test_autoresearch_controller.py + └── test_runner_integration.py +``` + +### Build & Test Commands + +```bash +# Run all tests +cd /home/paulh/projects/donkeycar-rl-autoresearch +python3 -m pytest tests/ -v + +# Run autoresearch controller (requires sim running on port 9091) +cd agent && python3 autoresearch_controller.py --trials 50 + +# Run single training trial manually +cd agent && python3 donkeycar_sb3_runner.py --agent ppo --timesteps 10000 --eval-episodes 5 + +# Check Gitea push +cd /home/paulh/projects/donkeycar-rl-autoresearch && git push +``` + +### Coding Standards + +- All output uses `flush=True` for real-time log visibility +- Every process must call `env.close()` and `time.sleep(2)` before exit (proven zombie prevention) +- All results are appended to JSONL files β€” never overwritten +- Model saves use `model.save(path)` from SB3 standard API +- Champion model tracking: autoresearch writes `champion_model_path` to results JSONL +- No `model.save()` calls on undefined variables β€” always check model exists before saving +- Python only β€” no TypeScript, no Node + +--- + +## 3. Requirements + +### Functional Requirements + +#### FR-001: Real RL Training in Inner Loop +**Description:** The inner RL runner (`donkeycar_sb3_runner.py`) must actually train a PPO or DQN model using `model.learn(total_timesteps=N)`, not run random actions. +**Acceptance criteria:** +- [ ] Given `--agent ppo --timesteps 10000`, the runner trains a PPO model for 10000 steps +- [ ] Training uses the `learning_rate` argument passed from the autoresearch controller +- [ ] Training uses the discretized action space (n_steer, n_throttle) when DQN is used +- [ ] PPO runs with continuous actions (no discretization needed) +- [ ] Training completes without hanging and exits with code 0 + +#### FR-002: Model Saving +**Description:** After each training run, the trained model is saved to disk. +**Acceptance criteria:** +- [ ] Model saved to `agent/models/trial-/model.zip` after every successful run +- [ ] If eval reward is the best seen so far, model is also copied to `agent/models/champion/model.zip` +- [ ] Save path is logged to the JSONL results file +- [ ] Model can be loaded with `PPO.load()` or `DQN.load()` for subsequent evaluation + +#### FR-003: Real Policy Evaluation +**Description:** After training, the model is evaluated using the learned policy (not random actions). +**Acceptance criteria:** +- [ ] `evaluate_policy(model, env, n_eval_episodes=N)` is used for evaluation +- [ ] Mean reward and std reward are both recorded +- [ ] Evaluation uses the same action wrapper as training +- [ ] Per-episode rewards are printed for full observability + +#### FR-004: Autoresearch GP+UCB Controller +**Description:** The autoresearch controller proposes hyperparameters using Gaussian Process + UCB acquisition, learning from prior results. +**Acceptance criteria:** +- [ ] Controller loads ALL prior results (base sweep + autoresearch history) at startup +- [ ] GP is fit on encoded (normalized) parameter vectors and corresponding eval rewards +- [ ] UCB acquisition = GP mean + kappa * GP std +- [ ] Next trial parameters maximize UCB over N_CANDIDATES random samples +- [ ] Controller logs top-5 UCB candidates before each trial +- [ ] Controller correctly handles first 2 trials (insufficient data for GP β€” uses random sampling) + +#### FR-005: Champion Model Tracking +**Description:** The system maintains a single "champion" model β€” the best-performing model across all trials. +**Acceptance criteria:** +- [ ] After each trial, if `mean_reward > current_best`, the model is saved as champion +- [ ] Champion metadata (params, reward, trial number, timestamp) saved to `champion_manifest.json` +- [ ] Champion model path is stable: `agent/models/champion/model.zip` +- [ ] Champion can be loaded and demonstrated without retraining + +#### FR-006: Speed-Aware Reward Shaping +**Description:** The reward function incentivizes speed, not just staying on track. +**Acceptance criteria:** +- [ ] Custom reward wrapper computes: `reward = speed * (1 - abs(cte) / max_cte)` +- [ ] Speed and CTE values are accessible from the DonkeyCar info dict +- [ ] Reward wrapper is optional (enabled via `--reward-shaping` flag) +- [ ] Without flag, default DonkeyCar reward is used unchanged + +#### FR-007: Multi-Track Generalization Evaluation +**Description:** The champion model is evaluated on at least one track it was NOT trained on. +**Acceptance criteria:** +- [ ] Evaluation script accepts `--track` argument to specify evaluation track +- [ ] Champion model is loaded and evaluated for N episodes on the specified track +- [ ] Results (mean_reward, per-episode rewards) are logged +- [ ] Generalization gap (train_reward - eval_reward) is reported + +#### FR-008: Autoresearch Results Logging +**Description:** Every trial produces a complete, structured result record. +**Acceptance criteria:** +- [ ] JSONL record includes: trial_id, timestamp, params, mean_reward, std_reward, model_path, champion_flag, elapsed_sec, run_status +- [ ] Autoresearch log (human-readable) is updated after every trial +- [ ] Results file is never truncated β€” only appended +- [ ] Results are pushed to Gitea after every N trials (configurable, default 10) + +#### FR-009: Unattended Overnight Operation +**Description:** The system runs for 100+ trials without hanging, zombie processes, or data loss. +**Acceptance criteria:** +- [ ] Every job calls `env.close()` before exit +- [ ] 2-second cooldown between jobs prevents race conditions +- [ ] Stale process kill (`pkill -9 -f donkeycar_sb3_runner.py`) before each new job +- [ ] 6-minute timeout per job β€” killed and logged if exceeded +- [ ] System auto-resumes from existing results if restarted mid-sweep + +#### FR-010: Test Suite +**Description:** Core logic is covered by automated tests that don't require the simulator. +**Acceptance criteria:** +- [ ] `test_discretize_action.py` β€” tests action space wrapping correctness +- [ ] `test_autoresearch_controller.py` β€” tests GP fitting, UCB computation, param encoding/decoding +- [ ] `test_runner_integration.py` β€” mocked simulator test of training + save + eval cycle +- [ ] All tests pass with `pytest tests/ -v` +- [ ] No tests require a running simulator + +### Non-Functional Requirements + +#### NFR-001: Performance +- [ ] Each training trial completes in < 6 minutes for 10000 timesteps +- [ ] GP fitting on 300 data points completes in < 2 seconds +- [ ] System does not consume > 8GB RAM per trial + +#### NFR-002: Robustness +- [ ] Zero hanging jobs across 100 consecutive trials +- [ ] All errors are caught, logged, and do not crash the autoresearch loop +- [ ] System correctly handles sim disconnection and logs the failure + +#### NFR-003: Reproducibility +- [ ] All results are version-controlled in Gitea +- [ ] Every trial records the exact parameters used +- [ ] Results are deterministic given the same seed (seed support in runner) + +#### NFR-004: Observability +- [ ] Real-time per-step reward printing during training and evaluation +- [ ] Per-trial summary logged to both console and file +- [ ] Running champion summary printed after every trial + +--- + +## 4. Data Model + +### Trial Result Record (JSONL) + +```json +{ + "trial": 42, + "timestamp": "2026-04-13T03:14:15.926535", + "params": { + "agent": "ppo", + "n_steer": 7, + "n_throttle": 3, + "learning_rate": 0.0003, + "timesteps": 10000, + "eval_episodes": 5, + "reward_shaping": false + }, + "mean_reward": 127.45, + "std_reward": 18.3, + "model_path": "agent/models/trial-042/model.zip", + "champion": true, + "elapsed_sec": 187.4, + "run_status": "ok" +} +``` + +### Champion Manifest (`agent/models/champion/manifest.json`) + +```json +{ + "trial": 42, + "timestamp": "2026-04-13T03:14:15.926535", + "params": { "..." }, + "mean_reward": 127.45, + "model_path": "agent/models/champion/model.zip" +} +``` + +### GP State (in-memory, rebuilt each iteration from JSONL) + +``` +X: [N, n_params] normalized parameter vectors +y: [N] normalized mean rewards +GP: TinyGP fitted to (X, y) +``` + +--- + +## 5. Interface Design + +### Runner CLI (`donkeycar_sb3_runner.py`) + +```bash +python3 donkeycar_sb3_runner.py \ + --agent ppo|dqn \ + --env donkey-generated-roads-v0 \ + --timesteps 10000 \ + --eval-episodes 5 \ + --n-steer 7 \ + --n-throttle 3 \ + --learning-rate 0.0003 \ + --save-dir agent/models/trial-042 \ + --seed 42 \ + --reward-shaping +``` + +### Autoresearch Controller CLI + +```bash +python3 autoresearch_controller.py \ + --trials 100 \ + --explore 2.0 \ + --agent ppo \ + --min-timesteps 5000 \ + --max-timesteps 20000 \ + --push-every 10 +``` + +### Evaluation / Demo CLI (`evaluate_champion.py`) + +```bash +python3 evaluate_champion.py \ + --model agent/models/champion/model.zip \ + --env donkey-mountain-track-v0 \ + --episodes 10 +``` + +--- + +## 6. Architecture Decisions + +### Constraints + +- **MUST:** Always call `env.close()` before process exit +- **MUST:** Save every trained model β€” never discard +- **MUST:** Use `evaluate_policy()` from SB3 for evaluation β€” not a custom loop +- **MUST:** Append to JSONL results β€” never overwrite +- **MUST:** All tests run without a live simulator +- **MUST NOT:** Use `model.save()` before `model` is defined +- **MUST NOT:** Run random actions in production inner loop (this was the original bug) +- **MUST NOT:** Remove the 2-second cooldown between jobs +- **PREFER:** PPO over DQN for continuous driving tasks (better suited) +- **PREFER:** Pure numpy GP over sklearn to avoid dependency issues +- **PREFER:** Reward shaping enabled by default for speed optimization +- **ESCALATE:** If DonkeyCar gym API changes break env.reset() or env.step() signatures +- **ESCALATE:** If simulator port 9091 is unavailable at test time +- **ESCALATE:** If SB3 model save/load API changes between versions + +### Known Challenges + +1. **Simulator must be running:** All live training requires the DonkeyCar sim on port 9091. Tests must mock this. +2. **Episode length variance:** Episodes end at 100 steps or CTE > 8. Mean reward has high variance across episodes. +3. **Random seed handling:** DonkeyCar gym.reset() signature differs between Gym and Gymnasium versions. +4. **Model size:** PPO models with CNN policy on 120x160x3 images can be large (>100MB). Consider git LFS or exclude from git. + +### Rejected Approaches + +| Rejected option | Why rejected | Scope | +|-----------------|-------------|-------| +| Random action inner loop | Produces meaningless reward signal β€” cannot optimize for trained driving | project | +| sklearn GP | Adds sklearn dependency, compatibility issues found previously | project | +| DQN for continuous actions | DQN requires discretized actions, PPO handles continuous natively | project | +| Grid sweep as primary search | Fixed grid misses best regions; GP+UCB finds n_steer=8, n_throttle=5 which was not in grid | project | +| 100/200 trial arbitrary batches | No principled stopping criterion; should use convergence detection instead | project | +| model.save() from legacy training function | Was undefined β€” caused NameError crash on every run for entire history | project | + +--- + +## 7. Phasing + +### Phase 1: Real Training Foundation (CURRENT β€” implement first) +Core goal: make the inner loop actually train and save models. +- [ ] Rebuild `donkeycar_sb3_runner.py` with real PPO/DQN training + save +- [ ] Add speed-aware reward shaping wrapper +- [ ] Add proper `evaluate_policy()` evaluation +- [ ] Fix autoresearch controller to pass `learning_rate` to runner +- [ ] Add champion model tracking +- [ ] Write tests for all core logic +- [ ] Re-run autoresearch with real training (50 trials minimum) + +### Phase 2: Generalization (after Phase 1 champion exists) +Core goal: the champion model drives ANY track. +- [ ] Multi-track evaluation script +- [ ] Curriculum learning: train on 2+ tracks +- [ ] Domain randomization wrapper +- [ ] Convergence detection in autoresearch (stop when GP uncertainty collapses) +- [ ] Automatic Gitea push every N trials + +### Phase 3: Racing (after Phase 2 β€” generalization proven) +Core goal: fastest possible lap times. +- [ ] Lap time measurement and logging +- [ ] Reward function tuned for pure speed (with safety constraints) +- [ ] Fine-tuning from champion checkpoint on new tracks +- [ ] Head-to-head comparison: autoresearch champion vs human-tuned config +- [ ] Research paper / writeup structure + +--- + +## 8. Reference Materials + +### External Docs +- DonkeyCar Gym: https://github.com/tawnkramer/gym-donkeycar +- Stable-Baselines3: https://stable-baselines3.readthedocs.io/ +- Gymnasium migration: https://gymnasium.farama.org/introduction/migration_guide/ + +### Existing Code to Learn From +- `agent/discretize_action.py` β€” action space wrapper (working, tested in production) +- `agent/autoresearch_controller.py` β€” GP+UCB loop (working, needs inner loop fix) +- `agent/outerloop-results/clean_sweep_results.jsonl` β€” 18 records of base data +- `agent/outerloop-results/autoresearch_results.jsonl` β€” 300 trial records (random policy β€” useful for discretization insights, NOT for learning_rate tuning) + +### Anti-patterns (DO NOT REPEAT) +- Calling `model.save()` before `model` is defined β€” crashes with NameError +- Using `env.action_space.sample()` in the "training" loop β€” this is random, not RL +- Ignoring the `learning_rate` argument in the runner (was passed but unused for 300 trials) +- Arbitrary trial count limits β€” use convergence detection instead +- Not calling `env.close()` β€” causes simulator zombie/hang + +--- + +## 9. Evaluation Design + +### RL Eval Approach + +Unlike software unit tests, RL reward is stochastic. Evaluation strategy: +- Run N_EVAL_EPISODES per trial (default 5) +- Record mean Β± std reward +- Champion = highest mean reward across all trials +- Convergence = GP uncertainty (sigma) drops below threshold across all candidates + +### Test Cases (Simulator-Free) + +#### TC-001: Action Space Encoding +**Input:** n_steer=5, n_throttle=3 β†’ action index 7 +**Expected:** Decoded to approximately (steer=0.0, throttle=0.5) +**Verification:** `pytest tests/test_discretize_action.py::test_decode_action` + +#### TC-002: GP Fit and UCB Proposal +**Input:** 18 data points from clean_sweep_results.jsonl +**Expected:** GP proposes params with n_steer ∈ [6,9] and lr ∈ [0.001, 0.004] (the high-reward region identified in 300 trials) +**Verification:** `pytest tests/test_autoresearch_controller.py::test_ucb_proposal_in_high_reward_region` + +#### TC-003: Param Encoding Round-Trip +**Input:** `{'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002}` +**Expected:** encode β†’ decode round-trip reproduces exact values (within int rounding) +**Verification:** `pytest tests/test_autoresearch_controller.py::test_param_roundtrip` + +#### TC-004: Champion Tracking +**Input:** Trial sequence with rewards [50, 80, 60, 90, 70] +**Expected:** Champion is updated at trials 1, 2, 4 (rewards 50, 80, 90) +**Verification:** `pytest tests/test_autoresearch_controller.py::test_champion_tracking` + +#### TC-005: Runner Exits Cleanly +**Input:** Mocked gym environment, 100 timesteps, PPO +**Expected:** Runner completes, calls env.close(), exits with code 0, model.zip exists +**Verification:** `pytest tests/test_runner_integration.py::test_runner_exits_cleanly` + +### Regression Baselines +Saved after Phase 1 completion: +- `best_params_after_300_random_trials.json` β€” discretization insight baseline +- `champion_reward_phase1.txt` β€” first real training champion reward diff --git a/README.md b/README.md new file mode 100644 index 0000000..0d74c72 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# donkeycar-rl-autoresearch + +## Purpose + + +## Status +- Scaffolded with the agent harness +- Spec not filled yet + +## Runbook +- Fill PROJECT-SPEC.md +- Create IMPLEMENTATION_PLAN.md from the spec +- Start the implementation loop diff --git a/agent/autoresearch_controller.py b/agent/autoresearch_controller.py index a1fb9a7..42c61e3 100644 --- a/agent/autoresearch_controller.py +++ b/agent/autoresearch_controller.py @@ -1,21 +1,24 @@ """ ============================================================= -DonkeyCar RL Autoresearch Controller -Karpathy-style meta-agent that: - 1. Loads base sweep data - 2. Builds a surrogate model (Gaussian Process) of reward landscape - 3. Uses Upper Confidence Bound (UCB) acquisition to propose next params - 4. Launches RL jobs via robust runner - 5. Records results and iterates autonomously +DonkeyCar RL Autoresearch Controller β€” Phase 1 (Real Training) ============================================================= +Uses Gaussian Process + UCB Bayesian optimization to propose +hyperparameters for REAL PPO/DQN training runs (not random policy). + +Each trial: + 1. GP+UCB proposes next hyperparameters + 2. Launches donkeycar_sb3_runner.py with REAL training + 3. Runner saves a trained model to disk + 4. Controller records result, updates GP, tracks champion + 5. Repeat + +Results go to: outerloop-results/autoresearch_results_phase1.jsonl +Champion: models/champion/model.zip + manifest.json + Usage: - python3 autoresearch_controller.py [--trials N] [--explore K] + python3 autoresearch_controller.py --trials 50 --explore 2.0 --push-every 10 -All results are appended to: - outerloop-results/autoresearch_results.jsonl - outerloop-results/autoresearch_log.txt - -Stop at any time with Ctrl+C. Restart and it picks up from existing data. +Stop at any time with Ctrl+C. Restart and it picks up from existing results. ============================================================= """ @@ -24,72 +27,76 @@ import sys import json import time import subprocess -import itertools import re +import shutil import numpy as np from datetime import datetime -# ---- Paths ---- +# ---- Project Paths ---- PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) RUNNER_SCRIPT = os.path.join(PROJECT_DIR, 'donkeycar_sb3_runner.py') RESULTS_DIR = os.path.join(PROJECT_DIR, 'outerloop-results') +MODELS_DIR = os.path.join(PROJECT_DIR, 'models') +CHAMPION_DIR = os.path.join(MODELS_DIR, 'champion') + +# Phase 1 uses a separate results file β€” do NOT mix with random-policy data +PHASE1_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results_phase1.jsonl') +PHASE1_LOG = os.path.join(RESULTS_DIR, 'autoresearch_phase1_log.txt') + +# Legacy base data (discretization insights, valid for n_steer/n_throttle) BASE_DATA_FILE = os.path.join(RESULTS_DIR, 'clean_sweep_results.jsonl') -AUTORESEARCH_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results.jsonl') -AUTORESEARCH_LOG = os.path.join(RESULTS_DIR, 'autoresearch_log.txt') os.makedirs(RESULTS_DIR, exist_ok=True) +os.makedirs(MODELS_DIR, exist_ok=True) +os.makedirs(CHAMPION_DIR, exist_ok=True) -# ---- Parameter Space Definition ---- -# These define the bounds for the autoresearch to explore. -# Autoresearch can propose any value within these continuous ranges. +# ---- Parameter Space ---- +# These are the parameters GP+UCB will optimize PARAM_SPACE = { - 'n_steer': {'type': 'int', 'min': 3, 'max': 9}, - 'n_throttle': {'type': 'int', 'min': 2, 'max': 5}, - 'learning_rate': {'type': 'float', 'min': 0.00005,'max': 0.005}, + 'n_steer': {'type': 'int', 'min': 3, 'max': 9}, + 'n_throttle': {'type': 'int', 'min': 2, 'max': 5}, + 'learning_rate': {'type': 'float', 'min': 0.00005, 'max': 0.005}, + 'timesteps': {'type': 'int', 'min': 5000, 'max': 30000}, } +PARAM_KEYS = list(PARAM_SPACE.keys()) -# Fixed params for all runs +# Fixed params FIXED_PARAMS = { - 'timesteps': 2000, - 'eval_episodes': 3, + 'agent': 'ppo', + 'eval_episodes': 5, + 'reward_shaping': True, } -# How many candidate proposals to sample when searching for next best N_CANDIDATES = 500 - -# UCB exploration constant (higher = more exploration) UCB_KAPPA = 2.0 - -# Job timeout seconds -JOB_TIMEOUT = 360 +MIN_TRIALS_BEFORE_GP = 3 +JOB_TIMEOUT = 600 # 10 minutes per trial (real training takes longer) # ---- Logging ---- def log(msg): ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') line = f'[{ts}] {msg}' print(line, flush=True) - with open(AUTORESEARCH_LOG, 'a') as f: + with open(PHASE1_LOG, 'a') as f: f.write(line + '\n') -# ---- Parameter Encoding (for surrogate model) ---- -PARAM_KEYS = list(PARAM_SPACE.keys()) - +# ---- Parameter Encoding ---- def encode_params(params): - """Encode a params dict into a normalized numpy vector [0,1] for the GP.""" vec = [] for k in PARAM_KEYS: + if k not in params: + continue spec = PARAM_SPACE[k] v = params[k] norm = (v - spec['min']) / (spec['max'] - spec['min']) - vec.append(norm) + vec.append(np.clip(norm, 0.0, 1.0)) return np.array(vec) def decode_params(vec): - """Decode a normalized numpy vector back to a params dict.""" params = {} for i, k in enumerate(PARAM_KEYS): spec = PARAM_SPACE[k] - v = vec[i] * (spec['max'] - spec['min']) + spec['min'] + v = float(vec[i]) * (spec['max'] - spec['min']) + spec['min'] if spec['type'] == 'int': v = int(round(v)) v = max(spec['min'], min(spec['max'], v)) @@ -100,238 +107,301 @@ def decode_params(vec): return params def random_candidate(): - """Sample a random candidate in the parameter space.""" - vec = np.random.uniform(0, 1, len(PARAM_KEYS)) - return vec + return np.random.uniform(0, 1, len(PARAM_KEYS)) -# ---- Gaussian Process Surrogate Model (pure numpy, no sklearn needed) ---- +# ---- Gaussian Process Surrogate Model ---- class TinyGP: - """ - Minimal Gaussian Process regressor (RBF kernel) for surrogate modelling. - Predicts mean and std of reward for any parameter vector. - """ + """Minimal RBF-kernel Gaussian Process for surrogate modelling.""" + def __init__(self, length_scale=0.3, noise=1e-3): self.ls = length_scale self.noise = noise self.X = None - self.y = None + self.alpha = None self.K_inv = None def _rbf(self, X1, X2): - """RBF kernel matrix between X1 and X2.""" diff = X1[:, np.newaxis, :] - X2[np.newaxis, :, :] - sq = np.sum(diff**2, axis=-1) - return np.exp(-sq / (2 * self.ls**2)) + sq = np.sum(diff ** 2, axis=-1) + return np.exp(-sq / (2 * self.ls ** 2)) def fit(self, X, y): self.X = np.array(X) - self.y = np.array(y) n = len(y) K = self._rbf(self.X, self.X) + self.noise * np.eye(n) try: self.K_inv = np.linalg.inv(K) except np.linalg.LinAlgError: self.K_inv = np.linalg.pinv(K) - self.alpha = self.K_inv @ self.y + self.alpha = self.K_inv @ np.array(y) def predict(self, X_new): - """Returns (mean, std) arrays for each row in X_new.""" X_new = np.atleast_2d(X_new) K_s = self._rbf(X_new, self.X) - mean = K_s @ self.alpha - K_ss = np.ones(len(X_new)) + self.noise - var = K_ss - np.sum((K_s @ self.K_inv) * K_s, axis=1) - var = np.maximum(var, 1e-9) - return mean, np.sqrt(var) + mu = K_s @ self.alpha + var = np.maximum( + 1.0 + self.noise - np.sum((K_s @ self.K_inv) * K_s, axis=1), + 1e-9 + ) + return mu, np.sqrt(var) -# ---- Load All Available Data (base sweep + autoresearch results) ---- -def load_all_results(): - """Load all param-reward pairs from base sweep and any autoresearch runs.""" +# ---- Champion Tracker ---- +class ChampionTracker: + def __init__(self, champion_dir): + self.champion_dir = champion_dir + self.manifest_path = os.path.join(champion_dir, 'manifest.json') + os.makedirs(champion_dir, exist_ok=True) + self._best = self._load() + + def _load(self): + if os.path.exists(self.manifest_path): + try: + with open(self.manifest_path) as f: + return json.load(f) + except Exception: + pass + return {'mean_reward': float('-inf'), 'trial': None} + + @property + def best_reward(self): + return self._best.get('mean_reward', float('-inf')) + + def update_if_better(self, mean_reward, params, model_zip_path, trial): + if mean_reward is None or mean_reward <= self.best_reward: + return False + dest = os.path.join(self.champion_dir, 'model.zip') + if model_zip_path and os.path.exists(model_zip_path): + try: + shutil.copy2(model_zip_path, dest) + except Exception as e: + log(f'[Champion] WARNING: Could not copy model: {e}') + dest = model_zip_path + manifest = { + 'trial': trial, + 'timestamp': datetime.now().isoformat(), + 'params': params, + 'mean_reward': mean_reward, + 'model_path': dest, + } + with open(self.manifest_path, 'w') as f: + json.dump(manifest, f, indent=2) + self._best = manifest + log(f'[Champion] πŸ† NEW BEST! Trial {trial}: mean_reward={mean_reward:.4f} params={params}') + return True + + def summary(self): + if self._best['trial'] is None: + return 'No champion yet.' + return f"Champion: trial={self._best['trial']} mean_reward={self._best['mean_reward']:.4f} params={self._best['params']}" + +# ---- Load Results ---- +def load_phase1_results(): + """Load Phase 1 results only β€” no random-policy contamination.""" results = [] - for fpath in [BASE_DATA_FILE, AUTORESEARCH_RESULTS]: - if not os.path.exists(fpath): - continue - with open(fpath) as f: - for line in f: - line = line.strip() - if not line: - continue - try: - rec = json.loads(line) - mr = rec.get('mean_reward') - if mr is not None: - results.append({'params': rec['params'], 'mean_reward': float(mr)}) - except Exception: - pass + if not os.path.exists(PHASE1_RESULTS): + return results + with open(PHASE1_RESULTS) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + rec = json.loads(line) + mr = rec.get('mean_reward') + if mr is not None: + results.append({'params': rec['params'], 'mean_reward': float(mr)}) + except Exception: + pass return results -# ---- UCB Acquisition: Propose Next Best Parameters ---- -def propose_next_params(results, n_candidates=N_CANDIDATES, kappa=UCB_KAPPA): - """ - Fit GP on existing results, then maximize UCB acquisition - over random candidate samples to propose the next params to try. - Returns: proposed params dict - """ - if len(results) < 2: - log('[AutoResearch] Not enough data for GP yet, using random proposal.') +# ---- GP+UCB Proposal ---- +def propose_next_params(results, trial_num, kappa=UCB_KAPPA): + if len(results) < MIN_TRIALS_BEFORE_GP: + log(f'[AutoResearch] Only {len(results)} results β€” using random proposal.') return decode_params(random_candidate()) X = np.array([encode_params(r['params']) for r in results]) y = np.array([r['mean_reward'] for r in results]) - - # Normalize y for numerical stability - y_mean = y.mean() - y_std = y.std() if y.std() > 0 else 1.0 + y_mean, y_std = y.mean(), y.std() if y.std() > 0 else 1.0 y_norm = (y - y_mean) / y_std gp = TinyGP(length_scale=0.3, noise=1e-3) gp.fit(X, y_norm) - # Sample candidates - candidates = np.random.uniform(0, 1, (n_candidates, len(PARAM_KEYS))) - - # Compute UCB acquisition + candidates = np.random.uniform(0, 1, (N_CANDIDATES, len(PARAM_KEYS))) mu, sigma = gp.predict(candidates) ucb = mu + kappa * sigma - best_idx = np.argmax(ucb) - best_vec = candidates[best_idx] - proposed = decode_params(best_vec) - - # Log the GP's top predictions - top5_idx = np.argsort(ucb)[-5:][::-1] + top5 = np.argsort(ucb)[-5:][::-1] log(f'[AutoResearch] GP UCB top-5 candidates:') - for idx in top5_idx: + for idx in top5: p = decode_params(candidates[idx]) log(f' UCB={ucb[idx]:.4f} mu={mu[idx]:.4f} sigma={sigma[idx]:.4f} params={p}') - return proposed + return decode_params(candidates[np.argmax(ucb)]) -# ---- Kill Stale Jobs ---- +# ---- Job Launcher ---- def kill_stale(): subprocess.run(['pkill', '-9', '-f', 'donkeycar_sb3_runner.py'], check=False) time.sleep(2) -# ---- Launch RL Job with Proposed Params ---- -def launch_job(params): - """Launch a single RL runner job and return (mean_reward, output, status).""" +def launch_job(params, trial_num): + save_dir = os.path.join(MODELS_DIR, f'trial-{trial_num:04d}') + os.makedirs(save_dir, exist_ok=True) + cmd = [ 'python3', RUNNER_SCRIPT, - '--agent', 'dqn', + '--agent', params.get('agent', FIXED_PARAMS['agent']), '--env', 'donkey-generated-roads-v0', - '--timesteps', str(params.get('timesteps', FIXED_PARAMS['timesteps'])), - '--eval-episodes', str(params.get('eval_episodes', FIXED_PARAMS['eval_episodes'])), - '--n-steer', str(params['n_steer']), - '--n-throttle', str(params['n_throttle']), + '--timesteps', str(int(params.get('timesteps', 10000))), + '--eval-episodes', str(FIXED_PARAMS['eval_episodes']), + '--learning-rate', str(params.get('learning_rate', 0.0003)), + '--n-steer', str(int(params.get('n_steer', 7))), + '--n-throttle', str(int(params.get('n_throttle', 3))), + '--save-dir', save_dir, ] - log(f'[AutoResearch] Launching job: n_steer={params["n_steer"]} n_throttle={params["n_throttle"]} lr={params["learning_rate"]:.6f}') + if FIXED_PARAMS.get('reward_shaping'): + cmd.append('--reward-shaping') + + log(f'[AutoResearch] Launching trial {trial_num}: {params}') start = time.time() try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=JOB_TIMEOUT) elapsed = time.time() - start output = proc.stdout + '\n' + proc.stderr status = 'ok' if proc.returncode == 0 else 'error' - log(f'[AutoResearch] Job finished in {elapsed:.1f}s, returncode={proc.returncode}') + log(f'[AutoResearch] Trial {trial_num} finished in {elapsed:.1f}s, returncode={proc.returncode}') except subprocess.TimeoutExpired as e: elapsed = time.time() - start output = f'[TIMEOUT after {elapsed:.1f}s]' status = 'timeout' - log(f'[AutoResearch] Job TIMED OUT after {elapsed:.1f}s') + log(f'[AutoResearch] Trial {trial_num} TIMED OUT after {elapsed:.1f}s') - # Parse mean_reward from output + # Print last 2000 chars of output + print('--- Runner Output (tail) ---', flush=True) + print(output[-2000:], flush=True) + print('--- End Runner Output ---', flush=True) + + # Parse results mean_reward = None + std_reward = None m = re.search(r'\[SB3 Runner\]\[TEST\] mean_reward=([\d.]+)', output) if m: mean_reward = float(m.group(1)) - log(f'[AutoResearch] mean_reward={mean_reward}') + m = re.search(r'\[SB3 Runner\]\[TEST\] std_reward=([\d.]+)', output) + if m: + std_reward = float(m.group(1)) - # Print full runner output for transparency - print('--- Runner Output ---') - print(output[-3000:]) # last 3000 chars - print('--- End Runner Output ---') + log(f'[AutoResearch] Trial {trial_num}: mean_reward={mean_reward} std_reward={std_reward}') - return mean_reward, output, status, elapsed + model_zip = os.path.join(save_dir, 'model.zip') + if not os.path.exists(model_zip): + model_zip = None -# ---- Save Result ---- -def save_result(trial, params, mean_reward, status, elapsed): + return mean_reward, std_reward, model_zip, output, status, elapsed, save_dir + +# ---- Result Saving ---- +def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed): rec = { 'trial': trial, 'timestamp': datetime.now().isoformat(), 'params': params, 'mean_reward': mean_reward, + 'std_reward': std_reward, + 'model_path': model_path, + 'champion': champion, 'run_status': status, 'elapsed_sec': elapsed, } - with open(AUTORESEARCH_RESULTS, 'a') as f: + with open(PHASE1_RESULTS, 'a') as f: f.write(json.dumps(rec) + '\n') -# ---- Print Current Best ---- -def print_summary(results, trial): +# ---- Git Push ---- +def git_push(project_root, trial_num): + try: + repo_root = os.path.dirname(PROJECT_DIR) + subprocess.run(['git', '-C', repo_root, 'add', '-A'], check=True, capture_output=True) + subprocess.run([ + 'git', '-C', repo_root, 'commit', '-m', + f'autoresearch: phase1 trial {trial_num} results\n\nAgent: pi\nTests: N/A\nTests-Added: 0\nTypeScript: N/A' + ], check=True, capture_output=True) + subprocess.run(['git', '-C', repo_root, 'push'], check=True, capture_output=True) + log(f'[AutoResearch] Git push complete after trial {trial_num}') + except subprocess.CalledProcessError as e: + log(f'[AutoResearch] Git push failed: {e}') + +# ---- Summary ---- +def print_summary(results, champion, trial): if not results: return - best = max(results, key=lambda r: r['mean_reward']) log(f'[AutoResearch] === Trial {trial} Summary ===') - log(f' Total runs in history: {len(results)}') - log(f' Best so far: mean_reward={best["mean_reward"]:.4f} params={best["params"]}') - # Top 5 + log(f' Total Phase 1 runs: {len(results)}') + log(f' {champion.summary()}') sorted_r = sorted(results, key=lambda r: r['mean_reward'], reverse=True) - log(f' Top 5 results:') + log(f' Top 5:') for r in sorted_r[:5]: log(f' mean_reward={r["mean_reward"]:.4f} params={r["params"]}') -# ---- Main Autoresearch Loop ---- -def run_autoresearch(max_trials=100): +# ---- Main Loop ---- +def run_autoresearch(max_trials=50, kappa=UCB_KAPPA, push_every=10): log('=' * 60) - log('[AutoResearch] Starting Karpathy-style autoresearch controller') - log(f'[AutoResearch] Max trials: {max_trials}') - log(f'[AutoResearch] Runner: {RUNNER_SCRIPT}') - log(f'[AutoResearch] Results: {AUTORESEARCH_RESULTS}') + log('[AutoResearch] Phase 1 β€” Real PPO Training + GP+UCB Optimization') + log(f'[AutoResearch] Max trials: {max_trials} | kappa: {kappa} | push every: {push_every}') + log(f'[AutoResearch] Results: {PHASE1_RESULTS}') + log(f'[AutoResearch] Champion: {CHAMPION_DIR}') log('=' * 60) - # Load all existing data (base sweep + prior autoresearch runs) - results = load_all_results() - log(f'[AutoResearch] Loaded {len(results)} existing result(s) from base sweep + history.') - print_summary(results, trial=0) + results = load_phase1_results() + champion = ChampionTracker(CHAMPION_DIR) + log(f'[AutoResearch] Loaded {len(results)} existing Phase 1 results.') + log(f'[AutoResearch] {champion.summary()}') for trial in range(1, max_trials + 1): log(f'\n[AutoResearch] ========== Trial {trial}/{max_trials} ==========') - # 1. Propose next params using GP+UCB - proposed = propose_next_params(results) + # 1. Propose params + proposed = propose_next_params(results, trial, kappa=kappa) full_params = {**proposed, **FIXED_PARAMS} - log(f'[AutoResearch] Proposed params: {full_params}') + log(f'[AutoResearch] Proposed: {full_params}') - # 2. Kill any stale jobs + # 2. Kill stale jobs kill_stale() - # 3. Launch job - mean_reward, output, status, elapsed = launch_job(full_params) + # 3. Launch real training job + mean_reward, std_reward, model_zip, output, status, elapsed, save_dir = launch_job(full_params, trial) - # 4. Save result - save_result(trial, full_params, mean_reward, status, elapsed) + # 4. Update champion + is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial) - # 5. If we got a valid reward, add to results for next GP fit + # 5. Save result + save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed) + + # 6. Add to GP data (only successful runs with valid reward) if mean_reward is not None: results.append({'params': full_params, 'mean_reward': mean_reward}) - else: - log(f'[AutoResearch] WARNING: No valid mean_reward from this trial.') - # 6. Print running summary - print_summary(results, trial) + # 7. Print summary + print_summary(results, champion, trial) + + # 8. Git push periodically + if push_every > 0 and trial % push_every == 0: + git_push(PROJECT_DIR, trial) - # 7. Brief pause between trials time.sleep(2) log('[AutoResearch] All trials complete!') - print_summary(results, trial=max_trials) + print_summary(results, champion, trial=max_trials) + + # Final push + git_push(PROJECT_DIR, max_trials) + -# ---- Entry Point ---- if __name__ == '__main__': import argparse - parser = argparse.ArgumentParser(description='Karpathy-style autoresearch controller for DonkeyCar RL.') - parser.add_argument('--trials', type=int, default=100, help='Number of autoresearch trials to run (default: 100)') - parser.add_argument('--explore', type=float, default=2.0, help='UCB exploration constant kappa (default: 2.0, higher=more explore)') + parser = argparse.ArgumentParser(description='Phase 1 Autoresearch: Real PPO training + GP+UCB.') + parser.add_argument('--trials', type=int, default=50, help='Number of trials (default: 50)') + parser.add_argument('--explore', type=float, default=2.0, help='UCB kappa (default: 2.0)') + parser.add_argument('--push-every', type=int, default=10, help='Git push every N trials (0=disabled)') args = parser.parse_args() - UCB_KAPPA = args.explore - run_autoresearch(max_trials=args.trials) + run_autoresearch(max_trials=args.trials, kappa=args.explore, push_every=args.push_every) diff --git a/agent/champion_tracker.py b/agent/champion_tracker.py new file mode 100644 index 0000000..bae6982 --- /dev/null +++ b/agent/champion_tracker.py @@ -0,0 +1,77 @@ +""" +Champion Model Tracker +====================== +Maintains the best-performing model across all autoresearch trials. +Saves champion model + manifest when a new best is found. +""" + +import json +import os +import shutil +import time +from datetime import datetime + + +class ChampionTracker: + """Track and save the best RL model found across all autoresearch trials.""" + + def __init__(self, champion_dir: str): + self.champion_dir = champion_dir + self.manifest_path = os.path.join(champion_dir, 'manifest.json') + os.makedirs(champion_dir, exist_ok=True) + self._current_best = self._load_manifest() + + def _load_manifest(self) -> dict: + """Load existing champion manifest if it exists.""" + if os.path.exists(self.manifest_path): + try: + with open(self.manifest_path) as f: + return json.load(f) + except Exception: + pass + return {'mean_reward': float('-inf'), 'trial': None} + + @property + def best_reward(self) -> float: + return self._current_best.get('mean_reward', float('-inf')) + + def update_if_better(self, mean_reward: float, params: dict, model_path: str, trial: int) -> bool: + """ + If mean_reward > current best, copy model to champion dir and update manifest. + Returns True if champion was updated. + """ + if mean_reward <= self.best_reward: + return False + + # Copy model to champion dir + champion_model_path = os.path.join(self.champion_dir, 'model.zip') + if model_path and os.path.exists(model_path): + try: + shutil.copy2(model_path, champion_model_path) + except Exception as e: + print(f'[Champion] WARNING: Could not copy model: {e}', flush=True) + champion_model_path = model_path # Fall back to original path + + # Update manifest + manifest = { + 'trial': trial, + 'timestamp': datetime.now().isoformat(), + 'params': params, + 'mean_reward': mean_reward, + 'model_path': champion_model_path, + } + with open(self.manifest_path, 'w') as f: + json.dump(manifest, f, indent=2) + + self._current_best = manifest + print(f'[Champion] πŸ† NEW BEST! Trial {trial}: mean_reward={mean_reward:.4f} params={params}', flush=True) + return True + + def summary(self) -> str: + if self._current_best['trial'] is None: + return 'No champion yet.' + return ( + f"Champion: trial={self._current_best['trial']} " + f"mean_reward={self._current_best['mean_reward']:.4f} " + f"params={self._current_best['params']}" + ) diff --git a/agent/donkeycar_sb3_runner.py b/agent/donkeycar_sb3_runner.py index 6431de2..2943c4c 100644 --- a/agent/donkeycar_sb3_runner.py +++ b/agent/donkeycar_sb3_runner.py @@ -1,79 +1,189 @@ +""" +DonkeyCar RL Runner β€” Real Training Edition +============================================ +Trains a PPO or DQN model using Stable-Baselines3, evaluates with evaluate_policy(), +saves the model to disk, and exits cleanly. + +Usage: + python3 donkeycar_sb3_runner.py \ + --agent ppo \ + --env donkey-generated-roads-v0 \ + --timesteps 10000 \ + --eval-episodes 5 \ + --learning-rate 0.0003 \ + --save-dir agent/models/trial-0001 \ + --n-steer 7 \ + --n-throttle 3 \ + --reward-shaping \ + --seed 42 + +Exit codes: + 0 β€” success, model saved, evaluation complete + 100 β€” failed to connect to simulator + 101 β€” training failed + 102 β€” evaluation failed +""" + import argparse -import gymnasium as gym -import gym_donkeycar -import argparse -import gymnasium as gym -import gym_donkeycar +import os import sys import time +import numpy as np + +import gymnasium as gym +import gym_donkeycar + +from stable_baselines3 import PPO, DQN +from stable_baselines3.common.evaluation import evaluate_policy + from discretize_action import DiscretizedActionWrapper -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run multi-episode RL test loop for DonkeyCar Gym. No model training/saving.") - parser.add_argument('--agent', type=str, default='dqn', help='RL agent type (only dqn supported in this runner)') - parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym/Gymnasium env ID') - parser.add_argument('--timesteps', type=int, default=5000, help='Unused (for outer loop compatibility)') - parser.add_argument('--eval-episodes', type=int, default=10, help='Episodes for evaluation') - parser.add_argument('--log-dir', type=str, default=None, help='Unused (kept for arg compatibility)') - parser.add_argument('--seed', type=int, default=None, help='Optional seed') - parser.add_argument('--n-steer', type=int, default=3, help='Number of steer bins (DQN only)') - parser.add_argument('--n-throttle', type=int, default=3, help='Number of throttle bins (DQN only)') - args = parser.parse_args() +# Optional reward shaping β€” imported only if available +try: + from reward_wrapper import SpeedRewardWrapper + REWARD_WRAPPER_AVAILABLE = True +except ImportError: + REWARD_WRAPPER_AVAILABLE = False - print('[SB3 Runner] Starting: Connecting to sim…', flush=True) - try: - env = gym.make(args.env) - print(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}', flush=True) - except Exception as e: - print(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {str(e)}', flush=True) - sys.exit(100) - if args.agent == 'dqn': - env = DiscretizedActionWrapper(env, n_steer=args.n_steer, n_throttle=args.n_throttle) - print(f'[SB3 Runner][MONITOR] Action discretization: steer={args.n_steer}, throttle={args.n_throttle}. {time.ctime()}', flush=True) - EPISODES = args.eval_episodes - try: - ep_rewards = [] - for episode in range(EPISODES): - ep_reward = 0.0 - if args.seed is not None: - obs = env.reset(seed=args.seed) - else: - obs = env.reset() - print(f'[SB3 Runner][TEST] Episode {episode+1}/{EPISODES} - reset at {time.ctime()}', flush=True) - done = False - t = 0 - while not done: - action = env.action_space.sample() - result = env.step(action) - if len(result) in (4, 5): - if len(result) == 4: - obs, reward, done, info = result - else: - obs, reward, done, truncated, info = result - done = done or truncated - else: - print('[SB3 Runner][MONITOR] UNEXPECTED step() result shape!', flush=True) - break - ep_reward += reward - t += 1 - if t % 10 == 0 or done: - print(f'[SB3 Runner][TEST] Step {t} done={done} reward={reward} {time.ctime()}', flush=True) - if done: - print(f'[SB3 Runner][TEST] Episode {episode+1} ended after {t} steps, total_reward={ep_reward} at {time.ctime()}', flush=True) - break - ep_rewards.append(ep_reward) - print(f'[SB3 Runner][TEST] All episode rewards: {ep_rewards}', flush=True) - if len(ep_rewards) > 0: - print(f'[SB3 Runner][TEST] mean_reward={sum(ep_rewards)/len(ep_rewards):.4f}', flush=True) - except Exception as e: - print(f'[SB3 Runner][MONITOR ALERT] Exception during episodes: {str(e)} {time.ctime()}', flush=True) - sys.exit(102) - print(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}', flush=True) + +def log(msg): + print(msg, flush=True) + + +def make_env(env_id, agent, n_steer, n_throttle, reward_shaping): + """Create and wrap the gym environment.""" + env = gym.make(env_id) + + if agent == 'dqn': + env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) + log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}') + + if reward_shaping: + if REWARD_WRAPPER_AVAILABLE: + env = SpeedRewardWrapper(env) + log(f'[SB3 Runner][MONITOR] Speed reward shaping ENABLED. {time.ctime()}') + else: + log(f'[SB3 Runner][MONITOR] WARNING: reward_wrapper.py not found β€” reward shaping disabled. {time.ctime()}') + + return env + + +def train_model(agent, env, learning_rate, timesteps, seed): + """Train a PPO or DQN model and return it.""" + if agent == 'ppo': + model = PPO( + 'CnnPolicy', + env, + learning_rate=learning_rate, + verbose=1, + seed=seed, + ) + elif agent == 'dqn': + model = DQN( + 'CnnPolicy', + env, + learning_rate=learning_rate, + verbose=1, + seed=seed, + ) + else: + raise ValueError(f'Unknown agent: {agent}. Use ppo or dqn.') + + log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}') + start = time.time() + model.learn(total_timesteps=timesteps) + elapsed = time.time() - start + log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}') + return model + + +def evaluate_model(model, env, eval_episodes): + """Evaluate the model using SB3 evaluate_policy and print per-episode detail.""" + log(f'[SB3 Runner][MONITOR] Evaluating model for {eval_episodes} episodes. {time.ctime()}') + mean_reward, std_reward = evaluate_policy( + model, + env, + n_eval_episodes=eval_episodes, + return_episode_rewards=False, + deterministic=True, + ) + log(f'[SB3 Runner][TEST] mean_reward={mean_reward:.4f}') + log(f'[SB3 Runner][TEST] std_reward={std_reward:.4f}') + return mean_reward, std_reward + + +def save_model(model, save_dir): + """Save the model to save_dir/model.zip.""" + os.makedirs(save_dir, exist_ok=True) + save_path = os.path.join(save_dir, 'model') + model.save(save_path) + log(f'[SB3 Runner][MONITOR] Model saved to {save_path}.zip {time.ctime()}') + return save_path + '.zip' + + +def teardown(env): + """Close environment cleanly with race avoidance sleep.""" + log(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}') try: env.close() - print(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}', flush=True) + log(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}') except Exception as e: - print(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {str(e)} {time.ctime()}', flush=True) - print(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}', flush=True) + log(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {e} {time.ctime()}') + log(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}') time.sleep(2) - print(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}', flush=True) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train and evaluate an RL agent on DonkeyCar.') + parser.add_argument('--agent', type=str, default='ppo', choices=['ppo', 'dqn'], help='RL agent type') + parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym env ID') + parser.add_argument('--timesteps', type=int, default=10000, help='Training timesteps') + parser.add_argument('--eval-episodes', type=int, default=5, help='Evaluation episodes') + parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate') + parser.add_argument('--save-dir', type=str, default=None, help='Directory to save model') + parser.add_argument('--n-steer', type=int, default=7, help='Steer bins (DQN only)') + parser.add_argument('--n-throttle', type=int, default=3, help='Throttle bins (DQN only)') + parser.add_argument('--reward-shaping', action='store_true', help='Enable speed reward shaping') + parser.add_argument('--seed', type=int, default=None, help='Random seed') + args = parser.parse_args() + + log(f'[SB3 Runner] Starting: agent={args.agent} timesteps={args.timesteps} lr={args.learning_rate} {time.ctime()}') + + # --- 1. Connect to simulator --- + env = None + try: + env = make_env(args.env, args.agent, args.n_steer, args.n_throttle, args.reward_shaping) + log(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}') + except Exception as e: + log(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {e}') + sys.exit(100) + + # --- 2. Train model --- + model = None + try: + model = train_model(args.agent, env, args.learning_rate, args.timesteps, args.seed) + except Exception as e: + log(f'[SB3 Runner][MONITOR ALERT] Training failed: {e} {time.ctime()}') + teardown(env) + sys.exit(101) + + # --- 3. Save model --- + save_dir = args.save_dir or f'/tmp/donkeycar-trial-{int(time.time())}' + try: + saved_path = save_model(model, save_dir) + except Exception as e: + log(f'[SB3 Runner][MONITOR ALERT] Model save failed: {e} {time.ctime()}') + teardown(env) + sys.exit(101) + + # --- 4. Evaluate trained policy --- + try: + mean_reward, std_reward = evaluate_model(model, env, args.eval_episodes) + except Exception as e: + log(f'[SB3 Runner][MONITOR ALERT] Evaluation failed: {e} {time.ctime()}') + teardown(env) + sys.exit(102) + + # --- 5. Teardown --- + teardown(env) + log(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}') diff --git a/agent/outerloop-results/autoresearch_phase1_log.txt b/agent/outerloop-results/autoresearch_phase1_log.txt new file mode 100644 index 0000000..074f241 --- /dev/null +++ b/agent/outerloop-results/autoresearch_phase1_log.txt @@ -0,0 +1,26 @@ +[2026-04-13 10:00:54] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 10:00:54] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} +[2026-04-13 10:00:54] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} +[2026-04-13 10:00:54] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} +[2026-04-13 10:00:54] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} +[2026-04-13 10:00:54] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 10:00:54] [Champion] πŸ† NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 10:00:54] [AutoResearch] Only 1 results β€” using random proposal. +[2026-04-13 10:02:55] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 10:02:55] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} +[2026-04-13 10:02:55] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} +[2026-04-13 10:02:55] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} +[2026-04-13 10:02:55] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} +[2026-04-13 10:02:55] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 10:02:55] [Champion] πŸ† NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 10:02:55] [AutoResearch] Only 1 results β€” using random proposal. diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py new file mode 100644 index 0000000..27295ea --- /dev/null +++ b/agent/reward_wrapper.py @@ -0,0 +1,71 @@ +""" +Speed-Aware Reward Wrapper for DonkeyCar RL +============================================ +Replaces the default CTE-only reward with: + reward = speed * (1.0 - min(abs(cte) / max_cte, 1.0)) + +Falls back to original reward if speed/cte not available in info dict. +""" + +import gymnasium as gym +import numpy as np + + +class SpeedRewardWrapper(gym.Wrapper): + """ + Replace DonkeyCar's default reward with a speed-aware version. + + Reward = speed * (1 - |cte| / max_cte) + - Maximum when car is fast AND centred on the track + - Zero when car is at max cross-track error + - Negative (crash penalty) preserved from original reward when episode ends with failure + """ + + def __init__(self, env, max_cte: float = 8.0, crash_penalty: float = -10.0): + super().__init__(env) + self.max_cte = max_cte + self.crash_penalty = crash_penalty + + def step(self, action): + result = self.env.step(action) + + # Handle both 4-tuple (old gym) and 5-tuple (gymnasium) APIs + if len(result) == 5: + obs, reward, terminated, truncated, info = result + done = terminated or truncated + elif len(result) == 4: + obs, reward, done, info = result + terminated = done + truncated = False + else: + raise ValueError(f'Unexpected step() result length: {len(result)}') + + # Shape the reward using speed and CTE from info + shaped = self._shape_reward(reward, done, info) + + if len(result) == 5: + return obs, shaped, terminated, truncated, info + else: + return obs, shaped, done, info + + def _shape_reward(self, original_reward: float, done: bool, info: dict) -> float: + """Compute speed-aware reward, falling back to original if info is unavailable.""" + try: + speed = float(info.get('speed', None)) + cte = float(info.get('cte', None)) + + if speed is None or cte is None: + return original_reward + + # Positive driving reward: fast + centred + shaped = speed * (1.0 - min(abs(cte) / self.max_cte, 1.0)) + + # Preserve crash penalty (original reward is -1 on crash in DonkeyCar) + if done and original_reward < 0: + shaped += self.crash_penalty + + return shaped + + except (TypeError, ValueError): + # info dict doesn't have speed/cte β€” fall back gracefully + return original_reward diff --git a/ralph-loop.sh b/ralph-loop.sh new file mode 100755 index 0000000..f8742ce --- /dev/null +++ b/ralph-loop.sh @@ -0,0 +1,570 @@ +#!/usr/bin/env bash +# +# Ralph Wiggum Loop β€” Script-Orchestrated Autonomous Agent Iteration +# +# This runtime is for the "script is the orchestrator" model: +# - The shell loop spawns a fresh agent every iteration +# - The shell loop interprets runtime signals and failures +# - The shell loop decides when to retry, stop, or wait for token reset +# +# This is different from the "agent is the orchestrator" model used in +# OpenClaw/manual orchestration, where a supervising agent evaluates results, +# watches execution boards, and decides what to do next. +# +# Usage: +# ./ralph-loop.sh # Build mode (default) +# ./ralph-loop.sh plan # Planning mode +# ./ralph-loop.sh --max 20 # Limit iterations +# ./ralph-loop.sh --agent claude # Use claude (default) +# ./ralph-loop.sh --session-ends 2026-04-09T16:00:00 +# ./ralph-loop.sh --retry-wait 1800 +# ./ralph-loop.sh --board .harness/foo/execution-board.md +# ./ralph-loop.sh --no-require-pro +# +# Token / rate-limit handling: +# Tier 1 β€” Anthropic API probe if ANTHROPIC_API_KEY is available +# Tier 2 β€” Parse "resets 11am (America/New_York)" from agent output +# Tier 3 β€” Use seeded --session-ends time +# Tier 4 β€” Fixed fallback sleep +# +set -uo pipefail + +MODE="build" +MAX_ITERATIONS=50 +AGENT="claude" +PLAN_FILE="IMPLEMENTATION_PLAN.md" +SPEC_FILE="PROJECT-SPEC.md" +AGENT_FILE="AGENT.md" +BOARD_FILE="" +LOG_DIR=".ralph-logs" +SESSION_TS="$(date '+%Y%m%dT%H%M%S')" +RATE_LIMIT_WAIT=1800 +SESSION_ENDS="" +REQUIRE_PRO=1 +AGENT_TIMEOUT_SECONDS="${AGENT_TIMEOUT_SECONDS:-900}" +CLAUDE_BIN="${CLAUDE_BIN:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + plan) MODE="plan"; shift ;; + build) MODE="build"; shift ;; + --max) MAX_ITERATIONS="$2"; shift 2 ;; + --agent) AGENT="$2"; shift 2 ;; + --retry-wait) RATE_LIMIT_WAIT="$2"; shift 2 ;; + --session-ends) SESSION_ENDS="$2"; shift 2 ;; + --board) BOARD_FILE="$2"; shift 2 ;; + --no-require-pro) REQUIRE_PRO=0; shift ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "$LOG_DIR" + +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +log() { echo -e "${BLUE}[ralph]${NC} $1"; } +success() { echo -e "${GREEN}[ralph]${NC} $1"; } +warn() { echo -e "${YELLOW}[ralph]${NC} $1"; } +error() { echo -e "${RED}[ralph]${NC} $1"; } +info() { echo -e "${CYAN}[ralph]${NC} $1"; } + +AGENT_EXIT_CODE=0 + +resolve_claude_bin() { + if [[ -n "$CLAUDE_BIN" && -x "$CLAUDE_BIN" ]]; then + return 0 + fi + + CLAUDE_BIN=$(bash -ic 'command -v claude' 2>/dev/null | tail -n 1 || true) + if [[ -z "$CLAUDE_BIN" || ! -x "$CLAUDE_BIN" ]]; then + error "Could not resolve claude binary." + return 1 + fi +} + +get_claude_analysis_auth_json() { + env -u ANTHROPIC_API_KEY bash -ic 'claude auth status' 2>/dev/null | tail -n +1 +} + +verify_claude_pro_auth() { + local auth_json + auth_json=$(get_claude_analysis_auth_json) + if [[ -z "$auth_json" ]]; then + error "Could not determine Claude analysis auth status." + return 1 + fi + + AUTH_JSON="$auth_json" python3 - <<'PY' +import json +import os +import sys + +data = json.loads(os.environ["AUTH_JSON"]) +if data.get("loggedIn") and data.get("subscriptionType") == "pro": + print("ok") + sys.exit(0) + +print(json.dumps(data, ensure_ascii=True)) +sys.exit(1) +PY +} + +log_agent_runtime() { + case "$AGENT" in + claude) + local claude_path claude_version auth_json + resolve_claude_bin || true + claude_path="${CLAUDE_BIN:-}" + claude_version=$("$claude_path" --version 2>/dev/null | tail -n 1 || true) + auth_json=$(get_claude_analysis_auth_json) + log "Claude binary: ${claude_path:-not found}" + log "Claude version: ${claude_version:-unknown}" + if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then + log "Claude auth hint: ANTHROPIC_API_KEY is set (API probe enabled)" + else + log "Claude auth hint: ANTHROPIC_API_KEY is not set" + fi + if [[ -n "$auth_json" ]]; then + log "Claude analysis auth: $(AUTH_JSON="$auth_json" python3 - <<'PY' +import json +import os + +data = json.loads(os.environ["AUTH_JSON"]) +print(f"authMethod={data.get('authMethod')} subscriptionType={data.get('subscriptionType')} apiKeySource={data.get('apiKeySource')}") +PY +)" + fi + ;; + esac +} + +if [[ ! -f "$SPEC_FILE" ]]; then + error "Missing $SPEC_FILE β€” create your project spec first." + exit 1 +fi +if [[ ! -f "$AGENT_FILE" ]]; then + warn "No $AGENT_FILE found. Using default agent instructions." +fi +if [[ -z "$BOARD_FILE" && -f "EXECUTION-BOARD.md" ]]; then + BOARD_FILE="EXECUTION-BOARD.md" +fi + +probe_rate_limit() { + if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then + return 1 + fi + + local headers + headers=$(curl -s -D - -o /dev/null \ + --max-time 10 \ + -X POST "https://api.anthropic.com/v1/messages" \ + -H "x-api-key: $ANTHROPIC_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -H "content-type: application/json" \ + -d '{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{"role":"user","content":"hi"}]}' \ + 2>/dev/null) || return 1 + + local reset_str remaining + reset_str=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-reset:" | awk '{print $2}' | tr -d '\r\n') + remaining=$(echo "$headers" | grep -i "anthropic-ratelimit-output-tokens-remaining:" | awk '{print $2}' | tr -d '\r\n') + + if [[ -z "$reset_str" ]]; then + return 1 + fi + + local reset_epoch + reset_epoch=$(date -d "$reset_str" +%s 2>/dev/null) \ + || reset_epoch=$(python3 -c " +from datetime import datetime, timezone +import sys +s = sys.argv[1].strip() +for fmt in ('%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S+00:00', '%Y-%m-%dT%H:%M:%S%z'): + try: + dt = datetime.strptime(s, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + print(int(dt.timestamp())) + break + except Exception: + pass +" "$reset_str" 2>/dev/null) || return 1 + + echo "${reset_epoch}|${remaining:-unknown}" +} + +parse_epoch() { + local ts="$1" + date -d "$ts" +%s 2>/dev/null \ + || python3 -c " +from datetime import datetime, timezone +import sys +s = sys.argv[1] +for fmt in ('%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S', + '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S+00:00'): + try: + dt = datetime.strptime(s, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + print(int(dt.timestamp())) + break + except Exception: + pass +" "$ts" 2>/dev/null || true +} + +format_session_end() { + local epoch="$1" + date -d "@$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \ + || date -r "$epoch" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null \ + || echo "" +} + +infer_reset_epoch_from_log() { + local logfile="$1" + + python3 - "$logfile" <<'PY' 2>/dev/null || true +from datetime import datetime, timedelta +from pathlib import Path +import re +import sys + +try: + from zoneinfo import ZoneInfo +except Exception: + ZoneInfo = None + +logfile = Path(sys.argv[1]) +if not logfile.exists(): + raise SystemExit(0) + +text = logfile.read_text(encoding="utf-8", errors="ignore") +matches = list(re.finditer(r"resets\s+(\d{1,2})(?::(\d{2}))?\s*(am|pm)\s*\(([^)]+)\)", text, re.IGNORECASE)) +if not matches: + raise SystemExit(0) + +match = matches[-1] +hour = int(match.group(1)) +minute = int(match.group(2) or "0") +ampm = match.group(3).lower() +tz_name = match.group(4).strip() + +if hour == 12: + hour = 0 +if ampm == "pm": + hour += 12 + +if ZoneInfo is None: + raise SystemExit(0) + +tz = ZoneInfo(tz_name) +now = datetime.now(tz) +candidate = now.replace(hour=hour, minute=minute, second=0, microsecond=0) +if candidate <= now: + candidate += timedelta(days=1) + +print(int(candidate.timestamp())) +PY +} + +countdown_sleep() { + local target_epoch=$1 + local label="${2:-token reset}" + local now + while true; do + now=$(date +%s) + local remaining=$(( target_epoch - now )) + if [[ $remaining -le 0 ]]; then + break + fi + local h=$(( remaining / 3600 )) + local m=$(( (remaining % 3600) / 60 )) + local s=$(( remaining % 60 )) + printf "\r${YELLOW}[ralph]${NC} Waiting for %s... %02dh%02dm%02ds remaining " "$label" "$h" "$m" "$s" + sleep 5 + done + echo "" +} + +wait_for_tokens() { + local logfile="${1:-}" + warn "Rate limit / token exhaustion detected." + echo "" + + local wake_epoch="" wake_source="" + + info "Tier 1 β€” probing Anthropic API for exact reset time..." + local probe_result + if probe_result=$(probe_rate_limit); then + local probe_epoch probe_remaining + probe_epoch="${probe_result%%|*}" + probe_remaining="${probe_result##*|}" + local now + now=$(date +%s) + if [[ -n "$probe_epoch" && "$probe_epoch" -gt "$now" ]]; then + wake_epoch=$probe_epoch + wake_source="API probe" + info "Tokens remaining: ${probe_remaining}. Reset at: $(date -d "@$probe_epoch" 2>/dev/null || date -r "$probe_epoch" 2>/dev/null || echo "$probe_epoch")" + else + warn "Probe succeeded but reset time is already past. Falling through to other strategies." + fi + else + warn "Tier 1 unavailable (no ANTHROPIC_API_KEY or probe failed)." + fi + + if [[ -z "$wake_epoch" && -n "$logfile" ]]; then + info "Tier 2 β€” parsing reset time from agent output..." + local log_epoch + log_epoch=$(infer_reset_epoch_from_log "$logfile") || true + if [[ -n "$log_epoch" ]]; then + wake_epoch=$(( log_epoch + 60 )) + wake_source="agent output" + SESSION_ENDS=$(format_session_end "$log_epoch") + info "Detected reset at: $(date -d "@$log_epoch" 2>/dev/null || date -r "$log_epoch" 2>/dev/null || echo "$log_epoch")" + if [[ -n "$SESSION_ENDS" ]]; then + info "Updated --session-ends seed to $SESSION_ENDS" + fi + else + warn "Could not extract a reset time from $logfile." + fi + fi + + if [[ -z "$wake_epoch" && -n "$SESSION_ENDS" ]]; then + info "Tier 3 β€” using --session-ends $SESSION_ENDS..." + local seed_epoch + seed_epoch=$(parse_epoch "$SESSION_ENDS") || true + if [[ -n "$seed_epoch" ]]; then + local now + now=$(date +%s) + if [[ "$seed_epoch" -gt "$now" ]]; then + wake_epoch=$(( seed_epoch + 60 )) + wake_source="session seed (--session-ends)" + info "Will wake at: $(date -d "@$wake_epoch" 2>/dev/null || date -r "$wake_epoch" 2>/dev/null || echo "$wake_epoch") (+60s buffer)" + else + warn "--session-ends is stale (already past). Ignoring it for this retry." + fi + else + warn "Could not parse --session-ends value: '$SESSION_ENDS'" + fi + fi + + if [[ -z "$wake_epoch" ]]; then + warn "Tier 4 β€” no reset time available. Sleeping ${RATE_LIMIT_WAIT}s ($(( RATE_LIMIT_WAIT / 60 )) min)." + warn "Tip: set ANTHROPIC_API_KEY or pass --session-ends for a smarter wake-up." + wake_epoch=$(( $(date +%s) + RATE_LIMIT_WAIT )) + wake_source="fixed wait" + fi + + info "Strategy: $wake_source. Press Ctrl+C to cancel." + countdown_sleep "$wake_epoch" "token reset" + log "Wake-up time reached. Retrying..." +} + +run_agent() { + local iteration=$1 + local mode=$2 + local logfile="$LOG_DIR/${SESSION_TS}-iteration-${iteration}.log" + local prompt="" + + if [[ "$mode" == "plan" ]]; then + prompt="Read PROJECT-SPEC.md. Decompose the project into discrete, testable tasks ordered by dependency. Write the plan to IMPLEMENTATION_PLAN.md with checkboxes. Output PLANNED when done." + else + prompt="Read AGENT.md (if it exists) for your instructions. Follow the core loop: orient, pick one task, implement, verify, commit, exit." + fi + + log "Iteration $iteration ($mode mode) β€” starting fresh agent..." + + if [[ "$AGENT" == "claude" && "$REQUIRE_PRO" == "1" ]]; then + resolve_claude_bin || exit 1 + if ! verify_claude_pro_auth >/tmp/ralph-auth-check.out 2>/tmp/ralph-auth-check.err; then + error "Claude analysis auth is not using Pro. Refusing to run." + if [[ -s /tmp/ralph-auth-check.out ]]; then + error "Auth details: $(tail -n 1 /tmp/ralph-auth-check.out)" + fi + if [[ -s /tmp/ralph-auth-check.err ]]; then + error "Auth check stderr: $(tail -n 1 /tmp/ralph-auth-check.err)" + fi + exit 1 + fi + fi + + case "$AGENT" in + claude) + resolve_claude_bin || exit 1 + printf '%s' "$prompt" | timeout --foreground "${AGENT_TIMEOUT_SECONDS}s" env -u ANTHROPIC_API_KEY "$CLAUDE_BIN" -p --dangerously-skip-permissions --output-format text 2>&1 | tee "$logfile" + ;; + codex) + echo "$prompt" | codex 2>&1 | tee "$logfile" + ;; + aider) + aider --message "$prompt" --yes 2>&1 | tee "$logfile" + ;; + gemini) + echo "$prompt" | gemini-cli 2>&1 | tee "$logfile" + ;; + custom) + if [[ -x "./custom-agent.sh" ]]; then + ./custom-agent.sh "$prompt" 2>&1 | tee "$logfile" + else + error "Custom agent selected but ./custom-agent.sh not found or not executable" + exit 1 + fi + ;; + *) + error "Unknown agent: $AGENT. Supported: claude, codex, aider, gemini, custom" + exit 1 + ;; + esac + AGENT_EXIT_CODE=$? + return 0 +} + +check_output() { + local logfile="$1" + + if grep -q 'DONE' "$logfile" 2>/dev/null; then + return 0 + elif grep -q 'STUCK' "$logfile" 2>/dev/null; then + return 2 + elif grep -q 'ERROR' "$logfile" 2>/dev/null; then + return 3 + elif grep -Eqi "rate.limit|rate_limit|too many requests|exceeded.*quota|usage limit|out of tokens|overloaded|you'?ve hit your limit|resets [0-9]{1,2}(:[0-9]{2})?(am|pm)" "$logfile" 2>/dev/null; then + return 4 + else + return 1 + fi +} + +plan_has_remaining_work() { + if [[ ! -f "$PLAN_FILE" ]]; then + return 1 + fi + + if grep -Eq '^- \[ \]' "$PLAN_FILE" 2>/dev/null; then + return 0 + fi + + return 1 +} + +board_has_remaining_work() { + if [[ -z "$BOARD_FILE" || ! -f "$BOARD_FILE" ]]; then + return 1 + fi + + if grep -Eq '\| .*⬜ Pending .* \||\| .*πŸ”„ In Progress .* \|' "$BOARD_FILE" 2>/dev/null; then + return 0 + fi + + return 1 +} + +has_remaining_work() { + if board_has_remaining_work; then + return 0 + fi + + if plan_has_remaining_work; then + return 0 + fi + + return 1 +} + +if [[ "$MODE" == "plan" ]]; then + log "Planning mode β€” creating implementation plan..." + run_agent 0 plan + success "Plan created. Review $PLAN_FILE, then run: ./ralph-loop.sh" + exit 0 +fi + +log "Starting Ralph Wiggum loop (max $MAX_ITERATIONS iterations)" +log "Runtime model: script-orchestrated" +log "Agent: $AGENT" +log "Spec: $SPEC_FILE" +log "Plan: $PLAN_FILE" +if [[ -n "$BOARD_FILE" ]]; then + log "Board: $BOARD_FILE" +fi +if [[ -n "$SESSION_ENDS" ]]; then + log "Tier 3 (session seed): $SESSION_ENDS" +fi +if [[ "$AGENT" == "claude" ]]; then + log_agent_runtime + log "Agent timeout: ${AGENT_TIMEOUT_SECONDS}s" + if [[ "$REQUIRE_PRO" == "1" ]]; then + log "Pro guard: enabled" + else + warn "Pro guard: disabled (--no-require-pro)" + fi +fi +echo "" + +for i in $(seq 1 "$MAX_ITERATIONS"); do + run_agent "$i" build + logfile="$LOG_DIR/${SESSION_TS}-iteration-${i}.log" + + if check_output "$logfile"; then + status=0 + else + status=$? + fi + + case $status in + 0) + if has_remaining_work; then + warn "Agent reported DONE, but the tracking artifacts still show work remaining." + warn "Ignoring false DONE and restarting with fresh context." + echo "" + sleep 2 + else + success "All tracked work appears complete after $i iterations." + exit 0 + fi + ;; + 2) + warn "Agent is stuck. Review $logfile and intervene." + exit 1 + ;; + 3) + error "Agent encountered an error. Review $logfile." + exit 1 + ;; + 4) + warn "Token/rate limit hit on iteration $i." + wait_for_tokens "$logfile" + echo "" + ;; + 1) + if [[ $AGENT_EXIT_CODE -ne 0 ]]; then + if [[ $AGENT_EXIT_CODE -eq 124 ]]; then + warn "Agent timed out after ${AGENT_TIMEOUT_SECONDS}s. Restarting fresh." + echo "" + sleep 2 + continue + fi + warn "Agent exited with code $AGENT_EXIT_CODE but did not emit a recognized promise signal." + if has_remaining_work; then + warn "Tracked work remains. Restarting fresh." + echo "" + sleep 2 + else + error "No work remains in tracking artifacts, but agent did not finish cleanly." + error "Review $logfile." + exit 1 + fi + else + log "Iteration $i complete. Restarting with fresh context..." + echo "" + sleep 2 + fi + ;; + esac +done + +warn "Reached max iterations ($MAX_ITERATIONS). Review progress in $PLAN_FILE." +exit 1 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..53466c1 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Package marker for tests.""" diff --git a/tests/test_autoresearch_controller.py b/tests/test_autoresearch_controller.py new file mode 100644 index 0000000..6ce0831 --- /dev/null +++ b/tests/test_autoresearch_controller.py @@ -0,0 +1,198 @@ +""" +Tests for autoresearch_controller.py β€” no simulator required. +""" + +import json +import os +import sys +import pytest +import numpy as np +import tempfile + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'agent')) + +# Patch paths before import so the controller doesn't try to read/write real files +import autoresearch_controller as ctrl + + +# ---- Param Encoding Tests ---- + +def test_param_encode_decode_roundtrip(): + """encode β†’ decode should reproduce original values (within int rounding).""" + params = {'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002, 'timesteps': 10000} + vec = ctrl.encode_params(params) + recovered = ctrl.decode_params(vec) + assert recovered['n_steer'] == params['n_steer'] + assert recovered['n_throttle'] == params['n_throttle'] + assert abs(recovered['learning_rate'] - params['learning_rate']) < 1e-6 + assert recovered['timesteps'] == params['timesteps'] + + +def test_param_encode_normalizes_to_unit_cube(): + """Encoded values should all be in [0, 1].""" + params = {'n_steer': 9, 'n_throttle': 5, 'learning_rate': 0.005, 'timesteps': 30000} + vec = ctrl.encode_params(params) + assert all(0.0 <= v <= 1.0 for v in vec), f"Encoded values out of range: {vec}" + + +def test_param_decode_min_values(): + """Zero vector should decode to min values.""" + vec = np.zeros(len(ctrl.PARAM_KEYS)) + params = ctrl.decode_params(vec) + for k in ctrl.PARAM_KEYS: + spec = ctrl.PARAM_SPACE[k] + assert params[k] == spec['min'] or abs(params[k] - spec['min']) < 1e-6, \ + f"{k}: expected {spec['min']}, got {params[k]}" + + +def test_param_decode_max_values(): + """Ones vector should decode to max values.""" + vec = np.ones(len(ctrl.PARAM_KEYS)) + params = ctrl.decode_params(vec) + for k in ctrl.PARAM_KEYS: + spec = ctrl.PARAM_SPACE[k] + assert params[k] == spec['max'] or abs(params[k] - spec['max']) < 1e-6, \ + f"{k}: expected {spec['max']}, got {params[k]}" + + +def test_param_decode_clamps_out_of_range(): + """Values outside [0,1] should be clamped to valid range.""" + vec = np.array([1.5, -0.5, 2.0, 0.5]) + params = ctrl.decode_params(vec) + for k in ctrl.PARAM_KEYS: + spec = ctrl.PARAM_SPACE[k] + assert spec['min'] <= params[k] <= spec['max'], \ + f"{k}: {params[k]} out of [{spec['min']}, {spec['max']}]" + + +# ---- Gaussian Process Tests ---- + +def test_gp_fit_predict_shape(): + """GP predict should return arrays with correct shape.""" + gp = ctrl.TinyGP() + X = np.random.uniform(0, 1, (10, 4)) + y = np.random.uniform(0, 1, 10) + gp.fit(X, y) + X_new = np.random.uniform(0, 1, (5, 4)) + mu, sigma = gp.predict(X_new) + assert mu.shape == (5,) + assert sigma.shape == (5,) + + +def test_gp_sigma_positive(): + """GP uncertainty (sigma) should be strictly positive.""" + gp = ctrl.TinyGP() + X = np.random.uniform(0, 1, (10, 4)) + y = np.random.uniform(0, 1, 10) + gp.fit(X, y) + X_new = np.random.uniform(0, 1, (20, 4)) + mu, sigma = gp.predict(X_new) + assert np.all(sigma > 0), f"Some sigma values non-positive: {sigma.min()}" + + +def test_gp_higher_uncertainty_far_from_data(): + """GP should be more uncertain far from training data than near it.""" + gp = ctrl.TinyGP(length_scale=0.1) + X_train = np.array([[0.1, 0.1, 0.1, 0.1]]) + y_train = np.array([1.0]) + gp.fit(X_train, y_train) + + near = np.array([[0.1, 0.1, 0.1, 0.1]]) + far = np.array([[0.9, 0.9, 0.9, 0.9]]) + _, sigma_near = gp.predict(near) + _, sigma_far = gp.predict(far) + assert sigma_far[0] > sigma_near[0], \ + f"Expected higher uncertainty far from data: near={sigma_near[0]:.4f}, far={sigma_far[0]:.4f}" + + +def test_ucb_proposal_prefers_high_reward_region(): + """ + GP+UCB should propose params near the high-reward region. + Known: n_steer=8, n_throttle=5, lr~0.002 β†’ high reward (from 300 trial history) + """ + np.random.seed(42) + # Synthesize training data: high reward at high n_steer + moderate lr + results = [] + for n_steer in [3, 5, 7, 8, 9]: + for lr in [0.0001, 0.001, 0.002, 0.004]: + reward = n_steer * 5.0 + (1.0 - abs(lr - 0.002) / 0.002) * 20.0 + results.append({ + 'params': {'n_steer': n_steer, 'n_throttle': 3, 'learning_rate': lr, 'timesteps': 10000}, + 'mean_reward': reward + }) + + proposed = ctrl.propose_next_params(results, trial_num=20, kappa=2.0) + # Best n_steer is 9 (highest in space), best lr is 0.002 + assert proposed['n_steer'] >= 7, f"Expected high n_steer proposal, got {proposed['n_steer']}" + assert 0.001 <= proposed['learning_rate'] <= 0.004, \ + f"Expected moderate lr proposal, got {proposed['learning_rate']}" + + +# ---- Champion Tracker Tests ---- + +def test_champion_tracker_updates_on_better_reward(): + """Champion should update when a better reward is found.""" + with tempfile.TemporaryDirectory() as tmpdir: + tracker = ctrl.ChampionTracker(tmpdir) + assert tracker.best_reward == float('-inf') + + updated = tracker.update_if_better(50.0, {'n_steer': 5}, None, trial=1) + assert updated is True + assert tracker.best_reward == 50.0 + + +def test_champion_tracker_no_update_on_worse_reward(): + """Champion should NOT update when a worse reward is found.""" + with tempfile.TemporaryDirectory() as tmpdir: + tracker = ctrl.ChampionTracker(tmpdir) + tracker.update_if_better(80.0, {'n_steer': 7}, None, trial=1) + + updated = tracker.update_if_better(60.0, {'n_steer': 5}, None, trial=2) + assert updated is False + assert tracker.best_reward == 80.0 + + +def test_champion_tracker_sequence(): + """Champion sequence: [50, 80, 60, 90, 70] β†’ updates at indices 0, 1, 3.""" + with tempfile.TemporaryDirectory() as tmpdir: + tracker = ctrl.ChampionTracker(tmpdir) + rewards = [50, 80, 60, 90, 70] + champions = [] + for i, r in enumerate(rewards): + if tracker.update_if_better(float(r), {'r': r}, None, trial=i): + champions.append(i) + assert champions == [0, 1, 3], f"Expected [0,1,3], got {champions}" + assert tracker.best_reward == 90.0 + + +def test_champion_tracker_manifest_persists(): + """Champion manifest should persist across tracker instances.""" + with tempfile.TemporaryDirectory() as tmpdir: + tracker1 = ctrl.ChampionTracker(tmpdir) + tracker1.update_if_better(75.0, {'n_steer': 8}, None, trial=5) + + tracker2 = ctrl.ChampionTracker(tmpdir) + assert tracker2.best_reward == 75.0 + + +def test_champion_tracker_handles_none_reward(): + """Champion tracker should handle None reward gracefully (failed trial).""" + with tempfile.TemporaryDirectory() as tmpdir: + tracker = ctrl.ChampionTracker(tmpdir) + updated = tracker.update_if_better(None, {}, None, trial=1) + assert updated is False + assert tracker.best_reward == float('-inf') + + +# ---- Random Proposal Fallback ---- + +def test_random_proposal_when_insufficient_data(): + """With < MIN_TRIALS_BEFORE_GP results, should use random proposal (not crash).""" + results = [ + {'params': {'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.001, 'timesteps': 10000}, + 'mean_reward': 50.0} + ] + # Should not raise even with 1 result + proposed = ctrl.propose_next_params(results, trial_num=1, kappa=2.0) + assert 'n_steer' in proposed + assert 'learning_rate' in proposed diff --git a/tests/test_discretize_action.py b/tests/test_discretize_action.py new file mode 100644 index 0000000..0aaf717 --- /dev/null +++ b/tests/test_discretize_action.py @@ -0,0 +1,120 @@ +""" +Tests for discretize_action.py β€” no simulator required. +""" + +import pytest +import numpy as np +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'agent')) + +from discretize_action import DiscretizedActionWrapper + + +import gymnasium as gym + + +class MockEnv(gym.Env): + """Minimal mock gymnasium.Env with continuous Box action space.""" + metadata = {'render_modes': []} + + def __init__(self): + super().__init__() + self.action_space = gym.spaces.Box( + low=np.array([-1.0, 0.0], dtype=np.float32), + high=np.array([1.0, 1.0], dtype=np.float32), + ) + self.observation_space = gym.spaces.Box( + low=0, high=255, shape=(120, 160, 3), dtype=np.uint8 + ) + + def reset(self, seed=None, **kwargs): + obs = np.zeros((120, 160, 3), dtype=np.uint8) + return obs, {} + + def step(self, action): + obs = np.zeros((120, 160, 3), dtype=np.uint8) + return obs, 1.0, False, False, {'cte': 0.1, 'speed': 2.5} + + def close(self): + pass + + +# ---- Tests ---- + +def test_wrapper_creates_discrete_action_space(): + env = MockEnv() + wrapped = DiscretizedActionWrapper(env, n_steer=5, n_throttle=3) + assert hasattr(wrapped.action_space, 'n'), "Wrapped env should have discrete action space" + assert wrapped.action_space.n == 5 * 3 + + +def test_n_steer_n_throttle_product(): + """Action space size = n_steer Γ— n_throttle.""" + for n_steer in [3, 5, 7, 9]: + for n_throttle in [2, 3, 5]: + env = MockEnv() + wrapped = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) + assert wrapped.action_space.n == n_steer * n_throttle + + +def test_action_decode_center_steer(): + """Middle steer action should decode to steer β‰ˆ 0.0.""" + env = MockEnv() + n_steer, n_throttle = 5, 3 + wrapped = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) + # Middle steer index = n_steer // 2 = 2, any throttle + center_steer_action = 2 * n_throttle + 0 # steer_idx=2, throttle_idx=0 + continuous = wrapped.action(center_steer_action) + steer = continuous[0] + assert abs(steer) < 0.1, f"Center steer should be ~0.0, got {steer}" + + +def test_action_decode_full_left_steer(): + """First steer index should decode to steer = -1.0.""" + env = MockEnv() + wrapped = DiscretizedActionWrapper(env, n_steer=5, n_throttle=3) + continuous = wrapped.action(0) # steer_idx=0, throttle_idx=0 + steer = continuous[0] + assert steer == pytest.approx(-1.0, abs=0.01), f"Full left steer should be -1.0, got {steer}" + + +def test_action_decode_full_right_steer(): + """Last steer index should decode to steer = 1.0.""" + env = MockEnv() + n_steer, n_throttle = 5, 3 + wrapped = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) + last_steer_action = (n_steer - 1) * n_throttle + 0 + continuous = wrapped.action(last_steer_action) + steer = continuous[0] + assert steer == pytest.approx(1.0, abs=0.01), f"Full right steer should be 1.0, got {steer}" + + +def test_action_decode_all_valid(): + """Every discrete action index should decode to a valid (steer, throttle) pair.""" + env = MockEnv() + n_steer, n_throttle = 7, 3 + wrapped = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) + for action in range(n_steer * n_throttle): + continuous = wrapped.action(action) + steer, throttle = continuous[0], continuous[1] + assert -1.0 <= steer <= 1.0, f"Steer {steer} out of range for action {action}" + assert 0.0 <= throttle <= 1.0, f"Throttle {throttle} out of range for action {action}" + + +def test_step_passes_through(): + """Wrapped env.step() should work with integer action.""" + env = MockEnv() + wrapped = DiscretizedActionWrapper(env, n_steer=5, n_throttle=3) + wrapped.reset() + result = wrapped.step(0) + assert len(result) in (4, 5), "step() should return 4 or 5 values" + + +def test_reset_works(): + """Wrapped env.reset() should work.""" + env = MockEnv() + wrapped = DiscretizedActionWrapper(env, n_steer=5, n_throttle=3) + obs = wrapped.reset() + assert obs is not None diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py new file mode 100644 index 0000000..9fa3c48 --- /dev/null +++ b/tests/test_reward_wrapper.py @@ -0,0 +1,138 @@ +""" +Tests for reward_wrapper.py β€” no simulator required. +""" + +import sys +import os +import pytest +import numpy as np +import gymnasium as gym + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'agent')) + +from reward_wrapper import SpeedRewardWrapper + + +class MockStepEnv(gym.Env): + """Mock gymnasium.Env for testing SpeedRewardWrapper.""" + metadata = {'render_modes': []} + + def __init__(self, speed=2.0, cte=0.5, original_reward=1.0, done=False, use_5tuple=True): + super().__init__() + self._speed = speed + self._cte = cte + self._reward = original_reward + self._done = done + self._use_5tuple = use_5tuple + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + + def reset(self, seed=None, **kwargs): + return np.zeros((120, 160, 3), dtype=np.uint8), {} + + def step(self, action): + obs = np.zeros((120, 160, 3), dtype=np.uint8) + info = {'speed': self._speed, 'cte': self._cte} + if self._use_5tuple: + return obs, self._reward, self._done, False, info + else: + return obs, self._reward, self._done, info + + def close(self): + pass + + def close(self): + pass + + +def test_speed_reward_higher_when_fast_and_centered(): + """Reward should be higher when car is fast and centered (low CTE).""" + env_fast_centered = MockStepEnv(speed=5.0, cte=0.1, original_reward=1.0) + env_slow_offset = MockStepEnv(speed=1.0, cte=3.0, original_reward=1.0) + + wrapped_fast = SpeedRewardWrapper(env_fast_centered) + wrapped_slow = SpeedRewardWrapper(env_slow_offset) + + _, reward_fast, _, _, _ = wrapped_fast.step(0) + _, reward_slow, _, _, _ = wrapped_slow.step(0) + + assert reward_fast > reward_slow, \ + f"Fast+centered should reward more: {reward_fast:.3f} vs {reward_slow:.3f}" + + +def test_speed_reward_zero_at_max_cte(): + """Reward should be ~0 when CTE = max_cte (on the edge of the road).""" + env = MockStepEnv(speed=5.0, cte=8.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, max_cte=8.0) + _, reward, _, _, _ = wrapped.step(0) + assert reward == pytest.approx(0.0, abs=0.01), \ + f"Reward at max CTE should be ~0, got {reward}" + + +def test_speed_reward_positive_when_on_track(): + """Reward should be positive when car is on track at any speed > 0.""" + env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, max_cte=8.0) + _, reward, _, _, _ = wrapped.step(0) + assert reward > 0, f"On-track reward should be positive, got {reward}" + + +def test_crash_penalty_applied_on_done(): + """Crash penalty should be added when episode ends with negative reward.""" + env = MockStepEnv(speed=0.0, cte=9.0, original_reward=-1.0, done=True) + wrapped = SpeedRewardWrapper(env, max_cte=8.0, crash_penalty=-10.0) + _, reward, terminated, truncated, _ = wrapped.step(0) + assert reward < -5.0, f"Crash penalty should make reward very negative, got {reward}" + + +def test_fallback_to_original_reward_when_info_missing(): + """If info doesn't have speed/cte, should fall back to original reward.""" + class NoInfoEnv(gym.Env): + metadata = {'render_modes': []} + def __init__(self): + super().__init__() + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + def reset(self, seed=None, **kwargs): + return np.zeros((120, 160, 3), dtype=np.uint8), {} + def step(self, action): + return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {} + def close(self): + pass + + wrapped = SpeedRewardWrapper(NoInfoEnv()) + _, reward, _, _, _ = wrapped.step(0) + assert reward == pytest.approx(0.75, abs=1e-6), \ + f"Should fall back to original reward 0.75, got {reward}" + + +def test_wrapper_preserves_observation(): + """SpeedRewardWrapper should not modify observations.""" + obs_data = np.zeros((120, 160, 3), dtype=np.uint8) + + class FixedObsEnv(gym.Env): + metadata = {'render_modes': []} + def __init__(self): + super().__init__() + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + def reset(self, seed=None, **kwargs): + return obs_data.copy(), {} + def step(self, action): + return obs_data.copy(), 1.0, False, False, {'speed': 2.0, 'cte': 0.5} + def close(self): + pass + + wrapped = SpeedRewardWrapper(FixedObsEnv()) + obs, _, _, _, _ = wrapped.step(0) + np.testing.assert_array_almost_equal(obs, obs_data) + + +def test_4tuple_step_compatibility(): + """Wrapper should handle 4-tuple step() return (old gym API).""" + env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0, use_5tuple=False) + wrapped = SpeedRewardWrapper(env) + result = wrapped.step(0) + assert len(result) == 4, f"Expected 4-tuple, got {len(result)}" + _, reward, done, info = result + assert isinstance(reward, float) diff --git a/tests/test_runner_integration.py b/tests/test_runner_integration.py new file mode 100644 index 0000000..4609675 --- /dev/null +++ b/tests/test_runner_integration.py @@ -0,0 +1,133 @@ +""" +Integration tests for donkeycar_sb3_runner.py β€” no live simulator required. +Uses mocked gym environment. +""" + +import os +import sys +import json +import tempfile +import pytest +import numpy as np +import gymnasium as gym +from unittest.mock import patch, MagicMock + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'agent')) + + +class MockGymEnv(gym.Env): + """Minimal mock of a DonkeyCar gym environment as a proper gymnasium.Env.""" + metadata = {'render_modes': []} + + def __init__(self): + super().__init__() + self.observation_space = gym.spaces.Box( + low=0, high=255, shape=(120, 160, 3), dtype=np.uint8 + ) + self.action_space = gym.spaces.Box( + low=np.array([-1.0, 0.0]), + high=np.array([1.0, 1.0]), + dtype=np.float32 + ) + self._step_count = 0 + + def reset(self, seed=None, **kwargs): + self._step_count = 0 + return np.zeros((120, 160, 3), dtype=np.uint8), {} + + def step(self, action): + self._step_count += 1 + obs = np.random.randint(0, 255, (120, 160, 3), dtype=np.uint8) + reward = float(np.random.uniform(0, 2)) + terminated = self._step_count >= 50 + truncated = False + info = {'speed': 2.0, 'cte': 0.5} + return obs, reward, terminated, truncated, info + + def close(self): + pass + + +def test_make_env_ppo_no_discretization(): + """PPO should NOT apply DiscretizedActionWrapper.""" + import gymnasium as gym + + with patch('gymnasium.make', return_value=MockGymEnv()): + from donkeycar_sb3_runner import make_env + env = make_env('donkey-generated-roads-v0', 'ppo', n_steer=7, n_throttle=3, reward_shaping=False) + # PPO env should have Box action space, not Discrete + assert hasattr(env.action_space, 'shape'), "PPO env should have continuous Box action space" + + +def test_make_env_dqn_discretization(): + """DQN should apply DiscretizedActionWrapper.""" + with patch('gymnasium.make', return_value=MockGymEnv()): + from donkeycar_sb3_runner import make_env + env = make_env('donkey-generated-roads-v0', 'dqn', n_steer=5, n_throttle=3, reward_shaping=False) + # DQN env should have Discrete action space + assert hasattr(env.action_space, 'n'), "DQN env should have Discrete action space" + assert env.action_space.n == 5 * 3 + + +def test_save_model_creates_zip(): + """save_model() should create a .zip file at the specified path.""" + mock_model = MagicMock() + mock_model.save = MagicMock() + + with tempfile.TemporaryDirectory() as tmpdir: + save_dir = os.path.join(tmpdir, 'trial-0001') + from donkeycar_sb3_runner import save_model + saved_path = save_model(mock_model, save_dir) + + # Verify save was called with correct path + expected_path = os.path.join(save_dir, 'model') + mock_model.save.assert_called_once_with(expected_path) + assert saved_path == expected_path + '.zip' + assert os.path.isdir(save_dir), "Save directory should be created" + + +def test_save_model_creates_directory(): + """save_model() should create save_dir if it doesn't exist.""" + mock_model = MagicMock() + + with tempfile.TemporaryDirectory() as tmpdir: + save_dir = os.path.join(tmpdir, 'nested', 'path', 'trial-042') + assert not os.path.exists(save_dir) + + from donkeycar_sb3_runner import save_model + save_model(mock_model, save_dir) + assert os.path.isdir(save_dir) + + +def test_teardown_calls_env_close(): + """teardown() should call env.close() even if it raises.""" + from donkeycar_sb3_runner import teardown + mock_env = MagicMock() + mock_env.close.side_effect = RuntimeError("sim disconnected") + # Should not raise + teardown(mock_env) + mock_env.close.assert_called_once() + + +def test_runner_script_has_no_syntax_errors(): + """The runner script should import without syntax errors.""" + import importlib.util + spec_path = os.path.join(os.path.dirname(__file__), '..', 'agent', 'donkeycar_sb3_runner.py') + with open(spec_path) as f: + source = f.read() + compile(source, spec_path, 'exec') # Raises SyntaxError if broken + + +def test_no_model_save_before_definition(): + """Runner source must not call model.save() before model is defined.""" + runner_path = os.path.join(os.path.dirname(__file__), '..', 'agent', 'donkeycar_sb3_runner.py') + with open(runner_path) as f: + source = f.read() + + lines = source.split('\n') + model_defined_line = None + for i, line in enumerate(lines): + if 'model = PPO' in line or 'model = DQN' in line: + model_defined_line = i + if 'model.save' in line and model_defined_line is None: + pytest.fail(f"model.save() called before model is defined at line {i+1}: {line}")