diff --git a/agent/autoresearch_controller.py b/agent/autoresearch_controller.py index 2c0288e..7f25f50 100644 --- a/agent/autoresearch_controller.py +++ b/agent/autoresearch_controller.py @@ -39,9 +39,9 @@ RESULTS_DIR = os.path.join(PROJECT_DIR, 'outerloop-results') MODELS_DIR = os.path.join(PROJECT_DIR, 'models') CHAMPION_DIR = os.path.join(MODELS_DIR, 'champion') -# Phase 1 uses a separate results file — do NOT mix with random-policy data -PHASE1_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results_phase1.jsonl') -PHASE1_LOG = os.path.join(RESULTS_DIR, 'autoresearch_phase1_log.txt') +# Phase 2 uses a separate results file — corner learning with longer timesteps +PHASE1_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results_phase2.jsonl') +PHASE1_LOG = os.path.join(RESULTS_DIR, 'autoresearch_phase2_log.txt') # Legacy base data (discretization insights, valid for n_steer/n_throttle) BASE_DATA_FILE = os.path.join(RESULTS_DIR, 'clean_sweep_results.jsonl') @@ -52,28 +52,30 @@ os.makedirs(CHAMPION_DIR, exist_ok=True) # ---- Parameter Space ---- # These are the parameters GP+UCB will optimize -# NOTE: timesteps kept small (1000-5000) for Phase 1 exploration on CPU. -# DonkeyCar sim runs ~20-50 steps/sec. 5000 steps ≈ 100-250s → fits in 600s timeout. -# Increase max_timesteps once we confirm the pipeline works end-to-end. +# PHASE 2: Corner Learning +# Phase 1 confirmed genuine driving (599 steps, mean_reward=1022, efficiency ~99%). +# Failure point: S-curve at step ~560 — too fast, doesn't learn left-turn recovery. +# Fix: Much longer training so model experiences the S-curve many times. +# Search space tightened around Phase 1 winning region: lr=0.00005-0.002, n_throttle=2-5 PARAM_SPACE = { 'n_steer': {'type': 'int', 'min': 3, 'max': 9}, 'n_throttle': {'type': 'int', 'min': 2, 'max': 5}, - 'learning_rate': {'type': 'float', 'min': 0.00005, 'max': 0.005}, - 'timesteps': {'type': 'int', 'min': 1000, 'max': 5000}, + 'learning_rate': {'type': 'float', 'min': 0.00005, 'max': 0.002}, + 'timesteps': {'type': 'int', 'min': 10000, 'max': 50000}, } PARAM_KEYS = list(PARAM_SPACE.keys()) # Fixed params FIXED_PARAMS = { 'agent': 'ppo', - 'eval_episodes': 3, + 'eval_episodes': 5, # More eval episodes — corner performance is stochastic 'reward_shaping': True, } N_CANDIDATES = 500 UCB_KAPPA = 2.0 MIN_TRIALS_BEFORE_GP = 3 -JOB_TIMEOUT = 480 # 8 minutes — enough for 5000 steps + eval, with margin +JOB_TIMEOUT = 3600 # 60 min per trial — 50k steps on CPU needs time # ---- Logging ---- def log(msg): @@ -222,7 +224,7 @@ class ChampionTracker: # ---- Load Results ---- def load_phase1_results(): - """Load Phase 1 results only — no random-policy contamination.""" + """Load Phase 2 results for GP fitting (corner learning runs).""" results = [] if not os.path.exists(PHASE1_RESULTS): return results diff --git a/agent/evaluate_champion.py b/agent/evaluate_champion.py new file mode 100644 index 0000000..3cff14f --- /dev/null +++ b/agent/evaluate_champion.py @@ -0,0 +1,169 @@ +""" +Champion Model Evaluator +======================== +Loads the champion model and runs it live in the simulator for visual inspection. +Prints per-step diagnostics: position, speed, CTE, efficiency, reward. + +Usage: + python3 evaluate_champion.py [--episodes N] [--steps N] + +Watch the simulator window to see if the car is genuinely driving the track +or exploiting circular motion. +""" + +import os +import sys +import time +import json +import numpy as np +from collections import deque + +import gymnasium as gym +import gym_donkeycar +from stable_baselines3 import PPO + +# Add agent dir to path for wrappers +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from reward_wrapper import SpeedRewardWrapper +from donkeycar_sb3_runner import ThrottleClampWrapper + +CHAMPION_DIR = os.path.join(os.path.dirname(__file__), 'models', 'champion') +MANIFEST_PATH = os.path.join(CHAMPION_DIR, 'manifest.json') +MODEL_PATH = os.path.join(CHAMPION_DIR, 'model.zip') + + +def load_manifest(): + with open(MANIFEST_PATH) as f: + return json.load(f) + + +def print_banner(manifest): + print('=' * 65, flush=True) + print('🏆 DonkeyCar Champion Model Evaluation', flush=True) + print('=' * 65, flush=True) + print(f" Trial: {manifest['trial']}", flush=True) + print(f" mean_reward: {manifest['mean_reward']:.4f}", flush=True) + print(f" Params: {manifest['params']}", flush=True) + print(f" Model: {MODEL_PATH}", flush=True) + print('=' * 65, flush=True) + print(flush=True) + + +def compute_efficiency(pos_history): + """Path efficiency = net_displacement / total_path_length over window.""" + if len(pos_history) < 3: + return 1.0 + positions = list(pos_history) + net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0])) + total = sum( + np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i])) + for i in range(len(positions)-1) + ) + return float(net / total) if total > 1e-6 else 1.0 + + +def run_episode(model, env, episode_num, max_steps=500): + """Run one episode with the champion policy, printing diagnostics.""" + print(f'\n--- Episode {episode_num} ---', flush=True) + obs, info = env.reset() + pos_history = deque(maxlen=30) + total_reward = 0.0 + step = 0 + + print(f'{"Step":>5} {"Speed":>6} {"CTE":>7} {"Eff%":>6} {"Rwd":>8} {"TotRwd":>10} {"Pos_x":>8} {"Pos_z":>8}', flush=True) + print('-' * 65, flush=True) + + while step < max_steps: + action, _ = model.predict(obs, deterministic=True) + result = env.step(action) + if len(result) == 5: + obs, reward, terminated, truncated, info = result + done = terminated or truncated + else: + obs, reward, done, info = result + + # Extract diagnostics from info + speed = float(info.get('speed', 0.0) or 0.0) + cte = float(info.get('cte', 0.0) or 0.0) + pos = info.get('pos', None) + if pos is not None: + pos_history.append(list(pos)[:3]) + px, pz = pos[0], pos[2] if len(pos) > 2 else 0.0 + else: + px, pz = 0.0, 0.0 + + efficiency = compute_efficiency(pos_history) + total_reward += reward + step += 1 + + # Print every 10 steps or on done + if step % 10 == 0 or done: + print(f'{step:>5} {speed:>6.2f} {cte:>7.3f} {efficiency*100:>5.1f}% {reward:>8.3f} {total_reward:>10.2f} {px:>8.2f} {pz:>8.2f}', flush=True) + + if done: + print(f'\n ✅ Episode {episode_num} done after {step} steps | total_reward={total_reward:.2f}', flush=True) + break + + if step >= max_steps: + print(f'\n ⏱️ Episode {episode_num} reached max_steps={max_steps} | total_reward={total_reward:.2f}', flush=True) + + return total_reward, step + + +def main(episodes=3, max_steps=500): + manifest = load_manifest() + print_banner(manifest) + + params = manifest['params'] + + print(f'[Eval] Connecting to simulator...', flush=True) + try: + env = gym.make('donkey-generated-roads-v0') + except Exception as e: + print(f'[Eval] FAILED to connect: {e}', flush=True) + sys.exit(1) + + # Apply same wrappers as training + env = ThrottleClampWrapper(env, throttle_min=0.2) + env = SpeedRewardWrapper(env, speed_scale=0.1) + print(f'[Eval] Wrappers applied: ThrottleClamp(min=0.2), SpeedRewardWrapper(scale=0.1)', flush=True) + + print(f'[Eval] Loading champion model from {MODEL_PATH}...', flush=True) + try: + model = PPO.load(MODEL_PATH, env=env) + print(f'[Eval] Model loaded successfully.', flush=True) + except Exception as e: + print(f'[Eval] FAILED to load model: {e}', flush=True) + env.close() + sys.exit(1) + + print(f'\n[Eval] Running {episodes} episodes (max {max_steps} steps each)...', flush=True) + print('[Eval] Watch the simulator window — is the car driving the track or circling?', flush=True) + + all_rewards = [] + for ep in range(1, episodes + 1): + total_reward, steps = run_episode(model, env, ep, max_steps=max_steps) + all_rewards.append(total_reward) + if ep < episodes: + time.sleep(2) # Brief pause between episodes + + print('\n' + '=' * 65, flush=True) + print('📊 Evaluation Complete', flush=True) + print(f' Episodes: {episodes}', flush=True) + print(f' Rewards: {[f"{r:.1f}" for r in all_rewards]}', flush=True) + print(f' Mean reward: {sum(all_rewards)/len(all_rewards):.2f}', flush=True) + print(f' Std reward: {float(np.std(all_rewards)):.2f}', flush=True) + print('=' * 65, flush=True) + + env.close() + time.sleep(2) + print('[Eval] Done.', flush=True) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--episodes', type=int, default=3, help='Number of eval episodes') + parser.add_argument('--steps', type=int, default=500, help='Max steps per episode') + args = parser.parse_args() + main(episodes=args.episodes, max_steps=args.steps) diff --git a/agent/outerloop-results/autoresearch_phase1_log.txt b/agent/outerloop-results/autoresearch_phase1_log.txt index 5bde57a..35a0dad 100644 --- a/agent/outerloop-results/autoresearch_phase1_log.txt +++ b/agent/outerloop-results/autoresearch_phase1_log.txt @@ -1991,3 +1991,4 @@ [2026-04-13 19:18:00] mean_reward=3332.0024 params={'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0010146909128518657, 'timesteps': 4979, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} [2026-04-13 19:18:00] mean_reward=2306.7610 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.0004488352572615814, 'timesteps': 4898, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} [2026-04-13 19:18:00] mean_reward=2286.9085 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.0003386484278685721, 'timesteps': 4977, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 19:18:01] [AutoResearch] Git push complete after trial 50 diff --git a/docs/RESEARCH_LOG.md b/docs/RESEARCH_LOG.md index d4e9676..e408932 100644 --- a/docs/RESEARCH_LOG.md +++ b/docs/RESEARCH_LOG.md @@ -247,3 +247,80 @@ shaped_reward = original_reward × (1 + speed_scale × speed × efficiency) | > 50% | Unstable policy, inconsistent behavior | This metric will be added to the autoresearch result logging and summary. + +--- + +## 2026-04-13 — 🏆 PHASE 1 MILESTONE: Genuine Track Driving Confirmed! + +### Finding: Champion Model Drives the Track — Real RL Behaviour Proven + +**This is the first confirmed genuine driving result from the autoresearch pipeline.** + +**Visual confirmation (user):** "It is definitely driving! The donkeycar is driving along the track!" + +**Evaluation data — 3 episodes, 1500 max steps:** + +| Episode | Steps | Total Reward | Std | Efficiency | +|---------|-------|-------------|-------|------------| +| 1 | 599 | 1022.73 | — | 96-100% | +| 2 | 598 | 1023.35 | — | 96-100% | +| 3 | 599 | 1022.25 | — | 96-100% | +| **Mean** | **599** | **1022.78** | **0.45** | **~99%** | + +**Champion Model Parameters:** +- agent: PPO, n_steer=7, n_throttle=3, lr=0.000680, timesteps=4787 +- Path: `agent/models/champion/model.zip` + +### Track Trajectory Analysis + +``` +Start: Pos(6.25, 6.30) → Starting line +Step 300: Pos(22.80, 2.09) → Long straight, approaching first corner +Step 400: Pos(18.80, -6.96) → Negotiating first right-hand curve ✅ +Step 500: Pos(28.12, -5.61) → Continuing along second straight +Step 560: Pos(33.12, -6.55) → Approaching second corner +Step 599: CRASH CTE=8.26 → Off track at second corner ❌ +``` + +The car successfully: +- Accelerates from 0 → 2.3 m/s along the straight +- Navigates the first right-hand curve +- Follows the track for ~600 steps covering ~30+ position units + +### Failure Analysis: The S-Curve Crash + +**User observation:** "The spot where the donkeycar goes off the track is during a right hand curve which quickly turns into a left hand curve. It doesn't even look like it sees the left hand curve." + +**What the data shows:** +- Steps 540-560: CTE briefly near zero (0.24) — car approaches corner well +- Steps 570+: CTE explodes 1.4 → 3.8 → 5.9 → 8.3 — car overshoots +- Speed at crash: 2.23-2.30 m/s — too fast for the S-curve + +**Root cause:** Only 4787 training timesteps — insufficient to learn: +1. Speed reduction approaching corners +2. Left-turn recovery after right-hand overshoot +3. S-curve geometry (right → quick left transition) + +**Key insight: The model never sees the left-hand curve** because it has always crashed at the right-hand part first during training. This is an exploration problem — the car needs more timesteps to get past this point and discover what's beyond. + +### Reward Shaping Victory + +All 3 reward hacking fixes proved necessary and correct: +- v1 additive → boundary oscillation exploit +- v2 multiplicative → circular driving exploit +- v3 path efficiency → genuine forward driving ✅ + +The path efficiency metric (96-100% throughout entire run) confirms the car is making continuous forward progress — not circling, not oscillating. + +### Phase 1 → Phase 2 Transition + +**Phase 1 objective achieved:** A real PPO model drives the DonkeyCar track with genuine forward motion, consistent behaviour (std=0.45), and correct trajectory. + +**Next objective (targeted autoresearch):** Learn corner handling and speed modulation. +- Increase timesteps to 10,000-50,000 per trial +- The model needs to see the S-curve many times to learn the transition +- Consider adding a CTE-rate-of-change penalty to discourage high speed at high CTE + +### This is Research! + +The reward hacking discovery and the progression from random walk → boundary oscillation → circular exploit → genuine driving represents real empirical RL research. Each failure mode revealed a fundamental property of reward design. The path efficiency fix was an original contribution to solving the circular driving problem without requiring track-shape knowledge.