diff --git a/agent/multitrack_eval.py b/agent/multitrack_eval.py new file mode 100644 index 0000000..5bb7cda --- /dev/null +++ b/agent/multitrack_eval.py @@ -0,0 +1,240 @@ +""" +Multi-Track Generalization Evaluation +===================================== +Tests all top Phase 2 models against every available DonkeyCar track. +Uses automatic track switching (exit_scene → reconnect). + +Results saved to: outerloop-results/multitrack_results.jsonl +Summary table printed at the end. + +Usage: + python3 multitrack_eval.py [--episodes N] [--steps N] +""" + +import os, sys, time, json, numpy as np +from datetime import datetime +from collections import deque + +import gymnasium as gym +import gym_donkeycar +from stable_baselines3 import PPO + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from donkeycar_sb3_runner import ThrottleClampWrapper +from reward_wrapper import SpeedRewardWrapper +from track_switcher import switch_track + +RESULTS_DIR = os.path.join(os.path.dirname(__file__), 'outerloop-results') +RESULTS_FILE = os.path.join(RESULTS_DIR, 'multitrack_results.jsonl') + +# All available tracks +ALL_TRACKS = [ + {'id': 'donkey-generated-roads-v0', 'name': 'Generated Road', 'trained_on': True}, + {'id': 'donkey-generated-track-v0', 'name': 'Generated Track', 'trained_on': False}, + {'id': 'donkey-mountain-track-v0', 'name': 'Mountain Track', 'trained_on': False}, + {'id': 'donkey-warehouse-v0', 'name': 'Warehouse', 'trained_on': False}, + {'id': 'donkey-avc-sparkfun-v0', 'name': 'AVC Sparkfun', 'trained_on': False}, + {'id': 'donkey-minimonaco-track-v0', 'name': 'Mini Monaco', 'trained_on': False}, + {'id': 'donkey-warren-track-v0', 'name': 'Warren', 'trained_on': False}, + {'id': 'donkey-roboracingleague-track-v0', 'name': 'Robo Racing League', 'trained_on': False}, + {'id': 'donkey-waveshare-v0', 'name': 'Waveshare', 'trained_on': False}, + {'id': 'donkey-thunderhill-track-v0', 'name': 'Thunderhill', 'trained_on': False}, + {'id': 'donkey-circuit-launch-track-v0', 'name': 'Circuit Launch', 'trained_on': False}, +] + +TOP3_MODELS = [ + {'label': 'Trial-20 (n_steer=3 n_thr=5 lr=0.000225 13k)', 'path': 'models/trial-0020/model.zip', 'short': 'T20'}, + {'label': 'Trial-8 (n_steer=4 n_thr=3 lr=0.00117 34k)', 'path': 'models/trial-0008/model.zip', 'short': 'T08'}, + {'label': 'Trial-18 (n_steer=3 n_thr=5 lr=0.000288 16k)', 'path': 'models/trial-0018/model.zip', 'short': 'T18'}, +] + + +def compute_efficiency(pos_history): + if len(pos_history) < 3: + return 1.0 + positions = list(pos_history) + net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0])) + total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i])) + for i in range(len(positions)-1)) + return float(net / total) if total > 1e-6 else 1.0 + + +def run_episodes(model, env, episodes, max_steps, track_name): + """Run evaluation episodes and return metrics.""" + all_rewards, all_steps, all_cte, all_steer = [], [], [], [] + last_action = None + + for ep in range(1, episodes + 1): + obs, info = env.reset() + pos_hist = deque(maxlen=31) + total_reward, step = 0.0, 0 + cte_vals, steer_vals = [], [] + + while step < max_steps: + action, _ = model.predict(obs, deterministic=True) + result = env.step(action) + if len(result) == 5: + obs, reward, terminated, truncated, info = result + done = terminated or truncated + else: + obs, reward, done, info = result + + cte = float(info.get('cte', 0) or 0) + pos = info.get('pos', (0, 0, 0)) + px = pos[0] if pos else 0 + pz = pos[2] if len(pos) > 2 else 0 + pos_hist.append(np.array([px, 0., pz])) + + try: + steer = float(action[0]) if hasattr(action, '__len__') else float(action) + steer_vals.append(steer) + if last_action is not None: + prev = float(last_action[0]) if hasattr(last_action, '__len__') else float(last_action) + except Exception: + pass + last_action = action + + cte_vals.append(cte) + total_reward += reward + step += 1 + + if done: + break + + all_rewards.append(total_reward) + all_steps.append(step) + all_cte.extend(cte_vals) + all_steer.extend(steer_vals) + time.sleep(1) + + # Oscillation score + if len(all_steer) > 1: + deltas = [abs(all_steer[i] - all_steer[i-1]) for i in range(1, len(all_steer))] + osc = float(np.mean(deltas)) + else: + osc = 0.0 + + return { + 'mean_reward': float(np.mean(all_rewards)), + 'std_reward': float(np.std(all_rewards)), + 'mean_steps': float(np.mean(all_steps)), + 'oscillation': osc, + 'mean_abs_cte': float(np.mean([abs(c) for c in all_cte])) if all_cte else 0, + 'mean_signed_cte': float(np.mean(all_cte)) if all_cte else 0, + 'drove_far': float(np.mean(all_steps)) > 200, # survived more than 200 steps avg + } + + +def run_multitrack_eval(episodes=3, max_steps=1000): + os.makedirs(RESULTS_DIR, exist_ok=True) + print('\n' + '='*70, flush=True) + print('🌍 MULTI-TRACK GENERALIZATION EVALUATION', flush=True) + print(f' Models: {len(TOP3_MODELS)} | Tracks: {len(ALL_TRACKS)} | Episodes: {episodes} | Max steps: {max_steps}', flush=True) + print('='*70, flush=True) + + all_results = {} + current_env_id = 'donkey-generated-roads-v0' # assume starting here + + for track in ALL_TRACKS: + track_id = track['id'] + track_name = track['name'] + trained = '⭐ TRAINED' if track['trained_on'] else '🆕 UNSEEN' + print(f'\n{"─"*70}', flush=True) + print(f'📍 Track: {track_name} {trained}', flush=True) + print(f' Env: {track_id}', flush=True) + print(f'{"─"*70}', flush=True) + + track_results = {} + + for model_info in TOP3_MODELS: + print(f'\n 🤖 Model: {model_info["short"]} — {model_info["label"][:50]}', flush=True) + + # Switch to the correct track + try: + env = switch_track( + target_env_id=track_id, + current_env_id=current_env_id, + verbose=False + ) + current_env_id = track_id + except Exception as e: + print(f' ❌ Failed to connect to {track_name}: {e}', flush=True) + track_results[model_info['short']] = {'error': str(e)} + continue + + env = ThrottleClampWrapper(env, throttle_min=0.2) + env = SpeedRewardWrapper(env, speed_scale=0.1) + + try: + model = PPO.load(model_info['path'], env=env) + except Exception as e: + print(f' ❌ Failed to load model: {e}', flush=True) + env.close() + continue + + try: + metrics = run_episodes(model, env, episodes, max_steps, track_name) + verdict = '✅ DRIVES' if metrics['drove_far'] else '❌ CRASHES' + print(f' {verdict} | reward={metrics["mean_reward"]:.0f} | ' + f'steps={metrics["mean_steps"]:.0f} | ' + f'osc={metrics["oscillation"]:.3f} | ' + f'cte={metrics["mean_abs_cte"]:.2f}', flush=True) + track_results[model_info['short']] = metrics + except Exception as e: + print(f' ❌ Evaluation error: {e}', flush=True) + track_results[model_info['short']] = {'error': str(e)} + finally: + env.close() + time.sleep(3) + + all_results[track_name] = track_results + + # Save after each track + record = { + 'timestamp': datetime.now().isoformat(), + 'track': track_name, + 'track_id': track_id, + 'trained_on': track['trained_on'], + 'results': track_results + } + with open(RESULTS_FILE, 'a') as f: + f.write(json.dumps(record) + '\n') + + # Print final summary table + print('\n\n' + '='*90, flush=True) + print('📊 MULTI-TRACK GENERALIZATION RESULTS', flush=True) + print('='*90, flush=True) + header = f'{"Track":<26} {"Trained":^8} | {"T20 Steps":>10} {"T20 Rwd":>8} | {"T08 Steps":>10} {"T08 Rwd":>8} | {"T18 Steps":>10} {"T18 Rwd":>8}' + print(header, flush=True) + print('─'*90, flush=True) + + for track in ALL_TRACKS: + tname = track['name'] + trained = '⭐ YES' if track['trained_on'] else 'NO' + r = all_results.get(tname, {}) + row = f'{tname:<26} {trained:^8} |' + for short in ['T20', 'T08', 'T18']: + m = r.get(short, {}) + if 'error' in m: + row += f' {"ERROR":>10} {"--":>8} |' + elif m: + steps = m.get('mean_steps', 0) + rwd = m.get('mean_reward', 0) + flag = '✅' if m.get('drove_far') else '❌' + row += f' {flag}{steps:>8.0f} {rwd:>8.0f} |' + else: + row += f' {"--":>10} {"--":>8} |' + print(row, flush=True) + + print('='*90, flush=True) + print(f'\nFull results saved to: {RESULTS_FILE}', flush=True) + return all_results + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--episodes', type=int, default=3, help='Episodes per track per model') + parser.add_argument('--steps', type=int, default=800, help='Max steps per episode') + args = parser.parse_args() + run_multitrack_eval(episodes=args.episodes, max_steps=args.steps) diff --git a/docs/RESEARCH_LOG.md b/docs/RESEARCH_LOG.md index d6acf59..94b1817 100644 --- a/docs/RESEARCH_LOG.md +++ b/docs/RESEARCH_LOG.md @@ -414,3 +414,31 @@ Yes! Through targeted reward shaping: - Fine-tuning from Phase 2 champion **Phase 2 Champion:** Trial 20 — n_steer=3, n_throttle=5, lr=0.000225, 13k steps + +--- + +## 2026-04-14 — Track Switching API: exit_scene() Works Automatically + +### Finding: Automatic Scene Switching via unwrapped viewer + +**Problem:** `gym.make('donkey-generated-track-v0')` ignores the scene name if the simulator already has a scene running — it just uses the current scene. + +**Root cause:** The sim only responds to scene selection when it's at the main menu (`scene_selection_ready` state). If a scene is loaded, it sends `need_car_config` instead. + +**Fix:** `env.unwrapped.viewer.exit_scene()` sends the exit message through the **established websocket connection**. Raw TCP socket approach failed because the DonkeyCar protocol requires proper framing. + +**Working procedure:** +```python +temp_env = gym.make(current_scene_env_id) +temp_env.unwrapped.viewer.exit_scene() # Sends exit via websocket +time.sleep(4) # Wait for sim to reach main menu +temp_env.unwrapped.viewer.quit() +env = gym.make(target_env_id) # Sim now loads correct scene +``` + +**Confirmed:** `loading scene generated_road` message appears in logs after switch. +**Impact:** Fully automated multi-track evaluation and training without user intervention! + +--- + +## 2026-04-14 — PHASE 3 BEGINS: Multi-Track Generalization Evaluation