""" Multi-Track Generalization Evaluation ===================================== Tests all top Phase 2 models against every available DonkeyCar track. Uses automatic track switching (exit_scene → reconnect). Results saved to: outerloop-results/multitrack_results.jsonl Summary table printed at the end. Usage: python3 multitrack_eval.py [--episodes N] [--steps N] """ import os, sys, time, json, numpy as np from datetime import datetime from collections import deque import gymnasium as gym import gym_donkeycar from stable_baselines3 import PPO sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from donkeycar_sb3_runner import ThrottleClampWrapper from reward_wrapper import SpeedRewardWrapper from track_switcher import switch_track RESULTS_DIR = os.path.join(os.path.dirname(__file__), 'outerloop-results') RESULTS_FILE = os.path.join(RESULTS_DIR, 'multitrack_results.jsonl') # All available tracks ALL_TRACKS = [ {'id': 'donkey-generated-roads-v0', 'name': 'Generated Road', 'trained_on': True}, {'id': 'donkey-generated-track-v0', 'name': 'Generated Track', 'trained_on': False}, {'id': 'donkey-mountain-track-v0', 'name': 'Mountain Track', 'trained_on': False}, {'id': 'donkey-warehouse-v0', 'name': 'Warehouse', 'trained_on': False}, {'id': 'donkey-avc-sparkfun-v0', 'name': 'AVC Sparkfun', 'trained_on': False}, {'id': 'donkey-minimonaco-track-v0', 'name': 'Mini Monaco', 'trained_on': False}, {'id': 'donkey-warren-track-v0', 'name': 'Warren', 'trained_on': False}, {'id': 'donkey-roboracingleague-track-v0', 'name': 'Robo Racing League', 'trained_on': False}, {'id': 'donkey-waveshare-v0', 'name': 'Waveshare', 'trained_on': False}, {'id': 'donkey-thunderhill-track-v0', 'name': 'Thunderhill', 'trained_on': False}, {'id': 'donkey-circuit-launch-track-v0', 'name': 'Circuit Launch', 'trained_on': False}, ] TOP3_MODELS = [ {'label': 'Trial-20 (n_steer=3 n_thr=5 lr=0.000225 13k)', 'path': 'models/trial-0020/model.zip', 'short': 'T20'}, {'label': 'Trial-8 (n_steer=4 n_thr=3 lr=0.00117 34k)', 'path': 'models/trial-0008/model.zip', 'short': 'T08'}, {'label': 'Trial-18 (n_steer=3 n_thr=5 lr=0.000288 16k)', 'path': 'models/trial-0018/model.zip', 'short': 'T18'}, ] def compute_efficiency(pos_history): if len(pos_history) < 3: return 1.0 positions = list(pos_history) net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0])) total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i])) for i in range(len(positions)-1)) return float(net / total) if total > 1e-6 else 1.0 def run_episodes(model, env, episodes, max_steps, track_name): """Run evaluation episodes and return metrics.""" all_rewards, all_steps, all_cte, all_steer = [], [], [], [] last_action = None for ep in range(1, episodes + 1): obs, info = env.reset() pos_hist = deque(maxlen=31) total_reward, step = 0.0, 0 cte_vals, steer_vals = [], [] while step < max_steps: action, _ = model.predict(obs, deterministic=True) result = env.step(action) if len(result) == 5: obs, reward, terminated, truncated, info = result done = terminated or truncated else: obs, reward, done, info = result cte = float(info.get('cte', 0) or 0) pos = info.get('pos', (0, 0, 0)) px = pos[0] if pos else 0 pz = pos[2] if len(pos) > 2 else 0 pos_hist.append(np.array([px, 0., pz])) try: steer = float(action[0]) if hasattr(action, '__len__') else float(action) steer_vals.append(steer) if last_action is not None: prev = float(last_action[0]) if hasattr(last_action, '__len__') else float(last_action) except Exception: pass last_action = action cte_vals.append(cte) total_reward += reward step += 1 if done: break all_rewards.append(total_reward) all_steps.append(step) all_cte.extend(cte_vals) all_steer.extend(steer_vals) time.sleep(1) # Oscillation score if len(all_steer) > 1: deltas = [abs(all_steer[i] - all_steer[i-1]) for i in range(1, len(all_steer))] osc = float(np.mean(deltas)) else: osc = 0.0 return { 'mean_reward': float(np.mean(all_rewards)), 'std_reward': float(np.std(all_rewards)), 'mean_steps': float(np.mean(all_steps)), 'oscillation': osc, 'mean_abs_cte': float(np.mean([abs(c) for c in all_cte])) if all_cte else 0, 'mean_signed_cte': float(np.mean(all_cte)) if all_cte else 0, 'drove_far': float(np.mean(all_steps)) > 200, # survived more than 200 steps avg } def run_multitrack_eval(episodes=3, max_steps=1000): os.makedirs(RESULTS_DIR, exist_ok=True) print('\n' + '='*70, flush=True) print('🌍 MULTI-TRACK GENERALIZATION EVALUATION', flush=True) print(f' Models: {len(TOP3_MODELS)} | Tracks: {len(ALL_TRACKS)} | Episodes: {episodes} | Max steps: {max_steps}', flush=True) print('='*70, flush=True) all_results = {} current_env_id = 'donkey-generated-roads-v0' # assume starting here for track in ALL_TRACKS: track_id = track['id'] track_name = track['name'] trained = '⭐ TRAINED' if track['trained_on'] else '🆕 UNSEEN' print(f'\n{"─"*70}', flush=True) print(f'📍 Track: {track_name} {trained}', flush=True) print(f' Env: {track_id}', flush=True) print(f'{"─"*70}', flush=True) track_results = {} for model_info in TOP3_MODELS: print(f'\n 🤖 Model: {model_info["short"]} — {model_info["label"][:50]}', flush=True) # Switch to the correct track try: env = switch_track( target_env_id=track_id, current_env_id=current_env_id, verbose=False ) current_env_id = track_id except Exception as e: print(f' ❌ Failed to connect to {track_name}: {e}', flush=True) track_results[model_info['short']] = {'error': str(e)} continue env = ThrottleClampWrapper(env, throttle_min=0.2) env = SpeedRewardWrapper(env, speed_scale=0.1) try: model = PPO.load(model_info['path'], env=env) except Exception as e: print(f' ❌ Failed to load model: {e}', flush=True) env.close() continue try: metrics = run_episodes(model, env, episodes, max_steps, track_name) verdict = '✅ DRIVES' if metrics['drove_far'] else '❌ CRASHES' print(f' {verdict} | reward={metrics["mean_reward"]:.0f} | ' f'steps={metrics["mean_steps"]:.0f} | ' f'osc={metrics["oscillation"]:.3f} | ' f'cte={metrics["mean_abs_cte"]:.2f}', flush=True) track_results[model_info['short']] = metrics except Exception as e: print(f' ❌ Evaluation error: {e}', flush=True) track_results[model_info['short']] = {'error': str(e)} finally: env.close() time.sleep(3) all_results[track_name] = track_results # Save after each track record = { 'timestamp': datetime.now().isoformat(), 'track': track_name, 'track_id': track_id, 'trained_on': track['trained_on'], 'results': track_results } with open(RESULTS_FILE, 'a') as f: f.write(json.dumps(record) + '\n') # Print final summary table print('\n\n' + '='*90, flush=True) print('📊 MULTI-TRACK GENERALIZATION RESULTS', flush=True) print('='*90, flush=True) header = f'{"Track":<26} {"Trained":^8} | {"T20 Steps":>10} {"T20 Rwd":>8} | {"T08 Steps":>10} {"T08 Rwd":>8} | {"T18 Steps":>10} {"T18 Rwd":>8}' print(header, flush=True) print('─'*90, flush=True) for track in ALL_TRACKS: tname = track['name'] trained = '⭐ YES' if track['trained_on'] else 'NO' r = all_results.get(tname, {}) row = f'{tname:<26} {trained:^8} |' for short in ['T20', 'T08', 'T18']: m = r.get(short, {}) if 'error' in m: row += f' {"ERROR":>10} {"--":>8} |' elif m: steps = m.get('mean_steps', 0) rwd = m.get('mean_reward', 0) flag = '✅' if m.get('drove_far') else '❌' row += f' {flag}{steps:>8.0f} {rwd:>8.0f} |' else: row += f' {"--":>10} {"--":>8} |' print(row, flush=True) print('='*90, flush=True) print(f'\nFull results saved to: {RESULTS_FILE}', flush=True) return all_results if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--episodes', type=int, default=3, help='Episodes per track per model') parser.add_argument('--steps', type=int, default=800, help='Max steps per episode') args = parser.parse_args() run_multitrack_eval(episodes=args.episodes, max_steps=args.steps)