""" Enhanced Champion Evaluator — Phase 3 ====================================== Evaluates a model with full metrics: - Total reward per episode - Lap time (using sim's last_lap_time) - Steering oscillation score (std of steering changes) - Lane position histogram (CTE distribution) - Path efficiency throughout episode - Per-step diagnostics: speed, CTE, efficiency, reward, position Usage: # Evaluate current champion python3 evaluate_champion.py # Evaluate a specific model python3 evaluate_champion.py --model models/trial-0020/model.zip # Long run to see lap completion python3 evaluate_champion.py --episodes 3 --steps 3000 # Compare all top Phase 2 models python3 evaluate_champion.py --compare """ import os import sys import time import json import math import numpy as np from collections import deque from datetime import datetime import gymnasium as gym import gym_donkeycar from stable_baselines3 import PPO sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from donkeycar_sb3_runner import ThrottleClampWrapper from reward_wrapper import SpeedRewardWrapper CHAMPION_DIR = os.path.join(os.path.dirname(__file__), 'models', 'champion') MANIFEST_PATH = os.path.join(CHAMPION_DIR, 'manifest.json') EVAL_SUMMARY = os.path.join(os.path.dirname(__file__), 'outerloop-results', 'eval_summary.jsonl') # Top Phase 2 models for comparison PHASE2_MODELS = [ { 'label': 'Trial-20 Phase2-CHAMPION (n_steer=3 n_throttle=5 lr=0.000225 13k)', 'path': 'models/trial-0020/model.zip', 'style': 'Right lane, stable', }, { 'label': 'Trial-8 Phase2-2nd (n_steer=4 n_throttle=3 lr=0.00117 34k)', 'path': 'models/trial-0008/model.zip', 'style': 'Left/center, oscillating', }, { 'label': 'Trial-18 Phase2-3rd (n_steer=3 n_throttle=5 lr=0.000288 16k)', 'path': 'models/trial-0018/model.zip', 'style': 'Right shoulder, very accurate', }, ] def load_manifest(): if os.path.exists(MANIFEST_PATH): with open(MANIFEST_PATH) as f: return json.load(f) return {} def compute_efficiency(pos_history): if len(pos_history) < 3: return 1.0 positions = list(pos_history) net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0])) total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i])) for i in range(len(positions)-1)) return float(net / total) if total > 1e-6 else 1.0 def print_banner(label, path): print(f'\n{"="*68}', flush=True) print(f'šŸ” {label}', flush=True) print(f' {path}', flush=True) print(f'{"="*68}', flush=True) def run_eval(model, env, episodes, max_steps, label=''): """Run evaluation and return full metrics.""" all_rewards = [] all_steps = [] all_lap_times = [] all_osc_scores = [] all_cte_distributions = [] all_completed = [] for ep in range(1, episodes + 1): obs, info = env.reset() pos_hist = deque(maxlen=31) total_reward = 0.0 step = 0 cte_values = [] steering_actions = [] laps_completed = 0 lap_times = [] print(f'\n--- Episode {ep}/{episodes} ---', flush=True) print(f'{"Step":>5} {"Spd":>5} {"CTE":>6} {"Eff%":>5} {"Rwd":>7} {"Tot":>9} {"Laps":>5} {"Px":>7} {"Pz":>7}', flush=True) print('-' * 62, flush=True) while step < max_steps: action, _ = model.predict(obs, deterministic=True) result = env.step(action) if len(result) == 5: obs, reward, terminated, truncated, info = result done = terminated or truncated else: obs, reward, done, info = result speed = float(info.get('speed', 0) or 0) cte = float(info.get('cte', 0) or 0) pos = info.get('pos', (0, 0, 0)) px = pos[0] if pos else 0 pz = pos[2] if len(pos) > 2 else 0 lap_count = int(info.get('lap_count', 0) or 0) last_lap_time = float(info.get('last_lap_time', 0) or 0) # Track new laps if lap_count > laps_completed: laps_completed = lap_count if last_lap_time > 0: lap_times.append(last_lap_time) print(f'\n šŸ LAP {laps_completed} COMPLETE! Time={last_lap_time:.2f}s', flush=True) pos_hist.append(np.array([px, 0., pz])) cte_values.append(cte) # Track steering for oscillation score try: steer = float(action[0]) if hasattr(action, '__len__') else float(action) steering_actions.append(steer) except (TypeError, IndexError): pass total_reward += reward step += 1 eff = compute_efficiency(pos_hist) if step % 50 == 0 or done: print(f'{step:>5} {speed:>5.2f} {cte:>6.2f} {eff*100:>4.0f}% ' f'{reward:>7.3f} {total_reward:>9.1f} {laps_completed:>5} ' f'{px:>7.1f} {pz:>7.1f}', flush=True) if done: print(f'\n Episode {ep} ended after {step} steps | ' f'total={total_reward:.1f} | laps={laps_completed}', flush=True) break if step >= max_steps: print(f'\n Episode {ep} reached max {max_steps} steps | ' f'total={total_reward:.1f} | laps={laps_completed}', flush=True) # Compute oscillation score if len(steering_actions) > 1: deltas = [abs(steering_actions[i] - steering_actions[i-1]) for i in range(1, len(steering_actions))] osc_score = float(np.mean(deltas)) else: osc_score = 0.0 all_rewards.append(total_reward) all_steps.append(step) all_lap_times.extend(lap_times) all_osc_scores.append(osc_score) all_cte_distributions.extend(cte_values) all_completed.append(laps_completed > 0) time.sleep(2) # Summary metrics summary = { 'label': label, 'episodes': episodes, 'mean_reward': float(np.mean(all_rewards)), 'std_reward': float(np.std(all_rewards)), 'mean_steps': float(np.mean(all_steps)), 'laps_completed': sum(1 for r in all_rewards if r > 500), # proxy for completion 'lap_times': all_lap_times, 'mean_lap_time': float(np.mean(all_lap_times)) if all_lap_times else None, 'oscillation_score': float(np.mean(all_osc_scores)), # lower = smoother 'mean_abs_cte': float(np.mean([abs(c) for c in all_cte_distributions])), 'cte_std': float(np.std(all_cte_distributions)), 'mean_cte_signed': float(np.mean(all_cte_distributions)), # + = left, - = right 'timestamp': datetime.now().isoformat(), } return summary, all_rewards def print_summary(summary): print(f'\nšŸ“Š Metrics for: {summary["label"]}', flush=True) print(f' Mean reward: {summary["mean_reward"]:.1f} ± {summary["std_reward"]:.1f}', flush=True) print(f' Mean steps/ep: {summary["mean_steps"]:.0f}', flush=True) print(f' Oscillation score: {summary["oscillation_score"]:.4f} (lower=smoother)', flush=True) print(f' Mean |CTE|: {summary["mean_abs_cte"]:.3f} m from centre', flush=True) print(f' Mean signed CTE: {summary["mean_cte_signed"]:.3f} m (+ =left, - =right)', flush=True) cte_side = 'RIGHT of centre āž”ļø' if summary['mean_cte_signed'] < -0.1 else \ 'LEFT of centre ā¬…ļø' if summary['mean_cte_signed'] > 0.1 else 'CENTRED ā†•ļø' print(f' Lane position: {cte_side}', flush=True) if summary['lap_times']: print(f' Lap times: {[f"{t:.1f}s" for t in summary["lap_times"]]}', flush=True) print(f' Best lap time: {min(summary["lap_times"]):.1f}s', flush=True) print(flush=True) def save_summary(summary): os.makedirs(os.path.dirname(EVAL_SUMMARY), exist_ok=True) with open(EVAL_SUMMARY, 'a') as f: f.write(json.dumps(summary) + '\n') def main(episodes=3, max_steps=3000, model_override=None, compare=False): manifest = load_manifest() models_to_eval = [] if compare: for m in PHASE2_MODELS: models_to_eval.append((m['label'], m['path'])) else: path = model_override or CHAMPION_DIR + '/model.zip' label = model_override or f"Champion (Phase {manifest.get('phase', '?')} Trial {manifest.get('trial', '?')})" models_to_eval.append((label, path)) all_summaries = [] for label, path in models_to_eval: print_banner(label, path) print(f'[Eval] Connecting to simulator...', flush=True) try: env = gym.make('donkey-generated-roads-v0') except Exception as e: print(f'[Eval] FAILED: {e}', flush=True) sys.exit(1) env = ThrottleClampWrapper(env, throttle_min=0.2) env = SpeedRewardWrapper(env, speed_scale=0.1) print(f'[Eval] Loading model: {path}', flush=True) try: model = PPO.load(path, env=env) print(f'[Eval] Model loaded. Running {episodes} episodes Ɨ {max_steps} steps...', flush=True) except Exception as e: print(f'[Eval] FAILED to load: {e}', flush=True) env.close() continue summary, rewards = run_eval(model, env, episodes, max_steps, label) print_summary(summary) save_summary(summary) all_summaries.append(summary) env.close() time.sleep(3) if compare and len(all_summaries) > 1: print('\n' + '=' * 68, flush=True) print('šŸ COMPARISON TABLE', flush=True) print('=' * 68, flush=True) print(f'{"Model":<40} {"Reward":>8} {"Steps":>7} {"Osc":>6} {"CTE":>6} {"Side":>10}', flush=True) print('-' * 68, flush=True) for s in all_summaries: side = 'āž”ļø RIGHT' if s['mean_cte_signed'] < -0.1 else \ 'ā¬…ļø LEFT' if s['mean_cte_signed'] > 0.1 else 'ā†•ļø CENTER' name = s['label'][:40] print(f'{name:<40} {s["mean_reward"]:>8.0f} {s["mean_steps"]:>7.0f} ' f'{s["oscillation_score"]:>6.3f} {s["mean_abs_cte"]:>6.2f} {side:>10}', flush=True) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Evaluate DonkeyCar RL model with full metrics.') parser.add_argument('--episodes', type=int, default=3) parser.add_argument('--steps', type=int, default=3000) parser.add_argument('--model', type=str, default=None, help='Override model path') parser.add_argument('--compare', action='store_true', help='Compare all top Phase 2 models') args = parser.parse_args() main(episodes=args.episodes, max_steps=args.steps, model_override=args.model, compare=args.compare)