donkeycar-rl-autoresearch/agent/evaluate_champion.py

"""
Enhanced Champion Evaluator — Phase 3
======================================
Evaluates a model with full metrics:
  - Total reward per episode
  - Lap time (using sim's last_lap_time)
  - Steering oscillation score (std of steering changes)
  - Lane position histogram (CTE distribution)
  - Path efficiency throughout episode
  - Per-step diagnostics: speed, CTE, efficiency, reward, position

Usage:
    # Evaluate current champion
    python3 evaluate_champion.py

    # Evaluate a specific model
    python3 evaluate_champion.py --model models/trial-0020/model.zip

    # Long run to see lap completion
    python3 evaluate_champion.py --episodes 3 --steps 3000

    # Compare all top Phase 2 models
    python3 evaluate_champion.py --compare
"""

import os
import sys
import time
import json
import math
import numpy as np
from collections import deque
from datetime import datetime

import gymnasium as gym
import gym_donkeycar
from stable_baselines3 import PPO

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from track_switcher import switch_track, AVAILABLE_TRACKS

CHAMPION_DIR = os.path.join(os.path.dirname(__file__), 'models', 'champion')
MANIFEST_PATH = os.path.join(CHAMPION_DIR, 'manifest.json')
EVAL_SUMMARY = os.path.join(os.path.dirname(__file__), 'outerloop-results', 'eval_summary.jsonl')

# Top Phase 2 models for comparison
PHASE2_MODELS = [
    {
        'label': 'Trial-20 Phase2-CHAMPION (n_steer=3 n_throttle=5 lr=0.000225 13k)',
        'path': 'models/trial-0020/model.zip',
        'style': 'Right lane, stable',
    },
    {
        'label': 'Trial-8  Phase2-2nd     (n_steer=4 n_throttle=3 lr=0.00117 34k)',
        'path': 'models/trial-0008/model.zip',
        'style': 'Left/center, oscillating',
    },
    {
        'label': 'Trial-18 Phase2-3rd     (n_steer=3 n_throttle=5 lr=0.000288 16k)',
        'path': 'models/trial-0018/model.zip',
        'style': 'Right shoulder, very accurate',
    },
]


def load_manifest():
    if os.path.exists(MANIFEST_PATH):
        with open(MANIFEST_PATH) as f:
            return json.load(f)
    return {}


def compute_efficiency(pos_history):
    if len(pos_history) < 3:
        return 1.0
    positions = list(pos_history)
    net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0]))
    total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i]))
                for i in range(len(positions)-1))
    return float(net / total) if total > 1e-6 else 1.0


def print_banner(label, path):
    print(f'\n{"="*68}', flush=True)
    print(f'🔍 {label}', flush=True)
    print(f'   {path}', flush=True)
    print(f'{"="*68}', flush=True)


def run_eval(model, env, episodes, max_steps, label=''):
    """Run evaluation and return full metrics."""
    all_rewards = []
    all_steps = []
    all_lap_times = []
    all_osc_scores = []
    all_cte_distributions = []
    all_completed = []

    for ep in range(1, episodes + 1):
        obs, info = env.reset()
        pos_hist = deque(maxlen=31)
        total_reward = 0.0
        step = 0
        cte_values = []
        steering_actions = []
        laps_completed = 0
        lap_times = []

        print(f'\n--- Episode {ep}/{episodes} ---', flush=True)
        print(f'{"Step":>5} {"Spd":>5} {"CTE":>6} {"Eff%":>5} {"Rwd":>7} {"Tot":>9} {"Laps":>5} {"Px":>7} {"Pz":>7}', flush=True)
        print('-' * 62, flush=True)

        while step < max_steps:
            action, _ = model.predict(obs, deterministic=True)
            result = env.step(action)
            if len(result) == 5:
                obs, reward, terminated, truncated, info = result
                done = terminated or truncated
            else:
                obs, reward, done, info = result

            speed = float(info.get('speed', 0) or 0)
            cte = float(info.get('cte', 0) or 0)
            pos = info.get('pos', (0, 0, 0))
            px = pos[0] if pos else 0
            pz = pos[2] if len(pos) > 2 else 0
            lap_count = int(info.get('lap_count', 0) or 0)
            last_lap_time = float(info.get('last_lap_time', 0) or 0)

            # Track new laps
            if lap_count > laps_completed:
                laps_completed = lap_count
                if last_lap_time > 0:
                    lap_times.append(last_lap_time)
                    print(f'\n  🏁 LAP {laps_completed} COMPLETE! Time={last_lap_time:.2f}s', flush=True)

            pos_hist.append(np.array([px, 0., pz]))
            cte_values.append(cte)

            # Track steering for oscillation score
            try:
                steer = float(action[0]) if hasattr(action, '__len__') else float(action)
                steering_actions.append(steer)
            except (TypeError, IndexError):
                pass

            total_reward += reward
            step += 1

            eff = compute_efficiency(pos_hist)

            if step % 50 == 0 or done:
                print(f'{step:>5} {speed:>5.2f} {cte:>6.2f} {eff*100:>4.0f}% '
                      f'{reward:>7.3f} {total_reward:>9.1f} {laps_completed:>5} '
                      f'{px:>7.1f} {pz:>7.1f}', flush=True)

            if done:
                print(f'\n  Episode {ep} ended after {step} steps | '
                      f'total={total_reward:.1f} | laps={laps_completed}', flush=True)
                break

        if step >= max_steps:
            print(f'\n  Episode {ep} reached max {max_steps} steps | '
                  f'total={total_reward:.1f} | laps={laps_completed}', flush=True)

        # Compute oscillation score
        if len(steering_actions) > 1:
            deltas = [abs(steering_actions[i] - steering_actions[i-1])
                      for i in range(1, len(steering_actions))]
            osc_score = float(np.mean(deltas))
        else:
            osc_score = 0.0

        all_rewards.append(total_reward)
        all_steps.append(step)
        all_lap_times.extend(lap_times)
        all_osc_scores.append(osc_score)
        all_cte_distributions.extend(cte_values)
        all_completed.append(laps_completed > 0)

        time.sleep(2)

    # Summary metrics
    summary = {
        'label': label,
        'episodes': episodes,
        'mean_reward': float(np.mean(all_rewards)),
        'std_reward': float(np.std(all_rewards)),
        'mean_steps': float(np.mean(all_steps)),
        'laps_completed': sum(1 for r in all_rewards if r > 500),  # proxy for completion
        'lap_times': all_lap_times,
        'mean_lap_time': float(np.mean(all_lap_times)) if all_lap_times else None,
        'oscillation_score': float(np.mean(all_osc_scores)),  # lower = smoother
        'mean_abs_cte': float(np.mean([abs(c) for c in all_cte_distributions])),
        'cte_std': float(np.std(all_cte_distributions)),
        'mean_cte_signed': float(np.mean(all_cte_distributions)),  # + = left, - = right
        'timestamp': datetime.now().isoformat(),
    }

    return summary, all_rewards


def print_summary(summary):
    print(f'\n📊 Metrics for: {summary["label"]}', flush=True)
    print(f'  Mean reward:       {summary["mean_reward"]:.1f} ± {summary["std_reward"]:.1f}', flush=True)
    print(f'  Mean steps/ep:     {summary["mean_steps"]:.0f}', flush=True)
    print(f'  Oscillation score: {summary["oscillation_score"]:.4f} (lower=smoother)', flush=True)
    print(f'  Mean |CTE|:        {summary["mean_abs_cte"]:.3f} m from centre', flush=True)
    print(f'  Mean signed CTE:   {summary["mean_cte_signed"]:.3f} m (+ =left, - =right)', flush=True)
    cte_side = 'RIGHT of centre ➡️' if summary['mean_cte_signed'] < -0.1 else \
               'LEFT of centre ⬅️' if summary['mean_cte_signed'] > 0.1 else 'CENTRED ↕️'
    print(f'  Lane position:     {cte_side}', flush=True)
    if summary['lap_times']:
        print(f'  Lap times:         {[f"{t:.1f}s" for t in summary["lap_times"]]}', flush=True)
        print(f'  Best lap time:     {min(summary["lap_times"]):.1f}s', flush=True)
    print(flush=True)


def save_summary(summary):
    os.makedirs(os.path.dirname(EVAL_SUMMARY), exist_ok=True)
    with open(EVAL_SUMMARY, 'a') as f:
        f.write(json.dumps(summary) + '\n')


def main(episodes=3, max_steps=3000, model_override=None, compare=False, env_id='donkey-generated-roads-v0'):
    manifest = load_manifest()

    models_to_eval = []
    if compare:
        for m in PHASE2_MODELS:
            models_to_eval.append((m['label'], m['path']))
    else:
        path = model_override or CHAMPION_DIR + '/model.zip'
        label = model_override or f"Champion (Phase {manifest.get('phase', '?')} Trial {manifest.get('trial', '?')})"
        models_to_eval.append((label, path))

    all_summaries = []
    for label, path in models_to_eval:
        print_banner(f'{label}  [env={env_id}]', path)

        print(f'[Eval] Switching sim to {env_id} (will exit current scene first)...', flush=True)
        try:
            # We tell the switcher which scene is currently running so it can connect and exit
            env = switch_track(target_env_id=env_id,
                               current_env_id=env_id,  # best guess; works even if different
                               verbose=True)
        except Exception as e:
            print(f'[Eval] FAILED to switch track: {e}', flush=True)
            continue

        env = ThrottleClampWrapper(env, throttle_min=0.2)
        env = SpeedRewardWrapper(env, speed_scale=0.1)

        print(f'[Eval] Loading model: {path}', flush=True)
        try:
            model = PPO.load(path, env=env)
            print(f'[Eval] Model loaded. Running {episodes} episodes × {max_steps} steps...', flush=True)
        except Exception as e:
            print(f'[Eval] FAILED to load: {e}', flush=True)
            env.close()
            continue

        summary, rewards = run_eval(model, env, episodes, max_steps, label)
        print_summary(summary)
        save_summary(summary)
        all_summaries.append(summary)

        env.close()
        time.sleep(3)

    if compare and len(all_summaries) > 1:
        print('\n' + '=' * 68, flush=True)
        print('🏁 COMPARISON TABLE', flush=True)
        print('=' * 68, flush=True)
        print(f'{"Model":<40} {"Reward":>8} {"Steps":>7} {"Osc":>6} {"CTE":>6} {"Side":>10}', flush=True)
        print('-' * 68, flush=True)
        for s in all_summaries:
            side = '➡️ RIGHT' if s['mean_cte_signed'] < -0.1 else \
                   '⬅️ LEFT' if s['mean_cte_signed'] > 0.1 else '↕️ CENTER'
            name = s['label'][:40]
            print(f'{name:<40} {s["mean_reward"]:>8.0f} {s["mean_steps"]:>7.0f} '
                  f'{s["oscillation_score"]:>6.3f} {s["mean_abs_cte"]:>6.2f} {side:>10}', flush=True)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Evaluate DonkeyCar RL model with full metrics.')
    parser.add_argument('--episodes', type=int, default=3)
    parser.add_argument('--steps', type=int, default=3000)
    parser.add_argument('--model', type=str, default=None, help='Override model path')
    parser.add_argument('--compare', action='store_true', help='Compare all top Phase 2 models')
    parser.add_argument('--env', type=str, default='donkey-generated-roads-v0',
                        help='Gym environment ID (default: donkey-generated-roads-v0)')
    args = parser.parse_args()
    main(episodes=args.episodes, max_steps=args.steps, model_override=args.model,
         compare=args.compare, env_id=args.env)