donkeycar-rl-autoresearch/agent/evaluate_champion.py

299 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Enhanced Champion Evaluator — Phase 3
======================================
Evaluates a model with full metrics:
- Total reward per episode
- Lap time (using sim's last_lap_time)
- Steering oscillation score (std of steering changes)
- Lane position histogram (CTE distribution)
- Path efficiency throughout episode
- Per-step diagnostics: speed, CTE, efficiency, reward, position
Usage:
# Evaluate current champion
python3 evaluate_champion.py
# Evaluate a specific model
python3 evaluate_champion.py --model models/trial-0020/model.zip
# Long run to see lap completion
python3 evaluate_champion.py --episodes 3 --steps 3000
# Compare all top Phase 2 models
python3 evaluate_champion.py --compare
"""
import os
import sys
import time
import json
import math
import numpy as np
from collections import deque
from datetime import datetime
import gymnasium as gym
import gym_donkeycar
from stable_baselines3 import PPO
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from track_switcher import switch_track, AVAILABLE_TRACKS
CHAMPION_DIR = os.path.join(os.path.dirname(__file__), 'models', 'champion')
MANIFEST_PATH = os.path.join(CHAMPION_DIR, 'manifest.json')
EVAL_SUMMARY = os.path.join(os.path.dirname(__file__), 'outerloop-results', 'eval_summary.jsonl')
# Top Phase 2 models for comparison
PHASE2_MODELS = [
{
'label': 'Trial-20 Phase2-CHAMPION (n_steer=3 n_throttle=5 lr=0.000225 13k)',
'path': 'models/trial-0020/model.zip',
'style': 'Right lane, stable',
},
{
'label': 'Trial-8 Phase2-2nd (n_steer=4 n_throttle=3 lr=0.00117 34k)',
'path': 'models/trial-0008/model.zip',
'style': 'Left/center, oscillating',
},
{
'label': 'Trial-18 Phase2-3rd (n_steer=3 n_throttle=5 lr=0.000288 16k)',
'path': 'models/trial-0018/model.zip',
'style': 'Right shoulder, very accurate',
},
]
def load_manifest():
if os.path.exists(MANIFEST_PATH):
with open(MANIFEST_PATH) as f:
return json.load(f)
return {}
def compute_efficiency(pos_history):
if len(pos_history) < 3:
return 1.0
positions = list(pos_history)
net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0]))
total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i]))
for i in range(len(positions)-1))
return float(net / total) if total > 1e-6 else 1.0
def print_banner(label, path):
print(f'\n{"="*68}', flush=True)
print(f'🔍 {label}', flush=True)
print(f' {path}', flush=True)
print(f'{"="*68}', flush=True)
def run_eval(model, env, episodes, max_steps, label=''):
"""Run evaluation and return full metrics."""
all_rewards = []
all_steps = []
all_lap_times = []
all_osc_scores = []
all_cte_distributions = []
all_completed = []
for ep in range(1, episodes + 1):
obs, info = env.reset()
pos_hist = deque(maxlen=31)
total_reward = 0.0
step = 0
cte_values = []
steering_actions = []
laps_completed = 0
lap_times = []
print(f'\n--- Episode {ep}/{episodes} ---', flush=True)
print(f'{"Step":>5} {"Spd":>5} {"CTE":>6} {"Eff%":>5} {"Rwd":>7} {"Tot":>9} {"Laps":>5} {"Px":>7} {"Pz":>7}', flush=True)
print('-' * 62, flush=True)
while step < max_steps:
action, _ = model.predict(obs, deterministic=True)
result = env.step(action)
if len(result) == 5:
obs, reward, terminated, truncated, info = result
done = terminated or truncated
else:
obs, reward, done, info = result
speed = float(info.get('speed', 0) or 0)
cte = float(info.get('cte', 0) or 0)
pos = info.get('pos', (0, 0, 0))
px = pos[0] if pos else 0
pz = pos[2] if len(pos) > 2 else 0
lap_count = int(info.get('lap_count', 0) or 0)
last_lap_time = float(info.get('last_lap_time', 0) or 0)
# Track new laps
if lap_count > laps_completed:
laps_completed = lap_count
if last_lap_time > 0:
lap_times.append(last_lap_time)
print(f'\n 🏁 LAP {laps_completed} COMPLETE! Time={last_lap_time:.2f}s', flush=True)
pos_hist.append(np.array([px, 0., pz]))
cte_values.append(cte)
# Track steering for oscillation score
try:
steer = float(action[0]) if hasattr(action, '__len__') else float(action)
steering_actions.append(steer)
except (TypeError, IndexError):
pass
total_reward += reward
step += 1
eff = compute_efficiency(pos_hist)
if step % 50 == 0 or done:
print(f'{step:>5} {speed:>5.2f} {cte:>6.2f} {eff*100:>4.0f}% '
f'{reward:>7.3f} {total_reward:>9.1f} {laps_completed:>5} '
f'{px:>7.1f} {pz:>7.1f}', flush=True)
if done:
print(f'\n Episode {ep} ended after {step} steps | '
f'total={total_reward:.1f} | laps={laps_completed}', flush=True)
break
if step >= max_steps:
print(f'\n Episode {ep} reached max {max_steps} steps | '
f'total={total_reward:.1f} | laps={laps_completed}', flush=True)
# Compute oscillation score
if len(steering_actions) > 1:
deltas = [abs(steering_actions[i] - steering_actions[i-1])
for i in range(1, len(steering_actions))]
osc_score = float(np.mean(deltas))
else:
osc_score = 0.0
all_rewards.append(total_reward)
all_steps.append(step)
all_lap_times.extend(lap_times)
all_osc_scores.append(osc_score)
all_cte_distributions.extend(cte_values)
all_completed.append(laps_completed > 0)
time.sleep(2)
# Summary metrics
summary = {
'label': label,
'episodes': episodes,
'mean_reward': float(np.mean(all_rewards)),
'std_reward': float(np.std(all_rewards)),
'mean_steps': float(np.mean(all_steps)),
'laps_completed': sum(1 for r in all_rewards if r > 500), # proxy for completion
'lap_times': all_lap_times,
'mean_lap_time': float(np.mean(all_lap_times)) if all_lap_times else None,
'oscillation_score': float(np.mean(all_osc_scores)), # lower = smoother
'mean_abs_cte': float(np.mean([abs(c) for c in all_cte_distributions])),
'cte_std': float(np.std(all_cte_distributions)),
'mean_cte_signed': float(np.mean(all_cte_distributions)), # + = left, - = right
'timestamp': datetime.now().isoformat(),
}
return summary, all_rewards
def print_summary(summary):
print(f'\n📊 Metrics for: {summary["label"]}', flush=True)
print(f' Mean reward: {summary["mean_reward"]:.1f} ± {summary["std_reward"]:.1f}', flush=True)
print(f' Mean steps/ep: {summary["mean_steps"]:.0f}', flush=True)
print(f' Oscillation score: {summary["oscillation_score"]:.4f} (lower=smoother)', flush=True)
print(f' Mean |CTE|: {summary["mean_abs_cte"]:.3f} m from centre', flush=True)
print(f' Mean signed CTE: {summary["mean_cte_signed"]:.3f} m (+ =left, - =right)', flush=True)
cte_side = 'RIGHT of centre ➡️' if summary['mean_cte_signed'] < -0.1 else \
'LEFT of centre ⬅️' if summary['mean_cte_signed'] > 0.1 else 'CENTRED ↕️'
print(f' Lane position: {cte_side}', flush=True)
if summary['lap_times']:
print(f' Lap times: {[f"{t:.1f}s" for t in summary["lap_times"]]}', flush=True)
print(f' Best lap time: {min(summary["lap_times"]):.1f}s', flush=True)
print(flush=True)
def save_summary(summary):
os.makedirs(os.path.dirname(EVAL_SUMMARY), exist_ok=True)
with open(EVAL_SUMMARY, 'a') as f:
f.write(json.dumps(summary) + '\n')
def main(episodes=3, max_steps=3000, model_override=None, compare=False, env_id='donkey-generated-roads-v0'):
manifest = load_manifest()
models_to_eval = []
if compare:
for m in PHASE2_MODELS:
models_to_eval.append((m['label'], m['path']))
else:
path = model_override or CHAMPION_DIR + '/model.zip'
label = model_override or f"Champion (Phase {manifest.get('phase', '?')} Trial {manifest.get('trial', '?')})"
models_to_eval.append((label, path))
all_summaries = []
for label, path in models_to_eval:
print_banner(f'{label} [env={env_id}]', path)
print(f'[Eval] Switching sim to {env_id} (will exit current scene first)...', flush=True)
try:
# We tell the switcher which scene is currently running so it can connect and exit
env = switch_track(target_env_id=env_id,
current_env_id=env_id, # best guess; works even if different
verbose=True)
except Exception as e:
print(f'[Eval] FAILED to switch track: {e}', flush=True)
continue
env = ThrottleClampWrapper(env, throttle_min=0.2)
env = SpeedRewardWrapper(env, speed_scale=0.1)
print(f'[Eval] Loading model: {path}', flush=True)
try:
model = PPO.load(path, env=env)
print(f'[Eval] Model loaded. Running {episodes} episodes × {max_steps} steps...', flush=True)
except Exception as e:
print(f'[Eval] FAILED to load: {e}', flush=True)
env.close()
continue
summary, rewards = run_eval(model, env, episodes, max_steps, label)
print_summary(summary)
save_summary(summary)
all_summaries.append(summary)
env.close()
time.sleep(3)
if compare and len(all_summaries) > 1:
print('\n' + '=' * 68, flush=True)
print('🏁 COMPARISON TABLE', flush=True)
print('=' * 68, flush=True)
print(f'{"Model":<40} {"Reward":>8} {"Steps":>7} {"Osc":>6} {"CTE":>6} {"Side":>10}', flush=True)
print('-' * 68, flush=True)
for s in all_summaries:
side = '➡️ RIGHT' if s['mean_cte_signed'] < -0.1 else \
'⬅️ LEFT' if s['mean_cte_signed'] > 0.1 else '↕️ CENTER'
name = s['label'][:40]
print(f'{name:<40} {s["mean_reward"]:>8.0f} {s["mean_steps"]:>7.0f} '
f'{s["oscillation_score"]:>6.3f} {s["mean_abs_cte"]:>6.2f} {side:>10}', flush=True)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Evaluate DonkeyCar RL model with full metrics.')
parser.add_argument('--episodes', type=int, default=3)
parser.add_argument('--steps', type=int, default=3000)
parser.add_argument('--model', type=str, default=None, help='Override model path')
parser.add_argument('--compare', action='store_true', help='Compare all top Phase 2 models')
parser.add_argument('--env', type=str, default='donkey-generated-roads-v0',
help='Gym environment ID (default: donkey-generated-roads-v0)')
args = parser.parse_args()
main(episodes=args.episodes, max_steps=args.steps, model_override=args.model,
compare=args.compare, env_id=args.env)