295 lines
11 KiB
Python
295 lines
11 KiB
Python
"""
|
||
Enhanced Champion Evaluator — Phase 3
|
||
======================================
|
||
Evaluates a model with full metrics:
|
||
- Total reward per episode
|
||
- Lap time (using sim's last_lap_time)
|
||
- Steering oscillation score (std of steering changes)
|
||
- Lane position histogram (CTE distribution)
|
||
- Path efficiency throughout episode
|
||
- Per-step diagnostics: speed, CTE, efficiency, reward, position
|
||
|
||
Usage:
|
||
# Evaluate current champion
|
||
python3 evaluate_champion.py
|
||
|
||
# Evaluate a specific model
|
||
python3 evaluate_champion.py --model models/trial-0020/model.zip
|
||
|
||
# Long run to see lap completion
|
||
python3 evaluate_champion.py --episodes 3 --steps 3000
|
||
|
||
# Compare all top Phase 2 models
|
||
python3 evaluate_champion.py --compare
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import time
|
||
import json
|
||
import math
|
||
import numpy as np
|
||
from collections import deque
|
||
from datetime import datetime
|
||
|
||
import gymnasium as gym
|
||
import gym_donkeycar
|
||
from stable_baselines3 import PPO
|
||
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||
from reward_wrapper import SpeedRewardWrapper
|
||
|
||
CHAMPION_DIR = os.path.join(os.path.dirname(__file__), 'models', 'champion')
|
||
MANIFEST_PATH = os.path.join(CHAMPION_DIR, 'manifest.json')
|
||
EVAL_SUMMARY = os.path.join(os.path.dirname(__file__), 'outerloop-results', 'eval_summary.jsonl')
|
||
|
||
# Top Phase 2 models for comparison
|
||
PHASE2_MODELS = [
|
||
{
|
||
'label': 'Trial-20 Phase2-CHAMPION (n_steer=3 n_throttle=5 lr=0.000225 13k)',
|
||
'path': 'models/trial-0020/model.zip',
|
||
'style': 'Right lane, stable',
|
||
},
|
||
{
|
||
'label': 'Trial-8 Phase2-2nd (n_steer=4 n_throttle=3 lr=0.00117 34k)',
|
||
'path': 'models/trial-0008/model.zip',
|
||
'style': 'Left/center, oscillating',
|
||
},
|
||
{
|
||
'label': 'Trial-18 Phase2-3rd (n_steer=3 n_throttle=5 lr=0.000288 16k)',
|
||
'path': 'models/trial-0018/model.zip',
|
||
'style': 'Right shoulder, very accurate',
|
||
},
|
||
]
|
||
|
||
|
||
def load_manifest():
|
||
if os.path.exists(MANIFEST_PATH):
|
||
with open(MANIFEST_PATH) as f:
|
||
return json.load(f)
|
||
return {}
|
||
|
||
|
||
def compute_efficiency(pos_history):
|
||
if len(pos_history) < 3:
|
||
return 1.0
|
||
positions = list(pos_history)
|
||
net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0]))
|
||
total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i]))
|
||
for i in range(len(positions)-1))
|
||
return float(net / total) if total > 1e-6 else 1.0
|
||
|
||
|
||
def print_banner(label, path):
|
||
print(f'\n{"="*68}', flush=True)
|
||
print(f'🔍 {label}', flush=True)
|
||
print(f' {path}', flush=True)
|
||
print(f'{"="*68}', flush=True)
|
||
|
||
|
||
def run_eval(model, env, episodes, max_steps, label=''):
|
||
"""Run evaluation and return full metrics."""
|
||
all_rewards = []
|
||
all_steps = []
|
||
all_lap_times = []
|
||
all_osc_scores = []
|
||
all_cte_distributions = []
|
||
all_completed = []
|
||
|
||
for ep in range(1, episodes + 1):
|
||
obs, info = env.reset()
|
||
pos_hist = deque(maxlen=31)
|
||
total_reward = 0.0
|
||
step = 0
|
||
cte_values = []
|
||
steering_actions = []
|
||
laps_completed = 0
|
||
lap_times = []
|
||
|
||
print(f'\n--- Episode {ep}/{episodes} ---', flush=True)
|
||
print(f'{"Step":>5} {"Spd":>5} {"CTE":>6} {"Eff%":>5} {"Rwd":>7} {"Tot":>9} {"Laps":>5} {"Px":>7} {"Pz":>7}', flush=True)
|
||
print('-' * 62, flush=True)
|
||
|
||
while step < max_steps:
|
||
action, _ = model.predict(obs, deterministic=True)
|
||
result = env.step(action)
|
||
if len(result) == 5:
|
||
obs, reward, terminated, truncated, info = result
|
||
done = terminated or truncated
|
||
else:
|
||
obs, reward, done, info = result
|
||
|
||
speed = float(info.get('speed', 0) or 0)
|
||
cte = float(info.get('cte', 0) or 0)
|
||
pos = info.get('pos', (0, 0, 0))
|
||
px = pos[0] if pos else 0
|
||
pz = pos[2] if len(pos) > 2 else 0
|
||
lap_count = int(info.get('lap_count', 0) or 0)
|
||
last_lap_time = float(info.get('last_lap_time', 0) or 0)
|
||
|
||
# Track new laps
|
||
if lap_count > laps_completed:
|
||
laps_completed = lap_count
|
||
if last_lap_time > 0:
|
||
lap_times.append(last_lap_time)
|
||
print(f'\n 🏁 LAP {laps_completed} COMPLETE! Time={last_lap_time:.2f}s', flush=True)
|
||
|
||
pos_hist.append(np.array([px, 0., pz]))
|
||
cte_values.append(cte)
|
||
|
||
# Track steering for oscillation score
|
||
try:
|
||
steer = float(action[0]) if hasattr(action, '__len__') else float(action)
|
||
steering_actions.append(steer)
|
||
except (TypeError, IndexError):
|
||
pass
|
||
|
||
total_reward += reward
|
||
step += 1
|
||
|
||
eff = compute_efficiency(pos_hist)
|
||
|
||
if step % 50 == 0 or done:
|
||
print(f'{step:>5} {speed:>5.2f} {cte:>6.2f} {eff*100:>4.0f}% '
|
||
f'{reward:>7.3f} {total_reward:>9.1f} {laps_completed:>5} '
|
||
f'{px:>7.1f} {pz:>7.1f}', flush=True)
|
||
|
||
if done:
|
||
print(f'\n Episode {ep} ended after {step} steps | '
|
||
f'total={total_reward:.1f} | laps={laps_completed}', flush=True)
|
||
break
|
||
|
||
if step >= max_steps:
|
||
print(f'\n Episode {ep} reached max {max_steps} steps | '
|
||
f'total={total_reward:.1f} | laps={laps_completed}', flush=True)
|
||
|
||
# Compute oscillation score
|
||
if len(steering_actions) > 1:
|
||
deltas = [abs(steering_actions[i] - steering_actions[i-1])
|
||
for i in range(1, len(steering_actions))]
|
||
osc_score = float(np.mean(deltas))
|
||
else:
|
||
osc_score = 0.0
|
||
|
||
all_rewards.append(total_reward)
|
||
all_steps.append(step)
|
||
all_lap_times.extend(lap_times)
|
||
all_osc_scores.append(osc_score)
|
||
all_cte_distributions.extend(cte_values)
|
||
all_completed.append(laps_completed > 0)
|
||
|
||
time.sleep(2)
|
||
|
||
# Summary metrics
|
||
summary = {
|
||
'label': label,
|
||
'episodes': episodes,
|
||
'mean_reward': float(np.mean(all_rewards)),
|
||
'std_reward': float(np.std(all_rewards)),
|
||
'mean_steps': float(np.mean(all_steps)),
|
||
'laps_completed': sum(1 for r in all_rewards if r > 500), # proxy for completion
|
||
'lap_times': all_lap_times,
|
||
'mean_lap_time': float(np.mean(all_lap_times)) if all_lap_times else None,
|
||
'oscillation_score': float(np.mean(all_osc_scores)), # lower = smoother
|
||
'mean_abs_cte': float(np.mean([abs(c) for c in all_cte_distributions])),
|
||
'cte_std': float(np.std(all_cte_distributions)),
|
||
'mean_cte_signed': float(np.mean(all_cte_distributions)), # + = left, - = right
|
||
'timestamp': datetime.now().isoformat(),
|
||
}
|
||
|
||
return summary, all_rewards
|
||
|
||
|
||
def print_summary(summary):
|
||
print(f'\n📊 Metrics for: {summary["label"]}', flush=True)
|
||
print(f' Mean reward: {summary["mean_reward"]:.1f} ± {summary["std_reward"]:.1f}', flush=True)
|
||
print(f' Mean steps/ep: {summary["mean_steps"]:.0f}', flush=True)
|
||
print(f' Oscillation score: {summary["oscillation_score"]:.4f} (lower=smoother)', flush=True)
|
||
print(f' Mean |CTE|: {summary["mean_abs_cte"]:.3f} m from centre', flush=True)
|
||
print(f' Mean signed CTE: {summary["mean_cte_signed"]:.3f} m (+ =left, - =right)', flush=True)
|
||
cte_side = 'RIGHT of centre ➡️' if summary['mean_cte_signed'] < -0.1 else \
|
||
'LEFT of centre ⬅️' if summary['mean_cte_signed'] > 0.1 else 'CENTRED ↕️'
|
||
print(f' Lane position: {cte_side}', flush=True)
|
||
if summary['lap_times']:
|
||
print(f' Lap times: {[f"{t:.1f}s" for t in summary["lap_times"]]}', flush=True)
|
||
print(f' Best lap time: {min(summary["lap_times"]):.1f}s', flush=True)
|
||
print(flush=True)
|
||
|
||
|
||
def save_summary(summary):
|
||
os.makedirs(os.path.dirname(EVAL_SUMMARY), exist_ok=True)
|
||
with open(EVAL_SUMMARY, 'a') as f:
|
||
f.write(json.dumps(summary) + '\n')
|
||
|
||
|
||
def main(episodes=3, max_steps=3000, model_override=None, compare=False, env_id='donkey-generated-roads-v0'):
|
||
manifest = load_manifest()
|
||
|
||
models_to_eval = []
|
||
if compare:
|
||
for m in PHASE2_MODELS:
|
||
models_to_eval.append((m['label'], m['path']))
|
||
else:
|
||
path = model_override or CHAMPION_DIR + '/model.zip'
|
||
label = model_override or f"Champion (Phase {manifest.get('phase', '?')} Trial {manifest.get('trial', '?')})"
|
||
models_to_eval.append((label, path))
|
||
|
||
all_summaries = []
|
||
for label, path in models_to_eval:
|
||
print_banner(f'{label} [env={env_id}]', path)
|
||
|
||
print(f'[Eval] Connecting to {env_id}...', flush=True)
|
||
try:
|
||
env = gym.make(env_id)
|
||
except Exception as e:
|
||
print(f'[Eval] FAILED: {e}', flush=True)
|
||
continue
|
||
|
||
env = ThrottleClampWrapper(env, throttle_min=0.2)
|
||
env = SpeedRewardWrapper(env, speed_scale=0.1)
|
||
|
||
print(f'[Eval] Loading model: {path}', flush=True)
|
||
try:
|
||
model = PPO.load(path, env=env)
|
||
print(f'[Eval] Model loaded. Running {episodes} episodes × {max_steps} steps...', flush=True)
|
||
except Exception as e:
|
||
print(f'[Eval] FAILED to load: {e}', flush=True)
|
||
env.close()
|
||
continue
|
||
|
||
summary, rewards = run_eval(model, env, episodes, max_steps, label)
|
||
print_summary(summary)
|
||
save_summary(summary)
|
||
all_summaries.append(summary)
|
||
|
||
env.close()
|
||
time.sleep(3)
|
||
|
||
if compare and len(all_summaries) > 1:
|
||
print('\n' + '=' * 68, flush=True)
|
||
print('🏁 COMPARISON TABLE', flush=True)
|
||
print('=' * 68, flush=True)
|
||
print(f'{"Model":<40} {"Reward":>8} {"Steps":>7} {"Osc":>6} {"CTE":>6} {"Side":>10}', flush=True)
|
||
print('-' * 68, flush=True)
|
||
for s in all_summaries:
|
||
side = '➡️ RIGHT' if s['mean_cte_signed'] < -0.1 else \
|
||
'⬅️ LEFT' if s['mean_cte_signed'] > 0.1 else '↕️ CENTER'
|
||
name = s['label'][:40]
|
||
print(f'{name:<40} {s["mean_reward"]:>8.0f} {s["mean_steps"]:>7.0f} '
|
||
f'{s["oscillation_score"]:>6.3f} {s["mean_abs_cte"]:>6.2f} {side:>10}', flush=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import argparse
|
||
parser = argparse.ArgumentParser(description='Evaluate DonkeyCar RL model with full metrics.')
|
||
parser.add_argument('--episodes', type=int, default=3)
|
||
parser.add_argument('--steps', type=int, default=3000)
|
||
parser.add_argument('--model', type=str, default=None, help='Override model path')
|
||
parser.add_argument('--compare', action='store_true', help='Compare all top Phase 2 models')
|
||
parser.add_argument('--env', type=str, default='donkey-generated-roads-v0',
|
||
help='Gym environment ID (default: donkey-generated-roads-v0)')
|
||
args = parser.parse_args()
|
||
main(episodes=args.episodes, max_steps=args.steps, model_override=args.model,
|
||
compare=args.compare, env_id=args.env)
|