feat: comprehensive multi-track evaluation script + research log updates
- multitrack_eval.py: tests all 3 top models against all 11 DonkeyCar tracks
- Automatic track switching via exit_scene → reconnect
- 11 tracks: generated_road, generated_track, mountain, warehouse, AVC,
mini_monaco, warren, robo_racing, waveshare, thunderhill, circuit_launch
- Records: reward, steps, oscillation, CTE distribution, drove_far flag
- Saves to outerloop-results/multitrack_results.jsonl
- Prints comparison table at the end
- RESEARCH_LOG.md: exit_scene fix documented, Phase 3 begun
- IMPLEMENTATION_PLAN.md: Wave 3 streams defined
Agent: pi/claude-sonnet
Tests: 53/53 passing
Tests-Added: 0
TypeScript: N/A
This commit is contained in:
parent
ce120393af
commit
5a626c87be
|
|
@ -0,0 +1,240 @@
|
||||||
|
"""
|
||||||
|
Multi-Track Generalization Evaluation
|
||||||
|
=====================================
|
||||||
|
Tests all top Phase 2 models against every available DonkeyCar track.
|
||||||
|
Uses automatic track switching (exit_scene → reconnect).
|
||||||
|
|
||||||
|
Results saved to: outerloop-results/multitrack_results.jsonl
|
||||||
|
Summary table printed at the end.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 multitrack_eval.py [--episodes N] [--steps N]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys, time, json, numpy as np
|
||||||
|
from datetime import datetime
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import gym_donkeycar
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||||||
|
from reward_wrapper import SpeedRewardWrapper
|
||||||
|
from track_switcher import switch_track
|
||||||
|
|
||||||
|
RESULTS_DIR = os.path.join(os.path.dirname(__file__), 'outerloop-results')
|
||||||
|
RESULTS_FILE = os.path.join(RESULTS_DIR, 'multitrack_results.jsonl')
|
||||||
|
|
||||||
|
# All available tracks
|
||||||
|
ALL_TRACKS = [
|
||||||
|
{'id': 'donkey-generated-roads-v0', 'name': 'Generated Road', 'trained_on': True},
|
||||||
|
{'id': 'donkey-generated-track-v0', 'name': 'Generated Track', 'trained_on': False},
|
||||||
|
{'id': 'donkey-mountain-track-v0', 'name': 'Mountain Track', 'trained_on': False},
|
||||||
|
{'id': 'donkey-warehouse-v0', 'name': 'Warehouse', 'trained_on': False},
|
||||||
|
{'id': 'donkey-avc-sparkfun-v0', 'name': 'AVC Sparkfun', 'trained_on': False},
|
||||||
|
{'id': 'donkey-minimonaco-track-v0', 'name': 'Mini Monaco', 'trained_on': False},
|
||||||
|
{'id': 'donkey-warren-track-v0', 'name': 'Warren', 'trained_on': False},
|
||||||
|
{'id': 'donkey-roboracingleague-track-v0', 'name': 'Robo Racing League', 'trained_on': False},
|
||||||
|
{'id': 'donkey-waveshare-v0', 'name': 'Waveshare', 'trained_on': False},
|
||||||
|
{'id': 'donkey-thunderhill-track-v0', 'name': 'Thunderhill', 'trained_on': False},
|
||||||
|
{'id': 'donkey-circuit-launch-track-v0', 'name': 'Circuit Launch', 'trained_on': False},
|
||||||
|
]
|
||||||
|
|
||||||
|
TOP3_MODELS = [
|
||||||
|
{'label': 'Trial-20 (n_steer=3 n_thr=5 lr=0.000225 13k)', 'path': 'models/trial-0020/model.zip', 'short': 'T20'},
|
||||||
|
{'label': 'Trial-8 (n_steer=4 n_thr=3 lr=0.00117 34k)', 'path': 'models/trial-0008/model.zip', 'short': 'T08'},
|
||||||
|
{'label': 'Trial-18 (n_steer=3 n_thr=5 lr=0.000288 16k)', 'path': 'models/trial-0018/model.zip', 'short': 'T18'},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def compute_efficiency(pos_history):
|
||||||
|
if len(pos_history) < 3:
|
||||||
|
return 1.0
|
||||||
|
positions = list(pos_history)
|
||||||
|
net = np.linalg.norm(np.array(positions[-1]) - np.array(positions[0]))
|
||||||
|
total = sum(np.linalg.norm(np.array(positions[i+1]) - np.array(positions[i]))
|
||||||
|
for i in range(len(positions)-1))
|
||||||
|
return float(net / total) if total > 1e-6 else 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def run_episodes(model, env, episodes, max_steps, track_name):
|
||||||
|
"""Run evaluation episodes and return metrics."""
|
||||||
|
all_rewards, all_steps, all_cte, all_steer = [], [], [], []
|
||||||
|
last_action = None
|
||||||
|
|
||||||
|
for ep in range(1, episodes + 1):
|
||||||
|
obs, info = env.reset()
|
||||||
|
pos_hist = deque(maxlen=31)
|
||||||
|
total_reward, step = 0.0, 0
|
||||||
|
cte_vals, steer_vals = [], []
|
||||||
|
|
||||||
|
while step < max_steps:
|
||||||
|
action, _ = model.predict(obs, deterministic=True)
|
||||||
|
result = env.step(action)
|
||||||
|
if len(result) == 5:
|
||||||
|
obs, reward, terminated, truncated, info = result
|
||||||
|
done = terminated or truncated
|
||||||
|
else:
|
||||||
|
obs, reward, done, info = result
|
||||||
|
|
||||||
|
cte = float(info.get('cte', 0) or 0)
|
||||||
|
pos = info.get('pos', (0, 0, 0))
|
||||||
|
px = pos[0] if pos else 0
|
||||||
|
pz = pos[2] if len(pos) > 2 else 0
|
||||||
|
pos_hist.append(np.array([px, 0., pz]))
|
||||||
|
|
||||||
|
try:
|
||||||
|
steer = float(action[0]) if hasattr(action, '__len__') else float(action)
|
||||||
|
steer_vals.append(steer)
|
||||||
|
if last_action is not None:
|
||||||
|
prev = float(last_action[0]) if hasattr(last_action, '__len__') else float(last_action)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
last_action = action
|
||||||
|
|
||||||
|
cte_vals.append(cte)
|
||||||
|
total_reward += reward
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
all_rewards.append(total_reward)
|
||||||
|
all_steps.append(step)
|
||||||
|
all_cte.extend(cte_vals)
|
||||||
|
all_steer.extend(steer_vals)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Oscillation score
|
||||||
|
if len(all_steer) > 1:
|
||||||
|
deltas = [abs(all_steer[i] - all_steer[i-1]) for i in range(1, len(all_steer))]
|
||||||
|
osc = float(np.mean(deltas))
|
||||||
|
else:
|
||||||
|
osc = 0.0
|
||||||
|
|
||||||
|
return {
|
||||||
|
'mean_reward': float(np.mean(all_rewards)),
|
||||||
|
'std_reward': float(np.std(all_rewards)),
|
||||||
|
'mean_steps': float(np.mean(all_steps)),
|
||||||
|
'oscillation': osc,
|
||||||
|
'mean_abs_cte': float(np.mean([abs(c) for c in all_cte])) if all_cte else 0,
|
||||||
|
'mean_signed_cte': float(np.mean(all_cte)) if all_cte else 0,
|
||||||
|
'drove_far': float(np.mean(all_steps)) > 200, # survived more than 200 steps avg
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_multitrack_eval(episodes=3, max_steps=1000):
|
||||||
|
os.makedirs(RESULTS_DIR, exist_ok=True)
|
||||||
|
print('\n' + '='*70, flush=True)
|
||||||
|
print('🌍 MULTI-TRACK GENERALIZATION EVALUATION', flush=True)
|
||||||
|
print(f' Models: {len(TOP3_MODELS)} | Tracks: {len(ALL_TRACKS)} | Episodes: {episodes} | Max steps: {max_steps}', flush=True)
|
||||||
|
print('='*70, flush=True)
|
||||||
|
|
||||||
|
all_results = {}
|
||||||
|
current_env_id = 'donkey-generated-roads-v0' # assume starting here
|
||||||
|
|
||||||
|
for track in ALL_TRACKS:
|
||||||
|
track_id = track['id']
|
||||||
|
track_name = track['name']
|
||||||
|
trained = '⭐ TRAINED' if track['trained_on'] else '🆕 UNSEEN'
|
||||||
|
print(f'\n{"─"*70}', flush=True)
|
||||||
|
print(f'📍 Track: {track_name} {trained}', flush=True)
|
||||||
|
print(f' Env: {track_id}', flush=True)
|
||||||
|
print(f'{"─"*70}', flush=True)
|
||||||
|
|
||||||
|
track_results = {}
|
||||||
|
|
||||||
|
for model_info in TOP3_MODELS:
|
||||||
|
print(f'\n 🤖 Model: {model_info["short"]} — {model_info["label"][:50]}', flush=True)
|
||||||
|
|
||||||
|
# Switch to the correct track
|
||||||
|
try:
|
||||||
|
env = switch_track(
|
||||||
|
target_env_id=track_id,
|
||||||
|
current_env_id=current_env_id,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
current_env_id = track_id
|
||||||
|
except Exception as e:
|
||||||
|
print(f' ❌ Failed to connect to {track_name}: {e}', flush=True)
|
||||||
|
track_results[model_info['short']] = {'error': str(e)}
|
||||||
|
continue
|
||||||
|
|
||||||
|
env = ThrottleClampWrapper(env, throttle_min=0.2)
|
||||||
|
env = SpeedRewardWrapper(env, speed_scale=0.1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
model = PPO.load(model_info['path'], env=env)
|
||||||
|
except Exception as e:
|
||||||
|
print(f' ❌ Failed to load model: {e}', flush=True)
|
||||||
|
env.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
metrics = run_episodes(model, env, episodes, max_steps, track_name)
|
||||||
|
verdict = '✅ DRIVES' if metrics['drove_far'] else '❌ CRASHES'
|
||||||
|
print(f' {verdict} | reward={metrics["mean_reward"]:.0f} | '
|
||||||
|
f'steps={metrics["mean_steps"]:.0f} | '
|
||||||
|
f'osc={metrics["oscillation"]:.3f} | '
|
||||||
|
f'cte={metrics["mean_abs_cte"]:.2f}', flush=True)
|
||||||
|
track_results[model_info['short']] = metrics
|
||||||
|
except Exception as e:
|
||||||
|
print(f' ❌ Evaluation error: {e}', flush=True)
|
||||||
|
track_results[model_info['short']] = {'error': str(e)}
|
||||||
|
finally:
|
||||||
|
env.close()
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
all_results[track_name] = track_results
|
||||||
|
|
||||||
|
# Save after each track
|
||||||
|
record = {
|
||||||
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
'track': track_name,
|
||||||
|
'track_id': track_id,
|
||||||
|
'trained_on': track['trained_on'],
|
||||||
|
'results': track_results
|
||||||
|
}
|
||||||
|
with open(RESULTS_FILE, 'a') as f:
|
||||||
|
f.write(json.dumps(record) + '\n')
|
||||||
|
|
||||||
|
# Print final summary table
|
||||||
|
print('\n\n' + '='*90, flush=True)
|
||||||
|
print('📊 MULTI-TRACK GENERALIZATION RESULTS', flush=True)
|
||||||
|
print('='*90, flush=True)
|
||||||
|
header = f'{"Track":<26} {"Trained":^8} | {"T20 Steps":>10} {"T20 Rwd":>8} | {"T08 Steps":>10} {"T08 Rwd":>8} | {"T18 Steps":>10} {"T18 Rwd":>8}'
|
||||||
|
print(header, flush=True)
|
||||||
|
print('─'*90, flush=True)
|
||||||
|
|
||||||
|
for track in ALL_TRACKS:
|
||||||
|
tname = track['name']
|
||||||
|
trained = '⭐ YES' if track['trained_on'] else 'NO'
|
||||||
|
r = all_results.get(tname, {})
|
||||||
|
row = f'{tname:<26} {trained:^8} |'
|
||||||
|
for short in ['T20', 'T08', 'T18']:
|
||||||
|
m = r.get(short, {})
|
||||||
|
if 'error' in m:
|
||||||
|
row += f' {"ERROR":>10} {"--":>8} |'
|
||||||
|
elif m:
|
||||||
|
steps = m.get('mean_steps', 0)
|
||||||
|
rwd = m.get('mean_reward', 0)
|
||||||
|
flag = '✅' if m.get('drove_far') else '❌'
|
||||||
|
row += f' {flag}{steps:>8.0f} {rwd:>8.0f} |'
|
||||||
|
else:
|
||||||
|
row += f' {"--":>10} {"--":>8} |'
|
||||||
|
print(row, flush=True)
|
||||||
|
|
||||||
|
print('='*90, flush=True)
|
||||||
|
print(f'\nFull results saved to: {RESULTS_FILE}', flush=True)
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--episodes', type=int, default=3, help='Episodes per track per model')
|
||||||
|
parser.add_argument('--steps', type=int, default=800, help='Max steps per episode')
|
||||||
|
args = parser.parse_args()
|
||||||
|
run_multitrack_eval(episodes=args.episodes, max_steps=args.steps)
|
||||||
|
|
@ -414,3 +414,31 @@ Yes! Through targeted reward shaping:
|
||||||
- Fine-tuning from Phase 2 champion
|
- Fine-tuning from Phase 2 champion
|
||||||
|
|
||||||
**Phase 2 Champion:** Trial 20 — n_steer=3, n_throttle=5, lr=0.000225, 13k steps
|
**Phase 2 Champion:** Trial 20 — n_steer=3, n_throttle=5, lr=0.000225, 13k steps
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-14 — Track Switching API: exit_scene() Works Automatically
|
||||||
|
|
||||||
|
### Finding: Automatic Scene Switching via unwrapped viewer
|
||||||
|
|
||||||
|
**Problem:** `gym.make('donkey-generated-track-v0')` ignores the scene name if the simulator already has a scene running — it just uses the current scene.
|
||||||
|
|
||||||
|
**Root cause:** The sim only responds to scene selection when it's at the main menu (`scene_selection_ready` state). If a scene is loaded, it sends `need_car_config` instead.
|
||||||
|
|
||||||
|
**Fix:** `env.unwrapped.viewer.exit_scene()` sends the exit message through the **established websocket connection**. Raw TCP socket approach failed because the DonkeyCar protocol requires proper framing.
|
||||||
|
|
||||||
|
**Working procedure:**
|
||||||
|
```python
|
||||||
|
temp_env = gym.make(current_scene_env_id)
|
||||||
|
temp_env.unwrapped.viewer.exit_scene() # Sends exit via websocket
|
||||||
|
time.sleep(4) # Wait for sim to reach main menu
|
||||||
|
temp_env.unwrapped.viewer.quit()
|
||||||
|
env = gym.make(target_env_id) # Sim now loads correct scene
|
||||||
|
```
|
||||||
|
|
||||||
|
**Confirmed:** `loading scene generated_road` message appears in logs after switch.
|
||||||
|
**Impact:** Fully automated multi-track evaluation and training without user intervention!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-14 — PHASE 3 BEGINS: Multi-Track Generalization Evaluation
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue