donkeycar-rl-autoresearch/agent/multitrack_runner.py

535 lines
21 KiB
Python

"""
Wave 3 Multi-Track Runner
=========================
Trains PPO across multiple DonkeyCar tracks by round-robin switching between
training segments. After training, evaluates on zero-shot test tracks
(mini_monaco + warren) to measure cross-track generalization.
Track classification (from visual analysis):
TRAINING : generated_road, generated_track, mountain_track
(outdoor, same road markings — yellow centre + white edge)
TEST/EVAL : mini_monaco, warren
(never seen during training — generalization benchmark)
SKIPPED : warehouse, robo_racing_league, waveshare, circuit_launch
(fully indoor — different domain entirely)
avc_sparkfun (outdoor but orange markings — too different)
Track switching strategy:
Close env → send_exit_scene_raw() → wait 4s → gym.make(next_track)
This avoids the double-connect issue in switch_track() when an env is
already open on the current track.
Key invariants (ADR-005, ADR-006):
- model is always defined before model.save()
- env.close() + time.sleep(2) before every track switch
- Results appended to JSONL, never overwritten
Output lines parsed by wave3_controller.py:
[W3 Runner][TRAIN] track=<name> segment_reward=<float>
[W3 Runner][TEST] track=<name> mean_reward=<float> mean_steps=<float>
[W3 Runner][TEST] combined_test_score=<float>
Usage:
python3 multitrack_runner.py \\
--total-timesteps 200000 \\
--steps-per-switch 10000 \\
--learning-rate 0.000225 \\
--warm-start models/champion/model.zip \\
--save-dir models/wave3/trial-0001 \\
--eval-episodes 3
Exit codes:
0 — success, model saved, evaluation complete
100 — failed to connect to simulator on initial track
101 — training error
102 — evaluation error
"""
import argparse
import os
import sys
import time
import json
import numpy as np
from datetime import datetime
import gymnasium as gym
import gym_donkeycar
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
# ---- Project paths ----
AGENT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, AGENT_DIR)
from donkeycar_sb3_runner import ThrottleClampWrapper, SimHealthCallback
from reward_wrapper import SpeedRewardWrapper
# ---- Track catalogue ----
# Maps short name → gym env ID
TRAINING_TRACKS = [
('generated_road', 'donkey-generated-roads-v0'),
('generated_track', 'donkey-generated-track-v0'),
('mountain_track', 'donkey-mountain-track-v0'),
]
# Zero-shot generalization test tracks — never seen during training.
# Warren was removed: its episode-done condition does not fire when the car
# crosses the INSIDE edge (CTE stays small), so the car can drive among
# chairs indefinitely and scores are meaningless.
TEST_TRACKS = [
('mini_monaco', 'donkey-minimonaco-track-v0'),
]
# How many steps to sample before deciding the segment reward (shorter than segment)
SEGMENT_EVAL_STEPS = 500
EXIT_SCENE_WAIT = 4.0 # seconds after exit_scene for sim to reach menu
THROTTLE_MIN = 0.2 # minimum throttle (prevents stationary car)
SPEED_SCALE = 0.1 # SpeedRewardWrapper coefficient
# ---- Logging ----
def log(msg):
ts = datetime.now().strftime('%H:%M:%S')
print(f'[{ts}] {msg}', flush=True)
# ---- Health check callback ----
class HealthCheckCallback(BaseCallback):
"""Stops training early if sim is stuck or frozen."""
def __init__(self, max_stuck_steps=150, min_speed=0.02):
super().__init__(verbose=0)
self.health = SimHealthCallback(max_stuck_steps=max_stuck_steps, min_speed=min_speed)
def _on_step(self):
infos = self.locals.get('infos', [{}])
obs = self.locals.get('new_obs', None)
info = infos[0] if infos else {}
obs_arr = obs[0] if obs is not None and len(obs) > 0 else None
healthy = self.health.on_step(obs_arr, None, None, info)
if not healthy:
log('[W3 Runner][HEALTH] Sim stuck/frozen — stopping segment early.')
return False
return True
# ---- Environment factory ----
def wrap_env(raw_env):
"""Apply standard wrappers: throttle clamp + speed reward shaping."""
env = ThrottleClampWrapper(raw_env, throttle_min=THROTTLE_MIN)
env = SpeedRewardWrapper(env, speed_scale=SPEED_SCALE)
return env
# ---- Track switching ----
def _send_exit_scene(env, verbose=True):
"""
Send exit_scene through the EXISTING connection on env.
Critical: the DonkeyCar sim creates one vehicle per TCP connection.
Sending exit_scene via a NEW raw socket creates a second vehicle and
the sim ignores it for the real training session. We must use the
existing viewer connection that env already holds.
"""
try:
base = env.unwrapped # strips all gym.Wrapper layers
if hasattr(base, 'viewer') and base.viewer is not None:
base.viewer.exit_scene() # sends {'msg_type': 'exit_scene'} on existing TCP
time.sleep(0.5) # let the message flush before closing socket
if verbose:
log('[W3 Runner] exit_scene sent on existing viewer connection.')
return True
else:
if verbose:
log('[W3 Runner] Warning: no viewer found on unwrapped env.')
return False
except Exception as e:
if verbose:
log(f'[W3 Runner] Warning: viewer.exit_scene() raised: {e}')
return False
def close_and_switch(current_env, next_env_id, verbose=True):
"""
Cleanly close current env and connect to next track.
Correct order (IMPORTANT):
1. viewer.exit_scene() on existing connection ← tells sim to go to menu
2. env.close() + sleep(2) ← disconnect (ADR-006)
3. sleep(EXIT_SCENE_WAIT) ← wait for sim menu
4. gym.make(next_env_id) + wrap ← connect to new track
Returns: new wrapped env, or raises on connection failure.
"""
if current_env is not None:
# Step 1: tell the sim to exit the scene BEFORE we close the connection
if verbose:
log('[W3 Runner] Sending exit_scene via existing viewer connection...')
_send_exit_scene(current_env, verbose=verbose)
# Step 2: now close the env (ADR-006)
if verbose:
log('[W3 Runner] Closing current env...')
try:
current_env.close()
except Exception as e:
log(f'[W3 Runner] Warning: env.close() raised: {e}')
time.sleep(2) # ADR-006
# Step 3: wait for sim to reach the scene-selection menu
if verbose:
log(f'[W3 Runner] Waiting {EXIT_SCENE_WAIT}s for sim to reach main menu...')
time.sleep(EXIT_SCENE_WAIT)
# Step 4: connect to the target track
if verbose:
log(f'[W3 Runner] Connecting to {next_env_id}...')
raw_env = gym.make(next_env_id)
env = wrap_env(raw_env)
if verbose:
log(f'[W3 Runner] ✅ Connected to {next_env_id}')
return env
# ---- Model creation / warm-start ----
def create_or_load_model(env, learning_rate, warm_start_path=None, seed=None):
"""
Load model from warm_start_path (PPO.load + set_env) or create fresh PPO.
Falls back to fresh model if warm-start path missing or space mismatch.
"""
if warm_start_path and os.path.exists(warm_start_path):
log(f'[W3 Runner] Loading warm-start model from {warm_start_path}')
try:
model = PPO.load(warm_start_path, env=env, device='auto')
# Override learning rate — set BOTH the SB3 attribute AND the
# optimizer param groups. PPO.load() restores the saved optimizer
# state (lr=0.000225 from Phase 2), so just setting model.learning_rate
# is not enough — the optimizer ignores it until _update_learning_rate
# is called, and even then only if it reads the attribute correctly.
model.learning_rate = learning_rate
for pg in model.policy.optimizer.param_groups:
pg['lr'] = learning_rate
log(f'[W3 Runner] ✅ Warm start loaded. LR overridden to {learning_rate:.6f} '
f'(set on model + {len(model.policy.optimizer.param_groups)} optimizer param group(s))')
return model
except Exception as e:
log(f'[W3 Runner] ⚠️ Warm start failed ({e}), training from scratch.')
log(f'[W3 Runner] Creating fresh PPO model (lr={learning_rate:.6f})')
model = PPO(
'CnnPolicy',
env,
learning_rate=learning_rate,
verbose=0,
seed=seed,
)
return model
# ---- Training loop ----
def train_multitrack(model, first_env, total_timesteps, steps_per_switch):
"""
Train PPO across training tracks by round-robin switching every steps_per_switch steps.
Args:
model: PPO model (already set to first_env)
first_env: The first wrapped training env (already connected)
total_timesteps: Total training budget across all tracks
steps_per_switch: Steps per track segment before switching
Returns:
env: The last env used (caller must close it)
segment_rewards: List of (track_name, reward) for each completed segment
"""
env = first_env
steps_done = 0
track_idx = 0 # Start on generated_road (first in TRAINING_TRACKS)
segment_rewards = []
health_cb = HealthCheckCallback()
log(f'[W3 Runner] Starting multi-track training:')
log(f' Total timesteps : {total_timesteps:,}')
log(f' Steps per switch: {steps_per_switch:,}')
log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}')
log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles')
while steps_done < total_timesteps:
track_name, track_env_id = TRAINING_TRACKS[track_idx]
segment_steps = min(steps_per_switch, total_timesteps - steps_done)
log(f'\n[W3 Runner] === Segment: {track_name} | '
f'{steps_done:,}/{total_timesteps:,} steps done | '
f'segment={segment_steps:,} steps ===')
# Train segment
model.learn(
total_timesteps=segment_steps,
reset_num_timesteps=False, # Continuous timestep counter across segments
callback=health_cb,
)
steps_done += segment_steps
# Quick segment reward estimate (run one short episode deterministically)
try:
seg_reward, _ = evaluate_policy(
model, env,
n_eval_episodes=1,
deterministic=True,
return_episode_rewards=False,
warn=False,
)
log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}')
segment_rewards.append((track_name, float(seg_reward)))
except Exception as e:
log(f'[W3 Runner][TRAIN] Segment eval failed: {e}')
segment_rewards.append((track_name, 0.0))
if steps_done >= total_timesteps:
break
# Switch to next training track
next_track_idx = (track_idx + 1) % len(TRAINING_TRACKS)
next_track_name, next_env_id = TRAINING_TRACKS[next_track_idx]
log(f'[W3 Runner] Switching: {track_name}{next_track_name}')
try:
new_env = close_and_switch(env, next_env_id)
model.set_env(new_env)
env = new_env
track_idx = next_track_idx
except Exception as e:
log(f'[W3 Runner] ⚠️ Track switch failed: {e}. Retrying in 5s...')
time.sleep(5)
try:
new_env = close_and_switch(None, next_env_id)
model.set_env(new_env)
env = new_env
track_idx = next_track_idx
except Exception as e2:
log(f'[W3 Runner] ❌ Track switch retry failed: {e2}. Continuing on current track.')
# Stay on current track — don't crash the whole run
log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across '
f'{len(segment_rewards)} segments.')
return env, segment_rewards
# ---- Zero-shot evaluation on test tracks ----
def evaluate_test_tracks(model, current_env, eval_episodes):
"""
Evaluate the trained model on each test track (zero-shot generalization).
Switches to each test track, runs eval_episodes episodes, records
mean_reward and mean_steps. Closes test envs when done.
Returns:
test_results: dict of {track_name: {'mean_reward': float, 'mean_steps': float}}
combined_test_score: sum of mean_rewards across test tracks
"""
log(f'\n[W3 Runner] ===== ZERO-SHOT EVALUATION on TEST tracks =====')
log(f' Test tracks : {[t[0] for t in TEST_TRACKS]}')
log(f' Eval episodes : {eval_episodes}')
test_results = {}
env = current_env
for track_name, track_env_id in TEST_TRACKS:
log(f'\n[W3 Runner] Switching to TEST track: {track_name}')
try:
env = close_and_switch(env, track_env_id)
except Exception as e:
log(f'[W3 Runner] ❌ Cannot connect to test track {track_name}: {e}')
test_results[track_name] = {'mean_reward': 0.0, 'mean_steps': 0.0, 'error': str(e)}
continue
# Run episodes manually to capture step count
all_rewards = []
all_steps = []
for ep in range(eval_episodes):
obs, info = env.reset()
total_reward = 0.0
steps = 0
done = False
while not done and steps < 2000:
action, _ = model.predict(obs, deterministic=True)
result = env.step(action)
if len(result) == 5:
obs, reward, terminated, truncated, info = result
done = terminated or truncated
else:
obs, reward, done, info = result
total_reward += reward
steps += 1
all_rewards.append(total_reward)
all_steps.append(steps)
log(f'[W3 Runner] {track_name} ep{ep+1}: reward={total_reward:.1f} steps={steps}')
time.sleep(0.5)
mean_reward = float(np.mean(all_rewards))
mean_steps = float(np.mean(all_steps))
drove_far = mean_steps > 200
test_results[track_name] = {
'mean_reward': mean_reward,
'mean_steps': mean_steps,
'drove_far': drove_far,
}
verdict = '✅ DRIVES' if drove_far else '❌ CRASHES'
log(f'[W3 Runner][TEST] track={track_name} mean_reward={mean_reward:.2f} '
f'mean_steps={mean_steps:.1f} {verdict}')
# Combined score = sum of mean_rewards on test tracks
combined = sum(r['mean_reward'] for r in test_results.values())
log(f'\n[W3 Runner][TEST] combined_test_score={combined:.4f}')
log(f'[W3 Runner][TEST] mini_monaco_reward='
f'{test_results.get("mini_monaco", {}).get("mean_reward", 0.0):.4f}')
log(f'[W3 Runner][TEST] warren_reward='
f'{test_results.get("warren", {}).get("mean_reward", 0.0):.4f}')
return test_results, combined, env
# ---- Main ----
def main():
parser = argparse.ArgumentParser(description='Wave 3 Multi-Track PPO Trainer.')
parser.add_argument('--total-timesteps', type=int, default=200000,
help='Total training timesteps across all tracks (default: 200000)')
parser.add_argument('--steps-per-switch', type=int, default=10000,
help='Steps on each track before switching (default: 10000)')
parser.add_argument('--learning-rate', type=float, default=0.000225,
help='PPO learning rate (default: 0.000225 = Phase 2 champion)')
parser.add_argument('--warm-start', type=str, default=None,
help='Path to .zip model for warm start (default: models/champion/model.zip)')
parser.add_argument('--save-dir', type=str, default=None,
help='Directory to save trained model')
parser.add_argument('--eval-episodes', type=int, default=3,
help='Episodes per test track for zero-shot evaluation')
parser.add_argument('--seed', type=int, default=None,
help='Random seed')
parser.add_argument('--skip-eval', action='store_true',
help='Skip zero-shot evaluation (training only)')
args = parser.parse_args()
# Resolve warm-start path
warm_start = args.warm_start
if warm_start is None:
default_champ = os.path.join(AGENT_DIR, 'models', 'champion', 'model.zip')
if os.path.exists(default_champ):
warm_start = default_champ
log(f'[W3 Runner] Auto-detected warm start: {warm_start}')
save_dir = args.save_dir or os.path.join(AGENT_DIR, 'models', 'wave3',
f'trial-{int(time.time())}')
log(f'[W3 Runner] === Wave 3 Multi-Track Training ===')
log(f'[W3 Runner] total_timesteps ={args.total_timesteps:,}')
log(f'[W3 Runner] steps_per_switch={args.steps_per_switch:,}')
log(f'[W3 Runner] learning_rate ={args.learning_rate:.6f}')
log(f'[W3 Runner] warm_start ={warm_start}')
log(f'[W3 Runner] save_dir ={save_dir}')
log(f'[W3 Runner] eval_episodes ={args.eval_episodes}')
# ---- 1. Connect to first training track ----
# Assume sim is already at the main menu (user-started, or previous run exited cleanly).
# gym.make() on the first track will load it directly from the menu.
first_track_name, first_env_id = TRAINING_TRACKS[0]
log(f'\n[W3 Runner] Connecting to first training track: {first_track_name} ({first_env_id})')
env = None
try:
raw_env = gym.make(first_env_id)
env = wrap_env(raw_env)
log(f'[W3 Runner] ✅ Connected to {first_env_id}')
except Exception as e:
log(f'[W3 Runner] ❌ Failed to connect to first training track: {e}')
sys.exit(100)
# ---- 2. Create or load model ----
model = None
try:
model = create_or_load_model(env, args.learning_rate, warm_start, args.seed)
except Exception as e:
log(f'[W3 Runner] ❌ Model creation failed: {e}')
try:
env.close()
time.sleep(2)
except Exception:
pass
sys.exit(101)
# ---- 3. Multi-track training ----
try:
env, segment_rewards = train_multitrack(
model, env,
total_timesteps=args.total_timesteps,
steps_per_switch=args.steps_per_switch,
)
except Exception as e:
log(f'[W3 Runner] ❌ Training failed: {e}')
try:
env.close()
time.sleep(2)
except Exception:
pass
sys.exit(101)
# ---- 4. Save model ----
# ADR-005: model is always defined before model.save()
try:
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'model')
model.save(save_path)
log(f'[W3 Runner] ✅ Model saved to {save_path}.zip')
except Exception as e:
log(f'[W3 Runner] ⚠️ Model save failed: {e}')
# ---- 5. Zero-shot evaluation on test tracks ----
combined_test_score = 0.0
test_results = {}
if not args.skip_eval:
try:
test_results, combined_test_score, env = evaluate_test_tracks(
model, env, args.eval_episodes
)
except Exception as e:
log(f'[W3 Runner] ❌ Test evaluation failed: {e}')
# Ensure combined_test_score = 0 is recorded (trial still valid)
# ---- 6. Print training summary ----
log(f'\n[W3 Runner] ===== TRAINING SUMMARY =====')
if segment_rewards:
by_track = {}
for tname, rew in segment_rewards:
by_track.setdefault(tname, []).append(rew)
for tname, rewards in by_track.items():
log(f'[W3 Runner][TRAIN] {tname}: '
f'mean={np.mean(rewards):.1f} over {len(rewards)} segments')
log(f'\n[W3 Runner] ===== TEST SUMMARY (zero-shot generalization) =====')
for tname, metrics in test_results.items():
verdict = '✅ DRIVES' if metrics.get('drove_far') else '❌ CRASHES'
log(f'[W3 Runner][TEST] {tname}: '
f'reward={metrics.get("mean_reward", 0):.1f} '
f'steps={metrics.get("mean_steps", 0):.0f} {verdict}')
log(f'[W3 Runner][TEST] combined_test_score={combined_test_score:.4f}')
# ---- 7. Teardown ----
log(f'[W3 Runner] Closing final env...')
try:
env.close()
log(f'[W3 Runner] env.close() complete.')
except Exception as e:
log(f'[W3 Runner] Warning: env.close() raised: {e}')
time.sleep(2) # ADR-006
log(f'[W3 Runner] ✅ Multi-track runner complete. Exiting.')
if __name__ == '__main__':
main()