539 lines
21 KiB
Python
539 lines
21 KiB
Python
"""
|
|
Wave 3 Multi-Track Runner
|
|
=========================
|
|
Trains PPO across multiple DonkeyCar tracks by round-robin switching between
|
|
training segments. After training, evaluates on zero-shot test tracks
|
|
(mini_monaco + warren) to measure cross-track generalization.
|
|
|
|
Track classification (from visual analysis):
|
|
TRAINING : generated_road, generated_track, mountain_track
|
|
(outdoor, same road markings — yellow centre + white edge)
|
|
TEST/EVAL : mini_monaco, warren
|
|
(never seen during training — generalization benchmark)
|
|
SKIPPED : warehouse, robo_racing_league, waveshare, circuit_launch
|
|
(fully indoor — different domain entirely)
|
|
avc_sparkfun (outdoor but orange markings — too different)
|
|
|
|
Track switching strategy:
|
|
Close env → send_exit_scene_raw() → wait 4s → gym.make(next_track)
|
|
This avoids the double-connect issue in switch_track() when an env is
|
|
already open on the current track.
|
|
|
|
Key invariants (ADR-005, ADR-006):
|
|
- model is always defined before model.save()
|
|
- env.close() + time.sleep(2) before every track switch
|
|
- Results appended to JSONL, never overwritten
|
|
|
|
Output lines parsed by wave3_controller.py:
|
|
[W3 Runner][TRAIN] track=<name> segment_reward=<float>
|
|
[W3 Runner][TEST] track=<name> mean_reward=<float> mean_steps=<float>
|
|
[W3 Runner][TEST] combined_test_score=<float>
|
|
|
|
Usage:
|
|
python3 multitrack_runner.py \\
|
|
--total-timesteps 200000 \\
|
|
--steps-per-switch 10000 \\
|
|
--learning-rate 0.000225 \\
|
|
--warm-start models/champion/model.zip \\
|
|
--save-dir models/wave3/trial-0001 \\
|
|
--eval-episodes 3
|
|
|
|
Exit codes:
|
|
0 — success, model saved, evaluation complete
|
|
100 — failed to connect to simulator on initial track
|
|
101 — training error
|
|
102 — evaluation error
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import numpy as np
|
|
from datetime import datetime
|
|
|
|
import gymnasium as gym
|
|
import gym_donkeycar
|
|
from stable_baselines3 import PPO
|
|
from stable_baselines3.common.utils import get_schedule_fn
|
|
from stable_baselines3.common.evaluation import evaluate_policy
|
|
from stable_baselines3.common.callbacks import BaseCallback
|
|
|
|
# ---- Project paths ----
|
|
AGENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.insert(0, AGENT_DIR)
|
|
|
|
from donkeycar_sb3_runner import ThrottleClampWrapper, SimHealthCallback
|
|
from reward_wrapper import SpeedRewardWrapper
|
|
|
|
# ---- Track catalogue ----
|
|
# Maps short name → gym env ID
|
|
TRAINING_TRACKS = [
|
|
('generated_road', 'donkey-generated-roads-v0'),
|
|
('generated_track', 'donkey-generated-track-v0'),
|
|
('mountain_track', 'donkey-mountain-track-v0'),
|
|
]
|
|
|
|
# Zero-shot generalization test tracks — never seen during training.
|
|
# Warren was removed: its episode-done condition does not fire when the car
|
|
# crosses the INSIDE edge (CTE stays small), so the car can drive among
|
|
# chairs indefinitely and scores are meaningless.
|
|
TEST_TRACKS = [
|
|
('mini_monaco', 'donkey-minimonaco-track-v0'),
|
|
]
|
|
|
|
# How many steps to sample before deciding the segment reward (shorter than segment)
|
|
SEGMENT_EVAL_STEPS = 500
|
|
EXIT_SCENE_WAIT = 4.0 # seconds after exit_scene for sim to reach menu
|
|
THROTTLE_MIN = 0.2 # minimum throttle (prevents stationary car)
|
|
SPEED_SCALE = 0.1 # SpeedRewardWrapper coefficient
|
|
|
|
|
|
# ---- Logging ----
|
|
def log(msg):
|
|
ts = datetime.now().strftime('%H:%M:%S')
|
|
print(f'[{ts}] {msg}', flush=True)
|
|
|
|
|
|
# ---- Health check callback ----
|
|
class HealthCheckCallback(BaseCallback):
|
|
"""Stops training early if sim is stuck or frozen."""
|
|
def __init__(self, max_stuck_steps=150, min_speed=0.02):
|
|
super().__init__(verbose=0)
|
|
self.health = SimHealthCallback(max_stuck_steps=max_stuck_steps, min_speed=min_speed)
|
|
|
|
def _on_step(self):
|
|
infos = self.locals.get('infos', [{}])
|
|
obs = self.locals.get('new_obs', None)
|
|
info = infos[0] if infos else {}
|
|
obs_arr = obs[0] if obs is not None and len(obs) > 0 else None
|
|
healthy = self.health.on_step(obs_arr, None, None, info)
|
|
if not healthy:
|
|
log('[W3 Runner][HEALTH] Sim stuck/frozen — stopping segment early.')
|
|
return False
|
|
return True
|
|
|
|
|
|
# ---- Environment factory ----
|
|
def wrap_env(raw_env):
|
|
"""Apply standard wrappers: throttle clamp + speed reward shaping."""
|
|
env = ThrottleClampWrapper(raw_env, throttle_min=THROTTLE_MIN)
|
|
env = SpeedRewardWrapper(env, speed_scale=SPEED_SCALE)
|
|
return env
|
|
|
|
|
|
# ---- Track switching ----
|
|
def _send_exit_scene(env, verbose=True):
|
|
"""
|
|
Send exit_scene through the EXISTING connection on env.
|
|
|
|
Critical: the DonkeyCar sim creates one vehicle per TCP connection.
|
|
Sending exit_scene via a NEW raw socket creates a second vehicle and
|
|
the sim ignores it for the real training session. We must use the
|
|
existing viewer connection that env already holds.
|
|
"""
|
|
try:
|
|
base = env.unwrapped # strips all gym.Wrapper layers
|
|
if hasattr(base, 'viewer') and base.viewer is not None:
|
|
base.viewer.exit_scene() # sends {'msg_type': 'exit_scene'} on existing TCP
|
|
time.sleep(0.5) # let the message flush before closing socket
|
|
if verbose:
|
|
log('[W3 Runner] exit_scene sent on existing viewer connection.')
|
|
return True
|
|
else:
|
|
if verbose:
|
|
log('[W3 Runner] Warning: no viewer found on unwrapped env.')
|
|
return False
|
|
except Exception as e:
|
|
if verbose:
|
|
log(f'[W3 Runner] Warning: viewer.exit_scene() raised: {e}')
|
|
return False
|
|
|
|
|
|
def close_and_switch(current_env, next_env_id, verbose=True):
|
|
"""
|
|
Cleanly close current env and connect to next track.
|
|
|
|
Correct order (IMPORTANT):
|
|
1. viewer.exit_scene() on existing connection ← tells sim to go to menu
|
|
2. env.close() + sleep(2) ← disconnect (ADR-006)
|
|
3. sleep(EXIT_SCENE_WAIT) ← wait for sim menu
|
|
4. gym.make(next_env_id) + wrap ← connect to new track
|
|
|
|
Returns: new wrapped env, or raises on connection failure.
|
|
"""
|
|
if current_env is not None:
|
|
# Step 1: tell the sim to exit the scene BEFORE we close the connection
|
|
if verbose:
|
|
log('[W3 Runner] Sending exit_scene via existing viewer connection...')
|
|
_send_exit_scene(current_env, verbose=verbose)
|
|
|
|
# Step 2: now close the env (ADR-006)
|
|
if verbose:
|
|
log('[W3 Runner] Closing current env...')
|
|
try:
|
|
current_env.close()
|
|
except Exception as e:
|
|
log(f'[W3 Runner] Warning: env.close() raised: {e}')
|
|
time.sleep(2) # ADR-006
|
|
|
|
# Step 3: wait for sim to reach the scene-selection menu
|
|
if verbose:
|
|
log(f'[W3 Runner] Waiting {EXIT_SCENE_WAIT}s for sim to reach main menu...')
|
|
time.sleep(EXIT_SCENE_WAIT)
|
|
|
|
# Step 4: connect to the target track
|
|
if verbose:
|
|
log(f'[W3 Runner] Connecting to {next_env_id}...')
|
|
raw_env = gym.make(next_env_id)
|
|
env = wrap_env(raw_env)
|
|
if verbose:
|
|
log(f'[W3 Runner] ✅ Connected to {next_env_id}')
|
|
return env
|
|
|
|
|
|
# ---- Model creation / warm-start ----
|
|
def create_or_load_model(env, learning_rate, warm_start_path=None, seed=None):
|
|
"""
|
|
Load model from warm_start_path (PPO.load + set_env) or create fresh PPO.
|
|
Falls back to fresh model if warm-start path missing or space mismatch.
|
|
"""
|
|
if warm_start_path and os.path.exists(warm_start_path):
|
|
log(f'[W3 Runner] Loading warm-start model from {warm_start_path}')
|
|
try:
|
|
model = PPO.load(warm_start_path, env=env, device='auto')
|
|
# Three-part LR override required after PPO.load():
|
|
# 1. model.learning_rate — Python attribute (used to recreate lr_schedule)
|
|
# 2. model.lr_schedule — FloatSchedule used by _update_learning_rate()
|
|
# during every train() call. Without this,
|
|
# _update_learning_rate() reverts the optimizer
|
|
# back to the saved LR on the first gradient step.
|
|
# 3. optimizer param_groups — immediate effect before first train()
|
|
model.learning_rate = learning_rate
|
|
model.lr_schedule = get_schedule_fn(learning_rate)
|
|
for pg in model.policy.optimizer.param_groups:
|
|
pg['lr'] = learning_rate
|
|
log(f'[W3 Runner] ✅ Warm start loaded. LR overridden to {learning_rate:.6f} '
|
|
f'(model + lr_schedule + {len(model.policy.optimizer.param_groups)} optimizer param group(s))')
|
|
return model
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ⚠️ Warm start failed ({e}), training from scratch.')
|
|
|
|
log(f'[W3 Runner] Creating fresh PPO model (lr={learning_rate:.6f})')
|
|
model = PPO(
|
|
'CnnPolicy',
|
|
env,
|
|
learning_rate=learning_rate,
|
|
verbose=0,
|
|
seed=seed,
|
|
)
|
|
return model
|
|
|
|
|
|
# ---- Training loop ----
|
|
def train_multitrack(model, first_env, total_timesteps, steps_per_switch):
|
|
"""
|
|
Train PPO across training tracks by round-robin switching every steps_per_switch steps.
|
|
|
|
Args:
|
|
model: PPO model (already set to first_env)
|
|
first_env: The first wrapped training env (already connected)
|
|
total_timesteps: Total training budget across all tracks
|
|
steps_per_switch: Steps per track segment before switching
|
|
|
|
Returns:
|
|
env: The last env used (caller must close it)
|
|
segment_rewards: List of (track_name, reward) for each completed segment
|
|
"""
|
|
env = first_env
|
|
steps_done = 0
|
|
track_idx = 0 # Start on generated_road (first in TRAINING_TRACKS)
|
|
segment_rewards = []
|
|
health_cb = HealthCheckCallback()
|
|
|
|
log(f'[W3 Runner] Starting multi-track training:')
|
|
log(f' Total timesteps : {total_timesteps:,}')
|
|
log(f' Steps per switch: {steps_per_switch:,}')
|
|
log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}')
|
|
log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles')
|
|
|
|
while steps_done < total_timesteps:
|
|
track_name, track_env_id = TRAINING_TRACKS[track_idx]
|
|
segment_steps = min(steps_per_switch, total_timesteps - steps_done)
|
|
|
|
log(f'\n[W3 Runner] === Segment: {track_name} | '
|
|
f'{steps_done:,}/{total_timesteps:,} steps done | '
|
|
f'segment={segment_steps:,} steps ===')
|
|
|
|
# Train segment
|
|
model.learn(
|
|
total_timesteps=segment_steps,
|
|
reset_num_timesteps=False, # Continuous timestep counter across segments
|
|
callback=health_cb,
|
|
)
|
|
steps_done += segment_steps
|
|
|
|
# Quick segment reward estimate (run one short episode deterministically)
|
|
try:
|
|
seg_reward, _ = evaluate_policy(
|
|
model, env,
|
|
n_eval_episodes=1,
|
|
deterministic=True,
|
|
return_episode_rewards=False,
|
|
warn=False,
|
|
)
|
|
log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}')
|
|
segment_rewards.append((track_name, float(seg_reward)))
|
|
except Exception as e:
|
|
log(f'[W3 Runner][TRAIN] Segment eval failed: {e}')
|
|
segment_rewards.append((track_name, 0.0))
|
|
|
|
if steps_done >= total_timesteps:
|
|
break
|
|
|
|
# Switch to next training track
|
|
next_track_idx = (track_idx + 1) % len(TRAINING_TRACKS)
|
|
next_track_name, next_env_id = TRAINING_TRACKS[next_track_idx]
|
|
log(f'[W3 Runner] Switching: {track_name} → {next_track_name}')
|
|
|
|
try:
|
|
new_env = close_and_switch(env, next_env_id)
|
|
model.set_env(new_env)
|
|
env = new_env
|
|
track_idx = next_track_idx
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ⚠️ Track switch failed: {e}. Retrying in 5s...')
|
|
time.sleep(5)
|
|
try:
|
|
new_env = close_and_switch(None, next_env_id)
|
|
model.set_env(new_env)
|
|
env = new_env
|
|
track_idx = next_track_idx
|
|
except Exception as e2:
|
|
log(f'[W3 Runner] ❌ Track switch retry failed: {e2}. Continuing on current track.')
|
|
# Stay on current track — don't crash the whole run
|
|
|
|
log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across '
|
|
f'{len(segment_rewards)} segments.')
|
|
return env, segment_rewards
|
|
|
|
|
|
# ---- Zero-shot evaluation on test tracks ----
|
|
def evaluate_test_tracks(model, current_env, eval_episodes):
|
|
"""
|
|
Evaluate the trained model on each test track (zero-shot generalization).
|
|
|
|
Switches to each test track, runs eval_episodes episodes, records
|
|
mean_reward and mean_steps. Closes test envs when done.
|
|
|
|
Returns:
|
|
test_results: dict of {track_name: {'mean_reward': float, 'mean_steps': float}}
|
|
combined_test_score: sum of mean_rewards across test tracks
|
|
"""
|
|
log(f'\n[W3 Runner] ===== ZERO-SHOT EVALUATION on TEST tracks =====')
|
|
log(f' Test tracks : {[t[0] for t in TEST_TRACKS]}')
|
|
log(f' Eval episodes : {eval_episodes}')
|
|
|
|
test_results = {}
|
|
env = current_env
|
|
|
|
for track_name, track_env_id in TEST_TRACKS:
|
|
log(f'\n[W3 Runner] Switching to TEST track: {track_name}')
|
|
try:
|
|
env = close_and_switch(env, track_env_id)
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ❌ Cannot connect to test track {track_name}: {e}')
|
|
test_results[track_name] = {'mean_reward': 0.0, 'mean_steps': 0.0, 'error': str(e)}
|
|
continue
|
|
|
|
# Run episodes manually to capture step count
|
|
all_rewards = []
|
|
all_steps = []
|
|
|
|
for ep in range(eval_episodes):
|
|
obs, info = env.reset()
|
|
total_reward = 0.0
|
|
steps = 0
|
|
done = False
|
|
|
|
while not done and steps < 2000:
|
|
action, _ = model.predict(obs, deterministic=True)
|
|
result = env.step(action)
|
|
if len(result) == 5:
|
|
obs, reward, terminated, truncated, info = result
|
|
done = terminated or truncated
|
|
else:
|
|
obs, reward, done, info = result
|
|
total_reward += reward
|
|
steps += 1
|
|
|
|
all_rewards.append(total_reward)
|
|
all_steps.append(steps)
|
|
log(f'[W3 Runner] {track_name} ep{ep+1}: reward={total_reward:.1f} steps={steps}')
|
|
time.sleep(0.5)
|
|
|
|
mean_reward = float(np.mean(all_rewards))
|
|
mean_steps = float(np.mean(all_steps))
|
|
drove_far = mean_steps > 200
|
|
|
|
test_results[track_name] = {
|
|
'mean_reward': mean_reward,
|
|
'mean_steps': mean_steps,
|
|
'drove_far': drove_far,
|
|
}
|
|
verdict = '✅ DRIVES' if drove_far else '❌ CRASHES'
|
|
log(f'[W3 Runner][TEST] track={track_name} mean_reward={mean_reward:.2f} '
|
|
f'mean_steps={mean_steps:.1f} {verdict}')
|
|
|
|
# Combined score = sum of mean_rewards on test tracks
|
|
combined = sum(r['mean_reward'] for r in test_results.values())
|
|
log(f'\n[W3 Runner][TEST] combined_test_score={combined:.4f}')
|
|
log(f'[W3 Runner][TEST] mini_monaco_reward='
|
|
f'{test_results.get("mini_monaco", {}).get("mean_reward", 0.0):.4f}')
|
|
log(f'[W3 Runner][TEST] warren_reward='
|
|
f'{test_results.get("warren", {}).get("mean_reward", 0.0):.4f}')
|
|
|
|
return test_results, combined, env
|
|
|
|
|
|
# ---- Main ----
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Wave 3 Multi-Track PPO Trainer.')
|
|
parser.add_argument('--total-timesteps', type=int, default=200000,
|
|
help='Total training timesteps across all tracks (default: 200000)')
|
|
parser.add_argument('--steps-per-switch', type=int, default=10000,
|
|
help='Steps on each track before switching (default: 10000)')
|
|
parser.add_argument('--learning-rate', type=float, default=0.000225,
|
|
help='PPO learning rate (default: 0.000225 = Phase 2 champion)')
|
|
parser.add_argument('--warm-start', type=str, default=None,
|
|
help='Path to .zip model for warm start (default: models/champion/model.zip)')
|
|
parser.add_argument('--save-dir', type=str, default=None,
|
|
help='Directory to save trained model')
|
|
parser.add_argument('--eval-episodes', type=int, default=3,
|
|
help='Episodes per test track for zero-shot evaluation')
|
|
parser.add_argument('--seed', type=int, default=None,
|
|
help='Random seed')
|
|
parser.add_argument('--skip-eval', action='store_true',
|
|
help='Skip zero-shot evaluation (training only)')
|
|
args = parser.parse_args()
|
|
|
|
# Resolve warm-start path
|
|
warm_start = args.warm_start
|
|
if warm_start is None:
|
|
default_champ = os.path.join(AGENT_DIR, 'models', 'champion', 'model.zip')
|
|
if os.path.exists(default_champ):
|
|
warm_start = default_champ
|
|
log(f'[W3 Runner] Auto-detected warm start: {warm_start}')
|
|
|
|
save_dir = args.save_dir or os.path.join(AGENT_DIR, 'models', 'wave3',
|
|
f'trial-{int(time.time())}')
|
|
|
|
log(f'[W3 Runner] === Wave 3 Multi-Track Training ===')
|
|
log(f'[W3 Runner] total_timesteps ={args.total_timesteps:,}')
|
|
log(f'[W3 Runner] steps_per_switch={args.steps_per_switch:,}')
|
|
log(f'[W3 Runner] learning_rate ={args.learning_rate:.6f}')
|
|
log(f'[W3 Runner] warm_start ={warm_start}')
|
|
log(f'[W3 Runner] save_dir ={save_dir}')
|
|
log(f'[W3 Runner] eval_episodes ={args.eval_episodes}')
|
|
|
|
# ---- 1. Connect to first training track ----
|
|
# Assume sim is already at the main menu (user-started, or previous run exited cleanly).
|
|
# gym.make() on the first track will load it directly from the menu.
|
|
first_track_name, first_env_id = TRAINING_TRACKS[0]
|
|
log(f'\n[W3 Runner] Connecting to first training track: {first_track_name} ({first_env_id})')
|
|
|
|
env = None
|
|
try:
|
|
raw_env = gym.make(first_env_id)
|
|
env = wrap_env(raw_env)
|
|
log(f'[W3 Runner] ✅ Connected to {first_env_id}')
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ❌ Failed to connect to first training track: {e}')
|
|
sys.exit(100)
|
|
|
|
# ---- 2. Create or load model ----
|
|
model = None
|
|
try:
|
|
model = create_or_load_model(env, args.learning_rate, warm_start, args.seed)
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ❌ Model creation failed: {e}')
|
|
try:
|
|
env.close()
|
|
time.sleep(2)
|
|
except Exception:
|
|
pass
|
|
sys.exit(101)
|
|
|
|
# ---- 3. Multi-track training ----
|
|
try:
|
|
env, segment_rewards = train_multitrack(
|
|
model, env,
|
|
total_timesteps=args.total_timesteps,
|
|
steps_per_switch=args.steps_per_switch,
|
|
)
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ❌ Training failed: {e}')
|
|
try:
|
|
env.close()
|
|
time.sleep(2)
|
|
except Exception:
|
|
pass
|
|
sys.exit(101)
|
|
|
|
# ---- 4. Save model ----
|
|
# ADR-005: model is always defined before model.save()
|
|
try:
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
save_path = os.path.join(save_dir, 'model')
|
|
model.save(save_path)
|
|
log(f'[W3 Runner] ✅ Model saved to {save_path}.zip')
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ⚠️ Model save failed: {e}')
|
|
|
|
# ---- 5. Zero-shot evaluation on test tracks ----
|
|
combined_test_score = 0.0
|
|
test_results = {}
|
|
|
|
if not args.skip_eval:
|
|
try:
|
|
test_results, combined_test_score, env = evaluate_test_tracks(
|
|
model, env, args.eval_episodes
|
|
)
|
|
except Exception as e:
|
|
log(f'[W3 Runner] ❌ Test evaluation failed: {e}')
|
|
# Ensure combined_test_score = 0 is recorded (trial still valid)
|
|
|
|
# ---- 6. Print training summary ----
|
|
log(f'\n[W3 Runner] ===== TRAINING SUMMARY =====')
|
|
if segment_rewards:
|
|
by_track = {}
|
|
for tname, rew in segment_rewards:
|
|
by_track.setdefault(tname, []).append(rew)
|
|
for tname, rewards in by_track.items():
|
|
log(f'[W3 Runner][TRAIN] {tname}: '
|
|
f'mean={np.mean(rewards):.1f} over {len(rewards)} segments')
|
|
|
|
log(f'\n[W3 Runner] ===== TEST SUMMARY (zero-shot generalization) =====')
|
|
for tname, metrics in test_results.items():
|
|
verdict = '✅ DRIVES' if metrics.get('drove_far') else '❌ CRASHES'
|
|
log(f'[W3 Runner][TEST] {tname}: '
|
|
f'reward={metrics.get("mean_reward", 0):.1f} '
|
|
f'steps={metrics.get("mean_steps", 0):.0f} {verdict}')
|
|
log(f'[W3 Runner][TEST] combined_test_score={combined_test_score:.4f}')
|
|
|
|
# ---- 7. Teardown ----
|
|
log(f'[W3 Runner] Closing final env...')
|
|
try:
|
|
env.close()
|
|
log(f'[W3 Runner] env.close() complete.')
|
|
except Exception as e:
|
|
log(f'[W3 Runner] Warning: env.close() raised: {e}')
|
|
time.sleep(2) # ADR-006
|
|
|
|
log(f'[W3 Runner] ✅ Multi-track runner complete. Exiting.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|