donkeycar-rl-autoresearch/agent/experiments/exp17_parallel_450k.py

200 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 17: Parallel DummyVecEnv — generated_track + mountain_track, 450k steps.
Strategy: Exp 11b proved the parallel DummyVecEnv infrastructure is stable.
The only failure mode was insufficient training budget (~45k effective steps
per track). This experiment triples the budget to ~225k per track.
Changes from Exp 11b:
- HOST: 10.0.0.55 → localhost (WSL/Windows share ports)
- TOTAL_STEPS: 90k → 450k
- CHECKPOINT_EVERY: 6k → 20k
- SAVE_DIR: exp17-parallel-450k
Everything else identical to Exp 11b (same reward, wrappers, lr, throttle_min).
Setup — TWO sim instances required:
Sim 1: launch donkey_sim.exe, select generated_track, port 9091 (default)
Sim 2: launch a second donkey_sim.exe with --port 9093, select mountain_track
Command: donkey_sim.exe --port 9093
Both sims must be running and on the correct tracks before starting this script.
Evaluation:
- Mid-training: both training tracks evaluated at each 20k checkpoint
- End-of-training: all 4 tracks evaluated sequentially (port 9091)
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym
import numpy as np
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.000725
TOTAL_STEPS = 450_000
CHECKPOINT_EVERY = 20_000
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp17-parallel-450k'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5)
env = SpeedRewardWrapper(env)
return env
return _init
log('=' * 60)
log('Exp 17: Parallel DummyVecEnv — 450k steps')
log(f' Sim 1: {HOST}:9091 → generated_track')
log(f' Sim 2: {HOST}:9093 → mountain_track')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward: v6 (speed × CTE_quality, efficiency gate >= 0.15)')
log(f' Stuck termination: 40 steps (~2.5s)')
log(f' Checkpoints: every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Creating DummyVecEnv with two tracks...')
env = DummyVecEnv([
make_env('donkey-generated-track-v0', 9091),
make_env('donkey-mountain-track-v0', 9093),
])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('PPO created. Starting training...')
best_reward = float('-inf')
steps_done = 0
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model'))
log(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
# Eval on both training tracks using the existing DummyVecEnv connections
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
status0 = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
status1 = '' if ep_steps[1] >= 2000 else f'❌@{int(ep_steps[1])}'
log(f' Eval: gen_track={ep_rewards[0]:.1f}r/{int(ep_steps[0])}s {status0} '
f'mountain={ep_rewards[1]:.1f}r/{int(ep_steps[1])}s {status1}')
total_reward = ep_rewards.sum()
if total_reward > best_reward:
best_reward = total_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f' ⭐ NEW BEST: {best_reward:.1f} combined reward')
except Exception as e:
log(f' Eval error: {e}')
import traceback; traceback.print_exc()
model.save(os.path.join(SAVE_DIR, 'model'))
log(f'\nTraining complete. Best combined reward: {best_reward:.1f}')
env.close()
time.sleep(5)
# --- Final eval on all 4 tracks (sequential, port 9091) ---
log('\n' + '=' * 60)
log('FINAL EVALUATION: best_model on 4 tracks (3 sets each)')
log('=' * 60)
EVAL_TRACKS = [
('donkey-generated-track-v0', 'generated_track'),
('donkey-mountain-track-v0', 'mountain_track'),
('donkey-minimonaco-track-v0', 'mini_monaco'),
('donkey-generated-roads-v0', 'generated_road'),
]
EVAL_PORT = 9091
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
best_model_path = os.path.join(SAVE_DIR, 'best_model.zip')
results_by_track = {}
for track_id, track_name in EVAL_TRACKS:
log(f'\n--- {track_name} ---')
steps_list = []
for s in range(1, EVAL_SETS + 1):
try:
raw = gym.make(track_id, conf={'host': HOST, 'port': EVAL_PORT})
inner = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
inner = StuckTerminationWrapper(inner, stuck_steps=40, min_displacement=0.5)
inner = SpeedRewardWrapper(inner)
eval_env = VecTransposeImage(DummyVecEnv([lambda e=inner: e]))
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
total_r, total_s, done = 0.0, 0, False
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
if len(result) == 4:
obs, r, d, info = result
done = bool(d[0])
else:
obs, r, t, tr, info = result
done = bool(t[0] or tr[0])
total_r += float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
log(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
eval_env.close()
time.sleep(3)
except Exception as e:
log(f' Set {s}: ERROR — {e}')
steps_list.append(0)
time.sleep(3)
mean_steps = np.mean(steps_list) if steps_list else 0
results_by_track[track_name] = steps_list
log(f' Mean: {mean_steps:.0f} steps')
log('\n' + '=' * 60)
log('SUMMARY')
log('=' * 60)
for track_name, steps_list in results_by_track.items():
steps_str = '/'.join(str(s) for s in steps_list)
mean = np.mean(steps_list)
verdict = '' if mean >= 1500 else '⚠️' if mean >= 500 else ''
log(f' {verdict} {track_name:20s}: {steps_str} mean={mean:.0f}')
log(f'\n=== Exp 17 COMPLETE ===')