donkeycar-rl-autoresearch/agent/experiments/exp11b_parallel_v6.py

202 lines
7.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 11b: Parallel DummyVecEnv — generated_track + mountain_track, v6 reward.
Changes from Exp 11 (aborted):
- Reward v6: speed × CTE_quality + efficiency GATE (prevents circular driving)
- stuck_steps: 80 → 40 (faster termination when stuck against barriers)
- Everything else identical: same tracks, same hyperparameters, same parallel setup
Setup:
- Sim 1: 10.0.0.55:9091 → generated_track
- Sim 2: 10.0.0.55:9093 → mountain_track
- DummyVecEnv wraps both → PPO sees both tracks in every rollout batch
- NO env closing, NO set_env(), NO track switching
Hypothesis: v6 reward fixes circular driving exploit seen in Exp 11 while
preserving gradient signal on mountain_track hills. Parallel envs provide
stable multi-track learning (no catastrophic forgetting).
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym
import numpy as np
HOST = '10.0.0.55'
THROTTLE_MIN = 0.2
LR = 0.000725
TOTAL_STEPS = 90000
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp11b-parallel-v6'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5)
env = SpeedRewardWrapper(env) # v6: speed×CTE + efficiency gate
return env
return _init
log('='*60)
log('Exp 11b: Parallel DummyVecEnv — v6 reward (anti-circle gate)')
log(f' Sim 1: {HOST}:9091 → generated_track')
log(f' Sim 2: {HOST}:9093 → mountain_track')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward: v6 (speed × CTE_quality, gated by efficiency >= 0.15)')
log(f' Stuck: 40 steps (~2.5s)')
log(f' Method: DummyVecEnv (both tracks in every PPO batch)')
log('='*60)
# Create parallel env
log('Creating DummyVecEnv with two tracks...')
env = DummyVecEnv([
make_env('donkey-generated-track-v0', 9091),
make_env('donkey-mountain-track-v0', 9093),
])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
# Create PPO
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('PPO created. Starting training...')
# Train in segments for checkpointing
CHECKPOINT_EVERY = 6000
best_reward = float('-inf')
steps_done = 0
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
# Save checkpoint
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model')) # latest for crash recovery
log(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
# Quick eval on both tracks simultaneously
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
status0 = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
status1 = '' if ep_steps[1] >= 2000 else f'❌@{int(ep_steps[1])}'
log(f' Eval: gen_track={ep_rewards[0]:.1f}r/{int(ep_steps[0])}s {status0} '
f'mountain={ep_rewards[1]:.1f}r/{int(ep_steps[1])}s {status1}')
total_reward = ep_rewards.sum()
if total_reward > best_reward:
best_reward = total_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f' ⭐ NEW BEST: {best_reward:.1f} (combined)')
except Exception as e:
log(f' Eval error: {e}')
import traceback; traceback.print_exc()
model.save(os.path.join(SAVE_DIR, 'model'))
log(f'\nTraining complete. Best combined reward: {best_reward:.1f}')
log(f'Checkpoints in {SAVE_DIR}:')
for f in sorted(os.listdir(SAVE_DIR)):
size = os.path.getsize(os.path.join(SAVE_DIR, f)) // (1024*1024)
log(f' {f} ({size}MB)')
# Close training env
env.close()
time.sleep(5)
# --- Eval on all 4 tracks using sim 1 (port 9091) ---
# We use a single sim for sequential eval since we only need one track at a time
log('\n' + '='*60)
log('EVALUATION: best_model on 4 tracks (3 sets each)')
log('='*60)
EVAL_TRACKS = [
('donkey-mountain-track-v0', 'mountain_track'),
('donkey-generated-track-v0', 'generated_track'),
('donkey-generated-roads-v0', 'generated_road'),
('donkey-minimonaco-track-v0', 'mini_monaco'),
]
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
EVAL_PORT = 9091
best_model_path = os.path.join(SAVE_DIR, 'best_model.zip')
results_by_track = {}
for track_id, track_name in EVAL_TRACKS:
log(f'\n--- {track_name} ---')
steps_list = []
for s in range(1, EVAL_SETS + 1):
try:
# Create fresh env for each eval run
raw = gym.make(track_id, conf={'host': HOST, 'port': EVAL_PORT})
eval_env_inner = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
eval_env_inner = StuckTerminationWrapper(eval_env_inner, stuck_steps=40, min_displacement=0.5)
eval_env_inner = SpeedRewardWrapper(eval_env_inner)
eval_env = VecTransposeImage(DummyVecEnv([lambda e=eval_env_inner: e]))
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
total_r, total_s, done = 0.0, 0, False
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
if len(result) == 4:
obs, r, d, info = result
done = bool(d[0])
else:
obs, r, t, tr, info = result
done = bool(t[0] or tr[0])
total_r += float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
log(f' Set{s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
eval_env.close()
time.sleep(3)
except Exception as e:
log(f' Set{s}: ERROR — {e}')
steps_list.append(0)
time.sleep(3)
mean_steps = np.mean(steps_list) if steps_list else 0
results_by_track[track_name] = steps_list
log(f' Mean: {mean_steps:.0f} steps')
# Summary
log('\n' + '='*60)
log('SUMMARY')
log('='*60)
for track_name, steps_list in results_by_track.items():
steps_str = '/'.join(str(s) for s in steps_list)
mean = np.mean(steps_list)
verdict = '' if mean >= 1500 else '⚠️' if mean >= 500 else ''
log(f' {verdict} {track_name:20s}: {steps_str} mean={mean:.0f}')
log(f'\n=== Exp 11b COMPLETE ===')