donkeycar-rl-autoresearch/agent/experiments/exp11d_parallel_v61.py

176 lines
6.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 11d: Parallel DummyVecEnv, v6.1 reward (grass + rollback fixes), 180k steps.
Changes from Exp 11c (aborted):
- Reward v6.1: adds two new termination conditions:
1. Sustained high CTE (grass exploit fix): if CTE > 4.0 for 20 steps → terminate
Stops the generated_track gap exploit where car exits through a hole
in the boundary mesh and drives indefinitely on the grass.
2. No track progress (mountain rollback fix): if active_node doesn't
advance for 60 steps → terminate.
Stops the car going up the hill, rolling back, going up again — IS
moving so StuckWrapper doesn't fire, but never makes track progress.
- Total steps: 180k (vs 250k in 11c — enough budget, not too long)
Infrastructure (unchanged from 11b/11c):
- DummyVecEnv with two sim instances (9091 + 9093)
- stuck_steps=40, throttle_min=0.2, lr=0.000725
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym
import numpy as np
HOST = '10.0.0.55'
THROTTLE_MIN = 0.2
LR = 0.000725
TOTAL_STEPS = 180000
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp11d-parallel-v61'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5)
env = SpeedRewardWrapper(env,
max_cte_terminate=4.0, # terminate if CTE > 4m for 20 steps (grass fix)
cte_patience=20,
)
return env
return _init
log('='*60)
log('Exp 11d: Parallel DummyVecEnv, v6.1 reward, 180k steps')
log(f' Sim 1: {HOST}:9091 → generated_track')
log(f' Sim 2: {HOST}:9093 → mountain_track')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward v6.1: speed×CTE + efficiency gate + grass exploit terminator')
log(f' max_cte_terminate=4.0, cte_patience=20 (grass fix only)')
log(f' Stuck: 40 steps')
log('='*60)
env = DummyVecEnv([
make_env('donkey-generated-track-v0', 9091),
make_env('donkey-mountain-track-v0', 9093),
])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('PPO created. Starting training...')
CHECKPOINT_EVERY = 10000
best_reward = float('-inf')
steps_done = 0
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model'))
log(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved')
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
status0 = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
status1 = '' if ep_steps[1] >= 2000 else f'❌@{int(ep_steps[1])}'
log(f' Eval: gen_track={ep_rewards[0]:.1f}r/{int(ep_steps[0])}s {status0} '
f'mountain={ep_rewards[1]:.1f}r/{int(ep_steps[1])}s {status1}')
total_reward = ep_rewards.sum()
if total_reward > best_reward:
best_reward = total_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f' ⭐ NEW BEST: {best_reward:.1f} (combined)')
except Exception as e:
log(f' Eval error: {e}')
model.save(os.path.join(SAVE_DIR, 'model'))
log(f'\nTraining complete. Best combined reward: {best_reward:.1f}')
env.close()
time.sleep(5)
# --- Eval on all 4 tracks ---
log('\n' + '='*60)
log('EVALUATION: best_model on 4 tracks (3 sets each)')
log('='*60)
EVAL_TRACKS = [
('donkey-mountain-track-v0', 'mountain_track'),
('donkey-generated-track-v0', 'generated_track'),
('donkey-generated-roads-v0', 'generated_road'),
('donkey-minimonaco-track-v0', 'mini_monaco'),
]
EVAL_PORT = 9091
best_model_path = os.path.join(SAVE_DIR, 'best_model.zip')
results_by_track = {}
for track_id, track_name in EVAL_TRACKS:
log(f'\n--- {track_name} ---')
steps_list = []
for s in range(1, 4):
try:
raw = gym.make(track_id, conf={'host': HOST, 'port': EVAL_PORT})
ei = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
ei = StuckTerminationWrapper(ei, stuck_steps=40, min_displacement=0.5)
ei = SpeedRewardWrapper(ei, max_cte_terminate=4.0, cte_patience=20)
ev = VecTransposeImage(DummyVecEnv([lambda e=ei: e]))
m = PPO.load(best_model_path, env=ev, device='cpu')
obs = ev.reset()
total_r, total_s, done = 0.0, 0, False
while not done and total_s < 2000:
action, _ = m.predict(obs, deterministic=True)
result = ev.step(action)
if len(result) == 4: obs, r, d, _ = result; done = bool(d[0])
else: obs, r, t, tr, _ = result; done = bool(t[0] or tr[0])
total_r += float(r[0]); total_s += 1
status = '' if total_s >= 2000 else f'❌@{total_s}'
log(f' Set{s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
ev.close(); time.sleep(3)
except Exception as e:
log(f' Set{s}: ERROR — {e}')
steps_list.append(0); time.sleep(3)
results_by_track[track_name] = steps_list
log(f' Mean: {np.mean(steps_list):.0f} steps')
log('\n' + '='*60)
log('SUMMARY')
log('='*60)
for track_name, steps_list in results_by_track.items():
steps_str = '/'.join(str(s) for s in steps_list)
mean = np.mean(steps_list)
verdict = '' if mean >= 1500 else '⚠️' if mean >= 500 else ''
log(f' {verdict} {track_name:20s}: {steps_str} mean={mean:.0f}')
log(f'\n=== Exp 11d COMPLETE ===')