donkeycar-rl-autoresearch/agent/experiments/exp9_mountain_v5_throttle02.py

129 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 9: mountain_track, v5 reward, throttle_min=0.2
ONE VARIABLE CHANGED from Exp8: throttle_min 0.5 → 0.2
Hypothesis: v5 reward (speed × CTE) has non-zero gradient on hill.
Model can learn to output high throttle when needed even with 0.2 floor.
Full throttle range [0.2, 1.0] allows model to also slow for corners.
If this works: can drive mountain_track AND potentially mini_monaco corners.
If this fails: car stalls on hill, confirming 0.5 minimum is physically required.
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, _send_exit_scene, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym, numpy as np
THROTTLE_MIN = 0.2 # ← ONLY CHANGE from Exp8
LR = 0.000725 # same
TOTAL_STEPS = 90000 # same
STEPS_PER_SEG = 6000 # same — 15 checkpoints
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp9-mountain-v5-throttle02'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env():
raw = gym.make('donkey-mountain-track-v0')
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=80, min_displacement=0.5)
env = SpeedRewardWrapper(env)
return env
log('='*60)
log('Exp 9: mountain_track, v5 reward, throttle_min=0.2')
log('ONE CHANGE from Exp8: throttle_min 0.5 → 0.2')
log(f' lr={LR}, total_steps={TOTAL_STEPS:,}, steps_per_seg={STEPS_PER_SEG:,}')
log(f' Hypothesis: v5 gradient non-zero on hill → model learns high throttle')
log(f' Save: {SAVE_DIR}')
log('='*60)
# Clear previous sim state
log('Clearing sim state...')
tmp = gym.make('donkey-mountain-track-v0'); time.sleep(2)
_send_exit_scene(tmp, verbose=False); tmp.close(); time.sleep(5)
# Single connection for entire run
env = VecTransposeImage(DummyVecEnv([make_env]))
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('Connected. Training begins on mountain_track with throttle_min=0.2')
log('Watch: does model get over the hill?')
best_reward = float('-inf')
steps_done, seg_num = 0, 0
while steps_done < TOTAL_STEPS:
seg_steps = min(STEPS_PER_SEG, TOTAL_STEPS - steps_done)
seg_num += 1
log(f'\n[Seg {seg_num}] steps {steps_done:,}{steps_done+seg_steps:,}')
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
log(f'[Seg {seg_num}] Checkpoint: {ckpt}.zip')
try:
obs = env.reset()
ep_reward, ep_steps, done = 0.0, 0, False
while not done and ep_steps < 2000:
action, _ = model.predict(obs, deterministic=True)
result = env.step(action)
if len(result)==5: obs,r,t,tr,_ = result; done=bool(t[0] or tr[0])
else: obs,r,d,_ = result; done=bool(d[0])
ep_reward += float(r[0]); ep_steps += 1
log(f'[Seg {seg_num}] Eval: {ep_reward:.1f} reward / {ep_steps} steps (deterministic)')
if ep_reward > best_reward:
best_reward = ep_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f'[Seg {seg_num}] ⭐ NEW BEST: {best_reward:.1f}')
except Exception as e:
log(f'[Seg {seg_num}] Eval error: {e}')
env.close(); time.sleep(2)
log(f'\nTraining complete. Best reward: {best_reward:.1f}')
# Eval best_model on all tracks
best_path = os.path.join(SAVE_DIR, 'best_model.zip')
def eval_track(current_id, track_id, name, n=3):
log(f'\n--- EVAL: {name} ---')
tmp2 = gym.make(current_id); time.sleep(2)
_send_exit_scene(tmp2, verbose=False); tmp2.close(); time.sleep(5)
ev = VecTransposeImage(DummyVecEnv([lambda: (
SpeedRewardWrapper(StuckTerminationWrapper(
ThrottleClampWrapper(gym.make(track_id), throttle_min=THROTTLE_MIN),
80, 0.5)))]))
m = PPO.load(best_path, env=ev, device='cpu')
results = []
for ep in range(1, n+1):
obs = ev.reset(); total, steps, done = 0.0, 0, False
while not done and steps < 2000:
action, _ = m.predict(obs, deterministic=True)
result = ev.step(action)
if len(result)==5: obs,r,t,tr,info=result; done=bool(t[0] or tr[0])
else: obs,r,d,info=result; done=bool(d[0])
total+=float(r[0]); steps+=1
status='✅ FULL' if steps>=2000 else f'❌ crash@{steps}'
log(f' ep{ep}: {total:.1f} reward / {steps} steps — {status}')
results.append(steps)
time.sleep(1)
log(f' Mean steps: {np.mean(results):.0f}')
ev.close(); time.sleep(3)
return track_id
current = 'donkey-mountain-track-v0'
current = eval_track(current, 'donkey-mountain-track-v0', 'mountain_track (training)')
current = eval_track(current, 'donkey-generated-track-v0', 'generated_track (zero-shot)')
current = eval_track(current, 'donkey-minimonaco-track-v0', 'mini_monaco (zero-shot)')
current = eval_track(current, 'donkey-generated-roads-v0', 'generated_road (zero-shot)')
log('\n=== Exp 9 COMPLETE ===')
log(f'Compare with Exp8 best_model results:')
log(f' mountain_track: 382/529/182 (mean=364)')
log(f' mini_monaco: 154/155/104 (mean=138) ← crashed at one corner')