donkeycar-rl-autoresearch/agent/experiments/exp12_mountain_single.py

151 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 12: Single track — mountain_track, v6.1 reward, lap-based stopping.
Strategy: train until the car consistently completes multiple laps.
No fixed step count — eval every 5k steps, stop when 3 consecutive laps
achieved in a single eval episode (2000 steps).
Key fixes applied:
- v6.1 reward: speed × CTE, efficiency gate, CTE patience (grass fix),
track progress patience (circle/stuck fix using active_node)
- stuck_steps=40, wall-clock timeout=12s
- Single track: all training budget goes to mountain_track
- throttle_min=0.2 (proven to work in Exp 9 with v5 reward + 90k steps)
Based on Exp 9 success: single track + v5/v6 reward + throttle_min=0.2
CAN learn mountain. The circle/stuck exploit was contaminating multi-track
training. Single track eliminates that interaction.
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym
import numpy as np
HOST = '10.0.0.55'
PORT = 9091
TRACK_ID = 'donkey-mountain-track-v0'
TRACK_NAME = 'mountain_track'
THROTTLE_MIN = 0.2
LR = 0.000725
MAX_STEPS = 300000 # safety ceiling — stop via lap criterion first
EVAL_EVERY = 5000 # eval after every N training steps
EVAL_MAX_STEPS = 2000 # steps per eval episode
LAP_STOP_THRESHOLD = 3 # stop when eval achieves this many laps in one episode
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp12-mountain-single'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env():
def _init():
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5,
max_stuck_seconds=12.0)
env = SpeedRewardWrapper(env,
max_cte_terminate=4.0,
cte_patience=20,
progress_patience=60, # terminate if active_node max not advancing for 60 steps
)
return env
return _init
log('='*60)
log(f'Exp 12: Single track — {TRACK_NAME}')
log(f' Host: {HOST}:{PORT}')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}')
log(f' Reward: v6.1 (speed×CTE + efficiency gate + progress termination)')
log(f' Stopping: eval every {EVAL_EVERY:,} steps, stop at {LAP_STOP_THRESHOLD} laps')
log(f' Safety ceiling: {MAX_STEPS:,} steps')
log('='*60)
env = VecTransposeImage(DummyVecEnv([make_env()]))
log(f' obs={env.observation_space.shape}')
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('PPO created. Training begins...')
best_reward = float('-inf')
best_laps = 0
steps_done = 0
stop_reason = None
while steps_done < MAX_STEPS:
seg = min(EVAL_EVERY, MAX_STEPS - steps_done)
model.learn(total_timesteps=seg, reset_num_timesteps=False)
steps_done += seg
# Save checkpoint
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model'))
# Eval: count laps in one deterministic episode
try:
obs = env.reset()
ep_reward, ep_steps, laps = 0.0, 0, 0
prev_lap_count = 0
for _ in range(EVAL_MAX_STEPS):
action, _ = model.predict(obs, deterministic=True)
result = env.step(action)
if len(result) == 4:
obs, r, d, info = result
done = bool(d[0])
else:
obs, r, t, tr, info = result
done = bool(t[0] or tr[0])
ep_reward += float(r[0])
ep_steps += 1
# Count laps from info
try:
lc = int((info[0] if isinstance(info, (list, tuple)) else info)
.get('lap_count', 0) or 0)
if lc > prev_lap_count:
laps = lc
prev_lap_count = lc
except Exception:
pass
if done:
break
status = '' if ep_steps >= EVAL_MAX_STEPS else f'❌@{ep_steps}'
log(f'[{steps_done:,}/{MAX_STEPS:,}] reward={ep_reward:.1f} '
f'steps={ep_steps} laps={laps} {status}')
if ep_reward > best_reward:
best_reward = ep_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f' ⭐ NEW BEST reward: {best_reward:.1f}')
if laps > best_laps:
best_laps = laps
log(f' 🏆 NEW BEST laps: {best_laps}')
if laps >= LAP_STOP_THRESHOLD:
stop_reason = f'achieved {laps} laps in eval at {steps_done:,} steps'
log(f' 🎯 STOPPING CRITERION MET: {stop_reason}')
break
except Exception as e:
log(f' Eval error: {e}')
import traceback; traceback.print_exc()
env.close()
time.sleep(3)
log(f'\n{"="*60}')
log(f'Training complete.')
log(f' Stop reason : {stop_reason or "reached max steps"}')
log(f' Total steps : {steps_done:,}')
log(f' Best laps : {best_laps}')
log(f' Best reward : {best_reward:.1f}')
log(f' Best model : {SAVE_DIR}/best_model.zip')
log(f'{"="*60}')
log(f'\n=== Exp 12 COMPLETE ===')