210 lines
6.9 KiB
Python
210 lines
6.9 KiB
Python
"""
|
|
Exp 14: Single track — mountain_track, v5 reward, lap-based stopping.
|
|
|
|
v5 reward (speed x CTE) is required for mountain_track hills:
|
|
- v4 (base x efficiency x speed) gives ZERO gradient on hills
|
|
(efficiency=0, speed=0, all terms=0 simultaneously → no learning signal)
|
|
- v5 (speed x CTE_quality) has non-zero gradient on hills:
|
|
reward = speed_norm x cte_quality → dR/dspeed > 0 always
|
|
Model CAN learn to apply more throttle on the hill.
|
|
Proved in Exp 9 (mountain only, v5, throttle_min=0.2 → 2000/2000 steps).
|
|
|
|
Circle exploit risk on mountain_track is lower than generated_track:
|
|
- Mountain track geometry doesn't have flat open circling areas
|
|
- The hill itself prevents sustained circling
|
|
- Exp 9 succeeded without circle detection
|
|
|
|
Same approach as Exp 13 (which worked):
|
|
- Single track, simple wrapper stack, lap-based stopping
|
|
- throttle_min=0.2 (v5 gradient teaches model to self-select high throttle)
|
|
"""
|
|
import sys, os, time
|
|
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
|
|
|
|
from donkeycar_sb3_runner import ThrottleClampWrapper
|
|
from stable_baselines3 import PPO
|
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
from datetime import datetime
|
|
|
|
HOST = '10.0.0.55'
|
|
PORT = 9091
|
|
TRACK_ID = 'donkey-mountain-track-v0'
|
|
TRACK_NAME = 'mountain_track'
|
|
THROTTLE_MIN = 0.2
|
|
LR = 0.000725
|
|
MAX_STEPS = 300000
|
|
EVAL_EVERY = 5000
|
|
LAP_STOP = 3
|
|
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp14-mountain-v5'
|
|
os.makedirs(SAVE_DIR, exist_ok=True)
|
|
|
|
|
|
def log(msg):
|
|
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
|
|
|
|
|
|
# ---- v5 reward (speed x CTE_quality) ----
|
|
class V5RewardWrapper(gym.Wrapper):
|
|
"""
|
|
v5 reward: speed_norm x cte_quality.
|
|
Non-zero gradient on hills — model learns to apply throttle.
|
|
Simple, no efficiency gate, no extra terminators.
|
|
"""
|
|
def __init__(self, env, max_cte=8.0, min_lap_time=5.0):
|
|
super().__init__(env)
|
|
self.max_cte = max_cte
|
|
self.min_lap_time = min_lap_time
|
|
self._last_lc = 0
|
|
|
|
def reset(self, **kwargs):
|
|
self._last_lc = 0
|
|
return self.env.reset(**kwargs)
|
|
|
|
def step(self, action):
|
|
result = self.env.step(action)
|
|
if len(result) == 5:
|
|
obs, _r, terminated, truncated, info = result
|
|
done = terminated or truncated
|
|
else:
|
|
obs, _r, done, info = result
|
|
terminated, truncated = done, False
|
|
|
|
reward, force_term = self._compute(done, info)
|
|
if force_term:
|
|
terminated = True
|
|
|
|
if len(result) == 5:
|
|
return obs, reward, terminated, truncated, info
|
|
return obs, reward, terminated or truncated, info
|
|
|
|
def _compute(self, done, info):
|
|
if done:
|
|
return -1.0, False
|
|
|
|
# Short-lap exploit check
|
|
try:
|
|
lc = int(info.get('lap_count', 0) or 0)
|
|
except (TypeError, ValueError):
|
|
lc = self._last_lc
|
|
if lc > self._last_lc:
|
|
self._last_lc = lc
|
|
try:
|
|
lt = float(info.get('last_lap_time', 999) or 999)
|
|
except (TypeError, ValueError):
|
|
lt = 999
|
|
if lt < self.min_lap_time:
|
|
penalty = -10.0 * (self.min_lap_time / max(lt, 0.1))
|
|
return penalty, True
|
|
|
|
try:
|
|
cte = float(info.get('cte', 0) or 0)
|
|
except (TypeError, ValueError):
|
|
cte = 0.0
|
|
cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)
|
|
|
|
try:
|
|
speed = max(0.0, float(info.get('speed', 0) or 0))
|
|
except (TypeError, ValueError):
|
|
speed = 0.0
|
|
|
|
speed_norm = min(speed / 10.0, 1.0)
|
|
return cte_quality * speed_norm, False
|
|
|
|
|
|
def make_env():
|
|
def _init():
|
|
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
|
|
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
|
|
env = V5RewardWrapper(env)
|
|
return env
|
|
return _init
|
|
|
|
|
|
log('='*60)
|
|
log(f'Exp 14: {TRACK_NAME}, v5 reward')
|
|
log(f' Host: {HOST}:{PORT}')
|
|
log(f' throttle_min={THROTTLE_MIN}, lr={LR}')
|
|
log(f' Reward: v5 (speed x CTE_quality) — non-zero gradient on hills')
|
|
log(f' Wrappers: ThrottleClamp + V5Reward ONLY')
|
|
log(f' Stop: eval every {EVAL_EVERY:,} steps, stop at {LAP_STOP} laps')
|
|
log(f' Safety ceiling: {MAX_STEPS:,} steps')
|
|
log('='*60)
|
|
|
|
# ---- Switch sim to mountain_track ----
|
|
# Must exit current scene via existing connection, then reconnect fresh.
|
|
log('Switching sim to mountain_track...')
|
|
_tmp = gym.make('donkey-generated-track-v0', conf={'host': HOST, 'port': PORT})
|
|
time.sleep(2)
|
|
try:
|
|
_tmp.unwrapped.viewer.exit_scene()
|
|
time.sleep(0.5)
|
|
except Exception as e:
|
|
log(f' exit_scene warning: {e}')
|
|
_tmp.close()
|
|
time.sleep(6) # wait for sim to return to main menu
|
|
log('Sim should now be at main menu. Connecting to mountain_track...')
|
|
|
|
env = VecTransposeImage(DummyVecEnv([make_env()]))
|
|
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
|
|
log('PPO created. Training...')
|
|
|
|
best_reward = float('-inf')
|
|
best_laps = 0
|
|
steps_done = 0
|
|
|
|
while steps_done < MAX_STEPS:
|
|
seg = min(EVAL_EVERY, MAX_STEPS - steps_done)
|
|
model.learn(total_timesteps=seg, reset_num_timesteps=False)
|
|
steps_done += seg
|
|
|
|
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
|
|
model.save(ckpt)
|
|
model.save(os.path.join(SAVE_DIR, 'model'))
|
|
|
|
try:
|
|
obs = env.reset()
|
|
ep_r = 0.0
|
|
ep_s = 0
|
|
laps = 0
|
|
prev_lc = 0
|
|
for _ in range(2000):
|
|
action, _ = model.predict(obs, deterministic=True)
|
|
obs, r, d, info = env.step(action)
|
|
ep_r += float(r[0])
|
|
ep_s += 1
|
|
try:
|
|
lc = int((info[0] if isinstance(info, (list,tuple)) else info)
|
|
.get('lap_count', 0) or 0)
|
|
if lc > prev_lc:
|
|
laps = lc
|
|
prev_lc = lc
|
|
except Exception:
|
|
pass
|
|
if bool(d[0]):
|
|
break
|
|
|
|
status = '✅' if ep_s >= 2000 else f'❌@{ep_s}'
|
|
log(f'[{steps_done:,}] reward={ep_r:.1f} steps={ep_s} laps={laps} {status}')
|
|
|
|
if ep_r > best_reward:
|
|
best_reward = ep_r
|
|
model.save(os.path.join(SAVE_DIR, 'best_model'))
|
|
log(f' ⭐ NEW BEST: {best_reward:.1f}')
|
|
if laps > best_laps:
|
|
best_laps = laps
|
|
log(f' 🏆 BEST LAPS: {best_laps}')
|
|
if laps >= LAP_STOP:
|
|
log(f' 🎯 {laps} laps at {steps_done:,} steps — STOPPING')
|
|
break
|
|
except Exception as e:
|
|
log(f' Eval error: {e}')
|
|
import traceback; traceback.print_exc()
|
|
|
|
env.close()
|
|
time.sleep(3)
|
|
log(f'\nDone. best_laps={best_laps} best_reward={best_reward:.1f}')
|
|
log(f'Best model: {SAVE_DIR}/best_model.zip')
|
|
log('=== Exp 14 COMPLETE ===')
|