feat(exp24): discrete steering + speed-based stuck detection

StuckTerminationWrapper: add low_speed_threshold + max_low_speed_seconds params.
Car pinned against a barrier has speed≈0 even while sliding laterally — lateral
drift was resetting the position-based displacement timer, leaving the car stuck
for up to max_episode_seconds. Speed-based check terminates after 2s at speed<0.5.

Exp24: 7-bin discrete steering (DiscretizedActionWrapper) eliminates Gaussian policy
noise that caused rapid oscillation in exp23. max_episode_seconds reduced to 30s
since speed-based stuck detection now handles the barrier-contact cases.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Paul Huliganga 2026-05-05 17:41:42 -04:00
parent c05e79d30c
commit 924615ca60
2 changed files with 299 additions and 13 deletions

View File

@ -0,0 +1,259 @@
"""
Exp 24: Discrete steering + speed-based stuck detection.
What changed from exp23:
- Discrete action space: 7 steering bins × 1 throttle = 7 actions.
Eliminates Gaussian policy noise that caused rapid steering oscillation.
Bins: steer {-1, -0.67, -0.33, 0, 0.33, 0.67, 1}, throttle=0clamped to 0.2.
- Speed-based stuck detection: if speed < 0.5 m/s for 2 wall-clock seconds
terminate. Catches car pinned against a barrier regardless of lateral sliding
(lateral drift was resetting the position-based timer in exp23, leaving the car
against the wall for up to max_episode_seconds).
- max_episode_seconds reduced to 30s (stuck detection catches the bad cases faster;
120s was a consequence of stuck detection not working, not a design choice).
- Single track: generated_road on port 9091.
- Fresh PPO (MlpPolicy not CnnPolicy Discrete action space, same CNN obs encoder).
- Total steps: 200k.
"""
import os
import sys
import time
from datetime import datetime
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
_SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete'
_PIDFILE = os.path.join(_SAVE_DIR, 'current.pid')
os.makedirs(_SAVE_DIR, exist_ok=True)
if os.path.exists(_PIDFILE):
try:
_old = int(open(_PIDFILE).read().strip())
if _old != os.getpid():
import signal
os.kill(_old, 0)
print(f'[exp24] Another instance already running (PID {_old}). Exiting.', flush=True)
sys.exit(1)
except (OSError, ValueError):
pass
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from discretize_action import DiscretizedActionWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from multitrack_runner import StuckTerminationWrapper
from reward_wrapper import SpeedRewardWrapper
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.0003
TOTAL_STEPS = 200_000
CHECKPOINT_EVERY = 10_000
N_STEER = 7 # steering bins: -1, -0.67, -0.33, 0, 0.33, 0.67, 1
N_THROTTLE = 1 # fixed at 0.0 → clamped to THROTTLE_MIN by ThrottleClampWrapper
# Reward wrapper params (same as exp23 v7)
EFFICIENCY_WINDOW = 30
MIN_EFFICIENCY = 0.15
MAX_CTE = 8.0
MIN_LAP_TIME = 12.0
PROGRESS_PATIENCE = 100
# StuckTerminationWrapper — speed-based check is the primary stuck detector now
MAX_STUCK_SECONDS = 5.0 # position-based: 0.5m displacement timer
MAX_EPISODE_SECONDS = 30.0 # hard cap (reduced from 120s — speed check handles it)
LOW_SPEED_THRESHOLD = 0.5 # m/s — below this counts as "stuck"
MAX_LOW_SPEED_SECONDS = 2.0 # seconds at low speed before termination
def log(msg):
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE)
env = StuckTerminationWrapper(
env,
stuck_steps=40,
min_displacement=0.5,
max_stuck_seconds=MAX_STUCK_SECONDS,
max_episode_seconds=MAX_EPISODE_SECONDS,
low_speed_threshold=LOW_SPEED_THRESHOLD,
max_low_speed_seconds=MAX_LOW_SPEED_SECONDS,
)
env = SpeedRewardWrapper(
env,
window_size=EFFICIENCY_WINDOW,
min_efficiency=MIN_EFFICIENCY,
max_cte=MAX_CTE,
min_lap_time=MIN_LAP_TIME,
progress_patience=PROGRESS_PATIENCE,
)
return env
return _init
def make_eval_env(track_id, port):
inner = make_env(track_id, port)()
return VecTransposeImage(DummyVecEnv([lambda e=inner: e]))
log('=' * 60)
log('Exp 24: generated_road — discrete steering, speed-based stuck')
log(f' Sim: {HOST}:9091 -> generated_road')
log(f' Discrete steering: {N_STEER} bins, throttle fixed at {THROTTLE_MIN}')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)')
log(f' Stuck: position≥0.5m/{MAX_STUCK_SECONDS}s OR speed<{LOW_SPEED_THRESHOLD}/{MAX_LOW_SPEED_SECONDS}s')
log(f' Episode cap: {MAX_EPISODE_SECONDS}s (safety net)')
log(f' Checkpoints every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Creating DummyVecEnv on generated_road...')
env = DummyVecEnv([make_env('donkey-generated-roads-v0', 9091)])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
log(f' Action space: {env.action_space}')
model = PPO(
'CnnPolicy',
env,
learning_rate=LR,
n_steps=2048,
batch_size=64,
n_epochs=10,
gamma=0.99,
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01,
verbose=1,
device='cpu',
)
with open(_PIDFILE, 'w') as f:
f.write(str(os.getpid()))
log(f'Fresh PPO model created (Discrete({N_STEER * N_THROTTLE}) actions). Starting training...')
best_total_steps = float('-inf')
best_total_reward = float('-inf')
steps_done = 0
run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_discrete'
log_path = os.path.join(_SAVE_DIR, f'run_{run_tag}.log')
best_model_path = os.path.join(_SAVE_DIR, 'best_model.zip')
import logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s',
handlers=[logging.FileHandler(log_path), logging.StreamHandler(sys.stdout)],
)
file_log = logging.getLogger('exp24')
def flog(msg):
ts = datetime.now().strftime('%H:%M:%S')
file_log.info(f'[{ts}] {msg}')
flog('=' * 60)
flog(f'Exp 24 started — PID {os.getpid()}')
flog(f'Log: {log_path}')
flog('=' * 60)
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(_SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(_SAVE_DIR, 'model'))
flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
total_steps_eval = int(ep_steps.sum())
total_reward_eval = float(ep_rewards.sum())
status = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}')
if (total_steps_eval > best_total_steps
or (total_steps_eval == best_total_steps
and total_reward_eval > best_total_reward)):
best_total_steps = total_steps_eval
best_total_reward = total_reward_eval
model.save(best_model_path)
flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}')
except Exception as e:
flog(f' Eval error: {e}')
env.close()
flog('=' * 60)
flog('FINAL EVALUATION: best_model on generated_road')
flog('=' * 60)
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
steps_list = []
reward_list = []
for s in range(1, EVAL_SETS + 1):
try:
eval_env = make_eval_env('donkey-generated-roads-v0', 9091)
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
done = False
total_s = 0
total_r = 0.0
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
obs, r, done = result[0], result[1], result[2]
if hasattr(done, '__len__'):
done = bool(done[0])
total_r += float(r) if not hasattr(r, '__len__') else float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
reward_list.append(total_r)
eval_env.close()
except Exception as e:
flog(f' Set {s} error: {e}')
if steps_list:
flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward')
flog('Exp 24 complete.')

View File

@ -134,31 +134,40 @@ class StuckTerminationWrapper(gym.Wrapper):
can take 1+ minutes of wall-clock time. The wall-clock timeout catches can take 1+ minutes of wall-clock time. The wall-clock timeout catches
this case regardless of sim speed. this case regardless of sim speed.
Handles two cases the sim misses: Handles three cases the sim misses:
1. Car pressed slowly against a barrier Unity's hit detection needs a 1. Car pressed slowly against a barrier Unity's OnCollisionEnter fires
velocity threshold; slow contact leaves hit='none' and episode open. once then resets; Python never sees sustained contact. Speed-based check
2. Car circling off the start/finish line efficiency0 gives zero reward terminates after max_low_speed_seconds at speed < low_speed_threshold.
2. Car sliding laterally along a barrier position displacement > 0.5m
keeps resetting the wall-clock timer; speed stays 0. Speed-based check
catches this; position-based check cannot.
3. Car circling off the start/finish line efficiency0 gives zero reward
but the episode never ends, wasting training steps with no signal. but the episode never ends, wasting training steps with no signal.
When stuck is detected: terminated=True so SpeedRewardWrapper returns -1.0. When stuck is detected: terminated=True so SpeedRewardWrapper returns -1.0.
""" """
def __init__(self, env, stuck_steps: int = 80, min_displacement: float = 0.5, def __init__(self, env, stuck_steps: int = 80, min_displacement: float = 0.5,
max_stuck_seconds: float = 12.0, max_episode_seconds: float = 30.0): max_stuck_seconds: float = 12.0, max_episode_seconds: float = 30.0,
low_speed_threshold: float = 0.5, max_low_speed_seconds: float = 3.0):
super().__init__(env) super().__init__(env)
self.stuck_steps = stuck_steps self.stuck_steps = stuck_steps
self.min_displacement = min_displacement self.min_displacement = min_displacement
self.max_stuck_seconds = max_stuck_seconds self.max_stuck_seconds = max_stuck_seconds
self.max_episode_seconds = max_episode_seconds self.max_episode_seconds = max_episode_seconds
self.low_speed_threshold = low_speed_threshold
self.max_low_speed_seconds = max_low_speed_seconds
self._pos_buf: deque = deque(maxlen=stuck_steps) self._pos_buf: deque = deque(maxlen=stuck_steps)
self._last_progress_pos = None self._last_progress_pos = None
self._last_progress_t = None self._last_progress_t = None
self._episode_start_t = None self._episode_start_t = None
self._low_speed_start_t = None
def reset(self, **kwargs): def reset(self, **kwargs):
self._pos_buf.clear() self._pos_buf.clear()
self._last_progress_pos = None self._last_progress_pos = None
self._last_progress_t = None self._last_progress_t = None
self._episode_start_t = time.time() self._episode_start_t = time.time()
self._low_speed_start_t = None
return self.env.reset(**kwargs) return self.env.reset(**kwargs)
def step(self, action): def step(self, action):
@ -197,6 +206,24 @@ class StuckTerminationWrapper(gym.Wrapper):
except (TypeError, ValueError): except (TypeError, ValueError):
pass pass
# Speed-based stuck detection: catches car pinned against a barrier.
# A car pressed against a wall has speed≈0 even while sliding laterally
# (accumulating displacement that resets the position-based timer above).
if not terminated:
try:
speed = float(info.get('speed', 999.0) or 999.0)
except (TypeError, ValueError):
speed = 999.0
if speed < self.low_speed_threshold:
if self._low_speed_start_t is None:
self._low_speed_start_t = now
elif (now - self._low_speed_start_t) > self.max_low_speed_seconds:
terminated = True
info['stuck_termination'] = True
info['stuck_reason'] = 'low_speed_timeout'
else:
self._low_speed_start_t = None
# Hard episode wall-clock limit — fires regardless of car position or sim fps. # Hard episode wall-clock limit — fires regardless of car position or sim fps.
# Catches cars sliding slowly along barriers that keep resetting the # Catches cars sliding slowly along barriers that keep resetting the
# max_stuck_seconds timer by drifting 0.5m at a time. # max_stuck_seconds timer by drifting 0.5m at a time.