From 924615ca6055de6fbc1718429c5cc46e7db0e19a Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Tue, 5 May 2026 17:41:42 -0400 Subject: [PATCH] feat(exp24): discrete steering + speed-based stuck detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StuckTerminationWrapper: add low_speed_threshold + max_low_speed_seconds params. Car pinned against a barrier has speed≈0 even while sliding laterally — lateral drift was resetting the position-based displacement timer, leaving the car stuck for up to max_episode_seconds. Speed-based check terminates after 2s at speed<0.5. Exp24: 7-bin discrete steering (DiscretizedActionWrapper) eliminates Gaussian policy noise that caused rapid oscillation in exp23. max_episode_seconds reduced to 30s since speed-based stuck detection now handles the barrier-contact cases. Co-Authored-By: Claude Sonnet 4.6 --- .../exp24_generated_road_discrete.py | 259 ++++++++++++++++++ agent/multitrack_runner.py | 53 +++- 2 files changed, 299 insertions(+), 13 deletions(-) create mode 100644 agent/experiments/exp24_generated_road_discrete.py diff --git a/agent/experiments/exp24_generated_road_discrete.py b/agent/experiments/exp24_generated_road_discrete.py new file mode 100644 index 0000000..a0d61b1 --- /dev/null +++ b/agent/experiments/exp24_generated_road_discrete.py @@ -0,0 +1,259 @@ +""" +Exp 24: Discrete steering + speed-based stuck detection. + +What changed from exp23: + - Discrete action space: 7 steering bins × 1 throttle = 7 actions. + Eliminates Gaussian policy noise that caused rapid steering oscillation. + Bins: steer ∈ {-1, -0.67, -0.33, 0, 0.33, 0.67, 1}, throttle=0→clamped to 0.2. + - Speed-based stuck detection: if speed < 0.5 m/s for 2 wall-clock seconds + → terminate. Catches car pinned against a barrier regardless of lateral sliding + (lateral drift was resetting the position-based timer in exp23, leaving the car + against the wall for up to max_episode_seconds). + - max_episode_seconds reduced to 30s (stuck detection catches the bad cases faster; + 120s was a consequence of stuck detection not working, not a design choice). + - Single track: generated_road on port 9091. + - Fresh PPO (MlpPolicy not CnnPolicy — Discrete action space, same CNN obs encoder). + - Total steps: 200k. +""" +import os +import sys +import time +from datetime import datetime + +sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') + +_SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete' +_PIDFILE = os.path.join(_SAVE_DIR, 'current.pid') +os.makedirs(_SAVE_DIR, exist_ok=True) + +if os.path.exists(_PIDFILE): + try: + _old = int(open(_PIDFILE).read().strip()) + if _old != os.getpid(): + import signal + os.kill(_old, 0) + print(f'[exp24] Another instance already running (PID {_old}). Exiting.', flush=True) + sys.exit(1) + except (OSError, ValueError): + pass + +import gymnasium as gym +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage + +from discretize_action import DiscretizedActionWrapper +from donkeycar_sb3_runner import ThrottleClampWrapper +from multitrack_runner import StuckTerminationWrapper +from reward_wrapper import SpeedRewardWrapper + + +HOST = 'localhost' +THROTTLE_MIN = 0.2 +LR = 0.0003 +TOTAL_STEPS = 200_000 +CHECKPOINT_EVERY = 10_000 + +N_STEER = 7 # steering bins: -1, -0.67, -0.33, 0, 0.33, 0.67, 1 +N_THROTTLE = 1 # fixed at 0.0 → clamped to THROTTLE_MIN by ThrottleClampWrapper + +# Reward wrapper params (same as exp23 v7) +EFFICIENCY_WINDOW = 30 +MIN_EFFICIENCY = 0.15 +MAX_CTE = 8.0 +MIN_LAP_TIME = 12.0 +PROGRESS_PATIENCE = 100 + +# StuckTerminationWrapper — speed-based check is the primary stuck detector now +MAX_STUCK_SECONDS = 5.0 # position-based: 0.5m displacement timer +MAX_EPISODE_SECONDS = 30.0 # hard cap (reduced from 120s — speed check handles it) +LOW_SPEED_THRESHOLD = 0.5 # m/s — below this counts as "stuck" +MAX_LOW_SPEED_SECONDS = 2.0 # seconds at low speed before termination + + +def log(msg): + print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True) + + +def make_env(track_id, port): + def _init(): + raw = gym.make(track_id, conf={'host': HOST, 'port': port}) + env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE) + env = StuckTerminationWrapper( + env, + stuck_steps=40, + min_displacement=0.5, + max_stuck_seconds=MAX_STUCK_SECONDS, + max_episode_seconds=MAX_EPISODE_SECONDS, + low_speed_threshold=LOW_SPEED_THRESHOLD, + max_low_speed_seconds=MAX_LOW_SPEED_SECONDS, + ) + env = SpeedRewardWrapper( + env, + window_size=EFFICIENCY_WINDOW, + min_efficiency=MIN_EFFICIENCY, + max_cte=MAX_CTE, + min_lap_time=MIN_LAP_TIME, + progress_patience=PROGRESS_PATIENCE, + ) + return env + return _init + + +def make_eval_env(track_id, port): + inner = make_env(track_id, port)() + return VecTransposeImage(DummyVecEnv([lambda e=inner: e])) + + +log('=' * 60) +log('Exp 24: generated_road — discrete steering, speed-based stuck') +log(f' Sim: {HOST}:9091 -> generated_road') +log(f' Discrete steering: {N_STEER} bins, throttle fixed at {THROTTLE_MIN}') +log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}') +log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)') +log(f' Stuck: position≥0.5m/{MAX_STUCK_SECONDS}s OR speed<{LOW_SPEED_THRESHOLD}/{MAX_LOW_SPEED_SECONDS}s') +log(f' Episode cap: {MAX_EPISODE_SECONDS}s (safety net)') +log(f' Checkpoints every {CHECKPOINT_EVERY:,} steps') +log('=' * 60) + +log('Creating DummyVecEnv on generated_road...') +env = DummyVecEnv([make_env('donkey-generated-roads-v0', 9091)]) +env = VecTransposeImage(env) +log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}') +log(f' Action space: {env.action_space}') + +model = PPO( + 'CnnPolicy', + env, + learning_rate=LR, + n_steps=2048, + batch_size=64, + n_epochs=10, + gamma=0.99, + gae_lambda=0.95, + clip_range=0.2, + ent_coef=0.01, + verbose=1, + device='cpu', +) + +with open(_PIDFILE, 'w') as f: + f.write(str(os.getpid())) + +log(f'Fresh PPO model created (Discrete({N_STEER * N_THROTTLE}) actions). Starting training...') + +best_total_steps = float('-inf') +best_total_reward = float('-inf') +steps_done = 0 +run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_discrete' +log_path = os.path.join(_SAVE_DIR, f'run_{run_tag}.log') +best_model_path = os.path.join(_SAVE_DIR, 'best_model.zip') + +import logging +logging.basicConfig( + level=logging.INFO, + format='%(message)s', + handlers=[logging.FileHandler(log_path), logging.StreamHandler(sys.stdout)], +) +file_log = logging.getLogger('exp24') + + +def flog(msg): + ts = datetime.now().strftime('%H:%M:%S') + file_log.info(f'[{ts}] {msg}') + + +flog('=' * 60) +flog(f'Exp 24 started — PID {os.getpid()}') +flog(f'Log: {log_path}') +flog('=' * 60) + +while steps_done < TOTAL_STEPS: + seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done) + model.learn(total_timesteps=seg_steps, reset_num_timesteps=False) + steps_done += seg_steps + + ckpt = os.path.join(_SAVE_DIR, f'checkpoint_{steps_done:07d}') + model.save(ckpt) + model.save(os.path.join(_SAVE_DIR, 'model')) + flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip') + + try: + obs = env.reset() + ep_rewards = np.zeros(env.num_envs) + ep_steps = np.zeros(env.num_envs) + done_mask = np.zeros(env.num_envs, dtype=bool) + + for _ in range(2000): + action, _ = model.predict(obs, deterministic=True) + obs, rewards, dones, infos = env.step(action) + for i in range(env.num_envs): + if not done_mask[i]: + ep_rewards[i] += rewards[i] + ep_steps[i] += 1 + if dones[i]: + done_mask[i] = True + if done_mask.all(): + break + + total_steps_eval = int(ep_steps.sum()) + total_reward_eval = float(ep_rewards.sum()) + + status = '✅' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}' + flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}') + + if (total_steps_eval > best_total_steps + or (total_steps_eval == best_total_steps + and total_reward_eval > best_total_reward)): + best_total_steps = total_steps_eval + best_total_reward = total_reward_eval + model.save(best_model_path) + flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}') + + except Exception as e: + flog(f' Eval error: {e}') + +env.close() + +flog('=' * 60) +flog('FINAL EVALUATION: best_model on generated_road') +flog('=' * 60) + +EVAL_SETS = 3 +EVAL_MAX_STEPS = 2000 + +steps_list = [] +reward_list = [] + +for s in range(1, EVAL_SETS + 1): + try: + eval_env = make_eval_env('donkey-generated-roads-v0', 9091) + eval_model = PPO.load(best_model_path, env=eval_env, device='cpu') + obs = eval_env.reset() + done = False + total_s = 0 + total_r = 0.0 + + while not done and total_s < EVAL_MAX_STEPS: + action, _ = eval_model.predict(obs, deterministic=True) + result = eval_env.step(action) + obs, r, done = result[0], result[1], result[2] + if hasattr(done, '__len__'): + done = bool(done[0]) + total_r += float(r) if not hasattr(r, '__len__') else float(r[0]) + total_s += 1 + + status = '✅' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}' + flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}') + steps_list.append(total_s) + reward_list.append(total_r) + eval_env.close() + + except Exception as e: + flog(f' Set {s} error: {e}') + +if steps_list: + flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward') + +flog('Exp 24 complete.') diff --git a/agent/multitrack_runner.py b/agent/multitrack_runner.py index 250d8a5..389e287 100644 --- a/agent/multitrack_runner.py +++ b/agent/multitrack_runner.py @@ -134,31 +134,40 @@ class StuckTerminationWrapper(gym.Wrapper): can take 1+ minutes of wall-clock time. The wall-clock timeout catches this case regardless of sim speed. - Handles two cases the sim misses: - 1. Car pressed slowly against a barrier — Unity's hit detection needs a - velocity threshold; slow contact leaves hit='none' and episode open. - 2. Car circling off the start/finish line — efficiency→0 gives zero reward + Handles three cases the sim misses: + 1. Car pressed slowly against a barrier — Unity's OnCollisionEnter fires + once then resets; Python never sees sustained contact. Speed-based check + terminates after max_low_speed_seconds at speed < low_speed_threshold. + 2. Car sliding laterally along a barrier — position displacement > 0.5m + keeps resetting the wall-clock timer; speed stays ≈0. Speed-based check + catches this; position-based check cannot. + 3. Car circling off the start/finish line — efficiency→0 gives zero reward but the episode never ends, wasting training steps with no signal. When stuck is detected: terminated=True so SpeedRewardWrapper returns -1.0. """ def __init__(self, env, stuck_steps: int = 80, min_displacement: float = 0.5, - max_stuck_seconds: float = 12.0, max_episode_seconds: float = 30.0): + max_stuck_seconds: float = 12.0, max_episode_seconds: float = 30.0, + low_speed_threshold: float = 0.5, max_low_speed_seconds: float = 3.0): super().__init__(env) - self.stuck_steps = stuck_steps - self.min_displacement = min_displacement - self.max_stuck_seconds = max_stuck_seconds - self.max_episode_seconds = max_episode_seconds - self._pos_buf: deque = deque(maxlen=stuck_steps) - self._last_progress_pos = None - self._last_progress_t = None - self._episode_start_t = None + self.stuck_steps = stuck_steps + self.min_displacement = min_displacement + self.max_stuck_seconds = max_stuck_seconds + self.max_episode_seconds = max_episode_seconds + self.low_speed_threshold = low_speed_threshold + self.max_low_speed_seconds = max_low_speed_seconds + self._pos_buf: deque = deque(maxlen=stuck_steps) + self._last_progress_pos = None + self._last_progress_t = None + self._episode_start_t = None + self._low_speed_start_t = None def reset(self, **kwargs): self._pos_buf.clear() self._last_progress_pos = None self._last_progress_t = None self._episode_start_t = time.time() + self._low_speed_start_t = None return self.env.reset(**kwargs) def step(self, action): @@ -197,6 +206,24 @@ class StuckTerminationWrapper(gym.Wrapper): except (TypeError, ValueError): pass + # Speed-based stuck detection: catches car pinned against a barrier. + # A car pressed against a wall has speed≈0 even while sliding laterally + # (accumulating displacement that resets the position-based timer above). + if not terminated: + try: + speed = float(info.get('speed', 999.0) or 999.0) + except (TypeError, ValueError): + speed = 999.0 + if speed < self.low_speed_threshold: + if self._low_speed_start_t is None: + self._low_speed_start_t = now + elif (now - self._low_speed_start_t) > self.max_low_speed_seconds: + terminated = True + info['stuck_termination'] = True + info['stuck_reason'] = 'low_speed_timeout' + else: + self._low_speed_start_t = None + # Hard episode wall-clock limit — fires regardless of car position or sim fps. # Catches cars sliding slowly along barriers that keep resetting the # max_stuck_seconds timer by drifting 0.5m at a time.