From 813f888502851a05678cc621983c6d234892594c Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Sun, 19 Apr 2026 17:01:41 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20reward=20v6.1=20=E2=80=94=20active=5Fnod?= =?UTF-8?q?e=20progress=20terminator=20kills=20circle/stuck=20exploits?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User's insight: a circling car stays near the same track waypoints, so active_node (sim's track progress indicator) never advances. Track the maximum active_node reached this episode. If it hasn't increased in progress_patience=60 steps (~3.3s), terminate. This catches: - Circular driving (active_node oscillates, max never increases) - Stuck on cone/barrier (active_node frozen) - NOT triggered by: legitimate cornering, slow forward progress, lap resets On lap completion, active_node wraps to 0 — reset max_node_seen and counter. Also: Exp 12 — single track mountain training with lap-based stopping criterion. Train until 3 consecutive laps in eval, not fixed step count. --- agent/experiments/exp12_mountain_single.py | 150 +++++++++++++++++++++ agent/reward_wrapper.py | 83 +++++++++--- tests/test_reward_wrapper.py | 96 ++++++++++++- 3 files changed, 300 insertions(+), 29 deletions(-) create mode 100644 agent/experiments/exp12_mountain_single.py diff --git a/agent/experiments/exp12_mountain_single.py b/agent/experiments/exp12_mountain_single.py new file mode 100644 index 0000000..3664f6c --- /dev/null +++ b/agent/experiments/exp12_mountain_single.py @@ -0,0 +1,150 @@ +""" +Exp 12: Single track — mountain_track, v6.1 reward, lap-based stopping. + +Strategy: train until the car consistently completes multiple laps. +No fixed step count — eval every 5k steps, stop when 3 consecutive laps +achieved in a single eval episode (2000 steps). + +Key fixes applied: + - v6.1 reward: speed × CTE, efficiency gate, CTE patience (grass fix), + track progress patience (circle/stuck fix using active_node) + - stuck_steps=40, wall-clock timeout=12s + - Single track: all training budget goes to mountain_track + - throttle_min=0.2 (proven to work in Exp 9 with v5 reward + 90k steps) + +Based on Exp 9 success: single track + v5/v6 reward + throttle_min=0.2 +CAN learn mountain. The circle/stuck exploit was contaminating multi-track +training. Single track eliminates that interaction. +""" +import sys, os, time +sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') + +from multitrack_runner import log, StuckTerminationWrapper +from donkeycar_sb3_runner import ThrottleClampWrapper +from reward_wrapper import SpeedRewardWrapper +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage +import gymnasium as gym +import numpy as np + +HOST = '10.0.0.55' +PORT = 9091 +TRACK_ID = 'donkey-mountain-track-v0' +TRACK_NAME = 'mountain_track' +THROTTLE_MIN = 0.2 +LR = 0.000725 +MAX_STEPS = 300000 # safety ceiling — stop via lap criterion first +EVAL_EVERY = 5000 # eval after every N training steps +EVAL_MAX_STEPS = 2000 # steps per eval episode +LAP_STOP_THRESHOLD = 3 # stop when eval achieves this many laps in one episode +SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp12-mountain-single' +os.makedirs(SAVE_DIR, exist_ok=True) + +def make_env(): + def _init(): + raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT}) + env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5, + max_stuck_seconds=12.0) + env = SpeedRewardWrapper(env, + max_cte_terminate=4.0, + cte_patience=20, + progress_patience=60, # terminate if active_node max not advancing for 60 steps + ) + return env + return _init + +log('='*60) +log(f'Exp 12: Single track — {TRACK_NAME}') +log(f' Host: {HOST}:{PORT}') +log(f' throttle_min={THROTTLE_MIN}, lr={LR}') +log(f' Reward: v6.1 (speed×CTE + efficiency gate + progress termination)') +log(f' Stopping: eval every {EVAL_EVERY:,} steps, stop at {LAP_STOP_THRESHOLD} laps') +log(f' Safety ceiling: {MAX_STEPS:,} steps') +log('='*60) + +env = VecTransposeImage(DummyVecEnv([make_env()])) +log(f' obs={env.observation_space.shape}') + +model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu') +log('PPO created. Training begins...') + +best_reward = float('-inf') +best_laps = 0 +steps_done = 0 +stop_reason = None + +while steps_done < MAX_STEPS: + seg = min(EVAL_EVERY, MAX_STEPS - steps_done) + model.learn(total_timesteps=seg, reset_num_timesteps=False) + steps_done += seg + + # Save checkpoint + ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}') + model.save(ckpt) + model.save(os.path.join(SAVE_DIR, 'model')) + + # Eval: count laps in one deterministic episode + try: + obs = env.reset() + ep_reward, ep_steps, laps = 0.0, 0, 0 + prev_lap_count = 0 + for _ in range(EVAL_MAX_STEPS): + action, _ = model.predict(obs, deterministic=True) + result = env.step(action) + if len(result) == 4: + obs, r, d, info = result + done = bool(d[0]) + else: + obs, r, t, tr, info = result + done = bool(t[0] or tr[0]) + ep_reward += float(r[0]) + ep_steps += 1 + + # Count laps from info + try: + lc = int((info[0] if isinstance(info, (list, tuple)) else info) + .get('lap_count', 0) or 0) + if lc > prev_lap_count: + laps = lc + prev_lap_count = lc + except Exception: + pass + + if done: + break + + status = '✅' if ep_steps >= EVAL_MAX_STEPS else f'❌@{ep_steps}' + log(f'[{steps_done:,}/{MAX_STEPS:,}] reward={ep_reward:.1f} ' + f'steps={ep_steps} laps={laps} {status}') + + if ep_reward > best_reward: + best_reward = ep_reward + model.save(os.path.join(SAVE_DIR, 'best_model')) + log(f' ⭐ NEW BEST reward: {best_reward:.1f}') + + if laps > best_laps: + best_laps = laps + log(f' 🏆 NEW BEST laps: {best_laps}') + + if laps >= LAP_STOP_THRESHOLD: + stop_reason = f'achieved {laps} laps in eval at {steps_done:,} steps' + log(f' 🎯 STOPPING CRITERION MET: {stop_reason}') + break + + except Exception as e: + log(f' Eval error: {e}') + import traceback; traceback.print_exc() + +env.close() +time.sleep(3) + +log(f'\n{"="*60}') +log(f'Training complete.') +log(f' Stop reason : {stop_reason or "reached max steps"}') +log(f' Total steps : {steps_done:,}') +log(f' Best laps : {best_laps}') +log(f' Best reward : {best_reward:.1f}') +log(f' Best model : {SAVE_DIR}/best_model.zip') +log(f'{"="*60}') +log(f'\n=== Exp 12 COMPLETE ===') diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 67ae49e..14274f6 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -67,15 +67,23 @@ class SpeedRewardWrapper(gym.Wrapper): Completely ignores the sim's own reward (which uses forward_vel and is exploitable by circular/spinning motion). + Exploit termination: + - Sustained high CTE (> max_cte_terminate for cte_patience steps): grass exploit + - No track progress (active_node max not advancing for progress_patience steps): + catches circular driving, stuck-on-cone, stuck-on-barrier. + A circling car stays near the same waypoints — active_node never advances. + A stuck car never advances either. Forward driving always advances. + Args: - env: gymnasium environment - speed_scale: speed bonus multiplier (default 0.1) - window_size: steps for efficiency calculation (default 30) - min_efficiency: efficiency below which no reward (default 0.15) - max_cte: track half-width for normalization (default 8.0) - min_lap_time: laps faster than this are penalised as exploits - max_cte_terminate: terminate if CTE exceeds this for cte_patience steps - cte_patience: steps of sustained high CTE before termination (default 20) + env: gymnasium environment + speed_scale: speed bonus multiplier (default 0.1) + window_size: steps for efficiency gate (default 30) + min_efficiency: efficiency gate threshold (default 0.15) + max_cte: track half-width for reward normalization (default 8.0) + min_lap_time: laps faster than this are penalised as exploits + max_cte_terminate: terminate if CTE > this for cte_patience steps + cte_patience: steps of sustained high CTE before termination + progress_patience: steps without new max active_node before termination """ def __init__( @@ -86,26 +94,32 @@ class SpeedRewardWrapper(gym.Wrapper): min_efficiency: float = 0.15, max_cte: float = 8.0, min_lap_time: float = 5.0, - max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m - cte_patience: int = 20, # steps of high CTE before terminate + max_cte_terminate: float = 4.0, + cte_patience: int = 20, + progress_patience: int = 60, # ~3.3s at 18 steps/sec ): super().__init__(env) - self.speed_scale = speed_scale - self.window_size = window_size - self.min_efficiency = min_efficiency - self.max_cte = max_cte - self.min_lap_time = min_lap_time - self.max_cte_terminate = max_cte_terminate - self.cte_patience = cte_patience - self._pos_history = deque(maxlen=window_size + 1) - self._last_lap_count = 0 - self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate + self.speed_scale = speed_scale + self.window_size = window_size + self.min_efficiency = min_efficiency + self.max_cte = max_cte + self.min_lap_time = min_lap_time + self.max_cte_terminate = max_cte_terminate + self.cte_patience = cte_patience + self.progress_patience = progress_patience + self._pos_history = deque(maxlen=window_size + 1) + self._last_lap_count = 0 + self._high_cte_steps = 0 + self._max_node_seen = -1 # highest active_node reached this episode + self._no_progress_steps = 0 # steps since max_node last increased def reset(self, **kwargs): result = self.env.reset(**kwargs) self._pos_history.clear() self._last_lap_count = 0 self._high_cte_steps = 0 + self._max_node_seen = -1 + self._no_progress_steps = 0 return result def step(self, action): @@ -176,14 +190,39 @@ class SpeedRewardWrapper(gym.Wrapper): else: self._high_cte_steps = 0 - # --- Short-lap exploit detection --- + # --- Circle / stuck exploit: no track progress termination --- + # Track the highest active_node (track waypoint) reached this episode. + # A circling car stays near the same waypoints — max_node never advances. + # A stuck car never advances either. Only genuine forward driving advances. + # On lap completion, active_node resets to 0 — we reset our tracker too. + try: + active_node = int(info.get('active_node', -1) or 0) + total_nodes = int(info.get('total_nodes', 1) or 1) + except (TypeError, ValueError): + active_node = -1 + total_nodes = 1 + + if active_node >= 0: + if active_node > self._max_node_seen: + # New furthest point reached — genuine forward progress + self._max_node_seen = active_node + self._no_progress_steps = 0 + else: + self._no_progress_steps += 1 + if self._no_progress_steps >= self.progress_patience: + return -1.0, True # no forward progress — terminate + + try: current_lap_count = int(info.get('lap_count', 0) or 0) except (TypeError, ValueError): current_lap_count = self._last_lap_count if current_lap_count > self._last_lap_count: - self._last_lap_count = current_lap_count + self._last_lap_count = current_lap_count + # Reset progress tracker — active_node wraps to 0 on new lap + self._max_node_seen = -1 + self._no_progress_steps = 0 try: lap_time = float(info.get('last_lap_time', 999.0) or 999.0) except (TypeError, ValueError): diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index b3f97af..956f79a 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -362,16 +362,98 @@ def test_high_cte_resets_when_back_on_track(): def test_no_track_progress_terminates_episode(): """ - REMOVED - progress_patience terminator removed from v6.1. - Mountain rollback is a learning issue, not a termination issue (ADR-020). + Circle/stuck exploit fix: if max active_node doesn't advance for + progress_patience steps, the episode must be force-terminated. + A circling car stays near the same waypoints — max_node never increases. """ - pass # placeholder + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=10) + wrapper.reset() + + # First step initialises max_node to 5, then 10 more steps stuck at 5 → terminate + for i in range(12): + info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.), + 'active_node': 5, 'total_nodes': 100, + 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + if ft: + break + + assert ft == True, 'Should terminate when max active_node not advancing' + assert r == -1.0 def test_track_progress_resets_counter(): """ - Node advancement must reset the no-progress counter. - REMOVED - progress_patience terminator removed from v6.1. - Mountain rollback is a learning issue, not a termination issue (ADR-020). + Advancing to a new max active_node must reset the no-progress counter. """ - pass # placeholder to keep test count stable + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=5) + wrapper.reset() + + # Step forward: nodes 0, 1, 2, 3 — each new node resets counter + for node in range(4): + info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(node)*0.5, 0., 0.), + 'active_node': node, 'total_nodes': 100, + 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + assert ft == False, f'Should not terminate when advancing (node {node})' + assert wrapper._no_progress_steps == 0, 'Counter should reset on new max node' + + +def test_circle_exploit_terminates(): + """ + A car circling near the same spot should be terminated. + active_node oscillates but never exceeds the initial max. + """ + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=10) + wrapper.reset() + + # Set max_node to 10 + info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.), + 'active_node': 10, 'total_nodes': 100, + 'lap_count': 0, 'last_lap_time': 0.0} + wrapper._compute_reward_and_done(done=False, info=info) + + # Now oscillate between nodes 8-10 (circling near node 10) + terminated = False + for i in range(20): + node = 8 + (i % 3) # oscillates 8, 9, 10, 8, 9, 10... + info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.), + 'active_node': node, 'total_nodes': 100, + 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + if ft: + terminated = True + break + + assert terminated, 'Circling (oscillating active_node, no new max) should terminate' + + +def test_lap_completion_resets_progress_tracker(): + """ + On lap completion, active_node resets to 0. Progress tracker must also + reset so the car isn't immediately terminated for 'no progress'. + """ + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=5, min_lap_time=5.0) + wrapper.reset() + + # Drive to near end of track + info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.), + 'active_node': 99, 'total_nodes': 100, + 'lap_count': 0, 'last_lap_time': 0.0} + wrapper._compute_reward_and_done(done=False, info=info) + assert wrapper._max_node_seen == 99 + + # Complete a valid lap + info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 0, 'total_nodes': 100, + 'lap_count': 1, 'last_lap_time': 12.0} # 12s lap = valid + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + + # Progress tracker should be reset + assert wrapper._max_node_seen == -1, 'max_node_seen should reset on lap completion' + assert wrapper._no_progress_steps == 0 + assert ft == False, 'Valid lap should not terminate'