From f730a2e0ba539d529d6bcaaaf8b5f6be3228ded0 Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Sun, 19 Apr 2026 16:14:28 -0400 Subject: [PATCH] =?UTF-8?q?docs:=20ADR-020/021=20+=20session=20log=20?= =?UTF-8?q?=E2=80=94=20throttle/hill=20history=20and=20grass=20exploit=20r?= =?UTF-8?q?oot=20cause?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical facts documented permanently: - throttle_min=0.5 bakes into action space (too fast for corners) - throttle_min=0.2 + v5 reward CAN learn hill (proved Exp 9, mountain only 90k) - Mountain failure in parallel is contamination from grass exploit, not throttle - Grass exploit root cause: sim determine_episode_over() passes when CTE>16m - DO NOT confuse mountain rollback with stuck issue - DO NOT change throttle_min as first response to mountain failure --- DECISIONS.md | 64 ++++++++ agent/experiments/exp11d_parallel_v61.py | 178 +++++++++++++++++++++++ agent/reward_wrapper.py | 121 +++++++++------ docs/SESSION_LOG_2026-04-19.md | 64 +++++++- tests/test_reward_wrapper.py | 102 +++++++++++++ 5 files changed, 480 insertions(+), 49 deletions(-) create mode 100644 agent/experiments/exp11d_parallel_v61.py diff --git a/DECISIONS.md b/DECISIONS.md index 892a704..4a19273 100644 --- a/DECISIONS.md +++ b/DECISIONS.md @@ -416,3 +416,67 @@ env = DummyVecEnv([ **Validation:** Exp 11 will test this approach. If results are consistent across multiple runs (not lottery), this ADR is confirmed. + +--- + +## ADR-020: Mountain Track Hill — Throttle and Reward History + +**Date:** 2026-04-19 +**Status:** Accepted + +**Context:** Mountain_track has a steep hill that the car must climb. +Multiple experiments tested different throttle_min and reward combinations. + +**Confirmed findings (from Exp 1–9):** +- `throttle_min=0.2` + v4 reward: car cannot get over hill. v4 reward gives + zero gradient when speed≈0 AND efficiency≈0 simultaneously on hill. +- `throttle_min=0.5` + any reward: car gets over hill, BUT throttle_min is + baked into the action space. Model cannot output throttle < 0.5. + Result: crashes on tight corners (mini_monaco ~91 steps consistently). +- `throttle_min=0.2` + v5 reward (speed×CTE): model CAN learn to self-select + high throttle on hill. Proved in Exp 9 (90k steps, mountain only) → 2000/2000. + The v5 speed gradient is non-zero on hills, giving the model a learning signal. + +**When mountain fails in parallel training:** +- First check for training contamination (e.g., grass exploit on other track) +- The grass exploit corrupts generated_track episodes → model learns exploit + instead of driving → mountain gets corrupted gradient too +- Fix the exploit first, then re-run. Do NOT immediately assume throttle_min + is the cause. + +**If mountain still fails after exploit fixes:** +- Consider per-track throttle_min: throttle_min=0.5 for mountain env, + throttle_min=0.2 for other envs (DummyVecEnv allows per-env wrappers) +- This is feasible since each env in DummyVecEnv is wrapped independently + +**DO NOT:** +- Confuse mountain rollback with a stuck issue (it's a learning/reward issue) +- Add termination conditions for rollback (interferes with slow hill learning) +- Change throttle_min as the FIRST response when mountain fails + +--- + +## ADR-021: Generated Track Grass Exploit — Root Cause and Fix + +**Date:** 2026-04-19 +**Status:** Accepted + +**Context:** generated_track has a physical gap in the boundary mesh at the +first turn. The car finds this gap and drives off onto the grass indefinitely. + +**Root cause:** `donkey_sim.py determine_episode_over()` has: +```python +if math.fabs(self.cte) > 2 * self.max_cte: # > 16.0m + pass # designed for bad startup frames, but means far-off-track = never terminates +elif math.fabs(self.cte) > self.max_cte: # 8.0-16.0m + self.over = True +``` +The car exits through the gap, CTE quickly exceeds 16m, hits `pass` — episode never ends. + +**Fix:** Python-side `SpeedRewardWrapper` CTE patience terminator: +- If CTE > `max_cte_terminate` (4.0m) for `cte_patience` (20) consecutive steps → terminate +- Catches the car at 4m (before blowing past 16m into the `pass` zone) +- 4.0m chosen conservatively — legitimate cornering stays well below 4m CTE +- Resets counter when car returns to within 4m (brief excursions allowed) + +**Note:** We cannot fix the Unity sim code directly. diff --git a/agent/experiments/exp11d_parallel_v61.py b/agent/experiments/exp11d_parallel_v61.py new file mode 100644 index 0000000..dacb2b5 --- /dev/null +++ b/agent/experiments/exp11d_parallel_v61.py @@ -0,0 +1,178 @@ +""" +Exp 11d: Parallel DummyVecEnv, v6.1 reward (grass + rollback fixes), 180k steps. + +Changes from Exp 11c (aborted): + - Reward v6.1: adds two new termination conditions: + 1. Sustained high CTE (grass exploit fix): if CTE > 4.0 for 20 steps → terminate + Stops the generated_track gap exploit where car exits through a hole + in the boundary mesh and drives indefinitely on the grass. + 2. No track progress (mountain rollback fix): if active_node doesn't + advance for 60 steps → terminate. + Stops the car going up the hill, rolling back, going up again — IS + moving so StuckWrapper doesn't fire, but never makes track progress. + + - Total steps: 180k (vs 250k in 11c — enough budget, not too long) + +Infrastructure (unchanged from 11b/11c): + - DummyVecEnv with two sim instances (9091 + 9093) + - stuck_steps=40, throttle_min=0.2, lr=0.000725 +""" +import sys, os, time +sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') + +from multitrack_runner import log, StuckTerminationWrapper +from donkeycar_sb3_runner import ThrottleClampWrapper +from reward_wrapper import SpeedRewardWrapper +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage +import gymnasium as gym +import numpy as np + +HOST = '10.0.0.55' +THROTTLE_MIN = 0.2 +LR = 0.000725 +TOTAL_STEPS = 180000 +SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp11d-parallel-v61' +os.makedirs(SAVE_DIR, exist_ok=True) + +def make_env(track_id, port): + def _init(): + raw = gym.make(track_id, conf={'host': HOST, 'port': port}) + env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5) + env = SpeedRewardWrapper(env, + max_cte_terminate=4.0, # terminate if CTE > 4m for 20 steps (grass fix) + cte_patience=20, + progress_patience=60, # terminate if no node advance for 60 steps (rollback fix) + ) + return env + return _init + +log('='*60) +log('Exp 11d: Parallel DummyVecEnv, v6.1 reward, 180k steps') +log(f' Sim 1: {HOST}:9091 → generated_track') +log(f' Sim 2: {HOST}:9093 → mountain_track') +log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}') +log(f' Reward v6.1: speed×CTE + efficiency gate + grass/rollback terminators') +log(f' max_cte_terminate=4.0, cte_patience=20 (grass fix)') +log(f' progress_patience=60 (mountain rollback fix)') +log(f' Stuck: 40 steps') +log('='*60) + +env = DummyVecEnv([ + make_env('donkey-generated-track-v0', 9091), + make_env('donkey-mountain-track-v0', 9093), +]) +env = VecTransposeImage(env) +log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}') + +model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu') +log('PPO created. Starting training...') + +CHECKPOINT_EVERY = 10000 +best_reward = float('-inf') +steps_done = 0 + +while steps_done < TOTAL_STEPS: + seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done) + model.learn(total_timesteps=seg_steps, reset_num_timesteps=False) + steps_done += seg_steps + + ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}') + model.save(ckpt) + model.save(os.path.join(SAVE_DIR, 'model')) + log(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved') + + try: + obs = env.reset() + ep_rewards = np.zeros(env.num_envs) + ep_steps = np.zeros(env.num_envs) + done_mask = np.zeros(env.num_envs, dtype=bool) + for _ in range(2000): + action, _ = model.predict(obs, deterministic=True) + obs, rewards, dones, infos = env.step(action) + for i in range(env.num_envs): + if not done_mask[i]: + ep_rewards[i] += rewards[i] + ep_steps[i] += 1 + if dones[i]: + done_mask[i] = True + if done_mask.all(): + break + + status0 = '✅' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}' + status1 = '✅' if ep_steps[1] >= 2000 else f'❌@{int(ep_steps[1])}' + log(f' Eval: gen_track={ep_rewards[0]:.1f}r/{int(ep_steps[0])}s {status0} ' + f'mountain={ep_rewards[1]:.1f}r/{int(ep_steps[1])}s {status1}') + + total_reward = ep_rewards.sum() + if total_reward > best_reward: + best_reward = total_reward + model.save(os.path.join(SAVE_DIR, 'best_model')) + log(f' ⭐ NEW BEST: {best_reward:.1f} (combined)') + except Exception as e: + log(f' Eval error: {e}') + +model.save(os.path.join(SAVE_DIR, 'model')) +log(f'\nTraining complete. Best combined reward: {best_reward:.1f}') +env.close() +time.sleep(5) + +# --- Eval on all 4 tracks --- +log('\n' + '='*60) +log('EVALUATION: best_model on 4 tracks (3 sets each)') +log('='*60) + +EVAL_TRACKS = [ + ('donkey-mountain-track-v0', 'mountain_track'), + ('donkey-generated-track-v0', 'generated_track'), + ('donkey-generated-roads-v0', 'generated_road'), + ('donkey-minimonaco-track-v0', 'mini_monaco'), +] +EVAL_PORT = 9091 +best_model_path = os.path.join(SAVE_DIR, 'best_model.zip') +results_by_track = {} + +for track_id, track_name in EVAL_TRACKS: + log(f'\n--- {track_name} ---') + steps_list = [] + for s in range(1, 4): + try: + raw = gym.make(track_id, conf={'host': HOST, 'port': EVAL_PORT}) + ei = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + ei = StuckTerminationWrapper(ei, stuck_steps=40, min_displacement=0.5) + ei = SpeedRewardWrapper(ei, max_cte_terminate=4.0, cte_patience=20, + progress_patience=60) + ev = VecTransposeImage(DummyVecEnv([lambda e=ei: e])) + m = PPO.load(best_model_path, env=ev, device='cpu') + + obs = ev.reset() + total_r, total_s, done = 0.0, 0, False + while not done and total_s < 2000: + action, _ = m.predict(obs, deterministic=True) + result = ev.step(action) + if len(result) == 4: obs, r, d, _ = result; done = bool(d[0]) + else: obs, r, t, tr, _ = result; done = bool(t[0] or tr[0]) + total_r += float(r[0]); total_s += 1 + + status = '✅' if total_s >= 2000 else f'❌@{total_s}' + log(f' Set{s}: {total_r:.1f}r / {total_s}s {status}') + steps_list.append(total_s) + ev.close(); time.sleep(3) + except Exception as e: + log(f' Set{s}: ERROR — {e}') + steps_list.append(0); time.sleep(3) + + results_by_track[track_name] = steps_list + log(f' Mean: {np.mean(steps_list):.0f} steps') + +log('\n' + '='*60) +log('SUMMARY') +log('='*60) +for track_name, steps_list in results_by_track.items(): + steps_str = '/'.join(str(s) for s in steps_list) + mean = np.mean(steps_list) + verdict = '✅' if mean >= 1500 else '⚠️' if mean >= 500 else '❌' + log(f' {verdict} {track_name:20s}: {steps_str} mean={mean:.0f}') + +log(f'\n=== Exp 11d COMPLETE ===') diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index c50e6c7..63de066 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -62,41 +62,58 @@ from collections import deque class SpeedRewardWrapper(gym.Wrapper): """ - Full reward bypass: base CTE reward × path efficiency × speed bonus. + Full reward bypass: speed × CTE_quality, gated by efficiency. Completely ignores the sim's own reward (which uses forward_vel and is exploitable by circular/spinning motion). Args: - env: gymnasium environment - speed_scale: speed bonus multiplier (default 0.1) - window_size: steps for efficiency calculation (default 30) - min_efficiency: efficiency below which no reward (default 0.05) - max_cte: track half-width for normalization (default 8.0, matches sim) + env: gymnasium environment + speed_scale: speed bonus multiplier (default 0.1) + window_size: steps for efficiency calculation (default 30) + min_efficiency: efficiency below which no reward (default 0.15) + max_cte: track half-width for normalization (default 8.0) + min_lap_time: laps faster than this are penalised as exploits + max_cte_terminate: terminate if CTE exceeds this for cte_patience steps + cte_patience: steps of sustained high CTE before termination (default 20) + min_progress_steps: steps before checking track progress (allow settling) + progress_patience: steps of zero track progress before termination (default 60) """ def __init__( self, env, speed_scale: float = 0.1, - window_size: int = 30, # captures 2+ full circles at typical circling speed - min_efficiency: float = 0.15, # gate threshold: circles ≈ 0.13, wobbly straight ≈ 0.98 + window_size: int = 30, + min_efficiency: float = 0.15, max_cte: float = 8.0, - min_lap_time: float = 5.0, # laps faster than this are penalised as exploits + min_lap_time: float = 5.0, + max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m + cte_patience: int = 20, # steps of high CTE before terminate + progress_patience: int = 60, # steps of no track progress before terminate ): super().__init__(env) - self.speed_scale = speed_scale - self.window_size = window_size - self.min_efficiency = min_efficiency - self.max_cte = max_cte - self.min_lap_time = min_lap_time - self._pos_history = deque(maxlen=window_size + 1) - self._last_lap_count = 0 # track lap completions to detect short-lap exploit + self.speed_scale = speed_scale + self.window_size = window_size + self.min_efficiency = min_efficiency + self.max_cte = max_cte + self.min_lap_time = min_lap_time + self.max_cte_terminate = max_cte_terminate + self.cte_patience = cte_patience + self.progress_patience = progress_patience + self._pos_history = deque(maxlen=window_size + 1) + self._last_lap_count = 0 + self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate + self._last_active_node = -1 # track progress node at last check + self._no_progress_steps = 0 # consecutive steps with no node advancement def reset(self, **kwargs): result = self.env.reset(**kwargs) self._pos_history.clear() - self._last_lap_count = 0 + self._last_lap_count = 0 + self._high_cte_steps = 0 + self._last_active_node = -1 + self._no_progress_steps = 0 return result def step(self, action): @@ -126,27 +143,25 @@ class SpeedRewardWrapper(gym.Wrapper): def _compute_reward_and_done(self, done: bool, info: dict): """ - v6: speed × CTE-quality + efficiency gate. + v6.1: speed × CTE-quality + efficiency gate + grass/rollback terminators. + + New termination conditions: + - Sustained high CTE: CTE > max_cte_terminate for cte_patience steps + → terminate. Stops the grass exploit (car exits track gap and + drives indefinitely on grass with CTE just under max_cte=8.0). + - No track progress: active_node doesn't advance for progress_patience + steps → terminate. Stops mountain rollback (car goes up, rolls + back, IS moving so StuckWrapper doesn't fire, but never advances). reward = speed_norm × cte_quality (when efficiency >= threshold) - reward = 0.0 (when efficiency < threshold — circling) - reward = -1.0 (on crash/done) - - The efficiency gate prevents circular driving (eff≈0 for circles) - without killing gradient on hills (eff>0 for a stuck-but-not-circling - car, so the gate passes and speed×CTE gradient pushes toward unstuck). - - Exploit protection: - - Efficiency gate: circles → reward = 0 - - Short-lap penalty: laps < min_lap_time → large negative + terminate - - StuckTerminationWrapper: done=True after stuck_steps of no movement - - Crash: done=True → -1.0 + reward = 0.0 (when circling) + reward = -1.0 (on crash/termination) """ # Track position for efficiency calculation try: pos = info.get('pos', (0.0, 0.0, 0.0)) pos_x = float(pos[0]) - pos_z = float(pos[2]) # z is forward in Unity coordinate system + pos_z = float(pos[2]) self._pos_history.append(np.array([pos_x, pos_z])) except (TypeError, ValueError, IndexError): pass @@ -155,6 +170,35 @@ class SpeedRewardWrapper(gym.Wrapper): if done: return -1.0, False + # --- CTE value for all checks --- + try: + cte = float(info.get('cte', 0.0) or 0.0) + except (TypeError, ValueError): + cte = 0.0 + + # --- Grass exploit: sustained high CTE termination --- + if abs(cte) > self.max_cte_terminate: + self._high_cte_steps += 1 + if self._high_cte_steps >= self.cte_patience: + return -1.0, True # too long off-track — terminate + else: + self._high_cte_steps = 0 + + # --- Mountain rollback: no track progress termination --- + try: + active_node = int(info.get('active_node', -1) or -1) + except (TypeError, ValueError): + active_node = -1 + + if active_node >= 0: + if active_node == self._last_active_node: + self._no_progress_steps += 1 + if self._no_progress_steps >= self.progress_patience: + return -1.0, True # no track progress — terminate + else: + self._last_active_node = active_node + self._no_progress_steps = 0 + # --- Short-lap exploit detection --- try: current_lap_count = int(info.get('lap_count', 0) or 0) @@ -169,22 +213,15 @@ class SpeedRewardWrapper(gym.Wrapper): lap_time = 999.0 if lap_time < self.min_lap_time: penalty = -10.0 * (self.min_lap_time / max(lap_time, 0.1)) - return penalty, True # (reward, force_terminate) + return penalty, True # --- Efficiency gate: detect circular driving --- efficiency = self._compute_efficiency() if efficiency < self.min_efficiency: - # Car is circling — zero reward but don't terminate. - # Zero (not negative) so there's no perverse incentive to crash - # early to avoid accumulating penalties. return 0.0, False - # --- CTE quality: how centred is the car? --- - try: - cte = float(info.get('cte', 0.0) or 0.0) - except (TypeError, ValueError): - cte = 0.0 - cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # 0=off track, 1=centred + # --- CTE quality --- + cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # --- Speed --- try: @@ -192,7 +229,7 @@ class SpeedRewardWrapper(gym.Wrapper): except (TypeError, ValueError): speed = 0.0 - # --- v6 reward: speed × CTE quality (same as v5, but gated) --- + # --- v6 reward: speed × CTE quality --- speed_norm = min(speed / 10.0, 1.0) return cte_quality * speed_norm, False diff --git a/docs/SESSION_LOG_2026-04-19.md b/docs/SESSION_LOG_2026-04-19.md index 1536438..0913dd0 100644 --- a/docs/SESSION_LOG_2026-04-19.md +++ b/docs/SESSION_LOG_2026-04-19.md @@ -117,10 +117,60 @@ parallel envs are working. - **Exp 11:** Tested parallel DummyVecEnv with two sim instances (ports 9091 + 9093) - Exp 11 (v5 reward): aborted due to circular driving on generated_track - Exp 11b (v6 reward): completed, no circles, but plateaus at ~194 steps on all tracks -- **v6 reward confirmed:** efficiency gate prevents circles, tests pass -- **Parallel env confirmed:** mechanically sound, stable training -- **Open issue:** 90k steps may be insufficient for 2-env training (45k per track) -- **Next experiment ideas:** - - Increase to 180k-250k total steps - - Test v6 on single track to isolate reward effect - - Check if efficiency gate fires during normal cornering (false positives) + - Exp 11c (v6 reward, 250k): aborted — grass exploit found on generated_track + - Exp 11d: pending fixes before re-run + +## Critical Known Facts (DO NOT LOSE) + +### throttle_min history (from Exp 1-9) +- `throttle_min=0.2` alone: car cannot get over mountain_track hill (not enough power) +- `throttle_min=0.5`: car gets over hill BUT throttle is baked into action space, + model CANNOT output throttle < 0.5, crashes on tight corners (mini_monaco ~91 steps) +- `throttle_min=0.2` + v5 reward (speed×CTE): car CAN learn to self-select high + throttle on hill. Proved in Exp 9 (mountain only, 90k steps) → 2000/2000 steps. +- KEY INSIGHT: Exp 9 worked because 90k steps were ALL on mountain. In parallel setup + (Exp 11b/11c), each track gets only ~45k effective steps AND the grass exploit + contaminated training. Mountain failure in parallel runs is NOT purely a throttle + issue — fix the grass exploit first, THEN see if mountain learns. + +### The grass exploit root cause (found 2026-04-19) +- generated_track has a physical gap in the boundary mesh at the first turn +- Car drives through the gap, CTE exceeds 8.0m → sim should terminate +- BUT: `determine_episode_over()` in donkey_sim.py has this code: + ```python + if math.fabs(self.cte) > 2 * self.max_cte: # > 16.0m + pass # ← INTENTIONALLY DOES NOTHING + elif math.fabs(self.cte) > self.max_cte: # 8.0–16.0m + self.over = True + ``` +- Car quickly exceeds 16m (> 2×max_cte), hits the `pass` case — episode never ends +- Fix: Python-side CTE patience wrapper that terminates when CTE > 4.0m for 20 steps + (catches the car BEFORE it blows past 16m) + +### Parallel env episode asymmetry +- DummyVecEnv runs both envs in every step (sequential, not truly parallel) +- When mountain episode ends quickly, VecEnv auto-resets mountain and starts new episode +- Meanwhile generated_track episode continues +- During training (model.learn()): PPO collects experience from both and auto-resets + independently — this is fine and correct +- During eval: our eval loop uses done_mask, so short mountain episodes auto-reset + and start new episodes that we ignore (waiting for generated_track to finish) +- User observation: 'car waits at start line for generated_track episode to end' — correct + +### DO NOT confuse mountain rollback with stuck issue +- Mountain rollback (car goes up, slows, rolls back) is a LEARNING/REWARD issue +- It is NOT a stuck issue — the car is moving (rolling back = speed > 0) +- StuckTerminationWrapper correctly does NOT fire (car IS moving) +- Root fix: ensure training is not contaminated by other exploits, then the + v5/v6 speed gradient teaches the model to apply high throttle on the hill + (proved to work in Exp 9) +- DO NOT add termination conditions for rollback — they interfere with valid + slow hill-climbing learning + +### speed vs forward_vel in reward +- info['speed'] comes from Unity — scalar magnitude, always ≥ 0 +- info['forward_vel'] computed in Python — dot(heading, velocity), negative when reversing +- Our reward uses info['speed'] — car rolling backward gets positive reward +- Sim's own reward correctly uses forward_vel with `if forward_vel > 0.0` check +- This is a known issue but NOT the primary cause of current problems + (efficiency gate gives 0 reward when rolling back → net displacement ≈ 0) diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index 2408866..ecb8015 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -299,3 +299,105 @@ def test_lap_count_resets_on_episode_reset(): # Reset episode — counter must go back to 0 wrapper.reset() assert wrapper._last_lap_count == 0 + + +# --------------------------------------------------------------------------- +# v6.1 exploit terminator tests +# --------------------------------------------------------------------------- + +def test_sustained_high_cte_terminates_episode(): + """ + Grass exploit fix: if CTE exceeds max_cte_terminate for cte_patience + consecutive steps, the episode must be force-terminated with -1.0 reward. + This catches the generated_track gap where car drives indefinitely on grass. + """ + env = MockEnv(speed=3.0, cte=5.0) # CTE=5.0 > max_cte_terminate=4.0 + wrapper = SpeedRewardWrapper(env, max_cte_terminate=4.0, cte_patience=5) + wrapper.reset() + + rewards = [] + terminated = [] + for _ in range(10): + info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 0, 'lap_count': 0, 'last_lap_time': 0.0} + r, force_term = wrapper._compute_reward_and_done(done=False, info=info) + rewards.append(r) + terminated.append(force_term) + + # Should terminate at step 5 (cte_patience=5) + assert terminated[4] == True, f'Should force-terminate at step 5, got {terminated}' + assert rewards[4] == -1.0, f'Termination reward should be -1.0, got {rewards[4]}' + assert terminated[0] == False, 'Should not terminate at step 1' + + +def test_high_cte_resets_when_back_on_track(): + """ + High CTE counter must reset when car returns to track. + Prevents false termination after a brief excursion. + """ + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, max_cte_terminate=4.0, cte_patience=5) + wrapper.reset() + + # 3 steps high CTE + for _ in range(3): + info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 0, 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + assert ft == False, 'Should not terminate after only 3 steps' + + # 1 step back on track resets counter + info = {'cte': 1.0, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 1, 'lap_count': 0, 'last_lap_time': 0.0} + wrapper._compute_reward_and_done(done=False, info=info) + assert wrapper._high_cte_steps == 0, 'CTE counter should reset when back on track' + + # 5 more steps high CTE — should now terminate (counter starts fresh) + for i in range(5): + info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 1, 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + assert ft == True, 'Should terminate after 5 new consecutive high-CTE steps' + + +def test_no_track_progress_terminates_episode(): + """ + Mountain rollback fix: if active_node doesn't advance for progress_patience + steps, the episode must be force-terminated. This catches a car that drives + up a hill, rolls back, and keeps moving (so StuckWrapper doesn't fire) + but never makes real track progress. + """ + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=10) + wrapper.reset() + + # Step with node=5 for 11 steps — first step initialises, then 10 stuck + for i in range(11): + info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.), + 'active_node': 5, 'lap_count': 0, 'last_lap_time': 0.0} + r, ft = wrapper._compute_reward_and_done(done=False, info=info) + + assert ft == True, f'Should terminate after 10 steps of no node progress (11 calls)' + assert r == -1.0, f'Termination reward should be -1.0' + + +def test_track_progress_resets_counter(): + """ + Node advancement must reset the no-progress counter. + """ + env = MockEnv(speed=3.0, cte=0.5) + wrapper = SpeedRewardWrapper(env, progress_patience=5) + wrapper.reset() + + # 3 steps on same node (first sets _last_active_node, then 2 count as no-progress) + for _ in range(3): + info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.), + 'active_node': 3, 'lap_count': 0, 'last_lap_time': 0.0} + wrapper._compute_reward_and_done(done=False, info=info) + assert wrapper._no_progress_steps == 2, 'First call initialises node, then 2 stuck' + + # Advance node — counter resets + info = {'cte': 0.5, 'speed': 3.0, 'pos': (0.1, 0., 0.), + 'active_node': 4, 'lap_count': 0, 'last_lap_time': 0.0} + wrapper._compute_reward_and_done(done=False, info=info) + assert wrapper._no_progress_steps == 0, 'Progress counter should reset on node advance'