From f730a2e0ba539d529d6bcaaaf8b5f6be3228ded0 Mon Sep 17 00:00:00 2001
From: Paul Huliganga <paje0101@gmail.com>
Date: Sun, 19 Apr 2026 16:14:28 -0400
Subject: [PATCH] =?UTF-8?q?docs:=20ADR-020/021=20+=20session=20log=20?=
 =?UTF-8?q?=E2=80=94=20throttle/hill=20history=20and=20grass=20exploit=20r?=
 =?UTF-8?q?oot=20cause?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical facts documented permanently:
- throttle_min=0.5 bakes into action space (too fast for corners)
- throttle_min=0.2 + v5 reward CAN learn hill (proved Exp 9, mountain only 90k)
- Mountain failure in parallel is contamination from grass exploit, not throttle
- Grass exploit root cause: sim determine_episode_over() passes when CTE>16m
- DO NOT confuse mountain rollback with stuck issue
- DO NOT change throttle_min as first response to mountain failure
---
 DECISIONS.md                             |  64 ++++++++
 agent/experiments/exp11d_parallel_v61.py | 178 +++++++++++++++++++++++
 agent/reward_wrapper.py                  | 121 +++++++++------
 docs/SESSION_LOG_2026-04-19.md           |  64 +++++++-
 tests/test_reward_wrapper.py             | 102 +++++++++++++
 5 files changed, 480 insertions(+), 49 deletions(-)
 create mode 100644 agent/experiments/exp11d_parallel_v61.py

diff --git a/DECISIONS.md b/DECISIONS.md
index 892a704..4a19273 100644
--- a/DECISIONS.md
+++ b/DECISIONS.md
@@ -416,3 +416,67 @@ env = DummyVecEnv([
 
 **Validation:** Exp 11 will test this approach. If results are consistent
 across multiple runs (not lottery), this ADR is confirmed.
+
+---
+
+## ADR-020: Mountain Track Hill — Throttle and Reward History
+
+**Date:** 2026-04-19
+**Status:** Accepted
+
+**Context:** Mountain_track has a steep hill that the car must climb.
+Multiple experiments tested different throttle_min and reward combinations.
+
+**Confirmed findings (from Exp 1–9):**
+- `throttle_min=0.2` + v4 reward: car cannot get over hill. v4 reward gives
+  zero gradient when speed≈0 AND efficiency≈0 simultaneously on hill.
+- `throttle_min=0.5` + any reward: car gets over hill, BUT throttle_min is
+  baked into the action space. Model cannot output throttle < 0.5.
+  Result: crashes on tight corners (mini_monaco ~91 steps consistently).
+- `throttle_min=0.2` + v5 reward (speed×CTE): model CAN learn to self-select
+  high throttle on hill. Proved in Exp 9 (90k steps, mountain only) → 2000/2000.
+  The v5 speed gradient is non-zero on hills, giving the model a learning signal.
+
+**When mountain fails in parallel training:**
+- First check for training contamination (e.g., grass exploit on other track)
+- The grass exploit corrupts generated_track episodes → model learns exploit
+  instead of driving → mountain gets corrupted gradient too
+- Fix the exploit first, then re-run. Do NOT immediately assume throttle_min
+  is the cause.
+
+**If mountain still fails after exploit fixes:**
+- Consider per-track throttle_min: throttle_min=0.5 for mountain env,
+  throttle_min=0.2 for other envs (DummyVecEnv allows per-env wrappers)
+- This is feasible since each env in DummyVecEnv is wrapped independently
+
+**DO NOT:**
+- Confuse mountain rollback with a stuck issue (it's a learning/reward issue)
+- Add termination conditions for rollback (interferes with slow hill learning)
+- Change throttle_min as the FIRST response when mountain fails
+
+---
+
+## ADR-021: Generated Track Grass Exploit — Root Cause and Fix
+
+**Date:** 2026-04-19
+**Status:** Accepted
+
+**Context:** generated_track has a physical gap in the boundary mesh at the
+first turn. The car finds this gap and drives off onto the grass indefinitely.
+
+**Root cause:** `donkey_sim.py determine_episode_over()` has:
+```python
+if math.fabs(self.cte) > 2 * self.max_cte:  # > 16.0m
+    pass   # designed for bad startup frames, but means far-off-track = never terminates
+elif math.fabs(self.cte) > self.max_cte:    # 8.0-16.0m
+    self.over = True
+```
+The car exits through the gap, CTE quickly exceeds 16m, hits `pass` — episode never ends.
+
+**Fix:** Python-side `SpeedRewardWrapper` CTE patience terminator:
+- If CTE > `max_cte_terminate` (4.0m) for `cte_patience` (20) consecutive steps → terminate
+- Catches the car at 4m (before blowing past 16m into the `pass` zone)
+- 4.0m chosen conservatively — legitimate cornering stays well below 4m CTE
+- Resets counter when car returns to within 4m (brief excursions allowed)
+
+**Note:** We cannot fix the Unity sim code directly.
diff --git a/agent/experiments/exp11d_parallel_v61.py b/agent/experiments/exp11d_parallel_v61.py
new file mode 100644
index 0000000..dacb2b5
--- /dev/null
+++ b/agent/experiments/exp11d_parallel_v61.py
@@ -0,0 +1,178 @@
+"""
+Exp 11d: Parallel DummyVecEnv, v6.1 reward (grass + rollback fixes), 180k steps.
+
+Changes from Exp 11c (aborted):
+  - Reward v6.1: adds two new termination conditions:
+      1. Sustained high CTE (grass exploit fix): if CTE > 4.0 for 20 steps → terminate
+         Stops the generated_track gap exploit where car exits through a hole
+         in the boundary mesh and drives indefinitely on the grass.
+      2. No track progress (mountain rollback fix): if active_node doesn't
+         advance for 60 steps → terminate.
+         Stops the car going up the hill, rolling back, going up again — IS
+         moving so StuckWrapper doesn't fire, but never makes track progress.
+
+  - Total steps: 180k (vs 250k in 11c — enough budget, not too long)
+
+Infrastructure (unchanged from 11b/11c):
+  - DummyVecEnv with two sim instances (9091 + 9093)
+  - stuck_steps=40, throttle_min=0.2, lr=0.000725
+"""
+import sys, os, time
+sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
+
+from multitrack_runner import log, StuckTerminationWrapper
+from donkeycar_sb3_runner import ThrottleClampWrapper
+from reward_wrapper import SpeedRewardWrapper
+from stable_baselines3 import PPO
+from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
+import gymnasium as gym
+import numpy as np
+
+HOST         = '10.0.0.55'
+THROTTLE_MIN = 0.2
+LR           = 0.000725
+TOTAL_STEPS  = 180000
+SAVE_DIR     = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp11d-parallel-v61'
+os.makedirs(SAVE_DIR, exist_ok=True)
+
+def make_env(track_id, port):
+    def _init():
+        raw = gym.make(track_id, conf={'host': HOST, 'port': port})
+        env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
+        env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5)
+        env = SpeedRewardWrapper(env,
+            max_cte_terminate=4.0,   # terminate if CTE > 4m for 20 steps (grass fix)
+            cte_patience=20,
+            progress_patience=60,    # terminate if no node advance for 60 steps (rollback fix)
+        )
+        return env
+    return _init
+
+log('='*60)
+log('Exp 11d: Parallel DummyVecEnv, v6.1 reward, 180k steps')
+log(f'  Sim 1: {HOST}:9091 → generated_track')
+log(f'  Sim 2: {HOST}:9093 → mountain_track')
+log(f'  throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
+log(f'  Reward v6.1: speed×CTE + efficiency gate + grass/rollback terminators')
+log(f'    max_cte_terminate=4.0, cte_patience=20 (grass fix)')
+log(f'    progress_patience=60 (mountain rollback fix)')
+log(f'  Stuck: 40 steps')
+log('='*60)
+
+env = DummyVecEnv([
+    make_env('donkey-generated-track-v0', 9091),
+    make_env('donkey-mountain-track-v0', 9093),
+])
+env = VecTransposeImage(env)
+log(f'  VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
+
+model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
+log('PPO created. Starting training...')
+
+CHECKPOINT_EVERY = 10000
+best_reward = float('-inf')
+steps_done = 0
+
+while steps_done < TOTAL_STEPS:
+    seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
+    model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
+    steps_done += seg_steps
+
+    ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
+    model.save(ckpt)
+    model.save(os.path.join(SAVE_DIR, 'model'))
+    log(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved')
+
+    try:
+        obs = env.reset()
+        ep_rewards = np.zeros(env.num_envs)
+        ep_steps = np.zeros(env.num_envs)
+        done_mask = np.zeros(env.num_envs, dtype=bool)
+        for _ in range(2000):
+            action, _ = model.predict(obs, deterministic=True)
+            obs, rewards, dones, infos = env.step(action)
+            for i in range(env.num_envs):
+                if not done_mask[i]:
+                    ep_rewards[i] += rewards[i]
+                    ep_steps[i] += 1
+                    if dones[i]:
+                        done_mask[i] = True
+            if done_mask.all():
+                break
+
+        status0 = '✅' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
+        status1 = '✅' if ep_steps[1] >= 2000 else f'❌@{int(ep_steps[1])}'
+        log(f'  Eval: gen_track={ep_rewards[0]:.1f}r/{int(ep_steps[0])}s {status0}  '
+            f'mountain={ep_rewards[1]:.1f}r/{int(ep_steps[1])}s {status1}')
+
+        total_reward = ep_rewards.sum()
+        if total_reward > best_reward:
+            best_reward = total_reward
+            model.save(os.path.join(SAVE_DIR, 'best_model'))
+            log(f'  ⭐ NEW BEST: {best_reward:.1f} (combined)')
+    except Exception as e:
+        log(f'  Eval error: {e}')
+
+model.save(os.path.join(SAVE_DIR, 'model'))
+log(f'\nTraining complete. Best combined reward: {best_reward:.1f}')
+env.close()
+time.sleep(5)
+
+# --- Eval on all 4 tracks ---
+log('\n' + '='*60)
+log('EVALUATION: best_model on 4 tracks (3 sets each)')
+log('='*60)
+
+EVAL_TRACKS = [
+    ('donkey-mountain-track-v0',   'mountain_track'),
+    ('donkey-generated-track-v0',  'generated_track'),
+    ('donkey-generated-roads-v0',  'generated_road'),
+    ('donkey-minimonaco-track-v0', 'mini_monaco'),
+]
+EVAL_PORT = 9091
+best_model_path = os.path.join(SAVE_DIR, 'best_model.zip')
+results_by_track = {}
+
+for track_id, track_name in EVAL_TRACKS:
+    log(f'\n--- {track_name} ---')
+    steps_list = []
+    for s in range(1, 4):
+        try:
+            raw = gym.make(track_id, conf={'host': HOST, 'port': EVAL_PORT})
+            ei = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
+            ei = StuckTerminationWrapper(ei, stuck_steps=40, min_displacement=0.5)
+            ei = SpeedRewardWrapper(ei, max_cte_terminate=4.0, cte_patience=20,
+                                    progress_patience=60)
+            ev = VecTransposeImage(DummyVecEnv([lambda e=ei: e]))
+            m = PPO.load(best_model_path, env=ev, device='cpu')
+
+            obs = ev.reset()
+            total_r, total_s, done = 0.0, 0, False
+            while not done and total_s < 2000:
+                action, _ = m.predict(obs, deterministic=True)
+                result = ev.step(action)
+                if len(result) == 4: obs, r, d, _ = result; done = bool(d[0])
+                else: obs, r, t, tr, _ = result; done = bool(t[0] or tr[0])
+                total_r += float(r[0]); total_s += 1
+
+            status = '✅' if total_s >= 2000 else f'❌@{total_s}'
+            log(f'  Set{s}: {total_r:.1f}r / {total_s}s {status}')
+            steps_list.append(total_s)
+            ev.close(); time.sleep(3)
+        except Exception as e:
+            log(f'  Set{s}: ERROR — {e}')
+            steps_list.append(0); time.sleep(3)
+
+    results_by_track[track_name] = steps_list
+    log(f'  Mean: {np.mean(steps_list):.0f} steps')
+
+log('\n' + '='*60)
+log('SUMMARY')
+log('='*60)
+for track_name, steps_list in results_by_track.items():
+    steps_str = '/'.join(str(s) for s in steps_list)
+    mean = np.mean(steps_list)
+    verdict = '✅' if mean >= 1500 else '⚠️' if mean >= 500 else '❌'
+    log(f'  {verdict} {track_name:20s}: {steps_str}  mean={mean:.0f}')
+
+log(f'\n=== Exp 11d COMPLETE ===')
diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py
index c50e6c7..63de066 100644
--- a/agent/reward_wrapper.py
+++ b/agent/reward_wrapper.py
@@ -62,41 +62,58 @@ from collections import deque
 
 class SpeedRewardWrapper(gym.Wrapper):
     """
-    Full reward bypass: base CTE reward × path efficiency × speed bonus.
+    Full reward bypass: speed × CTE_quality, gated by efficiency.
 
     Completely ignores the sim's own reward (which uses forward_vel and is
     exploitable by circular/spinning motion).
 
     Args:
-        env:          gymnasium environment
-        speed_scale:  speed bonus multiplier (default 0.1)
-        window_size:  steps for efficiency calculation (default 30)
-        min_efficiency: efficiency below which no reward (default 0.05)
-        max_cte:      track half-width for normalization (default 8.0, matches sim)
+        env:                gymnasium environment
+        speed_scale:        speed bonus multiplier (default 0.1)
+        window_size:        steps for efficiency calculation (default 30)
+        min_efficiency:     efficiency below which no reward (default 0.15)
+        max_cte:            track half-width for normalization (default 8.0)
+        min_lap_time:       laps faster than this are penalised as exploits
+        max_cte_terminate:  terminate if CTE exceeds this for cte_patience steps
+        cte_patience:       steps of sustained high CTE before termination (default 20)
+        min_progress_steps: steps before checking track progress (allow settling)
+        progress_patience:  steps of zero track progress before termination (default 60)
     """
 
     def __init__(
         self,
         env,
         speed_scale: float = 0.1,
-        window_size: int = 30,         # captures 2+ full circles at typical circling speed
-        min_efficiency: float = 0.15,  # gate threshold: circles ≈ 0.13, wobbly straight ≈ 0.98
+        window_size: int = 30,
+        min_efficiency: float = 0.15,
         max_cte: float = 8.0,
-        min_lap_time: float = 5.0,    # laps faster than this are penalised as exploits
+        min_lap_time: float = 5.0,
+        max_cte_terminate: float = 4.0,   # terminate early if CTE sustained > 4m
+        cte_patience: int = 20,            # steps of high CTE before terminate
+        progress_patience: int = 60,       # steps of no track progress before terminate
     ):
         super().__init__(env)
-        self.speed_scale   = speed_scale
-        self.window_size   = window_size
-        self.min_efficiency = min_efficiency
-        self.max_cte       = max_cte
-        self.min_lap_time  = min_lap_time
-        self._pos_history  = deque(maxlen=window_size + 1)
-        self._last_lap_count = 0      # track lap completions to detect short-lap exploit
+        self.speed_scale        = speed_scale
+        self.window_size        = window_size
+        self.min_efficiency     = min_efficiency
+        self.max_cte            = max_cte
+        self.min_lap_time       = min_lap_time
+        self.max_cte_terminate  = max_cte_terminate
+        self.cte_patience       = cte_patience
+        self.progress_patience  = progress_patience
+        self._pos_history       = deque(maxlen=window_size + 1)
+        self._last_lap_count    = 0
+        self._high_cte_steps    = 0    # consecutive steps with CTE > max_cte_terminate
+        self._last_active_node  = -1   # track progress node at last check
+        self._no_progress_steps = 0    # consecutive steps with no node advancement
 
     def reset(self, **kwargs):
         result = self.env.reset(**kwargs)
         self._pos_history.clear()
-        self._last_lap_count = 0
+        self._last_lap_count    = 0
+        self._high_cte_steps    = 0
+        self._last_active_node  = -1
+        self._no_progress_steps = 0
         return result
 
     def step(self, action):
@@ -126,27 +143,25 @@ class SpeedRewardWrapper(gym.Wrapper):
 
     def _compute_reward_and_done(self, done: bool, info: dict):
         """
-        v6: speed × CTE-quality + efficiency gate.
+        v6.1: speed × CTE-quality + efficiency gate + grass/rollback terminators.
+
+        New termination conditions:
+          - Sustained high CTE: CTE > max_cte_terminate for cte_patience steps
+            → terminate. Stops the grass exploit (car exits track gap and
+            drives indefinitely on grass with CTE just under max_cte=8.0).
+          - No track progress: active_node doesn't advance for progress_patience
+            steps → terminate. Stops mountain rollback (car goes up, rolls
+            back, IS moving so StuckWrapper doesn't fire, but never advances).
 
         reward = speed_norm × cte_quality   (when efficiency >= threshold)
-        reward = 0.0                        (when efficiency < threshold — circling)
-        reward = -1.0                       (on crash/done)
-
-        The efficiency gate prevents circular driving (eff≈0 for circles)
-        without killing gradient on hills (eff>0 for a stuck-but-not-circling
-        car, so the gate passes and speed×CTE gradient pushes toward unstuck).
-
-        Exploit protection:
-        - Efficiency gate: circles → reward = 0
-        - Short-lap penalty: laps < min_lap_time → large negative + terminate
-        - StuckTerminationWrapper: done=True after stuck_steps of no movement
-        - Crash: done=True → -1.0
+        reward = 0.0                        (when circling)
+        reward = -1.0                       (on crash/termination)
         """
         # Track position for efficiency calculation
         try:
             pos = info.get('pos', (0.0, 0.0, 0.0))
             pos_x = float(pos[0])
-            pos_z = float(pos[2])  # z is forward in Unity coordinate system
+            pos_z = float(pos[2])
             self._pos_history.append(np.array([pos_x, pos_z]))
         except (TypeError, ValueError, IndexError):
             pass
@@ -155,6 +170,35 @@ class SpeedRewardWrapper(gym.Wrapper):
         if done:
             return -1.0, False
 
+        # --- CTE value for all checks ---
+        try:
+            cte = float(info.get('cte', 0.0) or 0.0)
+        except (TypeError, ValueError):
+            cte = 0.0
+
+        # --- Grass exploit: sustained high CTE termination ---
+        if abs(cte) > self.max_cte_terminate:
+            self._high_cte_steps += 1
+            if self._high_cte_steps >= self.cte_patience:
+                return -1.0, True  # too long off-track — terminate
+        else:
+            self._high_cte_steps = 0
+
+        # --- Mountain rollback: no track progress termination ---
+        try:
+            active_node = int(info.get('active_node', -1) or -1)
+        except (TypeError, ValueError):
+            active_node = -1
+
+        if active_node >= 0:
+            if active_node == self._last_active_node:
+                self._no_progress_steps += 1
+                if self._no_progress_steps >= self.progress_patience:
+                    return -1.0, True  # no track progress — terminate
+            else:
+                self._last_active_node  = active_node
+                self._no_progress_steps = 0
+
         # --- Short-lap exploit detection ---
         try:
             current_lap_count = int(info.get('lap_count', 0) or 0)
@@ -169,22 +213,15 @@ class SpeedRewardWrapper(gym.Wrapper):
                 lap_time = 999.0
             if lap_time < self.min_lap_time:
                 penalty = -10.0 * (self.min_lap_time / max(lap_time, 0.1))
-                return penalty, True   # (reward, force_terminate)
+                return penalty, True
 
         # --- Efficiency gate: detect circular driving ---
         efficiency = self._compute_efficiency()
         if efficiency < self.min_efficiency:
-            # Car is circling — zero reward but don't terminate.
-            # Zero (not negative) so there's no perverse incentive to crash
-            # early to avoid accumulating penalties.
             return 0.0, False
 
-        # --- CTE quality: how centred is the car? ---
-        try:
-            cte = float(info.get('cte', 0.0) or 0.0)
-        except (TypeError, ValueError):
-            cte = 0.0
-        cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)   # 0=off track, 1=centred
+        # --- CTE quality ---
+        cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)
 
         # --- Speed ---
         try:
@@ -192,7 +229,7 @@ class SpeedRewardWrapper(gym.Wrapper):
         except (TypeError, ValueError):
             speed = 0.0
 
-        # --- v6 reward: speed × CTE quality (same as v5, but gated) ---
+        # --- v6 reward: speed × CTE quality ---
         speed_norm = min(speed / 10.0, 1.0)
         return cte_quality * speed_norm, False
 
diff --git a/docs/SESSION_LOG_2026-04-19.md b/docs/SESSION_LOG_2026-04-19.md
index 1536438..0913dd0 100644
--- a/docs/SESSION_LOG_2026-04-19.md
+++ b/docs/SESSION_LOG_2026-04-19.md
@@ -117,10 +117,60 @@ parallel envs are working.
 - **Exp 11:** Tested parallel DummyVecEnv with two sim instances (ports 9091 + 9093)
   - Exp 11 (v5 reward): aborted due to circular driving on generated_track
   - Exp 11b (v6 reward): completed, no circles, but plateaus at ~194 steps on all tracks
-- **v6 reward confirmed:** efficiency gate prevents circles, tests pass
-- **Parallel env confirmed:** mechanically sound, stable training
-- **Open issue:** 90k steps may be insufficient for 2-env training (45k per track)
-- **Next experiment ideas:**
-  - Increase to 180k-250k total steps
-  - Test v6 on single track to isolate reward effect
-  - Check if efficiency gate fires during normal cornering (false positives)
+  - Exp 11c (v6 reward, 250k): aborted — grass exploit found on generated_track
+  - Exp 11d: pending fixes before re-run
+
+## Critical Known Facts (DO NOT LOSE)
+
+### throttle_min history (from Exp 1-9)
+- `throttle_min=0.2` alone: car cannot get over mountain_track hill (not enough power)
+- `throttle_min=0.5`: car gets over hill BUT throttle is baked into action space,
+  model CANNOT output throttle < 0.5, crashes on tight corners (mini_monaco ~91 steps)
+- `throttle_min=0.2` + v5 reward (speed×CTE): car CAN learn to self-select high
+  throttle on hill. Proved in Exp 9 (mountain only, 90k steps) → 2000/2000 steps.
+- KEY INSIGHT: Exp 9 worked because 90k steps were ALL on mountain. In parallel setup
+  (Exp 11b/11c), each track gets only ~45k effective steps AND the grass exploit
+  contaminated training. Mountain failure in parallel runs is NOT purely a throttle
+  issue — fix the grass exploit first, THEN see if mountain learns.
+
+### The grass exploit root cause (found 2026-04-19)
+- generated_track has a physical gap in the boundary mesh at the first turn
+- Car drives through the gap, CTE exceeds 8.0m → sim should terminate
+- BUT: `determine_episode_over()` in donkey_sim.py has this code:
+  ```python
+  if math.fabs(self.cte) > 2 * self.max_cte:  # > 16.0m
+      pass   # ← INTENTIONALLY DOES NOTHING
+  elif math.fabs(self.cte) > self.max_cte:     # 8.0–16.0m
+      self.over = True
+  ```
+- Car quickly exceeds 16m (> 2×max_cte), hits the `pass` case — episode never ends
+- Fix: Python-side CTE patience wrapper that terminates when CTE > 4.0m for 20 steps
+  (catches the car BEFORE it blows past 16m)
+
+### Parallel env episode asymmetry
+- DummyVecEnv runs both envs in every step (sequential, not truly parallel)
+- When mountain episode ends quickly, VecEnv auto-resets mountain and starts new episode
+- Meanwhile generated_track episode continues
+- During training (model.learn()): PPO collects experience from both and auto-resets
+  independently — this is fine and correct
+- During eval: our eval loop uses done_mask, so short mountain episodes auto-reset
+  and start new episodes that we ignore (waiting for generated_track to finish)
+- User observation: 'car waits at start line for generated_track episode to end' — correct
+
+### DO NOT confuse mountain rollback with stuck issue
+- Mountain rollback (car goes up, slows, rolls back) is a LEARNING/REWARD issue
+- It is NOT a stuck issue — the car is moving (rolling back = speed > 0)
+- StuckTerminationWrapper correctly does NOT fire (car IS moving)
+- Root fix: ensure training is not contaminated by other exploits, then the
+  v5/v6 speed gradient teaches the model to apply high throttle on the hill
+  (proved to work in Exp 9)
+- DO NOT add termination conditions for rollback — they interfere with valid
+  slow hill-climbing learning
+
+### speed vs forward_vel in reward
+- info['speed'] comes from Unity — scalar magnitude, always ≥ 0
+- info['forward_vel'] computed in Python — dot(heading, velocity), negative when reversing
+- Our reward uses info['speed'] — car rolling backward gets positive reward
+- Sim's own reward correctly uses forward_vel with `if forward_vel > 0.0` check
+- This is a known issue but NOT the primary cause of current problems
+  (efficiency gate gives 0 reward when rolling back → net displacement ≈ 0)
diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py
index 2408866..ecb8015 100644
--- a/tests/test_reward_wrapper.py
+++ b/tests/test_reward_wrapper.py
@@ -299,3 +299,105 @@ def test_lap_count_resets_on_episode_reset():
     # Reset episode — counter must go back to 0
     wrapper.reset()
     assert wrapper._last_lap_count == 0
+
+
+# ---------------------------------------------------------------------------
+# v6.1 exploit terminator tests
+# ---------------------------------------------------------------------------
+
+def test_sustained_high_cte_terminates_episode():
+    """
+    Grass exploit fix: if CTE exceeds max_cte_terminate for cte_patience
+    consecutive steps, the episode must be force-terminated with -1.0 reward.
+    This catches the generated_track gap where car drives indefinitely on grass.
+    """
+    env = MockEnv(speed=3.0, cte=5.0)  # CTE=5.0 > max_cte_terminate=4.0
+    wrapper = SpeedRewardWrapper(env, max_cte_terminate=4.0, cte_patience=5)
+    wrapper.reset()
+
+    rewards = []
+    terminated = []
+    for _ in range(10):
+        info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.),
+                'active_node': 0, 'lap_count': 0, 'last_lap_time': 0.0}
+        r, force_term = wrapper._compute_reward_and_done(done=False, info=info)
+        rewards.append(r)
+        terminated.append(force_term)
+
+    # Should terminate at step 5 (cte_patience=5)
+    assert terminated[4] == True, f'Should force-terminate at step 5, got {terminated}'
+    assert rewards[4] == -1.0, f'Termination reward should be -1.0, got {rewards[4]}'
+    assert terminated[0] == False, 'Should not terminate at step 1'
+
+
+def test_high_cte_resets_when_back_on_track():
+    """
+    High CTE counter must reset when car returns to track.
+    Prevents false termination after a brief excursion.
+    """
+    env = MockEnv(speed=3.0, cte=0.5)
+    wrapper = SpeedRewardWrapper(env, max_cte_terminate=4.0, cte_patience=5)
+    wrapper.reset()
+
+    # 3 steps high CTE
+    for _ in range(3):
+        info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.),
+                'active_node': 0, 'lap_count': 0, 'last_lap_time': 0.0}
+        r, ft = wrapper._compute_reward_and_done(done=False, info=info)
+        assert ft == False, 'Should not terminate after only 3 steps'
+
+    # 1 step back on track resets counter
+    info = {'cte': 1.0, 'speed': 3.0, 'pos': (0., 0., 0.),
+            'active_node': 1, 'lap_count': 0, 'last_lap_time': 0.0}
+    wrapper._compute_reward_and_done(done=False, info=info)
+    assert wrapper._high_cte_steps == 0, 'CTE counter should reset when back on track'
+
+    # 5 more steps high CTE — should now terminate (counter starts fresh)
+    for i in range(5):
+        info = {'cte': 5.0, 'speed': 3.0, 'pos': (0., 0., 0.),
+                'active_node': 1, 'lap_count': 0, 'last_lap_time': 0.0}
+        r, ft = wrapper._compute_reward_and_done(done=False, info=info)
+    assert ft == True, 'Should terminate after 5 new consecutive high-CTE steps'
+
+
+def test_no_track_progress_terminates_episode():
+    """
+    Mountain rollback fix: if active_node doesn't advance for progress_patience
+    steps, the episode must be force-terminated. This catches a car that drives
+    up a hill, rolls back, and keeps moving (so StuckWrapper doesn't fire)
+    but never makes real track progress.
+    """
+    env = MockEnv(speed=3.0, cte=0.5)
+    wrapper = SpeedRewardWrapper(env, progress_patience=10)
+    wrapper.reset()
+
+    # Step with node=5 for 11 steps — first step initialises, then 10 stuck
+    for i in range(11):
+        info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.),
+                'active_node': 5, 'lap_count': 0, 'last_lap_time': 0.0}
+        r, ft = wrapper._compute_reward_and_done(done=False, info=info)
+
+    assert ft == True, f'Should terminate after 10 steps of no node progress (11 calls)'
+    assert r == -1.0, f'Termination reward should be -1.0'
+
+
+def test_track_progress_resets_counter():
+    """
+    Node advancement must reset the no-progress counter.
+    """
+    env = MockEnv(speed=3.0, cte=0.5)
+    wrapper = SpeedRewardWrapper(env, progress_patience=5)
+    wrapper.reset()
+
+    # 3 steps on same node (first sets _last_active_node, then 2 count as no-progress)
+    for _ in range(3):
+        info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.),
+                'active_node': 3, 'lap_count': 0, 'last_lap_time': 0.0}
+        wrapper._compute_reward_and_done(done=False, info=info)
+    assert wrapper._no_progress_steps == 2, 'First call initialises node, then 2 stuck'
+
+    # Advance node — counter resets
+    info = {'cte': 0.5, 'speed': 3.0, 'pos': (0.1, 0., 0.),
+            'active_node': 4, 'lap_count': 0, 'last_lap_time': 0.0}
+    wrapper._compute_reward_and_done(done=False, info=info)
+    assert wrapper._no_progress_steps == 0, 'Progress counter should reset on node advance'