From 9ffe1c5d40e3b7d88fb55a1f73e7afe125e32a90 Mon Sep 17 00:00:00 2001
From: Paul Huliganga <paje0101@gmail.com>
Date: Sun, 19 Apr 2026 17:26:38 -0400
Subject: [PATCH] fix: efficiency gate now TERMINATES after 20 low-efficiency
 steps (was zero-reward only)

Previously circles ran 20+ seconds because the efficiency gate only returned
0 reward without terminating. After 20 consecutive steps of efficiency < 0.15
(~0.7 seconds at 27 steps/sec), episode now terminates with -1.0.

Also confirmed from telemetry diagnostic: CTE does report correctly when
car goes off-track (rises steadily to 6.2m before tree collision).
The grass exploit runs long only when the open grass area has no obstacles.
Efficiency gate termination is the most reliable catch for both circles
and open-grass driving (straight-line grass = high efficiency, but
active_node progress terminator catches that case).
---
 agent/reward_wrapper.py | 42 ++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py
index 14274f6..3563d1a 100644
--- a/agent/reward_wrapper.py
+++ b/agent/reward_wrapper.py
@@ -96,22 +96,25 @@ class SpeedRewardWrapper(gym.Wrapper):
         min_lap_time: float = 5.0,
         max_cte_terminate: float = 4.0,
         cte_patience: int = 20,
-        progress_patience: int = 60,   # ~3.3s at 18 steps/sec
+        progress_patience: int = 60,
+        efficiency_patience: int = 20,  # steps of low efficiency before termination
     ):
         super().__init__(env)
-        self.speed_scale       = speed_scale
-        self.window_size       = window_size
-        self.min_efficiency    = min_efficiency
-        self.max_cte           = max_cte
-        self.min_lap_time      = min_lap_time
-        self.max_cte_terminate = max_cte_terminate
-        self.cte_patience      = cte_patience
-        self.progress_patience = progress_patience
-        self._pos_history      = deque(maxlen=window_size + 1)
-        self._last_lap_count   = 0
-        self._high_cte_steps   = 0
-        self._max_node_seen    = -1   # highest active_node reached this episode
-        self._no_progress_steps = 0  # steps since max_node last increased
+        self.speed_scale        = speed_scale
+        self.window_size        = window_size
+        self.min_efficiency     = min_efficiency
+        self.max_cte            = max_cte
+        self.min_lap_time       = min_lap_time
+        self.max_cte_terminate  = max_cte_terminate
+        self.cte_patience       = cte_patience
+        self.progress_patience  = progress_patience
+        self.efficiency_patience = efficiency_patience
+        self._pos_history       = deque(maxlen=window_size + 1)
+        self._last_lap_count    = 0
+        self._high_cte_steps    = 0
+        self._max_node_seen     = -1
+        self._no_progress_steps = 0
+        self._low_eff_steps     = 0
 
     def reset(self, **kwargs):
         result = self.env.reset(**kwargs)
@@ -120,6 +123,7 @@ class SpeedRewardWrapper(gym.Wrapper):
         self._high_cte_steps    = 0
         self._max_node_seen     = -1
         self._no_progress_steps = 0
+        self._low_eff_steps     = 0
         return result
 
     def step(self, action):
@@ -232,9 +236,17 @@ class SpeedRewardWrapper(gym.Wrapper):
                 return penalty, True
 
         # --- Efficiency gate: detect circular driving ---
+        # Count consecutive steps of low efficiency. After patience steps, terminate.
+        # Previously this just returned 0 reward (no termination) which let circles
+        # run for 20+ seconds. Now we terminate after ~20 steps (~0.7s).
         efficiency = self._compute_efficiency()
         if efficiency < self.min_efficiency:
-            return 0.0, False
+            self._low_eff_steps += 1
+            if self._low_eff_steps >= self.efficiency_patience:
+                return -1.0, True   # circle too long — terminate
+            return 0.0, False       # still accumulating — zero reward
+        else:
+            self._low_eff_steps = 0
 
         # --- CTE quality ---
         cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)