fix: efficiency gate now TERMINATES after 20 low-efficiency steps (was zero-reward only)

Previously circles ran 20+ seconds because the efficiency gate only returned 0 reward without terminating. After 20 consecutive steps of efficiency < 0.15 (~0.7 seconds at 27 steps/sec), episode now terminates with -1.0. Also confirmed from telemetry diagnostic: CTE does report correctly when car goes off-track (rises steadily to 6.2m before tree collision). The grass exploit runs long only when the open grass area has no obstacles. Efficiency gate termination is the most reliable catch for both circles and open-grass driving (straight-line grass = high efficiency, but active_node progress terminator catches that case).
2026-04-19 17:26:38 -04:00 · 2026-04-19 17:26:38 -04:00 · 9ffe1c5d40
parent 813f888502
commit 9ffe1c5d40
1 changed files with 27 additions and 15 deletions
--- a/agent/reward_wrapper.py
+++ b/agent/reward_wrapper.py
@ -96,7 +96,8 @@ class SpeedRewardWrapper(gym.Wrapper):
        min_lap_time: float = 5.0,
        max_cte_terminate: float = 4.0,
        cte_patience: int = 20,
-        progress_patience: int = 60,   # ~3.3s at 18 steps/sec
+        progress_patience: int = 60,
+        efficiency_patience: int = 20,  # steps of low efficiency before termination
    ):
        super().__init__(env)
        self.speed_scale        = speed_scale
@ -107,11 +108,13 @@ class SpeedRewardWrapper(gym.Wrapper):
        self.max_cte_terminate  = max_cte_terminate
        self.cte_patience       = cte_patience
        self.progress_patience  = progress_patience
+        self.efficiency_patience = efficiency_patience
        self._pos_history       = deque(maxlen=window_size + 1)
        self._last_lap_count    = 0
        self._high_cte_steps    = 0
-        self._max_node_seen    = -1   # highest active_node reached this episode
-        self._no_progress_steps = 0  # steps since max_node last increased
+        self._max_node_seen     = -1
+        self._no_progress_steps = 0
+        self._low_eff_steps     = 0

    def reset(self, **kwargs):
        result = self.env.reset(**kwargs)
@ -120,6 +123,7 @@ class SpeedRewardWrapper(gym.Wrapper):
        self._high_cte_steps    = 0
        self._max_node_seen     = -1
        self._no_progress_steps = 0
+        self._low_eff_steps     = 0
        return result

    def step(self, action):
@ -232,9 +236,17 @@ class SpeedRewardWrapper(gym.Wrapper):
                return penalty, True

        # --- Efficiency gate: detect circular driving ---
+        # Count consecutive steps of low efficiency. After patience steps, terminate.
+        # Previously this just returned 0 reward (no termination) which let circles
+        # run for 20+ seconds. Now we terminate after ~20 steps (~0.7s).
        efficiency = self._compute_efficiency()
        if efficiency < self.min_efficiency:
-            return 0.0, False
+            self._low_eff_steps += 1
+            if self._low_eff_steps >= self.efficiency_patience:
+                return -1.0, True   # circle too long — terminate
+            return 0.0, False       # still accumulating — zero reward
+        else:
+            self._low_eff_steps = 0

        # --- CTE quality ---
        cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)