diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 14274f6..3563d1a 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -96,22 +96,25 @@ class SpeedRewardWrapper(gym.Wrapper): min_lap_time: float = 5.0, max_cte_terminate: float = 4.0, cte_patience: int = 20, - progress_patience: int = 60, # ~3.3s at 18 steps/sec + progress_patience: int = 60, + efficiency_patience: int = 20, # steps of low efficiency before termination ): super().__init__(env) - self.speed_scale = speed_scale - self.window_size = window_size - self.min_efficiency = min_efficiency - self.max_cte = max_cte - self.min_lap_time = min_lap_time - self.max_cte_terminate = max_cte_terminate - self.cte_patience = cte_patience - self.progress_patience = progress_patience - self._pos_history = deque(maxlen=window_size + 1) - self._last_lap_count = 0 - self._high_cte_steps = 0 - self._max_node_seen = -1 # highest active_node reached this episode - self._no_progress_steps = 0 # steps since max_node last increased + self.speed_scale = speed_scale + self.window_size = window_size + self.min_efficiency = min_efficiency + self.max_cte = max_cte + self.min_lap_time = min_lap_time + self.max_cte_terminate = max_cte_terminate + self.cte_patience = cte_patience + self.progress_patience = progress_patience + self.efficiency_patience = efficiency_patience + self._pos_history = deque(maxlen=window_size + 1) + self._last_lap_count = 0 + self._high_cte_steps = 0 + self._max_node_seen = -1 + self._no_progress_steps = 0 + self._low_eff_steps = 0 def reset(self, **kwargs): result = self.env.reset(**kwargs) @@ -120,6 +123,7 @@ class SpeedRewardWrapper(gym.Wrapper): self._high_cte_steps = 0 self._max_node_seen = -1 self._no_progress_steps = 0 + self._low_eff_steps = 0 return result def step(self, action): @@ -232,9 +236,17 @@ class SpeedRewardWrapper(gym.Wrapper): return penalty, True # --- Efficiency gate: detect circular driving --- + # Count consecutive steps of low efficiency. After patience steps, terminate. + # Previously this just returned 0 reward (no termination) which let circles + # run for 20+ seconds. Now we terminate after ~20 steps (~0.7s). efficiency = self._compute_efficiency() if efficiency < self.min_efficiency: - return 0.0, False + self._low_eff_steps += 1 + if self._low_eff_steps >= self.efficiency_patience: + return -1.0, True # circle too long — terminate + return 0.0, False # still accumulating — zero reward + else: + self._low_eff_steps = 0 # --- CTE quality --- cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)