""" Speed + Progress Reward Wrapper for DonkeyCar RL — v4 (Full Bypass) ==================================================================== REWARD HACKING HISTORY: v1 additive: speed × (1-cte/max_cte) → boundary oscillation v2 multiplicative: original × (1+speed×scale) → circular driving (on-track) v3 path efficiency: original × (1+speed×eff×scale) → still circling! WHY v3 failed: efficiency killed the SPEED BONUS but not the BASE reward. A spinning car at CTE≈0 still earns 1.0/step × thousands of steps. v4 (THIS VERSION): Completely bypass sim's reward. Multiply base reward by efficiency so circling yields ZERO reward regardless of CTE. ROOT CAUSE OF CIRCLING: The sim's own calc_reward() uses `forward_vel` = dot(car_heading, velocity). A spinning car is ALWAYS moving "forward" relative to its own heading, so forward_vel > 0 always, giving positive reward while circling indefinitely. We bypass this entirely. FORMULA (v4): base = 1.0 - min(abs(cte) / max_cte, 1.0) # CTE quality [0,1] eff = net_displacement / total_path_length # Forward progress [0,1] shaped = base × eff × (1 + speed_scale × speed) # All three must be high On done/crash: shaped = -1.0 PROPERTIES: - Spinning (eff≈0): shaped ≈ 0 (no reward) - On track, slow (eff≈1): shaped ≈ base (CTE reward only) - On track, fast (eff≈1): shaped > base (CTE + speed bonus) - Off track (base≈0): shaped ≈ 0 (penalty via done) - Cannot be gamed: ALL THREE terms must be high simultaneously RESEARCH NOTE (2026-04-13): v3 was insufficient — circling at start gave 1.0/step × 47k steps = 47k reward. v4 makes efficiency a multiplier on the entire reward, not just the speed bonus. See docs/RESEARCH_LOG.md for full hacking history. """ import gymnasium as gym import numpy as np from collections import deque class SpeedRewardWrapper(gym.Wrapper): """ Full reward bypass: base CTE reward × path efficiency × speed bonus. Completely ignores the sim's own reward (which uses forward_vel and is exploitable by circular/spinning motion). Args: env: gymnasium environment speed_scale: speed bonus multiplier (default 0.1) window_size: steps for efficiency calculation (default 30) min_efficiency: efficiency below which no reward (default 0.05) max_cte: track half-width for normalization (default 8.0, matches sim) """ def __init__( self, env, speed_scale: float = 0.1, window_size: int = 60, # increased from 30 — catches slower circles min_efficiency: float = 0.05, max_cte: float = 8.0, min_lap_time: float = 5.0, # laps faster than this are penalised as exploits ): super().__init__(env) self.speed_scale = speed_scale self.window_size = window_size self.min_efficiency = min_efficiency self.max_cte = max_cte self.min_lap_time = min_lap_time self._pos_history = deque(maxlen=window_size + 1) self._last_lap_count = 0 # track lap completions to detect short-lap exploit def reset(self, **kwargs): result = self.env.reset(**kwargs) self._pos_history.clear() self._last_lap_count = 0 return result def step(self, action): result = self.env.step(action) # Handle both 4-tuple (old gym) and 5-tuple (gymnasium) APIs if len(result) == 5: obs, _sim_reward, terminated, truncated, info = result done = terminated or truncated elif len(result) == 4: obs, _sim_reward, done, info = result terminated = done truncated = False else: raise ValueError(f'Unexpected step() result length: {len(result)}') # Completely ignore _sim_reward — compute our own shaped = self._compute_reward(done, info) if len(result) == 5: return obs, shaped, terminated, truncated, info else: return obs, shaped, done, info def _compute_reward(self, done: bool, info: dict) -> float: """ v5: speed × CTE-quality reward. reward = speed × (1 - |cte| / max_cte) Simpler than v4. Directly incentivises going FAST while staying centred. On a hill: car slows → reward drops → clear gradient signal to apply more throttle. v4's efficiency term gave zero gradient when the car was stuck (all three terms collapsed to zero simultaneously, so no direction to improve). Exploit protection (unchanged): - Short-lap penalty: laps < min_lap_time → large negative reward - StuckTerminationWrapper: done=True after 80 steps of <0.5m movement - Crash: done=True → -1.0 """ # Crash / episode over if done: return -1.0 # --- Short-lap exploit detection (unchanged) --- try: current_lap_count = int(info.get('lap_count', 0) or 0) except (TypeError, ValueError): current_lap_count = self._last_lap_count if current_lap_count > self._last_lap_count: self._last_lap_count = current_lap_count try: lap_time = float(info.get('last_lap_time', 999.0) or 999.0) except (TypeError, ValueError): lap_time = 999.0 if lap_time < self.min_lap_time: return -10.0 * (self.min_lap_time / max(lap_time, 0.1)) # Legitimate lap — fall through to normal reward # --- CTE quality: how centred is the car? --- try: cte = float(info.get('cte', 0.0) or 0.0) except (TypeError, ValueError): cte = 0.0 cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # 0=off track, 1=centred # --- Speed --- try: speed = max(0.0, float(info.get('speed', 0.0) or 0.0)) except (TypeError, ValueError): speed = 0.0 # --- v5 reward: speed × CTE quality --- # Fast + centred = high reward. Slow (hill) = low reward → gradient # pushes policy toward higher throttle. Off-track = near-zero. # Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s). speed_norm = min(speed / 10.0, 1.0) return cte_quality * speed_norm def _compute_efficiency(self) -> float: """Path efficiency = net_displacement / total_path_length.""" if len(self._pos_history) < 3: return 1.0 # Insufficient history — give benefit of doubt positions = list(self._pos_history) net = np.linalg.norm(positions[-1] - positions[0]) total = sum( np.linalg.norm(positions[i + 1] - positions[i]) for i in range(len(positions) - 1) ) return float(net / total) if total > 1e-6 else 1.0 def theoretical_max_per_step(self, max_speed: float = 10.0) -> float: """Upper bound on reward/step (efficiency=1, CTE=0, max speed).""" return 1.0 * 1.0 * (1.0 + self.speed_scale * max_speed)