diff --git a/agent/experiments/exp11_parallel_envs.py b/agent/experiments/exp11_parallel_envs.py index 077413b..d10e7a0 100644 --- a/agent/experiments/exp11_parallel_envs.py +++ b/agent/experiments/exp11_parallel_envs.py @@ -36,7 +36,7 @@ def make_env(track_id, port): def _init(): raw = gym.make(track_id, conf={'host': HOST, 'port': port}) env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) - env = StuckTerminationWrapper(env, stuck_steps=80, min_displacement=0.5) + env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5) env = SpeedRewardWrapper(env) return env return _init diff --git a/agent/multitrack_runner.py b/agent/multitrack_runner.py index 4d840fe..a02de6e 100644 --- a/agent/multitrack_runner.py +++ b/agent/multitrack_runner.py @@ -177,7 +177,7 @@ class StuckTerminationWrapper(gym.Wrapper): def wrap_env(raw_env): """Apply standard wrappers: throttle clamp + stuck detection + speed reward.""" env = ThrottleClampWrapper(raw_env, throttle_min=THROTTLE_MIN) - env = StuckTerminationWrapper(env, stuck_steps=80, min_displacement=0.5) + env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5) env = SpeedRewardWrapper(env, speed_scale=SPEED_SCALE) return env diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 566d925..c50e6c7 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -1,6 +1,6 @@ """ -Speed + Progress Reward Wrapper for DonkeyCar RL — v4 (Full Bypass) -==================================================================== +Speed + Progress Reward Wrapper for DonkeyCar RL — v6 (Speed×CTE + Efficiency Gate) +===================================================================================== REWARD HACKING HISTORY: v1 additive: speed × (1-cte/max_cte) → boundary oscillation @@ -8,9 +8,15 @@ REWARD HACKING HISTORY: v3 path efficiency: original × (1+speed×eff×scale) → still circling! WHY v3 failed: efficiency killed the SPEED BONUS but not the BASE reward. A spinning car at CTE≈0 still earns 1.0/step × thousands of steps. - - v4 (THIS VERSION): Completely bypass sim's reward. Multiply base reward by - efficiency so circling yields ZERO reward regardless of CTE. + v4: base × eff × (1 + speed_scale × speed) → zero gradient on hills! + WHY v4 failed on hills: speed≈0 AND eff≈0 AND cte_quality varies → all + three terms near zero simultaneously → no gradient to push ANY term up. + v5: speed × CTE_quality (no efficiency) → circular driving returns! + WHY v5 failed: dropped efficiency entirely. Circular driving at CTE≈0 + with speed>0 earns positive reward indefinitely. Observed in Exp 11. + v6 (THIS VERSION): v5 reward + efficiency GATE. + Keeps v5's gradient properties (non-zero gradient on hills) but adds + a binary efficiency check that zeros reward when car is circling. ROOT CAUSE OF CIRCLING: The sim's own calc_reward() uses `forward_vel` = dot(car_heading, velocity). @@ -18,24 +24,35 @@ ROOT CAUSE OF CIRCLING: so forward_vel > 0 always, giving positive reward while circling indefinitely. We bypass this entirely. -FORMULA (v4): - base = 1.0 - min(abs(cte) / max_cte, 1.0) # CTE quality [0,1] - eff = net_displacement / total_path_length # Forward progress [0,1] - shaped = base × eff × (1 + speed_scale × speed) # All three must be high +FORMULA (v6): + cte_quality = 1.0 - min(|cte| / max_cte, 1.0) # [0,1] centred=1 + speed_norm = min(speed / 10.0, 1.0) # [0,1] normalised + efficiency = net_displacement / total_path # [0,1] straight=1, circle=0 - On done/crash: shaped = -1.0 + if efficiency < min_efficiency: + reward = 0.0 # GATE: circling → zero reward (but not negative) + else: + reward = cte_quality × speed_norm # v5 formula (gradient on hills) + + On done/crash: reward = -1.0 + +WHY GATE NOT MULTIPLIER: + v4 used efficiency as a multiplier: reward = base × eff × speed_bonus. + On a hill: speed≈0, eff≈0, base≈0.5 → reward≈0 and ∂reward/∂speed≈0. + No gradient to push speed up — car stays stuck. + + v6 gate: efficiency is either PASS or FAIL. When efficiency > threshold + (car moving forward at all), reward = speed × CTE_quality. On a hill: + car is stuck but still has eff > 0 (not literally circling), so the gate + passes and the reward = speed × CTE_quality. ∂reward/∂speed > 0 → gradient + pushes toward more throttle. Circle has eff ≈ 0 → gate fails → reward = 0. PROPERTIES: - - Spinning (eff≈0): shaped ≈ 0 (no reward) - - On track, slow (eff≈1): shaped ≈ base (CTE reward only) - - On track, fast (eff≈1): shaped > base (CTE + speed bonus) - - Off track (base≈0): shaped ≈ 0 (penalty via done) - - Cannot be gamed: ALL THREE terms must be high simultaneously - -RESEARCH NOTE (2026-04-13): - v3 was insufficient — circling at start gave 1.0/step × 47k steps = 47k reward. - v4 makes efficiency a multiplier on the entire reward, not just the speed bonus. - See docs/RESEARCH_LOG.md for full hacking history. + - Circling (eff0): reward = speed × CTE (gradient toward unstuck) + - On track, fast: reward = high (speed + centred) + - Off track: reward ≈ 0 (CTE_quality → 0) + - Crash: reward = -1.0 """ import gymnasium as gym @@ -62,8 +79,8 @@ class SpeedRewardWrapper(gym.Wrapper): self, env, speed_scale: float = 0.1, - window_size: int = 60, # increased from 30 — catches slower circles - min_efficiency: float = 0.05, + window_size: int = 30, # captures 2+ full circles at typical circling speed + min_efficiency: float = 0.15, # gate threshold: circles ≈ 0.13, wobbly straight ≈ 0.98 max_cte: float = 8.0, min_lap_time: float = 5.0, # laps faster than this are penalised as exploits ): @@ -109,26 +126,36 @@ class SpeedRewardWrapper(gym.Wrapper): def _compute_reward_and_done(self, done: bool, info: dict): """ - v5: speed × CTE-quality reward. + v6: speed × CTE-quality + efficiency gate. - reward = speed × (1 - |cte| / max_cte) + reward = speed_norm × cte_quality (when efficiency >= threshold) + reward = 0.0 (when efficiency < threshold — circling) + reward = -1.0 (on crash/done) - Simpler than v4. Directly incentivises going FAST while staying - centred. On a hill: car slows → reward drops → clear gradient - signal to apply more throttle. v4's efficiency term gave zero - gradient when the car was stuck (all three terms collapsed to zero - simultaneously, so no direction to improve). + The efficiency gate prevents circular driving (eff≈0 for circles) + without killing gradient on hills (eff>0 for a stuck-but-not-circling + car, so the gate passes and speed×CTE gradient pushes toward unstuck). - Exploit protection (unchanged): - - Short-lap penalty: laps < min_lap_time → large negative reward - - StuckTerminationWrapper: done=True after 80 steps of <0.5m movement + Exploit protection: + - Efficiency gate: circles → reward = 0 + - Short-lap penalty: laps < min_lap_time → large negative + terminate + - StuckTerminationWrapper: done=True after stuck_steps of no movement - Crash: done=True → -1.0 """ + # Track position for efficiency calculation + try: + pos = info.get('pos', (0.0, 0.0, 0.0)) + pos_x = float(pos[0]) + pos_z = float(pos[2]) # z is forward in Unity coordinate system + self._pos_history.append(np.array([pos_x, pos_z])) + except (TypeError, ValueError, IndexError): + pass + # Crash / episode over if done: return -1.0, False - # --- Short-lap exploit detection (unchanged) --- + # --- Short-lap exploit detection --- try: current_lap_count = int(info.get('lap_count', 0) or 0) except (TypeError, ValueError): @@ -141,13 +168,16 @@ class SpeedRewardWrapper(gym.Wrapper): except (TypeError, ValueError): lap_time = 999.0 if lap_time < self.min_lap_time: - # Short-lap exploit: penalty AND terminate episode immediately. - # Penalty alone is insufficient — the model stays alive and - # keeps accumulating small rewards between laps. - # Termination removes that loophole completely. penalty = -10.0 * (self.min_lap_time / max(lap_time, 0.1)) return penalty, True # (reward, force_terminate) - # Legitimate lap — fall through to normal reward + + # --- Efficiency gate: detect circular driving --- + efficiency = self._compute_efficiency() + if efficiency < self.min_efficiency: + # Car is circling — zero reward but don't terminate. + # Zero (not negative) so there's no perverse incentive to crash + # early to avoid accumulating penalties. + return 0.0, False # --- CTE quality: how centred is the car? --- try: @@ -162,10 +192,7 @@ class SpeedRewardWrapper(gym.Wrapper): except (TypeError, ValueError): speed = 0.0 - # --- v5 reward: speed × CTE quality --- - # Fast + centred = high reward. Slow (hill) = low reward → gradient - # pushes policy toward higher throttle. Off-track = near-zero. - # Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s). + # --- v6 reward: speed × CTE quality (same as v5, but gated) --- speed_norm = min(speed / 10.0, 1.0) return cte_quality * speed_norm, False diff --git a/agent/run_eval.py b/agent/run_eval.py index c345665..4279bc3 100644 --- a/agent/run_eval.py +++ b/agent/run_eval.py @@ -56,7 +56,7 @@ log(f'Log file: {log_path}') def make_env(track_id, throttle_min): raw = gym.make(track_id) env = ThrottleClampWrapper(raw, throttle_min=throttle_min) - env = StuckTerminationWrapper(env, stuck_steps=80, min_displacement=0.5) + env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5) env = SpeedRewardWrapper(env) return env diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index a204558..2408866 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -69,20 +69,28 @@ def test_sim_reward_is_completely_ignored(): def test_circling_at_zero_cte_gives_near_zero_reward(): """ - v5: circling protection is handled by lap-time penalty + StuckTermination, - NOT by the reward formula. A circling car at CTE=0 with speed CAN earn - reward per step. This test verifies the formula works as designed: - reward = speed_norm * cte_quality. Circling is stopped by other mechanisms. + v6: circling (low efficiency) should yield zero reward via the efficiency gate. + After enough steps of circular motion, the efficiency drops below threshold + and the gate zeros the reward. """ env = MockEnv(speed=3.0, cte=0.0) - wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20) + wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=30, min_efficiency=0.15) wrapped.reset() - # At CTE=0 and speed=3, expected reward = (3/10) * 1.0 = 0.3 - _, r, _, _, _ = wrapped.step(0) - expected = (3.0 / 10.0) * 1.0 - assert abs(r - expected) < 0.05, ( - f"v5: reward at CTE=0, speed=3 should be ~{expected:.2f}, got {r:.4f}") + # Drive in a circle for enough steps to fill the position window + rewards = [] + for i in range(40): + angle = 2 * math.pi * i / 12 # completes circle every 12 steps + env.set_pos([0.5 * math.cos(angle), 0., 0.5 * math.sin(angle)]) + _, r, _, _, _ = wrapped.step(0) + rewards.append(r) + + # After 20+ steps of circular motion, efficiency gate should kick in + # Last few rewards should be 0.0 + assert rewards[-1] == 0.0, ( + f"v6: circular driving should yield 0.0 reward via efficiency gate, got {rewards[-1]:.4f}") + assert sum(1 for r in rewards[-5:] if r == 0.0) >= 3, ( + f"v6: most of last 5 rewards during circle should be 0.0, got {rewards[-5:]}") def test_forward_driving_earns_positive_reward(): @@ -97,23 +105,29 @@ def test_forward_driving_earns_positive_reward(): def test_forward_beats_circling_by_large_margin(): """ - v5: forward driving at moderate CTE should beat driving with high CTE. - The reward directly penalises being off-centre. + v6: forward driving earns positive reward; circular driving earns zero. + The efficiency gate ensures this gap. """ - # On track (CTE=1m) at speed=5 - env_on = MockEnv(speed=5.0, cte=1.0) - wrapped_on = SpeedRewardWrapper(env_on, speed_scale=0.1) - wrapped_on.reset() - _, r_on, _, _, _ = wrapped_on.step(0) + # Forward driving at CTE=1m, speed=5 + env_fwd = MockEnv(speed=5.0, cte=1.0) + wrapped_fwd = SpeedRewardWrapper(env_fwd, speed_scale=0.1, window_size=30) + wrapped_fwd.reset() + for i in range(35): + env_fwd.set_pos([i * 0.5, 0., 0.]) # straight line + _, r_fwd, _, _, _ = wrapped_fwd.step(0) - # Off track (CTE=7m) at same speed - env_off = MockEnv(speed=5.0, cte=7.0) - wrapped_off = SpeedRewardWrapper(env_off, speed_scale=0.1) - wrapped_off.reset() - _, r_off, _, _, _ = wrapped_off.step(0) + # Circular driving at CTE=0, speed=5 + env_circ = MockEnv(speed=5.0, cte=0.0) + wrapped_circ = SpeedRewardWrapper(env_circ, speed_scale=0.1, window_size=30) + wrapped_circ.reset() + for i in range(35): + angle = 2 * math.pi * i / 12 + env_circ.set_pos([0.5 * math.cos(angle), 0., 0.5 * math.sin(angle)]) + _, r_circ, _, _, _ = wrapped_circ.step(0) - assert r_on > r_off * 3, ( - f"On-track ({r_on:.2f}) should beat off-track ({r_off:.2f}) by 3x") + assert r_fwd > 0, f"Forward driving should earn positive reward, got {r_fwd}" + assert r_circ == 0.0, f"Circular driving should earn 0 reward, got {r_circ}" + assert r_fwd > r_circ, f"Forward ({r_fwd:.3f}) must beat circling ({r_circ:.3f})" def test_crash_gives_negative_reward():