diff --git a/agent/outerloop-results/autoresearch_phase2_log.txt b/agent/outerloop-results/autoresearch_phase2_log.txt index ae9fc81..06845fb 100644 --- a/agent/outerloop-results/autoresearch_phase2_log.txt +++ b/agent/outerloop-results/autoresearch_phase2_log.txt @@ -762,3 +762,29 @@ [2026-04-17 22:10:12] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-17 22:10:12] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-17 22:10:12] [AutoResearch] Only 1 results — using random proposal. +[2026-04-18 10:41:08] [AutoResearch] GP UCB top-5 candidates: +[2026-04-18 10:41:08] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173} +[2026-04-18 10:41:08] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198} +[2026-04-18 10:41:08] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887} +[2026-04-18 10:41:08] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199} +[2026-04-18 10:41:08] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-18 10:41:08] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-18 10:41:08] [AutoResearch] Only 1 results — using random proposal. +[2026-04-18 10:41:59] [AutoResearch] GP UCB top-5 candidates: +[2026-04-18 10:41:59] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173} +[2026-04-18 10:41:59] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198} +[2026-04-18 10:41:59] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887} +[2026-04-18 10:41:59] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199} +[2026-04-18 10:41:59] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-18 10:41:59] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-18 10:41:59] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_phase3_log.txt b/agent/outerloop-results/autoresearch_phase3_log.txt index 9b611bd..09e407e 100644 --- a/agent/outerloop-results/autoresearch_phase3_log.txt +++ b/agent/outerloop-results/autoresearch_phase3_log.txt @@ -405,3 +405,13 @@ [2026-04-17 22:10:26] [Wave3] Only 0 results — using random proposal. [2026-04-17 22:10:26] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} [2026-04-17 22:10:26] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} +[2026-04-18 10:41:19] [Wave3] Seed trial 1/2: using hardcoded params. +[2026-04-18 10:41:19] [Wave3] Seed trial 2/2: using hardcoded params. +[2026-04-18 10:41:19] [Wave3] Only 0 results — using random proposal. +[2026-04-18 10:41:19] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} +[2026-04-18 10:41:19] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} +[2026-04-18 10:42:10] [Wave3] Seed trial 1/2: using hardcoded params. +[2026-04-18 10:42:10] [Wave3] Seed trial 2/2: using hardcoded params. +[2026-04-18 10:42:10] [Wave3] Only 0 results — using random proposal. +[2026-04-18 10:42:10] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} +[2026-04-18 10:42:10] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 9513072..566d925 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -97,14 +97,17 @@ class SpeedRewardWrapper(gym.Wrapper): raise ValueError(f'Unexpected step() result length: {len(result)}') # Completely ignore _sim_reward — compute our own - shaped = self._compute_reward(done, info) + shaped, force_terminate = self._compute_reward_and_done(done, info) + if force_terminate: + terminated = True + done = True if len(result) == 5: return obs, shaped, terminated, truncated, info else: return obs, shaped, done, info - def _compute_reward(self, done: bool, info: dict) -> float: + def _compute_reward_and_done(self, done: bool, info: dict): """ v5: speed × CTE-quality reward. @@ -123,7 +126,7 @@ class SpeedRewardWrapper(gym.Wrapper): """ # Crash / episode over if done: - return -1.0 + return -1.0, False # --- Short-lap exploit detection (unchanged) --- try: @@ -138,7 +141,12 @@ class SpeedRewardWrapper(gym.Wrapper): except (TypeError, ValueError): lap_time = 999.0 if lap_time < self.min_lap_time: - return -10.0 * (self.min_lap_time / max(lap_time, 0.1)) + # Short-lap exploit: penalty AND terminate episode immediately. + # Penalty alone is insufficient — the model stays alive and + # keeps accumulating small rewards between laps. + # Termination removes that loophole completely. + penalty = -10.0 * (self.min_lap_time / max(lap_time, 0.1)) + return penalty, True # (reward, force_terminate) # Legitimate lap — fall through to normal reward # --- CTE quality: how centred is the car? --- @@ -159,7 +167,7 @@ class SpeedRewardWrapper(gym.Wrapper): # pushes policy toward higher throttle. Off-track = near-zero. # Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s). speed_norm = min(speed / 10.0, 1.0) - return cte_quality * speed_norm + return cte_quality * speed_norm, False def _compute_efficiency(self) -> float: """Path efficiency = net_displacement / total_path_length.""" diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index 70fad19..a204558 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -224,7 +224,7 @@ def test_short_lap_triggers_penalty(): # Simulate step where a new lap completes in 1 second (exploit) info = {'cte': 0.0, 'speed': 3.0, 'pos': (0.0, 0.0, 0.0), 'lap_count': 1, 'last_lap_time': 1.0} - reward = wrapper._compute_reward(done=False, info=info) + reward, _ = wrapper._compute_reward_and_done(done=False, info=info) assert reward < 0, f'Short lap (1s) should penalise, got reward={reward}' assert reward <= -10.0, f'Short lap penalty should be large (<= -10), got {reward}' @@ -240,12 +240,12 @@ def test_legitimate_lap_not_penalised(): # First step — no lap yet info_no_lap = {'cte': 0.0, 'speed': 3.0, 'pos': (0.0, 0.0, 0.0), 'lap_count': 0, 'last_lap_time': 0.0} - wrapper._compute_reward(done=False, info=info_no_lap) + wrapper._compute_reward_and_done(done=False, info=info_no_lap) # Legitimate lap at 12 seconds info = {'cte': 0.2, 'speed': 3.0, 'pos': (1.0, 0.0, 0.0), 'lap_count': 1, 'last_lap_time': 12.0} - reward = wrapper._compute_reward(done=False, info=info) + reward, _ = wrapper._compute_reward_and_done(done=False, info=info) assert reward >= 0, f'Legitimate lap (12s) should not be penalised, got {reward}' @@ -260,13 +260,13 @@ def test_lap_count_not_double_penalised(): # Short lap fires on step where lap_count increments info_lap = {'cte': 0.0, 'speed': 3.0, 'pos': (0.0, 0.0, 0.0), 'lap_count': 1, 'last_lap_time': 1.5} - r1 = wrapper._compute_reward(done=False, info=info_lap) + r1, _ = wrapper._compute_reward_and_done(done=False, info=info_lap) assert r1 < 0 # Next step same lap_count — should get normal reward, not another penalty info_next = {'cte': 0.0, 'speed': 3.0, 'pos': (0.1, 0.0, 0.0), 'lap_count': 1, 'last_lap_time': 1.5} - r2 = wrapper._compute_reward(done=False, info=info_next) + r2, _ = wrapper._compute_reward_and_done(done=False, info=info_next) assert r2 >= 0, f'Penalty should not repeat on same lap_count, got r2={r2}' @@ -279,7 +279,7 @@ def test_lap_count_resets_on_episode_reset(): # Complete a short lap info_lap = {'cte': 0.0, 'speed': 3.0, 'pos': (0.0, 0.0, 0.0), 'lap_count': 1, 'last_lap_time': 1.0} - wrapper._compute_reward(done=False, info=info_lap) + wrapper._compute_reward_and_done(done=False, info=info_lap) assert wrapper._last_lap_count == 1 # Reset episode — counter must go back to 0