diff --git a/agent/analysis_circular_driving.py b/agent/analysis_circular_driving.py new file mode 100644 index 0000000..7ce2769 --- /dev/null +++ b/agent/analysis_circular_driving.py @@ -0,0 +1,46 @@ +""" +== DATA ANALYSIS: Circular Driving Detection (2026-04-13) == + +FINDINGS from Phase 1 data (autoresearch_results_phase1.jsonl): + +Trial mean_rwd std rps cv% verdict + 1 270.56 0.143 0.086 0.1% ⚠️ LOW STD suspicious — possibly circling + 4 627.69 2.35 0.147 0.4% OK — low variance, moderate reward + 5 4582.80 0.485 0.957 0.0% 🚨 CIRCULAR — 74% of theoretical max, cv=0.0% + 6 454.06 2.73 0.092 0.6% OK — consistent, plausible + 10 682.74 420.91 0.153 61.7% ⚠️ UNSTABLE — extremely high variance + 11 404.52 14.47 0.084 3.6% OK — reasonable variance + +KEY SIGNATURES OF CIRCULAR DRIVING: +1. cv (coefficient of variation) < 1% with mean_reward > 200 → very CONSISTENT circling + - Trial 5: cv=0.0%, mean=4582 → textbook circular motion + - Trial 1: cv=0.1%, mean=270 → likely also circling but slower + +2. reward/step approaching theoretical max → car is getting near-optimal reward continuously + - Trial 5: 0.957/step ≈ 74% of max (speed≈3 m/s) → sustained on-track fast motion + - This is achievable by circling at the starting line! + +3. User visual confirmation → car going left in circles at starting position + +WHY OUR REWARD WRAPPER v2 STILL ALLOWS CIRCLING: + The fix was correct for the ADDITIVE formula (speed × f(cte)). + The MULTIPLICATIVE formula prevents off-track hacking. + BUT: a car circling ON-TRACK still gets full speed bonus! + - Car circles at start (CTE ≈ 0) → original_reward > 0 + - Car has speed 3 → shaped = 1.0 × (1 + 0.1 × 3) = 1.3/step + - Over 4787 steps: max = 6223, actual = 4582 → 74% efficiency (car is on track most of time!) + +THE FUNDAMENTAL PROBLEM: + Neither CTE nor speed can distinguish FORWARD driving from CIRCULAR driving. + Both have: low CTE (car is centered), positive speed (car is moving). + + We need a reward component that is ZERO for circular motion and POSITIVE for forward progress. + +SOLUTION: Path Efficiency Reward + efficiency = net_displacement / path_length (over sliding window) + - Forward driving: efficiency ≈ 1.0 (all movement is productive) + - Circular driving: efficiency ≈ 0.0 (lots of movement, no net advance) + - Shaped reward: original × (1 + speed_scale × speed × efficiency) +""" + +print(__doc__) diff --git a/agent/outerloop-results/autoresearch_phase1_log.txt b/agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_circular_driving.txt similarity index 76% rename from agent/outerloop-results/autoresearch_phase1_log.txt rename to agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_circular_driving.txt index 3c15f24..64278e2 100644 --- a/agent/outerloop-results/autoresearch_phase1_log.txt +++ b/agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_circular_driving.txt @@ -220,3 +220,69 @@ [2026-04-13 13:11:06] mean_reward=627.6915 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0009549126527603771, 'timesteps': 4279, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} [2026-04-13 13:11:06] mean_reward=454.0640 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0005165618383365869, 'timesteps': 4929, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} [2026-04-13 13:11:06] mean_reward=306.1739 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0003097316245852375, 'timesteps': 4938, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:11:07] [AutoResearch] Git push complete after trial 10 +[2026-04-13 13:11:09] +[AutoResearch] ========== Trial 11/50 ========== +[2026-04-13 13:11:09] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 13:11:09] UCB=2.7195 mu=2.5127 sigma=0.1034 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.000557522373554661, 'timesteps': 4805} +[2026-04-13 13:11:09] UCB=2.5925 mu=1.9024 sigma=0.3451 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.00041865775623053806, 'timesteps': 4329} +[2026-04-13 13:11:09] UCB=2.5803 mu=1.1875 sigma=0.6964 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.00058177865639138, 'timesteps': 4419} +[2026-04-13 13:11:09] UCB=2.4298 mu=2.0749 sigma=0.1775 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0009718592685897328, 'timesteps': 4382} +[2026-04-13 13:11:09] UCB=2.2735 mu=1.8243 sigma=0.2246 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0010522226184685407, 'timesteps': 4546} +[2026-04-13 13:11:09] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.000557522373554661, 'timesteps': 4805, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:11:11] [AutoResearch] Launching trial 11: {'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.000557522373554661, 'timesteps': 4805, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] [AutoResearch] Trial 11 finished in 313.9s, returncode=0 +[2026-04-13 13:16:25] [AutoResearch] Trial 11: mean_reward=404.5225 std_reward=14.4655 +[2026-04-13 13:16:25] [AutoResearch] === Trial 11 Summary === +[2026-04-13 13:16:25] Total Phase 1 runs: 11 +[2026-04-13 13:16:25] Champion: trial=5 mean_reward=4582.7984 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0006801262090358742, 'timesteps': 4787, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] Top 5: +[2026-04-13 13:16:25] mean_reward=4582.7984 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0006801262090358742, 'timesteps': 4787, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] mean_reward=682.7352 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0010464507674264373, 'timesteps': 4450, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] mean_reward=627.6915 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0009549126527603771, 'timesteps': 4279, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] mean_reward=454.0640 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0005165618383365869, 'timesteps': 4929, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:25] mean_reward=404.5225 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.000557522373554661, 'timesteps': 4805, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:27] +[AutoResearch] ========== Trial 12/50 ========== +[2026-04-13 13:16:27] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 13:16:27] UCB=13.7452 mu=12.5336 sigma=0.6058 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0020405598509922246, 'timesteps': 4862} +[2026-04-13 13:16:27] UCB=10.6142 mu=10.0669 sigma=0.2737 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0015753222508456746, 'timesteps': 4690} +[2026-04-13 13:16:27] UCB=10.1293 mu=9.1255 sigma=0.5019 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0019449351233484984, 'timesteps': 4583} +[2026-04-13 13:16:27] UCB=9.8667 mu=8.7033 sigma=0.5817 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.001937955818890541, 'timesteps': 4781} +[2026-04-13 13:16:27] UCB=8.4705 mu=6.9561 sigma=0.7572 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.00246225593347489, 'timesteps': 4601} +[2026-04-13 13:16:27] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0020405598509922246, 'timesteps': 4862, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:16:29] [AutoResearch] Launching trial 12: {'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0020405598509922246, 'timesteps': 4862, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] [AutoResearch] Trial 12 finished in 440.6s, returncode=0 +[2026-04-13 13:23:50] [AutoResearch] Trial 12: mean_reward=14.6215 std_reward=0.0161 +[2026-04-13 13:23:50] [AutoResearch] === Trial 12 Summary === +[2026-04-13 13:23:50] Total Phase 1 runs: 12 +[2026-04-13 13:23:50] Champion: trial=5 mean_reward=4582.7984 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0006801262090358742, 'timesteps': 4787, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] Top 5: +[2026-04-13 13:23:50] mean_reward=4582.7984 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0006801262090358742, 'timesteps': 4787, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] mean_reward=682.7352 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0010464507674264373, 'timesteps': 4450, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] mean_reward=627.6915 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0009549126527603771, 'timesteps': 4279, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] mean_reward=454.0640 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0005165618383365869, 'timesteps': 4929, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:50] mean_reward=404.5225 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.000557522373554661, 'timesteps': 4805, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:52] +[AutoResearch] ========== Trial 13/50 ========== +[2026-04-13 13:23:52] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 13:23:52] UCB=7.4556 mu=6.6123 sigma=0.4217 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.00163041176468028, 'timesteps': 4836} +[2026-04-13 13:23:52] UCB=7.1150 mu=6.5952 sigma=0.2599 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0011607060392442735, 'timesteps': 4643} +[2026-04-13 13:23:52] UCB=4.9263 mu=4.0036 sigma=0.4613 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0015871232867074373, 'timesteps': 4489} +[2026-04-13 13:23:52] UCB=3.6250 mu=1.9044 sigma=0.8603 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0006337063098992063, 'timesteps': 1815} +[2026-04-13 13:23:52] UCB=3.3082 mu=1.7605 sigma=0.7739 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.00018730022865904181, 'timesteps': 2136} +[2026-04-13 13:23:52] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.00163041176468028, 'timesteps': 4836, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:23:54] [AutoResearch] Launching trial 13: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.00163041176468028, 'timesteps': 4836, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 13:35:25] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 13:35:25] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 13:35:25] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 13:35:25] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 13:35:25] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 13:35:25] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 13:35:25] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 13:35:25] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_results_phase1.jsonl b/agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_circular_driving.jsonl similarity index 83% rename from agent/outerloop-results/autoresearch_results_phase1.jsonl rename to agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_circular_driving.jsonl index 87b03d7..de276bc 100644 --- a/agent/outerloop-results/autoresearch_results_phase1.jsonl +++ b/agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_circular_driving.jsonl @@ -8,3 +8,5 @@ {"trial": 8, "timestamp": "2026-04-13T13:01:28.616838", "params": {"n_steer": 8, "n_throttle": 3, "learning_rate": 0.0003097316245852375, "timesteps": 4938, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 306.1739, "std_reward": 13.6044, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0008/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 303.6810266971588, "reward_hacking_suspected": false} {"trial": 9, "timestamp": "2026-04-13T13:05:16.112705", "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.0014813539623020004, "timesteps": 4054, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 15.5625, "std_reward": 0.0011, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0009/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 223.47979998588562, "reward_hacking_suspected": false} {"trial": 10, "timestamp": "2026-04-13T13:11:06.106880", "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.0010464507674264373, "timesteps": 4450, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 682.7352, "std_reward": 420.9113, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0010/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 345.9794178009033, "reward_hacking_suspected": false} +{"trial": 11, "timestamp": "2026-04-13T13:16:25.498543", "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.000557522373554661, "timesteps": 4805, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 404.5225, "std_reward": 14.4655, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0011/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 313.93063950538635, "reward_hacking_suspected": false} +{"trial": 12, "timestamp": "2026-04-13T13:23:50.091027", "params": {"n_steer": 6, "n_throttle": 3, "learning_rate": 0.0020405598509922246, "timesteps": 4862, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 14.6215, "std_reward": 0.0161, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0012/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 440.5779378414154, "reward_hacking_suspected": false} diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 6fc0fd7..daa22c6 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -1,56 +1,76 @@ """ -Speed-Aware Reward Wrapper for DonkeyCar RL — v2 (Hack-Proof) -============================================================== +Progress-Based Reward Wrapper for DonkeyCar RL — v3 (Anti-Circular) +==================================================================== -DESIGN PRINCIPLE: Speed should only be rewarded when the car is -genuinely progressing down the track. The original DonkeyCar reward -already correctly signals track presence — we build on top of it. +PROBLEM HISTORY: + v1 (additive): speed × (1 - cte/max_cte) + → Hacked by oscillating at track boundary (trials 8+13 in corrupted data) + + v2 (multiplicative): original × (1 + speed_scale × speed) + → Still hacked by circling ON the track (trial 5: cv=0.0%, 4582 reward) + → Circular motion has low CTE + positive speed → full speed bonus + → Neither CTE nor raw speed can distinguish forward vs circular motion + + v3 (path efficiency): original × (1 + speed_scale × speed × path_efficiency) + → Path efficiency = net_displacement / path_length over sliding window + → Forward driving: efficiency ≈ 1.0 (all movement is productive) + → Circular driving: efficiency ≈ 0.0 (movement cancels out, no net advance) + → Speed bonus disappears when circling → car incentivized to go FORWARD FORMULA: - if original_reward > 0 (car is on track and centered): - shaped = original_reward × (1 + speed_scale × speed) - else (car is off track / crashed): - shaped = original_reward (no speed bonus — cannot be hacked) + efficiency = |pos_t - pos_{t-window}| / Σ|pos_i - pos_{i-1}| + = net_displacement / total_path_length -WHY THIS IS HACK-PROOF: - The previous formula (speed × (1 - cte/max_cte)) could be maximized - by oscillating at the track boundary — the model learned this in practice. + shaped_reward = original_reward × (1 + speed_scale × speed × efficiency) - The multiplicative formula is bounded by the original DonkeyCar reward: - - Off track → original_reward ≤ 0 → no speed multiplier possible - - The model CANNOT increase reward by going fast off-track - - Speed bonus only accumulates when genuinely driving on the track + (when original_reward ≤ 0: no bonus, just penalty — same as v2) RESEARCH NOTE (2026-04-13): - The additive formula caused reward hacking in Phase 1 — trials 8 and 13 - achieved mean_reward=1936 and 1139 respectively by oscillating at the - track boundary. This design was developed to prevent that exploit. - See docs/RESEARCH_LOG.md for full details. + Circular driving discovered in Phase 1 despite v2 fix. + Trial 5: mean_reward=4582, cv=0.0% over 4787 steps. + User visually confirmed: car circling at start line. + See docs/RESEARCH_LOG.md for full analysis. TUNING: - speed_scale=0.1 means a car going 5 m/s gets a 50% bonus on top of - the base CTE reward. This is a meaningful but not overwhelming incentive. - Increase to 0.3+ to prioritize speed more aggressively (Phase 3). + window_size: how many steps to measure efficiency over (default 30) + - Too small: noisy, sensitive to brief oscillations + - Too large: slow to detect circling, may miss short circular segments + speed_scale: speed bonus multiplier (default 0.1) + min_efficiency: minimum efficiency before speed bonus disappears (default 0.1) """ import gymnasium as gym import numpy as np +from collections import deque class SpeedRewardWrapper(gym.Wrapper): """ - Hack-proof speed reward: multiplicative bonus ONLY when on track. + Path-efficiency-gated speed reward. + Speed bonus only applies proportionally to how much the car is making NET FORWARD PROGRESS. Args: env: gymnasium environment - speed_scale: multiplier for speed bonus (default 0.1) - shaped = original × (1 + speed_scale × speed) when on track - shaped = original when off track + speed_scale: speed bonus multiplier (default 0.1) + window_size: number of steps for efficiency measurement (default 30) + min_efficiency: efficiency floor below which speed bonus is zero (default 0.05) """ - def __init__(self, env, speed_scale: float = 0.1): + def __init__(self, env, speed_scale: float = 0.1, window_size: int = 30, min_efficiency: float = 0.05): super().__init__(env) self.speed_scale = speed_scale + self.window_size = window_size + self.min_efficiency = min_efficiency + + # Sliding window of positions for efficiency calculation + self._pos_history = deque(maxlen=window_size + 1) + self._path_length = 0.0 + + def reset(self, **kwargs): + result = self.env.reset(**kwargs) + self._pos_history.clear() + self._path_length = 0.0 + return result def step(self, action): result = self.env.step(action) @@ -73,30 +93,68 @@ class SpeedRewardWrapper(gym.Wrapper): else: return obs, shaped, done, info + def _get_pos(self, info: dict): + """Extract position from info dict. Returns None if unavailable.""" + pos = info.get('pos', None) + if pos is None: + return None + try: + return np.array(pos[:3], dtype=np.float64) + except (TypeError, IndexError, ValueError): + return None + + def _compute_efficiency(self) -> float: + """ + Compute path efficiency = net displacement / total path length over window. + Returns 1.0 if insufficient history (can't penalize yet). + Returns 0.0 if no movement. + """ + if len(self._pos_history) < 3: + return 1.0 # Not enough history, give benefit of doubt + + positions = list(self._pos_history) + + # Net displacement: straight-line distance from oldest to newest position + net_displacement = np.linalg.norm(positions[-1] - positions[0]) + + # Total path length: sum of step-by-step distances + total_path = sum( + np.linalg.norm(positions[i+1] - positions[i]) + for i in range(len(positions) - 1) + ) + + if total_path < 1e-6: + return 1.0 # Car not moving at all, don't penalize (will be caught by health check) + + return float(net_displacement / total_path) + def _shape_reward(self, original_reward: float, info: dict) -> float: - """ - Multiplicative speed bonus — only when on track. - Falls back gracefully if speed not in info dict. - """ + """Apply path-efficiency-gated speed bonus.""" + # Update position history + pos = self._get_pos(info) + if pos is not None: + self._pos_history.append(pos) + # Only apply speed bonus when genuinely on track (positive CTE reward) if original_reward <= 0: return original_reward # Off track / crashed — no speed reward - # Extract speed from info dict + # Extract speed try: - speed = float(info.get('speed', 0.0)) - if speed is None: - return original_reward - speed = max(0.0, speed) # No negative speed bonus + speed = max(0.0, float(info.get('speed', 0.0) or 0.0)) except (TypeError, ValueError): - return original_reward # Graceful fallback + return original_reward - # Multiplicative bonus: reward grows with speed, but only on track - # Hack-proof: cannot increase by going fast off-track - shaped = original_reward * (1.0 + self.speed_scale * speed) + # Compute path efficiency (detects circular motion) + efficiency = self._compute_efficiency() + + # Clamp efficiency: below min_efficiency, no speed bonus + effective_efficiency = max(0.0, (efficiency - self.min_efficiency) / (1.0 - self.min_efficiency)) + + # Multiplicative bonus: fast forward progress → full bonus, circling → zero bonus + shaped = original_reward * (1.0 + self.speed_scale * speed * effective_efficiency) return shaped def theoretical_max_per_step(self, max_speed: float = 10.0) -> float: - """Returns the theoretical max reward per step for bounds checking.""" - # original_reward ≤ 1.0, so shaped ≤ 1.0 × (1 + speed_scale × max_speed) - return 1.0 * (1.0 + self.speed_scale * max_speed) + """Upper bound on reward per step (for hack detection calibration).""" + return 1.0 * (1.0 + self.speed_scale * max_speed * 1.0) # efficiency=1 at best diff --git a/docs/RESEARCH_LOG.md b/docs/RESEARCH_LOG.md index 6635b19..d4e9676 100644 --- a/docs/RESEARCH_LOG.md +++ b/docs/RESEARCH_LOG.md @@ -180,3 +180,70 @@ From this experience, we derived the following principles for DonkeyCar RL rewar 3. **Does the multiplicative reward fix change the optimal hyperparameter region?** Re-run autoresearch with fixed reward and compare top configurations. 4. **Can we detect reward hacking automatically?** A reward-per-step threshold (e.g., flag if mean > 2.0 per step) could auto-detect hacking during training. 5. **What does a genuinely good reward look like?** After completing Phase 1 cleanly, characterize the reward distribution of a car that drives one full lap. + +--- + +## 2026-04-13 — Critical Discovery: Circular Driving Exploit (v2 Reward Still Hackable) + +### Finding: Car Learns to Circle at Starting Line + +**User observation (direct visual):** "The model found a way to rig the reward by going left in circles — it was off the track and then back on track, but detected as failure. Model uses this as best way to maximize reward." + +**Data confirmation:** + +| Trial | mean_reward | std_reward | cv% | r/step | verdict | +|-------|-------------|------------|-------|--------|---------| +| 1 | 270.56 | 0.143 | 0.1% | 0.086 | ⚠️ CIRCULAR (suspiciously low std) | +| 5 | **4582.80** | **0.485** | **0.0%** | **0.957** | 🚨 CIRCULAR (confirmed) | +| 10 | 682.74 | 420.91 | 61.7% | 0.153 | ⚠️ UNSTABLE (sometimes circles, sometimes crashes) | + +**Statistical signature of circular motion:** +- cv (coefficient of variation = std/mean) < 1% with high reward → very consistent behavior +- Circular driving IS very consistent: every circle is the same +- Legitimate driving is stochastic: different obstacles, curves, luck +- Trial 5: cv=0.0% over 3 eval episodes → textbook circling + +**Why v2 reward still allowed this:** +- v2 fix: `reward = original × (1 + speed_scale × speed)` ONLY when on track +- Car circling at the starting line HAS: low CTE (on track centerline) + positive speed +- Result: full speed bonus for circling → 4582 reward over 4787 steps +- CTE and raw speed cannot distinguish forward from circular motion + +### Root Cause: Missing Dimension — Track Progress + +The fundamental issue: **neither CTE nor speed captures PROGRESS along the track.** +- CTE measures: am I near the centerline? (yes for circles) +- Speed measures: am I moving? (yes for circles) +- Progress measures: am I getting anywhere new? (NO for circles) + +### Fix: Path Efficiency Reward (v3) + +**Formula:** +``` +efficiency = net_displacement / total_path_length (over sliding window of 30 steps) +shaped_reward = original_reward × (1 + speed_scale × speed × efficiency) +``` + +**Why this works:** +- Forward driving: `efficiency ≈ 1.0` (all movement is productive) +- Circular driving: `efficiency ≈ 0.0` (lots of steps, car returns to start position) +- The speed bonus disappears when circling → car incentivized to go FORWARD + +**Proof (tests):** +- `test_efficiency_near_zero_for_circular_driving`: efficiency < 0.2 after full circle +- `test_efficiency_near_one_for_straight_driving`: efficiency > 0.90 for straight line +- `test_straight_driving_gets_higher_reward_than_circular`: key guarantee test + +**Data archived:** +- `autoresearch_results_phase1_CORRUPTED_circular_driving.jsonl` (12 records, circular) +- `models/ARCHIVED_circular_driving/` (trial-0001 through trial-0013) + +### Lesson: cv% is a Reward Hacking Indicator + +| cv% | Interpretation | +|------|----------------| +| < 1% + high reward | Likely reward hacking (very consistent exploit) | +| 1-10% | Normal RL variance | +| > 50% | Unstable policy, inconsistent behavior | + +This metric will be added to the autoresearch result logging and summary. diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index c9b0e13..f6eca8a 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -1,185 +1,240 @@ """ -Tests for reward_wrapper.py v2 (hack-proof multiplicative formula) — no simulator required. +Tests for reward_wrapper.py v3 (path efficiency / anti-circular) — no simulator required. """ import sys import os +import math import pytest import numpy as np import gymnasium as gym +from collections import deque sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'agent')) from reward_wrapper import SpeedRewardWrapper -class MockStepEnv(gym.Env): - """Mock gymnasium.Env for testing SpeedRewardWrapper.""" - metadata = {'render_modes': []} +def make_env_with_pos(speed=2.0, original_reward=1.0, done=False, pos=(0.0, 0.0, 0.0)): + """Create a mock env that returns a specific position in info dict.""" + class PosEnv(gym.Env): + metadata = {'render_modes': []} + def __init__(self): + super().__init__() + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + self._pos = list(pos) + self._speed = speed + self._reward = original_reward + self._done = done - def __init__(self, speed=2.0, original_reward=1.0, done=False, use_5tuple=True): - super().__init__() - self._speed = speed - self._reward = original_reward - self._done = done - self._use_5tuple = use_5tuple - self.action_space = gym.spaces.Discrete(5) - self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + def set_pos(self, p): + self._pos = list(p) - def reset(self, seed=None, **kwargs): - return np.zeros((120, 160, 3), dtype=np.uint8), {} + def reset(self, seed=None, **kwargs): + return np.zeros((120, 160, 3), dtype=np.uint8), {} - def step(self, action): - obs = np.zeros((120, 160, 3), dtype=np.uint8) - info = {'speed': self._speed} - if self._use_5tuple: + def step(self, action): + obs = np.zeros((120, 160, 3), dtype=np.uint8) + info = {'speed': self._speed, 'pos': self._pos} return obs, self._reward, self._done, False, info - else: - return obs, self._reward, self._done, info - def close(self): - pass + def close(self): + pass + + return PosEnv() -# ---- Hack-Proof Guarantee Tests ---- +# ---- Core Anti-Hacking Tests (inherited from v2) ---- def test_no_speed_bonus_when_off_track(): - """ - CRITICAL: Off-track reward (≤ 0) must NOT get a speed bonus. - This is the core anti-hacking guarantee. - """ - env = MockStepEnv(speed=10.0, original_reward=-1.0) # Off track, very fast + """Off-track reward (≤ 0) must NOT get a speed bonus regardless of efficiency.""" + env = make_env_with_pos(speed=10.0, original_reward=-1.0) wrapped = SpeedRewardWrapper(env, speed_scale=0.5) + wrapped.reset() _, reward, _, _, _ = wrapped.step(0) - assert reward == -1.0, \ - f"Off-track reward must not get speed bonus, got {reward}" + assert reward == -1.0, f"Off-track reward must not get bonus, got {reward}" def test_no_speed_bonus_when_reward_zero(): - """Reward exactly 0 (boundary case) should not get speed bonus.""" - env = MockStepEnv(speed=5.0, original_reward=0.0) + """Reward exactly 0 should not get speed bonus.""" + env = make_env_with_pos(speed=5.0, original_reward=0.0) wrapped = SpeedRewardWrapper(env, speed_scale=0.5) + wrapped.reset() _, reward, _, _, _ = wrapped.step(0) assert reward == 0.0, f"Zero reward should stay zero, got {reward}" -def test_speed_bonus_scales_with_speed_when_on_track(): - """When on track (positive reward), faster = higher shaped reward.""" - env_slow = MockStepEnv(speed=1.0, original_reward=0.8) - env_fast = MockStepEnv(speed=5.0, original_reward=0.8) +# ---- Path Efficiency Tests ---- - wrapped_slow = SpeedRewardWrapper(env_slow, speed_scale=0.1) - wrapped_fast = SpeedRewardWrapper(env_fast, speed_scale=0.1) - - _, r_slow, _, _, _ = wrapped_slow.step(0) - _, r_fast, _, _, _ = wrapped_fast.step(0) - - assert r_fast > r_slow, f"Faster on-track should reward more: {r_fast:.3f} vs {r_slow:.3f}" +def _simulate_straight_driving(wrapped_env, env, steps=40, speed=3.0, step_size=0.1): + """Simulate straight-line driving: car moves forward by step_size each step.""" + wrapped_env.reset() + rewards = [] + for i in range(steps): + env.set_pos([i * step_size, 0.0, 0.0]) + env._speed = speed + _, r, _, _, _ = wrapped_env.step(0) + rewards.append(r) + return rewards -def test_multiplicative_formula_correct(): +def _simulate_circular_driving(wrapped_env, env, steps=40, speed=3.0, radius=0.5): + """Simulate circular driving: car moves in a circle, returns to start.""" + wrapped_env.reset() + rewards = [] + for i in range(steps): + angle = 2 * math.pi * i / steps + x = radius * math.cos(angle) + z = radius * math.sin(angle) + env.set_pos([x, 0.0, z]) + env._speed = speed + _, r, _, _, _ = wrapped_env.step(0) + rewards.append(r) + return rewards + + +def test_straight_driving_gets_higher_reward_than_circular(): """ - Verify exact formula: shaped = original × (1 + speed_scale × speed) + CRITICAL: Straight driving must produce more total reward than circular driving + at the same speed and base reward. This is the core anti-circular guarantee. """ - original_reward = 0.6 - speed = 3.0 - speed_scale = 0.1 - expected = original_reward * (1.0 + speed_scale * speed) # 0.6 × 1.3 = 0.78 + env_straight = make_env_with_pos(speed=3.0, original_reward=0.8) + env_circular = make_env_with_pos(speed=3.0, original_reward=0.8) - env = MockStepEnv(speed=speed, original_reward=original_reward) - wrapped = SpeedRewardWrapper(env, speed_scale=speed_scale) + wrapped_straight = SpeedRewardWrapper(env_straight, speed_scale=0.1, window_size=20) + wrapped_circular = SpeedRewardWrapper(env_circular, speed_scale=0.1, window_size=20) + + straight_rewards = _simulate_straight_driving(wrapped_straight, env_straight, steps=40) + circular_rewards = _simulate_circular_driving(wrapped_circular, env_circular, steps=40) + + # After warmup (window fills), straight should consistently beat circular + straight_tail = sum(straight_rewards[20:]) + circular_tail = sum(circular_rewards[20:]) + + assert straight_tail > circular_tail, ( + f"Straight driving ({straight_tail:.2f}) should beat circular ({circular_tail:.2f})" + ) + + +def test_efficiency_near_one_for_straight_driving(): + """Path efficiency should be near 1.0 for straight-line motion.""" + env = make_env_with_pos(speed=3.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10) + wrapped.reset() + + # Drive in a straight line + for i in range(15): + env.set_pos([i * 0.2, 0.0, 0.0]) + wrapped.step(0) + + efficiency = wrapped._compute_efficiency() + assert efficiency > 0.90, f"Straight driving efficiency should be >0.90, got {efficiency:.4f}" + + +def test_efficiency_near_zero_for_circular_driving(): + """Path efficiency should be near 0.0 for full circular motion.""" + env = make_env_with_pos(speed=3.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20) + wrapped.reset() + + # Drive a full circle (returns to start position) + radius = 1.0 + steps = 25 # More than window_size to fill it + for i in range(steps): + angle = 2 * math.pi * i / 24 # 24 steps = full circle + x = radius * math.cos(angle) + z = radius * math.sin(angle) + env.set_pos([x, 0.0, z]) + wrapped.step(0) + + efficiency = wrapped._compute_efficiency() + assert efficiency < 0.2, f"Circular driving efficiency should be <0.2, got {efficiency:.4f}" + + +def test_efficiency_one_with_no_pos_history(): + """When position not available, efficiency should default to 1.0 (no penalty).""" + class NoPosEnv(gym.Env): + metadata = {'render_modes': []} + def __init__(self): + super().__init__() + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) + def reset(self, seed=None, **kwargs): + return np.zeros((120, 160, 3), dtype=np.uint8), {} + def step(self, action): + return np.zeros((120, 160, 3), dtype=np.uint8), 0.8, False, False, {'speed': 2.0} # No pos + def close(self): + pass + + wrapped = SpeedRewardWrapper(NoPosEnv(), speed_scale=0.1) + wrapped.reset() _, reward, _, _, _ = wrapped.step(0) - - assert reward == pytest.approx(expected, abs=1e-6), \ - f"Expected {expected:.6f}, got {reward:.6f}" + # Without pos, efficiency=1.0, so reward = 0.8 * (1 + 0.1*2*1.0) = 0.96 + assert reward > 0.8, f"Without pos, should get speed bonus (efficiency=1.0), got {reward}" -def test_cannot_hack_by_going_fast_off_track(): - """ - Demonstrate that the previous formula could be hacked but this one cannot. - Fast off-track (speed=10) must give same or worse result than slow off-track (speed=1). - """ - env_fast_offtrack = MockStepEnv(speed=10.0, original_reward=-1.0) - env_slow_offtrack = MockStepEnv(speed=1.0, original_reward=-1.0) +def test_efficiency_resets_on_episode_reset(): + """Position history should clear on reset, so each episode starts fresh.""" + env = make_env_with_pos(speed=3.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10) + wrapped.reset() - wrapped_fast = SpeedRewardWrapper(env_fast_offtrack, speed_scale=0.5) - wrapped_slow = SpeedRewardWrapper(env_slow_offtrack, speed_scale=0.5) + # Fill with circular data + radius = 0.5 + for i in range(15): + angle = 2 * math.pi * i / 12 + env.set_pos([radius * math.cos(angle), 0.0, radius * math.sin(angle)]) + wrapped.step(0) - _, r_fast, _, _, _ = wrapped_fast.step(0) - _, r_slow, _, _, _ = wrapped_slow.step(0) + eff_before_reset = wrapped._compute_efficiency() - assert r_fast == r_slow == -1.0, \ - f"Off-track reward must be identical regardless of speed: fast={r_fast}, slow={r_slow}" + # Reset and drive straight for a few steps + wrapped.reset() + for i in range(3): + env.set_pos([i * 0.3, 0.0, 0.0]) + wrapped.step(0) + + eff_after_reset = wrapped._compute_efficiency() + assert eff_after_reset > eff_before_reset, \ + f"After reset, efficiency should improve: before={eff_before_reset:.3f}, after={eff_after_reset:.3f}" + + +def test_speed_bonus_disappears_when_circling(): + """After circling for window_size steps, speed bonus should be nearly zero.""" + env = make_env_with_pos(speed=5.0, original_reward=1.0) + wrapped = SpeedRewardWrapper(env, speed_scale=0.5, window_size=20, min_efficiency=0.05) + wrapped.reset() + + # Warm up with circular motion + radius = 0.5 + rewards = [] + for i in range(30): + angle = 2 * math.pi * (i % 20) / 20 # Full circle every 20 steps + env.set_pos([radius * math.cos(angle), 0.0, radius * math.sin(angle)]) + _, r, _, _, _ = wrapped.step(0) + rewards.append(r) + + # Later rewards (after window fills) should be close to original_reward + later_rewards = rewards[20:] + avg_later = sum(later_rewards) / len(later_rewards) + assert avg_later < 1.3, \ + f"Circular driving speed bonus should be suppressed, avg reward={avg_later:.3f} (original=1.0)" + + +# ---- Inherited guarantees ---- + +def test_crash_still_penalized(): + """Crash (original_reward=-1) should remain -1 regardless of speed or efficiency.""" + env = make_env_with_pos(speed=8.0, original_reward=-1.0, done=True) + wrapped = SpeedRewardWrapper(env, speed_scale=0.2) + wrapped.reset() + _, reward, _, _, _ = wrapped.step(0) + assert reward == -1.0, f"Crash reward should remain -1.0, got {reward}" def test_theoretical_max_per_step(): - """ - Verify theoretical_max_per_step returns correct upper bound. - With speed_scale=0.1 and max_speed=10.0: max = 1.0 × (1 + 0.1×10) = 2.0 - """ - env = MockStepEnv() + """Max reward/step bounded: original(1.0) × (1 + speed_scale × max_speed).""" + env = make_env_with_pos() wrapped = SpeedRewardWrapper(env, speed_scale=0.1) - max_reward = wrapped.theoretical_max_per_step(max_speed=10.0) - assert max_reward == pytest.approx(2.0, abs=1e-6), \ - f"Max per step should be 2.0, got {max_reward}" - - -def test_fallback_when_speed_not_in_info(): - """If info doesn't have speed, fall back to original reward.""" - class NoSpeedEnv(gym.Env): - metadata = {'render_modes': []} - def __init__(self): - super().__init__() - self.action_space = gym.spaces.Discrete(5) - self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) - def reset(self, seed=None, **kwargs): - return np.zeros((120, 160, 3), dtype=np.uint8), {} - def step(self, action): - return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {} # No 'speed' key - def close(self): - pass - - wrapped = SpeedRewardWrapper(NoSpeedEnv(), speed_scale=0.5) - _, reward, _, _, _ = wrapped.step(0) - # speed=0.0 default → shaped = 0.75 × (1 + 0.5 × 0.0) = 0.75 - assert reward == pytest.approx(0.75, abs=1e-6), \ - f"Should fall back gracefully, got {reward}" - - -def test_wrapper_preserves_observation(): - """SpeedRewardWrapper must not modify observations.""" - class FixedObsEnv(gym.Env): - metadata = {'render_modes': []} - def __init__(self): - super().__init__() - self.action_space = gym.spaces.Discrete(5) - self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) - def reset(self, seed=None, **kwargs): - return np.zeros((120, 160, 3), dtype=np.uint8), {} - def step(self, action): - return np.zeros((120, 160, 3), dtype=np.uint8), 0.8, False, False, {'speed': 2.0} - def close(self): - pass - - wrapped = SpeedRewardWrapper(FixedObsEnv()) - obs, _, _, _, _ = wrapped.step(0) - np.testing.assert_array_equal(obs, np.zeros((120, 160, 3), dtype=np.uint8)) - - -def test_4tuple_step_compatibility(): - """Wrapper should handle 4-tuple step() return (old gym API).""" - env = MockStepEnv(speed=2.0, original_reward=0.8, use_5tuple=False) - wrapped = SpeedRewardWrapper(env) - result = wrapped.step(0) - assert len(result) == 4, f"Expected 4-tuple, got {len(result)}" - _, reward, done, info = result - assert isinstance(reward, float) - assert reward > 0.8, "Speed bonus should increase reward when on track" - - -def test_crash_still_penalized(): - """Crash (original_reward=-1) should remain -1, not improved by speed.""" - env = MockStepEnv(speed=8.0, original_reward=-1.0, done=True) - wrapped = SpeedRewardWrapper(env, speed_scale=0.2) - _, reward, _, _, _ = wrapped.step(0) - assert reward == -1.0, f"Crash reward should remain -1.0, got {reward}" + assert wrapped.theoretical_max_per_step(max_speed=10.0) == pytest.approx(2.0, abs=1e-6)