From 5e93dae316afc33a8194e59319e55180e94a347f Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Mon, 13 Apr 2026 12:27:48 -0400 Subject: [PATCH] fix: hack-proof reward shaping + reward hacking detection + research log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CRITICAL BUG FIX — Reward Hacking: - Old formula: speed × (1 - cte/max_cte) could be maximized by oscillating at track boundary regardless of on-track behavior (trials 8+13 hit 1936+1139) - New formula: original_reward × (1 + speed_scale × speed) ONLY when on_track - Off-track (original_reward ≤ 0) → zero speed bonus → cannot be hacked - Verified hack-proof: 9 new targeted tests including test_cannot_hack_by_going_fast_off_track Reward Hacking Auto-Detection: - check_for_reward_hacking() flags results with >3.0 reward/step as suspected hacking - Flagged results are excluded from GP fitting (won't optimize toward hacking params) - reward_hacking_suspected field added to JSONL result records Research Documentation: - docs/RESEARCH_LOG.md created: full chronological research history - Random policy bug discovery and impact - Throttle clamp fix - Reward hacking discovery with evidence table - Hack-proof design rationale - Lessons learned + future research questions - Archived corrupted Phase 1 data: autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl - Archived hacked models: models/ARCHIVED_reward_hacking/ Clean start: autoresearch_results_phase1.jsonl reset, models/champion reset Agent: pi/claude-sonnet Tests: 40/40 passing Tests-Added: +9 (reward wrapper hack-proof tests) TypeScript: N/A --- agent/autoresearch_controller.py | 45 ++- .../champion_hacked}/manifest.json | 0 .../autoresearch_phase1_log.txt | 329 ++------------- ...ch_phase1_log_CORRUPTED_reward_hacking.txt | 377 ++++++++++++++++++ ...lts_phase1_CORRUPTED_reward_hacking.jsonl} | 3 + agent/reward_wrapper.py | 97 +++-- docs/RESEARCH_LOG.md | 182 +++++++++ tests/test_reward_wrapper.py | 149 ++++--- 8 files changed, 788 insertions(+), 394 deletions(-) rename agent/models/{champion => ARCHIVED_reward_hacking/champion_hacked}/manifest.json (100%) create mode 100644 agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_reward_hacking.txt rename agent/outerloop-results/{autoresearch_results_phase1.jsonl => autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl} (83%) create mode 100644 docs/RESEARCH_LOG.md diff --git a/agent/autoresearch_controller.py b/agent/autoresearch_controller.py index c2d7d27..2c0288e 100644 --- a/agent/autoresearch_controller.py +++ b/agent/autoresearch_controller.py @@ -83,6 +83,29 @@ def log(msg): with open(PHASE1_LOG, 'a') as f: f.write(line + '\n') + +# ---- Reward Sanity / Hacking Detection ---- +# SpeedRewardWrapper v2 theoretical max: +# max_original_reward ≈ 1.0, max_speed ≈ 10.0, speed_scale=0.1 +# max_per_step = 1.0 × (1 + 0.1 × 10) = 2.0 +# Flag anything above 3.0 reward/step as suspected hacking. +REWARD_PER_STEP_HACK_THRESHOLD = 3.0 + + +def check_for_reward_hacking(mean_reward, params): + """Detect reward hacking from physically impossible reward-per-step values.""" + if mean_reward is None: + return False + timesteps = params.get('timesteps', 3000) + reward_per_step = mean_reward / max(timesteps, 1) + if reward_per_step > REWARD_PER_STEP_HACK_THRESHOLD: + log(f'[AutoResearch] ⚠️ REWARD HACKING SUSPECTED: ' + f'mean_reward={mean_reward:.1f} over {timesteps} steps ' + f'= {reward_per_step:.3f}/step > threshold {REWARD_PER_STEP_HACK_THRESHOLD}. ' + f'Result EXCLUDED from GP fitting. See docs/RESEARCH_LOG.md.') + return True + return False + # ---- Parameter Encoding ---- def encode_params(params): vec = [] @@ -304,7 +327,7 @@ def launch_job(params, trial_num): return mean_reward, std_reward, model_zip, output, status, elapsed, save_dir # ---- Result Saving ---- -def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed): +def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed, hacked=False): rec = { 'trial': trial, 'timestamp': datetime.now().isoformat(), @@ -315,6 +338,7 @@ def save_result(trial, params, mean_reward, std_reward, model_path, champion, st 'champion': champion, 'run_status': status, 'elapsed_sec': elapsed, + 'reward_hacking_suspected': hacked, } with open(PHASE1_RESULTS, 'a') as f: f.write(json.dumps(rec) + '\n') @@ -373,15 +397,22 @@ def run_autoresearch(max_trials=50, kappa=UCB_KAPPA, push_every=10): # 3. Launch real training job mean_reward, std_reward, model_zip, output, status, elapsed, save_dir = launch_job(full_params, trial) - # 4. Update champion - is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial) + # 4. Check for reward hacking before updating champion + hacked = check_for_reward_hacking(mean_reward, full_params) - # 5. Save result - save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed) + # 5. Update champion (only if not hacking) + is_champion = False + if not hacked: + is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial) - # 6. Add to GP data (only successful runs with valid reward) - if mean_reward is not None: + # 6. Save result (flag hacked results) + save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed, hacked=hacked) + + # 7. Add to GP data (ONLY if not hacking and valid reward) + if mean_reward is not None and not hacked: results.append({'params': full_params, 'mean_reward': mean_reward}) + elif hacked: + log(f'[AutoResearch] Hacked result excluded from GP — GP will not optimize toward this region.') # 7. Print summary print_summary(results, champion, trial) diff --git a/agent/models/champion/manifest.json b/agent/models/ARCHIVED_reward_hacking/champion_hacked/manifest.json similarity index 100% rename from agent/models/champion/manifest.json rename to agent/models/ARCHIVED_reward_hacking/champion_hacked/manifest.json diff --git a/agent/outerloop-results/autoresearch_phase1_log.txt b/agent/outerloop-results/autoresearch_phase1_log.txt index c103eb2..d08fdb0 100644 --- a/agent/outerloop-results/autoresearch_phase1_log.txt +++ b/agent/outerloop-results/autoresearch_phase1_log.txt @@ -1,303 +1,26 @@ -[2026-04-13 10:00:54] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 10:00:54] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} -[2026-04-13 10:00:54] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} -[2026-04-13 10:00:54] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} -[2026-04-13 10:00:54] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} -[2026-04-13 10:00:54] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} -[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} -[2026-04-13 10:00:54] [AutoResearch] Only 1 results — using random proposal. -[2026-04-13 10:02:55] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 10:02:55] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} -[2026-04-13 10:02:55] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} -[2026-04-13 10:02:55] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} -[2026-04-13 10:02:55] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} -[2026-04-13 10:02:55] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} -[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} -[2026-04-13 10:02:55] [AutoResearch] Only 1 results — using random proposal. -[2026-04-13 10:03:22] ============================================================ -[2026-04-13 10:03:22] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization -[2026-04-13 10:03:22] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10 -[2026-04-13 10:03:22] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl -[2026-04-13 10:03:22] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion -[2026-04-13 10:03:22] ============================================================ -[2026-04-13 10:03:22] [AutoResearch] Loaded 0 existing Phase 1 results. -[2026-04-13 10:03:22] [AutoResearch] No champion yet. -[2026-04-13 10:03:22] -[AutoResearch] ========== Trial 1/50 ========== -[2026-04-13 10:03:22] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:03:22] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:03:24] [AutoResearch] Launching trial 1: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:13:24] [AutoResearch] Trial 1 TIMED OUT after 600.2s -[2026-04-13 10:13:24] [AutoResearch] Trial 1: mean_reward=None std_reward=None -[2026-04-13 10:13:26] -[AutoResearch] ========== Trial 2/50 ========== -[2026-04-13 10:13:26] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:13:26] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:13:28] [AutoResearch] Launching trial 2: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:23:28] [AutoResearch] Trial 2 TIMED OUT after 600.0s -[2026-04-13 10:23:28] [AutoResearch] Trial 2: mean_reward=None std_reward=None -[2026-04-13 10:23:30] -[AutoResearch] ========== Trial 3/50 ========== -[2026-04-13 10:23:30] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:23:30] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:23:32] [AutoResearch] Launching trial 3: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:33:32] [AutoResearch] Trial 3 TIMED OUT after 600.1s -[2026-04-13 10:33:32] [AutoResearch] Trial 3: mean_reward=None std_reward=None -[2026-04-13 10:33:34] -[AutoResearch] ========== Trial 4/50 ========== -[2026-04-13 10:33:34] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:33:34] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:33:36] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:43:36] [AutoResearch] Trial 4 TIMED OUT after 600.1s -[2026-04-13 10:43:36] [AutoResearch] Trial 4: mean_reward=None std_reward=None -[2026-04-13 10:43:39] -[AutoResearch] ========== Trial 5/50 ========== -[2026-04-13 10:43:39] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:43:39] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:43:41] [AutoResearch] Launching trial 5: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:53:41] [AutoResearch] Trial 5 TIMED OUT after 600.1s -[2026-04-13 10:53:41] [AutoResearch] Trial 5: mean_reward=None std_reward=None -[2026-04-13 10:53:43] -[AutoResearch] ========== Trial 6/50 ========== -[2026-04-13 10:53:43] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 10:53:43] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 10:53:45] [AutoResearch] Launching trial 6: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 11:03:45] [AutoResearch] Trial 6 TIMED OUT after 600.1s -[2026-04-13 11:03:45] [AutoResearch] Trial 6: mean_reward=None std_reward=None -[2026-04-13 11:03:47] -[AutoResearch] ========== Trial 7/50 ========== -[2026-04-13 11:03:47] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 11:03:47] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 11:03:49] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} -[2026-04-13 11:16:34] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:16:34] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} -[2026-04-13 11:16:34] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} -[2026-04-13 11:16:34] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} -[2026-04-13 11:16:34] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} -[2026-04-13 11:16:34] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} -[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} -[2026-04-13 11:16:34] [AutoResearch] Only 1 results — using random proposal. -[2026-04-13 11:16:53] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:16:53] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} -[2026-04-13 11:16:53] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} -[2026-04-13 11:16:53] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} -[2026-04-13 11:16:53] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} -[2026-04-13 11:16:53] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} -[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} -[2026-04-13 11:16:53] [AutoResearch] Only 1 results — using random proposal. -[2026-04-13 11:17:15] ============================================================ -[2026-04-13 11:17:15] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization -[2026-04-13 11:17:15] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10 -[2026-04-13 11:17:15] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl -[2026-04-13 11:17:15] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion -[2026-04-13 11:17:15] ============================================================ -[2026-04-13 11:17:15] [AutoResearch] Loaded 0 existing Phase 1 results. -[2026-04-13 11:17:15] [AutoResearch] No champion yet. -[2026-04-13 11:17:15] -[AutoResearch] ========== Trial 1/50 ========== -[2026-04-13 11:17:15] [AutoResearch] Only 0 results — using random proposal. -[2026-04-13 11:17:15] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:17:17] [AutoResearch] Launching trial 1: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:20:53] [AutoResearch] Trial 1 finished in 215.5s, returncode=0 -[2026-04-13 11:20:53] [AutoResearch] Trial 1: mean_reward=5.7246 std_reward=0.027 -[2026-04-13 11:20:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:20:53] [AutoResearch] === Trial 1 Summary === -[2026-04-13 11:20:53] Total Phase 1 runs: 1 -[2026-04-13 11:20:53] Champion: trial=1 mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:20:53] Top 5: -[2026-04-13 11:20:53] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:20:55] -[AutoResearch] ========== Trial 2/50 ========== -[2026-04-13 11:20:55] [AutoResearch] Only 1 results — using random proposal. -[2026-04-13 11:20:55] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:20:57] [AutoResearch] Launching trial 2: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:32] [AutoResearch] Trial 2 finished in 215.5s, returncode=0 -[2026-04-13 11:24:32] [AutoResearch] Trial 2: mean_reward=398.8564 std_reward=1.1786 -[2026-04-13 11:24:33] [Champion] 🏆 NEW BEST! Trial 2: mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:33] [AutoResearch] === Trial 2 Summary === -[2026-04-13 11:24:33] Total Phase 1 runs: 2 -[2026-04-13 11:24:33] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:33] Top 5: -[2026-04-13 11:24:33] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:33] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:35] -[AutoResearch] ========== Trial 3/50 ========== -[2026-04-13 11:24:35] [AutoResearch] Only 2 results — using random proposal. -[2026-04-13 11:24:35] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:24:37] [AutoResearch] Launching trial 3: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:06] [AutoResearch] Trial 3 finished in 269.6s, returncode=0 -[2026-04-13 11:29:06] [AutoResearch] Trial 3: mean_reward=5.9776 std_reward=0.0252 -[2026-04-13 11:29:06] [AutoResearch] === Trial 3 Summary === -[2026-04-13 11:29:06] Total Phase 1 runs: 3 -[2026-04-13 11:29:06] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:06] Top 5: -[2026-04-13 11:29:06] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:06] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:06] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:08] -[AutoResearch] ========== Trial 4/50 ========== -[2026-04-13 11:29:08] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:29:08] UCB=2.4615 mu=0.8615 sigma=0.8000 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084} -[2026-04-13 11:29:08] UCB=2.4548 mu=0.9032 sigma=0.7758 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0009758380297094257, 'timesteps': 3202} -[2026-04-13 11:29:08] UCB=2.4540 mu=0.7444 sigma=0.8548 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0006970147905651335, 'timesteps': 3351} -[2026-04-13 11:29:08] UCB=2.4479 mu=0.7051 sigma=0.8714 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0009997388594600006, 'timesteps': 4139} -[2026-04-13 11:29:08] UCB=2.4443 mu=0.9374 sigma=0.7535 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001158537723428793, 'timesteps': 3743} -[2026-04-13 11:29:08] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:29:10] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:24] [AutoResearch] Trial 4 finished in 194.0s, returncode=0 -[2026-04-13 11:32:24] [AutoResearch] Trial 4: mean_reward=22.8241 std_reward=0.1918 -[2026-04-13 11:32:24] [AutoResearch] === Trial 4 Summary === -[2026-04-13 11:32:24] Total Phase 1 runs: 4 -[2026-04-13 11:32:24] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:24] Top 5: -[2026-04-13 11:32:24] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:24] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:24] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:24] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:26] -[AutoResearch] ========== Trial 5/50 ========== -[2026-04-13 11:32:26] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:32:26] UCB=2.9797 mu=1.4209 sigma=0.7794 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626} -[2026-04-13 11:32:26] UCB=2.9360 mu=1.6516 sigma=0.6422 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.003483580964392729, 'timesteps': 3613} -[2026-04-13 11:32:26] UCB=2.8856 mu=1.1888 sigma=0.8484 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.002515064142571671, 'timesteps': 4267} -[2026-04-13 11:32:26] UCB=2.8582 mu=1.5163 sigma=0.6709 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0029159438252188284, 'timesteps': 3730} -[2026-04-13 11:32:26] UCB=2.8422 mu=1.5296 sigma=0.6563 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0033924318546103937, 'timesteps': 3346} -[2026-04-13 11:32:26] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:32:28] [AutoResearch] Launching trial 5: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] [AutoResearch] Trial 5 finished in 264.3s, returncode=0 -[2026-04-13 11:36:52] [AutoResearch] Trial 5: mean_reward=5.9913 std_reward=0.0246 -[2026-04-13 11:36:52] [AutoResearch] === Trial 5 Summary === -[2026-04-13 11:36:52] Total Phase 1 runs: 5 -[2026-04-13 11:36:52] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] Top 5: -[2026-04-13 11:36:52] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:52] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:54] -[AutoResearch] ========== Trial 6/50 ========== -[2026-04-13 11:36:54] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:36:54] UCB=2.8622 mu=1.4083 sigma=0.7270 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387} -[2026-04-13 11:36:54] UCB=2.7841 mu=1.0518 sigma=0.8661 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002782960062629981, 'timesteps': 4433} -[2026-04-13 11:36:54] UCB=2.7380 mu=1.5849 sigma=0.5765 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001906612836342622, 'timesteps': 3714} -[2026-04-13 11:36:54] UCB=2.7029 mu=0.9236 sigma=0.8897 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002301914433902658, 'timesteps': 4751} -[2026-04-13 11:36:54] UCB=2.6924 mu=1.1628 sigma=0.7648 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019575950790335435, 'timesteps': 2548} -[2026-04-13 11:36:54] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:36:56] [AutoResearch] Launching trial 6: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] [AutoResearch] Trial 6 TIMED OUT after 480.1s -[2026-04-13 11:44:57] [AutoResearch] Trial 6: mean_reward=None std_reward=None -[2026-04-13 11:44:57] [AutoResearch] === Trial 6 Summary === -[2026-04-13 11:44:57] Total Phase 1 runs: 5 -[2026-04-13 11:44:57] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] Top 5: -[2026-04-13 11:44:57] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:57] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:44:59] -[AutoResearch] ========== Trial 7/50 ========== -[2026-04-13 11:44:59] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:44:59] UCB=2.7677 mu=1.3945 sigma=0.6866 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160} -[2026-04-13 11:44:59] UCB=2.6401 mu=0.8590 sigma=0.8906 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012329470317109907, 'timesteps': 4404} -[2026-04-13 11:44:59] UCB=2.6346 mu=0.8897 sigma=0.8725 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002824572687677801, 'timesteps': 2173} -[2026-04-13 11:44:59] UCB=2.6197 mu=1.1406 sigma=0.7395 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002264095441698803, 'timesteps': 3610} -[2026-04-13 11:44:59] UCB=2.6013 mu=0.7257 sigma=0.9378 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.001986372556075669, 'timesteps': 4899} -[2026-04-13 11:44:59] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:45:01] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] [AutoResearch] Trial 7 finished in 211.3s, returncode=0 -[2026-04-13 11:48:32] [AutoResearch] Trial 7: mean_reward=5.7529 std_reward=0.0318 -[2026-04-13 11:48:32] [AutoResearch] === Trial 7 Summary === -[2026-04-13 11:48:32] Total Phase 1 runs: 6 -[2026-04-13 11:48:32] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] Top 5: -[2026-04-13 11:48:32] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:32] mean_reward=5.7529 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:34] -[AutoResearch] ========== Trial 8/50 ========== -[2026-04-13 11:48:34] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:48:34] UCB=2.9928 mu=1.4031 sigma=0.7948 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429} -[2026-04-13 11:48:34] UCB=2.9102 mu=1.2105 sigma=0.8499 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0013337121696681005, 'timesteps': 4384} -[2026-04-13 11:48:34] UCB=2.9095 mu=1.2362 sigma=0.8366 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0016866172466465327, 'timesteps': 4586} -[2026-04-13 11:48:34] UCB=2.7220 mu=1.0017 sigma=0.8601 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012033218829906316, 'timesteps': 4239} -[2026-04-13 11:48:34] UCB=2.6586 mu=0.8020 sigma=0.9283 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0014425382569831862, 'timesteps': 4488} -[2026-04-13 11:48:34] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:48:36] [AutoResearch] Launching trial 8: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] [AutoResearch] Trial 8 finished in 252.2s, returncode=0 -[2026-04-13 11:52:48] [AutoResearch] Trial 8: mean_reward=1936.8533 std_reward=34.0067 -[2026-04-13 11:52:48] [Champion] 🏆 NEW BEST! Trial 8: mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] [AutoResearch] === Trial 8 Summary === -[2026-04-13 11:52:48] Total Phase 1 runs: 7 -[2026-04-13 11:52:48] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] Top 5: -[2026-04-13 11:52:48] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:48] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:50] -[AutoResearch] ========== Trial 9/50 ========== -[2026-04-13 11:52:50] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:52:50] UCB=3.6446 mu=2.2362 sigma=0.7042 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961} -[2026-04-13 11:52:50] UCB=3.6253 mu=2.3605 sigma=0.6324 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0014035502090066865, 'timesteps': 2985} -[2026-04-13 11:52:50] UCB=3.5079 mu=2.3661 sigma=0.5709 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0016891347290568105, 'timesteps': 3422} -[2026-04-13 11:52:50] UCB=3.4169 mu=2.2243 sigma=0.5963 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0011351472472032882, 'timesteps': 4191} -[2026-04-13 11:52:50] UCB=3.3399 mu=1.6131 sigma=0.8634 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.00114114991914373, 'timesteps': 3413} -[2026-04-13 11:52:50] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:52:52] [AutoResearch] Launching trial 9: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] [AutoResearch] Trial 9 finished in 215.5s, returncode=0 -[2026-04-13 11:56:28] [AutoResearch] Trial 9: mean_reward=237.9115 std_reward=1.4136 -[2026-04-13 11:56:28] [AutoResearch] === Trial 9 Summary === -[2026-04-13 11:56:28] Total Phase 1 runs: 8 -[2026-04-13 11:56:28] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] Top 5: -[2026-04-13 11:56:28] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:28] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:30] -[AutoResearch] ========== Trial 10/50 ========== -[2026-04-13 11:56:30] [AutoResearch] GP UCB top-5 candidates: -[2026-04-13 11:56:30] UCB=3.6513 mu=2.0026 sigma=0.8243 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691} -[2026-04-13 11:56:30] UCB=3.2438 mu=1.9644 sigma=0.6397 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0013292739097783752, 'timesteps': 3897} -[2026-04-13 11:56:30] UCB=3.1815 mu=1.2984 sigma=0.9415 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0004768119261475519, 'timesteps': 4914} -[2026-04-13 11:56:30] UCB=3.0779 mu=1.4273 sigma=0.8253 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001854044179957165, 'timesteps': 3308} -[2026-04-13 11:56:30] UCB=2.9649 mu=1.2760 sigma=0.8444 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001236030774697938, 'timesteps': 3010} -[2026-04-13 11:56:30] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 11:56:32] [AutoResearch] Launching trial 10: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] [AutoResearch] Trial 10 finished in 285.4s, returncode=0 -[2026-04-13 12:01:17] [AutoResearch] Trial 10: mean_reward=7.6595 std_reward=0.1051 -[2026-04-13 12:01:17] [AutoResearch] === Trial 10 Summary === -[2026-04-13 12:01:17] Total Phase 1 runs: 9 -[2026-04-13 12:01:17] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] Top 5: -[2026-04-13 12:01:17] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} -[2026-04-13 12:01:17] mean_reward=7.6595 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:26:21] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:26:21] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 12:26:21] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 12:26:21] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 12:26:21] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 12:26:21] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 12:26:21] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 12:27:28] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:27:28] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 12:27:28] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 12:27:28] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 12:27:28] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 12:27:28] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 12:27:28] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_reward_hacking.txt b/agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_reward_hacking.txt new file mode 100644 index 0000000..a41c2f6 --- /dev/null +++ b/agent/outerloop-results/autoresearch_phase1_log_CORRUPTED_reward_hacking.txt @@ -0,0 +1,377 @@ +[2026-04-13 10:00:54] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 10:00:54] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} +[2026-04-13 10:00:54] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} +[2026-04-13 10:00:54] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} +[2026-04-13 10:00:54] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} +[2026-04-13 10:00:54] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 10:00:54] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 10:02:55] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 10:02:55] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316} +[2026-04-13 10:02:55] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654} +[2026-04-13 10:02:55] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730} +[2026-04-13 10:02:55] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413} +[2026-04-13 10:02:55] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 10:02:55] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 10:03:22] ============================================================ +[2026-04-13 10:03:22] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization +[2026-04-13 10:03:22] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10 +[2026-04-13 10:03:22] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl +[2026-04-13 10:03:22] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion +[2026-04-13 10:03:22] ============================================================ +[2026-04-13 10:03:22] [AutoResearch] Loaded 0 existing Phase 1 results. +[2026-04-13 10:03:22] [AutoResearch] No champion yet. +[2026-04-13 10:03:22] +[AutoResearch] ========== Trial 1/50 ========== +[2026-04-13 10:03:22] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:03:22] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:03:24] [AutoResearch] Launching trial 1: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:13:24] [AutoResearch] Trial 1 TIMED OUT after 600.2s +[2026-04-13 10:13:24] [AutoResearch] Trial 1: mean_reward=None std_reward=None +[2026-04-13 10:13:26] +[AutoResearch] ========== Trial 2/50 ========== +[2026-04-13 10:13:26] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:13:26] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:13:28] [AutoResearch] Launching trial 2: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:23:28] [AutoResearch] Trial 2 TIMED OUT after 600.0s +[2026-04-13 10:23:28] [AutoResearch] Trial 2: mean_reward=None std_reward=None +[2026-04-13 10:23:30] +[AutoResearch] ========== Trial 3/50 ========== +[2026-04-13 10:23:30] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:23:30] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:23:32] [AutoResearch] Launching trial 3: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:33:32] [AutoResearch] Trial 3 TIMED OUT after 600.1s +[2026-04-13 10:33:32] [AutoResearch] Trial 3: mean_reward=None std_reward=None +[2026-04-13 10:33:34] +[AutoResearch] ========== Trial 4/50 ========== +[2026-04-13 10:33:34] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:33:34] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:33:36] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:43:36] [AutoResearch] Trial 4 TIMED OUT after 600.1s +[2026-04-13 10:43:36] [AutoResearch] Trial 4: mean_reward=None std_reward=None +[2026-04-13 10:43:39] +[AutoResearch] ========== Trial 5/50 ========== +[2026-04-13 10:43:39] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:43:39] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:43:41] [AutoResearch] Launching trial 5: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:53:41] [AutoResearch] Trial 5 TIMED OUT after 600.1s +[2026-04-13 10:53:41] [AutoResearch] Trial 5: mean_reward=None std_reward=None +[2026-04-13 10:53:43] +[AutoResearch] ========== Trial 6/50 ========== +[2026-04-13 10:53:43] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:53:43] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:53:45] [AutoResearch] Launching trial 6: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:03:45] [AutoResearch] Trial 6 TIMED OUT after 600.1s +[2026-04-13 11:03:45] [AutoResearch] Trial 6: mean_reward=None std_reward=None +[2026-04-13 11:03:47] +[AutoResearch] ========== Trial 7/50 ========== +[2026-04-13 11:03:47] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 11:03:47] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:03:49] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:16:34] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:16:34] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 11:16:34] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 11:16:34] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 11:16:34] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 11:16:34] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 11:16:34] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 11:16:53] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:16:53] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 11:16:53] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 11:16:53] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 11:16:53] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 11:16:53] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 11:16:53] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 11:17:15] ============================================================ +[2026-04-13 11:17:15] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization +[2026-04-13 11:17:15] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10 +[2026-04-13 11:17:15] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl +[2026-04-13 11:17:15] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion +[2026-04-13 11:17:15] ============================================================ +[2026-04-13 11:17:15] [AutoResearch] Loaded 0 existing Phase 1 results. +[2026-04-13 11:17:15] [AutoResearch] No champion yet. +[2026-04-13 11:17:15] +[AutoResearch] ========== Trial 1/50 ========== +[2026-04-13 11:17:15] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 11:17:15] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:17:17] [AutoResearch] Launching trial 1: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:20:53] [AutoResearch] Trial 1 finished in 215.5s, returncode=0 +[2026-04-13 11:20:53] [AutoResearch] Trial 1: mean_reward=5.7246 std_reward=0.027 +[2026-04-13 11:20:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:20:53] [AutoResearch] === Trial 1 Summary === +[2026-04-13 11:20:53] Total Phase 1 runs: 1 +[2026-04-13 11:20:53] Champion: trial=1 mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:20:53] Top 5: +[2026-04-13 11:20:53] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:20:55] +[AutoResearch] ========== Trial 2/50 ========== +[2026-04-13 11:20:55] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 11:20:55] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:20:57] [AutoResearch] Launching trial 2: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:32] [AutoResearch] Trial 2 finished in 215.5s, returncode=0 +[2026-04-13 11:24:32] [AutoResearch] Trial 2: mean_reward=398.8564 std_reward=1.1786 +[2026-04-13 11:24:33] [Champion] 🏆 NEW BEST! Trial 2: mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:33] [AutoResearch] === Trial 2 Summary === +[2026-04-13 11:24:33] Total Phase 1 runs: 2 +[2026-04-13 11:24:33] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:33] Top 5: +[2026-04-13 11:24:33] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:33] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:35] +[AutoResearch] ========== Trial 3/50 ========== +[2026-04-13 11:24:35] [AutoResearch] Only 2 results — using random proposal. +[2026-04-13 11:24:35] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:24:37] [AutoResearch] Launching trial 3: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:06] [AutoResearch] Trial 3 finished in 269.6s, returncode=0 +[2026-04-13 11:29:06] [AutoResearch] Trial 3: mean_reward=5.9776 std_reward=0.0252 +[2026-04-13 11:29:06] [AutoResearch] === Trial 3 Summary === +[2026-04-13 11:29:06] Total Phase 1 runs: 3 +[2026-04-13 11:29:06] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:06] Top 5: +[2026-04-13 11:29:06] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:06] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:06] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:08] +[AutoResearch] ========== Trial 4/50 ========== +[2026-04-13 11:29:08] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:29:08] UCB=2.4615 mu=0.8615 sigma=0.8000 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084} +[2026-04-13 11:29:08] UCB=2.4548 mu=0.9032 sigma=0.7758 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0009758380297094257, 'timesteps': 3202} +[2026-04-13 11:29:08] UCB=2.4540 mu=0.7444 sigma=0.8548 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0006970147905651335, 'timesteps': 3351} +[2026-04-13 11:29:08] UCB=2.4479 mu=0.7051 sigma=0.8714 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0009997388594600006, 'timesteps': 4139} +[2026-04-13 11:29:08] UCB=2.4443 mu=0.9374 sigma=0.7535 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001158537723428793, 'timesteps': 3743} +[2026-04-13 11:29:08] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:29:10] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:24] [AutoResearch] Trial 4 finished in 194.0s, returncode=0 +[2026-04-13 11:32:24] [AutoResearch] Trial 4: mean_reward=22.8241 std_reward=0.1918 +[2026-04-13 11:32:24] [AutoResearch] === Trial 4 Summary === +[2026-04-13 11:32:24] Total Phase 1 runs: 4 +[2026-04-13 11:32:24] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:24] Top 5: +[2026-04-13 11:32:24] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:24] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:24] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:24] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:26] +[AutoResearch] ========== Trial 5/50 ========== +[2026-04-13 11:32:26] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:32:26] UCB=2.9797 mu=1.4209 sigma=0.7794 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626} +[2026-04-13 11:32:26] UCB=2.9360 mu=1.6516 sigma=0.6422 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.003483580964392729, 'timesteps': 3613} +[2026-04-13 11:32:26] UCB=2.8856 mu=1.1888 sigma=0.8484 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.002515064142571671, 'timesteps': 4267} +[2026-04-13 11:32:26] UCB=2.8582 mu=1.5163 sigma=0.6709 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0029159438252188284, 'timesteps': 3730} +[2026-04-13 11:32:26] UCB=2.8422 mu=1.5296 sigma=0.6563 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0033924318546103937, 'timesteps': 3346} +[2026-04-13 11:32:26] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:32:28] [AutoResearch] Launching trial 5: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] [AutoResearch] Trial 5 finished in 264.3s, returncode=0 +[2026-04-13 11:36:52] [AutoResearch] Trial 5: mean_reward=5.9913 std_reward=0.0246 +[2026-04-13 11:36:52] [AutoResearch] === Trial 5 Summary === +[2026-04-13 11:36:52] Total Phase 1 runs: 5 +[2026-04-13 11:36:52] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] Top 5: +[2026-04-13 11:36:52] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:52] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:54] +[AutoResearch] ========== Trial 6/50 ========== +[2026-04-13 11:36:54] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:36:54] UCB=2.8622 mu=1.4083 sigma=0.7270 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387} +[2026-04-13 11:36:54] UCB=2.7841 mu=1.0518 sigma=0.8661 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002782960062629981, 'timesteps': 4433} +[2026-04-13 11:36:54] UCB=2.7380 mu=1.5849 sigma=0.5765 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001906612836342622, 'timesteps': 3714} +[2026-04-13 11:36:54] UCB=2.7029 mu=0.9236 sigma=0.8897 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002301914433902658, 'timesteps': 4751} +[2026-04-13 11:36:54] UCB=2.6924 mu=1.1628 sigma=0.7648 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019575950790335435, 'timesteps': 2548} +[2026-04-13 11:36:54] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:36:56] [AutoResearch] Launching trial 6: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] [AutoResearch] Trial 6 TIMED OUT after 480.1s +[2026-04-13 11:44:57] [AutoResearch] Trial 6: mean_reward=None std_reward=None +[2026-04-13 11:44:57] [AutoResearch] === Trial 6 Summary === +[2026-04-13 11:44:57] Total Phase 1 runs: 5 +[2026-04-13 11:44:57] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] Top 5: +[2026-04-13 11:44:57] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:57] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:44:59] +[AutoResearch] ========== Trial 7/50 ========== +[2026-04-13 11:44:59] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:44:59] UCB=2.7677 mu=1.3945 sigma=0.6866 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160} +[2026-04-13 11:44:59] UCB=2.6401 mu=0.8590 sigma=0.8906 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012329470317109907, 'timesteps': 4404} +[2026-04-13 11:44:59] UCB=2.6346 mu=0.8897 sigma=0.8725 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002824572687677801, 'timesteps': 2173} +[2026-04-13 11:44:59] UCB=2.6197 mu=1.1406 sigma=0.7395 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002264095441698803, 'timesteps': 3610} +[2026-04-13 11:44:59] UCB=2.6013 mu=0.7257 sigma=0.9378 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.001986372556075669, 'timesteps': 4899} +[2026-04-13 11:44:59] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:45:01] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] [AutoResearch] Trial 7 finished in 211.3s, returncode=0 +[2026-04-13 11:48:32] [AutoResearch] Trial 7: mean_reward=5.7529 std_reward=0.0318 +[2026-04-13 11:48:32] [AutoResearch] === Trial 7 Summary === +[2026-04-13 11:48:32] Total Phase 1 runs: 6 +[2026-04-13 11:48:32] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] Top 5: +[2026-04-13 11:48:32] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:32] mean_reward=5.7529 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:34] +[AutoResearch] ========== Trial 8/50 ========== +[2026-04-13 11:48:34] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:48:34] UCB=2.9928 mu=1.4031 sigma=0.7948 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429} +[2026-04-13 11:48:34] UCB=2.9102 mu=1.2105 sigma=0.8499 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0013337121696681005, 'timesteps': 4384} +[2026-04-13 11:48:34] UCB=2.9095 mu=1.2362 sigma=0.8366 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0016866172466465327, 'timesteps': 4586} +[2026-04-13 11:48:34] UCB=2.7220 mu=1.0017 sigma=0.8601 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012033218829906316, 'timesteps': 4239} +[2026-04-13 11:48:34] UCB=2.6586 mu=0.8020 sigma=0.9283 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0014425382569831862, 'timesteps': 4488} +[2026-04-13 11:48:34] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:48:36] [AutoResearch] Launching trial 8: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] [AutoResearch] Trial 8 finished in 252.2s, returncode=0 +[2026-04-13 11:52:48] [AutoResearch] Trial 8: mean_reward=1936.8533 std_reward=34.0067 +[2026-04-13 11:52:48] [Champion] 🏆 NEW BEST! Trial 8: mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] [AutoResearch] === Trial 8 Summary === +[2026-04-13 11:52:48] Total Phase 1 runs: 7 +[2026-04-13 11:52:48] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] Top 5: +[2026-04-13 11:52:48] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:48] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:50] +[AutoResearch] ========== Trial 9/50 ========== +[2026-04-13 11:52:50] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:52:50] UCB=3.6446 mu=2.2362 sigma=0.7042 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961} +[2026-04-13 11:52:50] UCB=3.6253 mu=2.3605 sigma=0.6324 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0014035502090066865, 'timesteps': 2985} +[2026-04-13 11:52:50] UCB=3.5079 mu=2.3661 sigma=0.5709 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0016891347290568105, 'timesteps': 3422} +[2026-04-13 11:52:50] UCB=3.4169 mu=2.2243 sigma=0.5963 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0011351472472032882, 'timesteps': 4191} +[2026-04-13 11:52:50] UCB=3.3399 mu=1.6131 sigma=0.8634 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.00114114991914373, 'timesteps': 3413} +[2026-04-13 11:52:50] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:52:52] [AutoResearch] Launching trial 9: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] [AutoResearch] Trial 9 finished in 215.5s, returncode=0 +[2026-04-13 11:56:28] [AutoResearch] Trial 9: mean_reward=237.9115 std_reward=1.4136 +[2026-04-13 11:56:28] [AutoResearch] === Trial 9 Summary === +[2026-04-13 11:56:28] Total Phase 1 runs: 8 +[2026-04-13 11:56:28] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] Top 5: +[2026-04-13 11:56:28] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:28] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:30] +[AutoResearch] ========== Trial 10/50 ========== +[2026-04-13 11:56:30] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:56:30] UCB=3.6513 mu=2.0026 sigma=0.8243 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691} +[2026-04-13 11:56:30] UCB=3.2438 mu=1.9644 sigma=0.6397 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0013292739097783752, 'timesteps': 3897} +[2026-04-13 11:56:30] UCB=3.1815 mu=1.2984 sigma=0.9415 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0004768119261475519, 'timesteps': 4914} +[2026-04-13 11:56:30] UCB=3.0779 mu=1.4273 sigma=0.8253 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001854044179957165, 'timesteps': 3308} +[2026-04-13 11:56:30] UCB=2.9649 mu=1.2760 sigma=0.8444 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001236030774697938, 'timesteps': 3010} +[2026-04-13 11:56:30] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 11:56:32] [AutoResearch] Launching trial 10: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] [AutoResearch] Trial 10 finished in 285.4s, returncode=0 +[2026-04-13 12:01:17] [AutoResearch] Trial 10: mean_reward=7.6595 std_reward=0.1051 +[2026-04-13 12:01:17] [AutoResearch] === Trial 10 Summary === +[2026-04-13 12:01:17] Total Phase 1 runs: 9 +[2026-04-13 12:01:17] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] Top 5: +[2026-04-13 12:01:17] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:17] mean_reward=7.6595 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:19] [AutoResearch] Git push complete after trial 10 +[2026-04-13 12:01:21] +[AutoResearch] ========== Trial 11/50 ========== +[2026-04-13 12:01:21] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:01:21] UCB=3.1424 mu=1.5222 sigma=0.8101 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548} +[2026-04-13 12:01:21] UCB=3.1149 mu=1.7370 sigma=0.6890 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001459419353524927, 'timesteps': 2410} +[2026-04-13 12:01:21] UCB=2.7824 mu=1.5507 sigma=0.6159 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0017876960785136527, 'timesteps': 3884} +[2026-04-13 12:01:21] UCB=2.7343 mu=1.2928 sigma=0.7207 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019938475892844754, 'timesteps': 2452} +[2026-04-13 12:01:21] UCB=2.7199 mu=1.3608 sigma=0.6795 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0010871407527655017, 'timesteps': 2371} +[2026-04-13 12:01:21] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:01:23] [AutoResearch] Launching trial 11: {'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] [AutoResearch] Trial 11 finished in 211.8s, returncode=0 +[2026-04-13 12:04:55] [AutoResearch] Trial 11: mean_reward=439.8991 std_reward=2.2951 +[2026-04-13 12:04:55] [AutoResearch] === Trial 11 Summary === +[2026-04-13 12:04:55] Total Phase 1 runs: 10 +[2026-04-13 12:04:55] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] Top 5: +[2026-04-13 12:04:55] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:55] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:57] +[AutoResearch] ========== Trial 12/50 ========== +[2026-04-13 12:04:57] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:04:57] UCB=2.7238 mu=2.2403 sigma=0.2418 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460} +[2026-04-13 12:04:57] UCB=2.5207 mu=1.4162 sigma=0.5522 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0019602238083129895, 'timesteps': 3653} +[2026-04-13 12:04:57] UCB=2.4574 mu=1.4037 sigma=0.5268 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0007010382162706215, 'timesteps': 3309} +[2026-04-13 12:04:57] UCB=2.3988 mu=0.5967 sigma=0.9011 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0013450109997306151, 'timesteps': 1954} +[2026-04-13 12:04:57] UCB=2.3760 mu=0.7624 sigma=0.8068 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0011051791427736288, 'timesteps': 1984} +[2026-04-13 12:04:57] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:04:59] [AutoResearch] Launching trial 12: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] [AutoResearch] Trial 12 finished in 191.1s, returncode=0 +[2026-04-13 12:08:10] [AutoResearch] Trial 12: mean_reward=6.446 std_reward=0.0024 +[2026-04-13 12:08:10] [AutoResearch] === Trial 12 Summary === +[2026-04-13 12:08:10] Total Phase 1 runs: 11 +[2026-04-13 12:08:10] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] Top 5: +[2026-04-13 12:08:10] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:10] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:12] +[AutoResearch] ========== Trial 13/50 ========== +[2026-04-13 12:08:12] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:08:12] UCB=7.7182 mu=7.0518 sigma=0.3332 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686} +[2026-04-13 12:08:12] UCB=7.5060 mu=6.3573 sigma=0.5743 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0006674348206081718, 'timesteps': 2600} +[2026-04-13 12:08:12] UCB=7.2501 mu=6.6046 sigma=0.3227 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0007355516271507972, 'timesteps': 3206} +[2026-04-13 12:08:12] UCB=6.7989 mu=5.8906 sigma=0.4542 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.00023989918210819933, 'timesteps': 3143} +[2026-04-13 12:08:12] UCB=6.4551 mu=5.6895 sigma=0.3828 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0008766782176390233, 'timesteps': 3774} +[2026-04-13 12:08:12] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:08:14] [AutoResearch] Launching trial 13: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] [AutoResearch] Trial 13 finished in 212.8s, returncode=0 +[2026-04-13 12:11:47] [AutoResearch] Trial 13: mean_reward=1139.4415 std_reward=1.9558 +[2026-04-13 12:11:47] [AutoResearch] === Trial 13 Summary === +[2026-04-13 12:11:47] Total Phase 1 runs: 12 +[2026-04-13 12:11:47] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] Top 5: +[2026-04-13 12:11:47] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] mean_reward=1139.4415 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:47] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:49] +[AutoResearch] ========== Trial 14/50 ========== +[2026-04-13 12:11:49] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 12:11:49] UCB=6.5039 mu=4.9135 sigma=0.7952 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527} +[2026-04-13 12:11:49] UCB=6.4956 mu=5.4779 sigma=0.5088 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0042217867035675835, 'timesteps': 3617} +[2026-04-13 12:11:49] UCB=6.2232 mu=4.7772 sigma=0.7230 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.004423012325506047, 'timesteps': 4273} +[2026-04-13 12:11:49] UCB=6.1472 mu=4.5372 sigma=0.8050 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.004498330263353934, 'timesteps': 2879} +[2026-04-13 12:11:49] UCB=6.0219 mu=4.2216 sigma=0.9001 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.00012156867129133186, 'timesteps': 1887} +[2026-04-13 12:11:49] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} +[2026-04-13 12:11:51] [AutoResearch] Launching trial 14: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True} diff --git a/agent/outerloop-results/autoresearch_results_phase1.jsonl b/agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl similarity index 83% rename from agent/outerloop-results/autoresearch_results_phase1.jsonl rename to agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl index 56a5326..0df1c2c 100644 --- a/agent/outerloop-results/autoresearch_results_phase1.jsonl +++ b/agent/outerloop-results/autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl @@ -14,3 +14,6 @@ {"trial": 8, "timestamp": "2026-04-13T11:52:48.821996", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.001449588903551847, "timesteps": 3429, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 1936.8533, "std_reward": 34.0067, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0008/model.zip", "champion": true, "run_status": "ok", "elapsed_sec": 252.2464599609375} {"trial": 9, "timestamp": "2026-04-13T11:56:28.296244", "params": {"n_steer": 4, "n_throttle": 2, "learning_rate": 0.0012562469886511318, "timesteps": 2961, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 237.9115, "std_reward": 1.4136, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0009/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 215.46081495285034} {"trial": 10, "timestamp": "2026-04-13T12:01:17.700485", "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0012074041487018196, "timesteps": 4691, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 7.6595, "std_reward": 0.1051, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0010/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 285.3893370628357} +{"trial": 11, "timestamp": "2026-04-13T12:04:55.096100", "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.00047930749995235496, "timesteps": 3548, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 439.8991, "std_reward": 2.2951, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0011/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 211.77687573432922} +{"trial": 12, "timestamp": "2026-04-13T12:08:10.184572", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.0018881008842323835, "timesteps": 3460, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 6.446, "std_reward": 0.0024, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0012/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 191.07323956489563} +{"trial": 13, "timestamp": "2026-04-13T12:11:47.012459", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.000577795506052323, "timesteps": 3686, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 1139.4415, "std_reward": 1.9558, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0013/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 212.81442260742188} diff --git a/agent/reward_wrapper.py b/agent/reward_wrapper.py index 27295ea..6fc0fd7 100644 --- a/agent/reward_wrapper.py +++ b/agent/reward_wrapper.py @@ -1,10 +1,36 @@ """ -Speed-Aware Reward Wrapper for DonkeyCar RL -============================================ -Replaces the default CTE-only reward with: - reward = speed * (1.0 - min(abs(cte) / max_cte, 1.0)) +Speed-Aware Reward Wrapper for DonkeyCar RL — v2 (Hack-Proof) +============================================================== -Falls back to original reward if speed/cte not available in info dict. +DESIGN PRINCIPLE: Speed should only be rewarded when the car is +genuinely progressing down the track. The original DonkeyCar reward +already correctly signals track presence — we build on top of it. + +FORMULA: + if original_reward > 0 (car is on track and centered): + shaped = original_reward × (1 + speed_scale × speed) + else (car is off track / crashed): + shaped = original_reward (no speed bonus — cannot be hacked) + +WHY THIS IS HACK-PROOF: + The previous formula (speed × (1 - cte/max_cte)) could be maximized + by oscillating at the track boundary — the model learned this in practice. + + The multiplicative formula is bounded by the original DonkeyCar reward: + - Off track → original_reward ≤ 0 → no speed multiplier possible + - The model CANNOT increase reward by going fast off-track + - Speed bonus only accumulates when genuinely driving on the track + +RESEARCH NOTE (2026-04-13): + The additive formula caused reward hacking in Phase 1 — trials 8 and 13 + achieved mean_reward=1936 and 1139 respectively by oscillating at the + track boundary. This design was developed to prevent that exploit. + See docs/RESEARCH_LOG.md for full details. + +TUNING: + speed_scale=0.1 means a car going 5 m/s gets a 50% bonus on top of + the base CTE reward. This is a meaningful but not overwhelming incentive. + Increase to 0.3+ to prioritize speed more aggressively (Phase 3). """ import gymnasium as gym @@ -13,18 +39,18 @@ import numpy as np class SpeedRewardWrapper(gym.Wrapper): """ - Replace DonkeyCar's default reward with a speed-aware version. + Hack-proof speed reward: multiplicative bonus ONLY when on track. - Reward = speed * (1 - |cte| / max_cte) - - Maximum when car is fast AND centred on the track - - Zero when car is at max cross-track error - - Negative (crash penalty) preserved from original reward when episode ends with failure + Args: + env: gymnasium environment + speed_scale: multiplier for speed bonus (default 0.1) + shaped = original × (1 + speed_scale × speed) when on track + shaped = original when off track """ - def __init__(self, env, max_cte: float = 8.0, crash_penalty: float = -10.0): + def __init__(self, env, speed_scale: float = 0.1): super().__init__(env) - self.max_cte = max_cte - self.crash_penalty = crash_penalty + self.speed_scale = speed_scale def step(self, action): result = self.env.step(action) @@ -40,32 +66,37 @@ class SpeedRewardWrapper(gym.Wrapper): else: raise ValueError(f'Unexpected step() result length: {len(result)}') - # Shape the reward using speed and CTE from info - shaped = self._shape_reward(reward, done, info) + shaped = self._shape_reward(reward, info) if len(result) == 5: return obs, shaped, terminated, truncated, info else: return obs, shaped, done, info - def _shape_reward(self, original_reward: float, done: bool, info: dict) -> float: - """Compute speed-aware reward, falling back to original if info is unavailable.""" + def _shape_reward(self, original_reward: float, info: dict) -> float: + """ + Multiplicative speed bonus — only when on track. + Falls back gracefully if speed not in info dict. + """ + # Only apply speed bonus when genuinely on track (positive CTE reward) + if original_reward <= 0: + return original_reward # Off track / crashed — no speed reward + + # Extract speed from info dict try: - speed = float(info.get('speed', None)) - cte = float(info.get('cte', None)) - - if speed is None or cte is None: + speed = float(info.get('speed', 0.0)) + if speed is None: return original_reward - - # Positive driving reward: fast + centred - shaped = speed * (1.0 - min(abs(cte) / self.max_cte, 1.0)) - - # Preserve crash penalty (original reward is -1 on crash in DonkeyCar) - if done and original_reward < 0: - shaped += self.crash_penalty - - return shaped - + speed = max(0.0, speed) # No negative speed bonus except (TypeError, ValueError): - # info dict doesn't have speed/cte — fall back gracefully - return original_reward + return original_reward # Graceful fallback + + # Multiplicative bonus: reward grows with speed, but only on track + # Hack-proof: cannot increase by going fast off-track + shaped = original_reward * (1.0 + self.speed_scale * speed) + return shaped + + def theoretical_max_per_step(self, max_speed: float = 10.0) -> float: + """Returns the theoretical max reward per step for bounds checking.""" + # original_reward ≤ 1.0, so shaped ≤ 1.0 × (1 + speed_scale × max_speed) + return 1.0 * (1.0 + self.speed_scale * max_speed) diff --git a/docs/RESEARCH_LOG.md b/docs/RESEARCH_LOG.md new file mode 100644 index 0000000..6635b19 --- /dev/null +++ b/docs/RESEARCH_LOG.md @@ -0,0 +1,182 @@ +# Research Log — DonkeyCar RL Autoresearch + +> Chronological research findings, discoveries, bugs, and decisions. +> Every significant observation is recorded here for scientific reproducibility and future reference. +> Format: date, finding, evidence, action taken. + +--- + +## 2026-04-12 — Project Kickoff and Initial Infrastructure + +### Finding: Grid Sweep as Research Baseline + +**Observation:** Before any autoresearch, we ran an 18-config grid sweep across: +- `n_steer`: [3, 5, 7] +- `n_throttle`: [2, 3] +- `learning_rate`: [0.001, 0.0005, 0.0001] +- 3 repeats each + +**Important caveat discovered later:** This sweep used a **random action policy** (bug — model training code had been removed). The rewards reflect how well a random policy can stumble through different action discretizations. + +**Valid insight from this data:** Action discretization matters even for random policy. +`n_steer=7, n_throttle=2` outperformed `n_steer=3, n_throttle=2` with random actions — more steering granularity helps even without learning. + +**Data location:** `outerloop-results/clean_sweep_results.jsonl` (18 records) + +--- + +## 2026-04-12 — Discovery: Random Policy Bug (Critical) + +### Finding: Inner Loop Was Never Training + +**Observation:** The `donkeycar_sb3_runner.py` was calling `env.action_space.sample()` instead of `model.learn()`. This was introduced when we removed the broken `model.save()` call that caused `NameError: name 'model' is not defined`. + +**Root cause:** Legacy code path removal was too aggressive — removed training along with the broken save call. + +**Impact:** +- All 300 autoresearch trials (two overnight runs) used random policy +- `learning_rate` parameter was passed but completely ignored +- `mean_reward` values reflect random-walk quality, not RL training quality +- The GP+UCB found the best *action space for random walking*, not the best *hyperparameters for learning* + +**Valid salvage:** The `n_steer=8, n_throttle=5` finding is valid as a discretization insight. +**Invalid:** All learning_rate optimization in the 300-trial autoresearch runs. + +**Fix:** Completely rebuilt runner with real `PPO.learn()` + `evaluate_policy()` + `model.save()`. + +**Decision record:** ADR-005 — Never call model.save() before model is defined. + +--- + +## 2026-04-12 — Autoresearch Infrastructure Proven + +### Finding: GP+UCB Autoresearch Works Correctly + +**Observation:** The GP+UCB meta-controller correctly: +- Loads prior results and fits a Gaussian Process +- Uses UCB acquisition to balance exploration/exploitation +- Proposes parameters outside the original grid (e.g., `n_steer=6` was never in grid) +- Converges toward higher-reward regions with each trial + +**Evidence:** After 300 trials, the top-5 consistently clustered around `n_steer=7-9, n_throttle=4-5, lr≈0.002` — a coherent high-reward region. + +**Conclusion:** The infrastructure is sound. The data was from wrong experiments, but the meta-loop works exactly as designed. + +--- + +## 2026-04-13 — Phase 1 Launch: First Real Training Attempt + +### Finding: Timeout — PPO+CNN is Too Slow on CPU for Large Timesteps + +**Observation:** First Phase 1 run with real PPO training proposed 20k-30k timesteps. +At ~0.05-0.1 steps/sec (PPO+CNN on CPU), this requires 2000-6000 seconds per trial — far exceeding the 600-second timeout. + +**Evidence:** Trials 1-6 all timed out at exactly 600 seconds. + +**Fix:** Reduced timestep search space from [5000, 30000] to [1000, 5000]. +At ~15-30 steps/sec (DonkeyCar sim speed), 5000 steps ≈ 170-330 seconds. Fits within 480s timeout. + +**Lesson:** Always calibrate timeout to actual sim + training speed before launching sweeps. + +--- + +## 2026-04-13 — Discovery: Car Not Moving (PPO Throttle Problem) + +**Observation:** During early Phase 1 training, the car's steering values changed but the car did not move. + +**Root cause:** PPO with continuous action space outputs actions in `[-1, 1]` for all dimensions. +DonkeyCar expects `throttle ∈ [0, 1]`. When PPO's random initial policy outputs throttle ≈ -0.5, it gets clipped to 0 — the car sits still. + +**Fix:** Added `ThrottleClampWrapper` that ensures throttle ∈ [0.2, 1.0]. +This guarantees the car always moves forward, even before any learning. + +**Impact:** Without this fix, the car never moves and the health check detects it as a stuck sim, prematurely killing training. + +--- + +## 2026-04-13 — Critical Discovery: Reward Hacking via SpeedRewardWrapper 🚨 + +### Finding: Model Learned to Exploit Speed Reward by Oscillating at Track Boundary + +**Observation:** After fixing throttle and timestep issues, Phase 1 trials ran successfully. +Some trials produced suspiciously high rewards: + +| Trial | mean_reward | n_throttle | lr | verdict | +|-------|-------------|------------|--------|---------| +| 8 | **1936.9** | 2 | 0.00145 | 🚨 HACKED | +| 13 | **1139.4** | 2 | 0.00058 | 🚨 HACKED | +| 11 | 439.9 | 3 | 0.00048 | ⚠️ Suspicious | +| 2 | 398.9 | 2 | 0.00236 | ⚠️ Suspicious | + +**Root cause:** The `SpeedRewardWrapper` computed: +``` +reward = speed × (1 - abs(cte) / max_cte) +``` + +The model discovered a policy that **maximizes this formula without genuine track driving**: +1. Drive fast toward the track boundary +2. Return to track center (momentarily low CTE = high reward) +3. Repeat — "oscillation farming" + +The crash penalty (`-10`) was insufficient to deter this because thousands of oscillation steps accumulate far more positive reward. + +**Physical impossibility check:** A car driving at max speed (≈5 m/s) perfectly centered for 3429 steps would accumulate ≈ `5.0 × 1.0 × 3429 = 17,145`. Observed max was 1937 — so technically possible but the high variance (`std_reward=34`) across only 3 eval episodes and the user's direct observation confirm hacking. + +**User observation (direct visual confirmation):** "The model found a way to rig the reward by just going left — it was off the track and then back on the track." + +**Impact:** The entire Phase 1 dataset with `reward_shaping=True` is corrupted. +The GP fitted on these rewards was optimizing for hacking parameters, not driving parameters. + +**Action taken:** +- Archived all Phase 1 results: `autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl` +- Archived hacked models: `models/ARCHIVED_reward_hacking/` +- Redesigned reward function entirely + +--- + +## 2026-04-13 — Fix: Hack-Proof Reward Shaping Design + +### Finding: Multiplicative Speed Bonus Prevents Reward Hacking + +**Problem with additive formula:** `reward = speed × f(cte)` can be maximized by maximizing speed independently of f(cte). + +**Solution — multiplicative on-track bonus:** +```python +if original_reward > 0: + shaped = original_reward × (1 + speed_scale × speed) +else: + shaped = original_reward # No speed bonus when off track +``` + +**Why this is hack-proof:** +- `original_reward > 0` is ONLY true when the car is on track AND centered (DonkeyCar's own CTE signal) +- When off track, `original_reward ≤ 0` — no speed reward possible +- The model cannot increase reward by going fast off-track +- The formula is bounded: `shaped ≤ original_reward × (1 + speed_scale × max_speed)` + +**Author's insight:** "Speed should only be rewarded if you are progressing down the track." + +**Implementation:** `agent/reward_wrapper.py` — `SpeedRewardWrapper` v2. + +--- + +## 2026-04-13 — Lesson: Reward Function Design Principles + +From this experience, we derived the following principles for DonkeyCar RL reward shaping: + +1. **Never reward speed unconditionally.** Speed reward must be gated on track presence. +2. **The original DonkeyCar reward is the ground truth.** Any shaping must respect it, not replace it. +3. **Multiplicative bonuses are safer than additive.** They can't be maximized independently. +4. **High variance in eval reward is a red flag.** `std_reward=34` on 3 episodes suggests instability. +5. **Physically impossible reward values signal hacking.** Establish theoretical reward bounds before training. +6. **Low `n_throttle` (=2) may enable hacking.** With only 2 throttle values, the model may discover degenerate oscillation policies more easily. Investigate. + +--- + +## Next Research Questions + +1. **Does `n_throttle=2` uniquely enable hacking?** The hacked models all had `n_throttle=2`. With only 2 throttle states (stop/full-throttle), oscillation may be easier to exploit. +2. **What is the minimum timestep for genuine learning?** The low-reward trials (5-22) may not have trained long enough. Is 3000 steps sufficient for any real driving behavior? +3. **Does the multiplicative reward fix change the optimal hyperparameter region?** Re-run autoresearch with fixed reward and compare top configurations. +4. **Can we detect reward hacking automatically?** A reward-per-step threshold (e.g., flag if mean > 2.0 per step) could auto-detect hacking during training. +5. **What does a genuinely good reward look like?** After completing Phase 1 cleanly, characterize the reward distribution of a car that drives one full lap. diff --git a/tests/test_reward_wrapper.py b/tests/test_reward_wrapper.py index 9fa3c48..c9b0e13 100644 --- a/tests/test_reward_wrapper.py +++ b/tests/test_reward_wrapper.py @@ -1,5 +1,5 @@ """ -Tests for reward_wrapper.py — no simulator required. +Tests for reward_wrapper.py v2 (hack-proof multiplicative formula) — no simulator required. """ import sys @@ -17,10 +17,9 @@ class MockStepEnv(gym.Env): """Mock gymnasium.Env for testing SpeedRewardWrapper.""" metadata = {'render_modes': []} - def __init__(self, speed=2.0, cte=0.5, original_reward=1.0, done=False, use_5tuple=True): + def __init__(self, speed=2.0, original_reward=1.0, done=False, use_5tuple=True): super().__init__() self._speed = speed - self._cte = cte self._reward = original_reward self._done = done self._use_5tuple = use_5tuple @@ -32,7 +31,7 @@ class MockStepEnv(gym.Env): def step(self, action): obs = np.zeros((120, 160, 3), dtype=np.uint8) - info = {'speed': self._speed, 'cte': self._cte} + info = {'speed': self._speed} if self._use_5tuple: return obs, self._reward, self._done, False, info else: @@ -41,53 +40,93 @@ class MockStepEnv(gym.Env): def close(self): pass - def close(self): - pass +# ---- Hack-Proof Guarantee Tests ---- -def test_speed_reward_higher_when_fast_and_centered(): - """Reward should be higher when car is fast and centered (low CTE).""" - env_fast_centered = MockStepEnv(speed=5.0, cte=0.1, original_reward=1.0) - env_slow_offset = MockStepEnv(speed=1.0, cte=3.0, original_reward=1.0) - - wrapped_fast = SpeedRewardWrapper(env_fast_centered) - wrapped_slow = SpeedRewardWrapper(env_slow_offset) - - _, reward_fast, _, _, _ = wrapped_fast.step(0) - _, reward_slow, _, _, _ = wrapped_slow.step(0) - - assert reward_fast > reward_slow, \ - f"Fast+centered should reward more: {reward_fast:.3f} vs {reward_slow:.3f}" - - -def test_speed_reward_zero_at_max_cte(): - """Reward should be ~0 when CTE = max_cte (on the edge of the road).""" - env = MockStepEnv(speed=5.0, cte=8.0, original_reward=1.0) - wrapped = SpeedRewardWrapper(env, max_cte=8.0) +def test_no_speed_bonus_when_off_track(): + """ + CRITICAL: Off-track reward (≤ 0) must NOT get a speed bonus. + This is the core anti-hacking guarantee. + """ + env = MockStepEnv(speed=10.0, original_reward=-1.0) # Off track, very fast + wrapped = SpeedRewardWrapper(env, speed_scale=0.5) _, reward, _, _, _ = wrapped.step(0) - assert reward == pytest.approx(0.0, abs=0.01), \ - f"Reward at max CTE should be ~0, got {reward}" + assert reward == -1.0, \ + f"Off-track reward must not get speed bonus, got {reward}" -def test_speed_reward_positive_when_on_track(): - """Reward should be positive when car is on track at any speed > 0.""" - env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0) - wrapped = SpeedRewardWrapper(env, max_cte=8.0) +def test_no_speed_bonus_when_reward_zero(): + """Reward exactly 0 (boundary case) should not get speed bonus.""" + env = MockStepEnv(speed=5.0, original_reward=0.0) + wrapped = SpeedRewardWrapper(env, speed_scale=0.5) _, reward, _, _, _ = wrapped.step(0) - assert reward > 0, f"On-track reward should be positive, got {reward}" + assert reward == 0.0, f"Zero reward should stay zero, got {reward}" -def test_crash_penalty_applied_on_done(): - """Crash penalty should be added when episode ends with negative reward.""" - env = MockStepEnv(speed=0.0, cte=9.0, original_reward=-1.0, done=True) - wrapped = SpeedRewardWrapper(env, max_cte=8.0, crash_penalty=-10.0) - _, reward, terminated, truncated, _ = wrapped.step(0) - assert reward < -5.0, f"Crash penalty should make reward very negative, got {reward}" +def test_speed_bonus_scales_with_speed_when_on_track(): + """When on track (positive reward), faster = higher shaped reward.""" + env_slow = MockStepEnv(speed=1.0, original_reward=0.8) + env_fast = MockStepEnv(speed=5.0, original_reward=0.8) + + wrapped_slow = SpeedRewardWrapper(env_slow, speed_scale=0.1) + wrapped_fast = SpeedRewardWrapper(env_fast, speed_scale=0.1) + + _, r_slow, _, _, _ = wrapped_slow.step(0) + _, r_fast, _, _, _ = wrapped_fast.step(0) + + assert r_fast > r_slow, f"Faster on-track should reward more: {r_fast:.3f} vs {r_slow:.3f}" -def test_fallback_to_original_reward_when_info_missing(): - """If info doesn't have speed/cte, should fall back to original reward.""" - class NoInfoEnv(gym.Env): +def test_multiplicative_formula_correct(): + """ + Verify exact formula: shaped = original × (1 + speed_scale × speed) + """ + original_reward = 0.6 + speed = 3.0 + speed_scale = 0.1 + expected = original_reward * (1.0 + speed_scale * speed) # 0.6 × 1.3 = 0.78 + + env = MockStepEnv(speed=speed, original_reward=original_reward) + wrapped = SpeedRewardWrapper(env, speed_scale=speed_scale) + _, reward, _, _, _ = wrapped.step(0) + + assert reward == pytest.approx(expected, abs=1e-6), \ + f"Expected {expected:.6f}, got {reward:.6f}" + + +def test_cannot_hack_by_going_fast_off_track(): + """ + Demonstrate that the previous formula could be hacked but this one cannot. + Fast off-track (speed=10) must give same or worse result than slow off-track (speed=1). + """ + env_fast_offtrack = MockStepEnv(speed=10.0, original_reward=-1.0) + env_slow_offtrack = MockStepEnv(speed=1.0, original_reward=-1.0) + + wrapped_fast = SpeedRewardWrapper(env_fast_offtrack, speed_scale=0.5) + wrapped_slow = SpeedRewardWrapper(env_slow_offtrack, speed_scale=0.5) + + _, r_fast, _, _, _ = wrapped_fast.step(0) + _, r_slow, _, _, _ = wrapped_slow.step(0) + + assert r_fast == r_slow == -1.0, \ + f"Off-track reward must be identical regardless of speed: fast={r_fast}, slow={r_slow}" + + +def test_theoretical_max_per_step(): + """ + Verify theoretical_max_per_step returns correct upper bound. + With speed_scale=0.1 and max_speed=10.0: max = 1.0 × (1 + 0.1×10) = 2.0 + """ + env = MockStepEnv() + wrapped = SpeedRewardWrapper(env, speed_scale=0.1) + max_reward = wrapped.theoretical_max_per_step(max_speed=10.0) + assert max_reward == pytest.approx(2.0, abs=1e-6), \ + f"Max per step should be 2.0, got {max_reward}" + + +def test_fallback_when_speed_not_in_info(): + """If info doesn't have speed, fall back to original reward.""" + class NoSpeedEnv(gym.Env): metadata = {'render_modes': []} def __init__(self): super().__init__() @@ -96,20 +135,19 @@ def test_fallback_to_original_reward_when_info_missing(): def reset(self, seed=None, **kwargs): return np.zeros((120, 160, 3), dtype=np.uint8), {} def step(self, action): - return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {} + return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {} # No 'speed' key def close(self): pass - wrapped = SpeedRewardWrapper(NoInfoEnv()) + wrapped = SpeedRewardWrapper(NoSpeedEnv(), speed_scale=0.5) _, reward, _, _, _ = wrapped.step(0) + # speed=0.0 default → shaped = 0.75 × (1 + 0.5 × 0.0) = 0.75 assert reward == pytest.approx(0.75, abs=1e-6), \ - f"Should fall back to original reward 0.75, got {reward}" + f"Should fall back gracefully, got {reward}" def test_wrapper_preserves_observation(): - """SpeedRewardWrapper should not modify observations.""" - obs_data = np.zeros((120, 160, 3), dtype=np.uint8) - + """SpeedRewardWrapper must not modify observations.""" class FixedObsEnv(gym.Env): metadata = {'render_modes': []} def __init__(self): @@ -117,22 +155,31 @@ def test_wrapper_preserves_observation(): self.action_space = gym.spaces.Discrete(5) self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8) def reset(self, seed=None, **kwargs): - return obs_data.copy(), {} + return np.zeros((120, 160, 3), dtype=np.uint8), {} def step(self, action): - return obs_data.copy(), 1.0, False, False, {'speed': 2.0, 'cte': 0.5} + return np.zeros((120, 160, 3), dtype=np.uint8), 0.8, False, False, {'speed': 2.0} def close(self): pass wrapped = SpeedRewardWrapper(FixedObsEnv()) obs, _, _, _, _ = wrapped.step(0) - np.testing.assert_array_almost_equal(obs, obs_data) + np.testing.assert_array_equal(obs, np.zeros((120, 160, 3), dtype=np.uint8)) def test_4tuple_step_compatibility(): """Wrapper should handle 4-tuple step() return (old gym API).""" - env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0, use_5tuple=False) + env = MockStepEnv(speed=2.0, original_reward=0.8, use_5tuple=False) wrapped = SpeedRewardWrapper(env) result = wrapped.step(0) assert len(result) == 4, f"Expected 4-tuple, got {len(result)}" _, reward, done, info = result assert isinstance(reward, float) + assert reward > 0.8, "Speed bonus should increase reward when on track" + + +def test_crash_still_penalized(): + """Crash (original_reward=-1) should remain -1, not improved by speed.""" + env = MockStepEnv(speed=8.0, original_reward=-1.0, done=True) + wrapped = SpeedRewardWrapper(env, speed_scale=0.2) + _, reward, _, _, _ = wrapped.step(0) + assert reward == -1.0, f"Crash reward should remain -1.0, got {reward}"