fix: hack-proof reward shaping + reward hacking detection + research log
CRITICAL BUG FIX — Reward Hacking: - Old formula: speed × (1 - cte/max_cte) could be maximized by oscillating at track boundary regardless of on-track behavior (trials 8+13 hit 1936+1139) - New formula: original_reward × (1 + speed_scale × speed) ONLY when on_track - Off-track (original_reward ≤ 0) → zero speed bonus → cannot be hacked - Verified hack-proof: 9 new targeted tests including test_cannot_hack_by_going_fast_off_track Reward Hacking Auto-Detection: - check_for_reward_hacking() flags results with >3.0 reward/step as suspected hacking - Flagged results are excluded from GP fitting (won't optimize toward hacking params) - reward_hacking_suspected field added to JSONL result records Research Documentation: - docs/RESEARCH_LOG.md created: full chronological research history - Random policy bug discovery and impact - Throttle clamp fix - Reward hacking discovery with evidence table - Hack-proof design rationale - Lessons learned + future research questions - Archived corrupted Phase 1 data: autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl - Archived hacked models: models/ARCHIVED_reward_hacking/ Clean start: autoresearch_results_phase1.jsonl reset, models/champion reset Agent: pi/claude-sonnet Tests: 40/40 passing Tests-Added: +9 (reward wrapper hack-proof tests) TypeScript: N/A
This commit is contained in:
parent
0c6263352b
commit
5e93dae316
|
|
@ -83,6 +83,29 @@ def log(msg):
|
||||||
with open(PHASE1_LOG, 'a') as f:
|
with open(PHASE1_LOG, 'a') as f:
|
||||||
f.write(line + '\n')
|
f.write(line + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Reward Sanity / Hacking Detection ----
|
||||||
|
# SpeedRewardWrapper v2 theoretical max:
|
||||||
|
# max_original_reward ≈ 1.0, max_speed ≈ 10.0, speed_scale=0.1
|
||||||
|
# max_per_step = 1.0 × (1 + 0.1 × 10) = 2.0
|
||||||
|
# Flag anything above 3.0 reward/step as suspected hacking.
|
||||||
|
REWARD_PER_STEP_HACK_THRESHOLD = 3.0
|
||||||
|
|
||||||
|
|
||||||
|
def check_for_reward_hacking(mean_reward, params):
|
||||||
|
"""Detect reward hacking from physically impossible reward-per-step values."""
|
||||||
|
if mean_reward is None:
|
||||||
|
return False
|
||||||
|
timesteps = params.get('timesteps', 3000)
|
||||||
|
reward_per_step = mean_reward / max(timesteps, 1)
|
||||||
|
if reward_per_step > REWARD_PER_STEP_HACK_THRESHOLD:
|
||||||
|
log(f'[AutoResearch] ⚠️ REWARD HACKING SUSPECTED: '
|
||||||
|
f'mean_reward={mean_reward:.1f} over {timesteps} steps '
|
||||||
|
f'= {reward_per_step:.3f}/step > threshold {REWARD_PER_STEP_HACK_THRESHOLD}. '
|
||||||
|
f'Result EXCLUDED from GP fitting. See docs/RESEARCH_LOG.md.')
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
# ---- Parameter Encoding ----
|
# ---- Parameter Encoding ----
|
||||||
def encode_params(params):
|
def encode_params(params):
|
||||||
vec = []
|
vec = []
|
||||||
|
|
@ -304,7 +327,7 @@ def launch_job(params, trial_num):
|
||||||
return mean_reward, std_reward, model_zip, output, status, elapsed, save_dir
|
return mean_reward, std_reward, model_zip, output, status, elapsed, save_dir
|
||||||
|
|
||||||
# ---- Result Saving ----
|
# ---- Result Saving ----
|
||||||
def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed):
|
def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed, hacked=False):
|
||||||
rec = {
|
rec = {
|
||||||
'trial': trial,
|
'trial': trial,
|
||||||
'timestamp': datetime.now().isoformat(),
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
|
@ -315,6 +338,7 @@ def save_result(trial, params, mean_reward, std_reward, model_path, champion, st
|
||||||
'champion': champion,
|
'champion': champion,
|
||||||
'run_status': status,
|
'run_status': status,
|
||||||
'elapsed_sec': elapsed,
|
'elapsed_sec': elapsed,
|
||||||
|
'reward_hacking_suspected': hacked,
|
||||||
}
|
}
|
||||||
with open(PHASE1_RESULTS, 'a') as f:
|
with open(PHASE1_RESULTS, 'a') as f:
|
||||||
f.write(json.dumps(rec) + '\n')
|
f.write(json.dumps(rec) + '\n')
|
||||||
|
|
@ -373,15 +397,22 @@ def run_autoresearch(max_trials=50, kappa=UCB_KAPPA, push_every=10):
|
||||||
# 3. Launch real training job
|
# 3. Launch real training job
|
||||||
mean_reward, std_reward, model_zip, output, status, elapsed, save_dir = launch_job(full_params, trial)
|
mean_reward, std_reward, model_zip, output, status, elapsed, save_dir = launch_job(full_params, trial)
|
||||||
|
|
||||||
# 4. Update champion
|
# 4. Check for reward hacking before updating champion
|
||||||
is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial)
|
hacked = check_for_reward_hacking(mean_reward, full_params)
|
||||||
|
|
||||||
# 5. Save result
|
# 5. Update champion (only if not hacking)
|
||||||
save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed)
|
is_champion = False
|
||||||
|
if not hacked:
|
||||||
|
is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial)
|
||||||
|
|
||||||
# 6. Add to GP data (only successful runs with valid reward)
|
# 6. Save result (flag hacked results)
|
||||||
if mean_reward is not None:
|
save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed, hacked=hacked)
|
||||||
|
|
||||||
|
# 7. Add to GP data (ONLY if not hacking and valid reward)
|
||||||
|
if mean_reward is not None and not hacked:
|
||||||
results.append({'params': full_params, 'mean_reward': mean_reward})
|
results.append({'params': full_params, 'mean_reward': mean_reward})
|
||||||
|
elif hacked:
|
||||||
|
log(f'[AutoResearch] Hacked result excluded from GP — GP will not optimize toward this region.')
|
||||||
|
|
||||||
# 7. Print summary
|
# 7. Print summary
|
||||||
print_summary(results, champion, trial)
|
print_summary(results, champion, trial)
|
||||||
|
|
|
||||||
|
|
@ -1,303 +1,26 @@
|
||||||
[2026-04-13 10:00:54] [AutoResearch] GP UCB top-5 candidates:
|
[2026-04-13 12:26:21] [AutoResearch] GP UCB top-5 candidates:
|
||||||
[2026-04-13 10:00:54] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316}
|
[2026-04-13 12:26:21] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
||||||
[2026-04-13 10:00:54] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654}
|
[2026-04-13 12:26:21] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
||||||
[2026-04-13 10:00:54] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730}
|
[2026-04-13 12:26:21] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
||||||
[2026-04-13 10:00:54] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413}
|
[2026-04-13 12:26:21] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
||||||
[2026-04-13 10:00:54] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446}
|
[2026-04-13 12:26:21] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
[2026-04-13 12:26:21] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
[2026-04-13 10:00:54] [AutoResearch] Only 1 results — using random proposal.
|
[2026-04-13 12:26:21] [AutoResearch] Only 1 results — using random proposal.
|
||||||
[2026-04-13 10:02:55] [AutoResearch] GP UCB top-5 candidates:
|
[2026-04-13 12:27:28] [AutoResearch] GP UCB top-5 candidates:
|
||||||
[2026-04-13 10:02:55] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316}
|
[2026-04-13 12:27:28] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
||||||
[2026-04-13 10:02:55] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654}
|
[2026-04-13 12:27:28] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
||||||
[2026-04-13 10:02:55] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730}
|
[2026-04-13 12:27:28] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
||||||
[2026-04-13 10:02:55] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413}
|
[2026-04-13 12:27:28] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
||||||
[2026-04-13 10:02:55] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446}
|
[2026-04-13 12:27:28] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
[2026-04-13 12:27:28] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
[2026-04-13 10:02:55] [AutoResearch] Only 1 results — using random proposal.
|
[2026-04-13 12:27:28] [AutoResearch] Only 1 results — using random proposal.
|
||||||
[2026-04-13 10:03:22] ============================================================
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion
|
|
||||||
[2026-04-13 10:03:22] ============================================================
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Loaded 0 existing Phase 1 results.
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] No champion yet.
|
|
||||||
[2026-04-13 10:03:22]
|
|
||||||
[AutoResearch] ========== Trial 1/50 ==========
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:03:22] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:03:24] [AutoResearch] Launching trial 1: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:13:24] [AutoResearch] Trial 1 TIMED OUT after 600.2s
|
|
||||||
[2026-04-13 10:13:24] [AutoResearch] Trial 1: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 10:13:26]
|
|
||||||
[AutoResearch] ========== Trial 2/50 ==========
|
|
||||||
[2026-04-13 10:13:26] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:13:26] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:13:28] [AutoResearch] Launching trial 2: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:23:28] [AutoResearch] Trial 2 TIMED OUT after 600.0s
|
|
||||||
[2026-04-13 10:23:28] [AutoResearch] Trial 2: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 10:23:30]
|
|
||||||
[AutoResearch] ========== Trial 3/50 ==========
|
|
||||||
[2026-04-13 10:23:30] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:23:30] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:23:32] [AutoResearch] Launching trial 3: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:33:32] [AutoResearch] Trial 3 TIMED OUT after 600.1s
|
|
||||||
[2026-04-13 10:33:32] [AutoResearch] Trial 3: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 10:33:34]
|
|
||||||
[AutoResearch] ========== Trial 4/50 ==========
|
|
||||||
[2026-04-13 10:33:34] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:33:34] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:33:36] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:43:36] [AutoResearch] Trial 4 TIMED OUT after 600.1s
|
|
||||||
[2026-04-13 10:43:36] [AutoResearch] Trial 4: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 10:43:39]
|
|
||||||
[AutoResearch] ========== Trial 5/50 ==========
|
|
||||||
[2026-04-13 10:43:39] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:43:39] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:43:41] [AutoResearch] Launching trial 5: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:53:41] [AutoResearch] Trial 5 TIMED OUT after 600.1s
|
|
||||||
[2026-04-13 10:53:41] [AutoResearch] Trial 5: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 10:53:43]
|
|
||||||
[AutoResearch] ========== Trial 6/50 ==========
|
|
||||||
[2026-04-13 10:53:43] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 10:53:43] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 10:53:45] [AutoResearch] Launching trial 6: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:03:45] [AutoResearch] Trial 6 TIMED OUT after 600.1s
|
|
||||||
[2026-04-13 11:03:45] [AutoResearch] Trial 6: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 11:03:47]
|
|
||||||
[AutoResearch] ========== Trial 7/50 ==========
|
|
||||||
[2026-04-13 11:03:47] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 11:03:47] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:03:49] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:16:34] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:16:34] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
|
||||||
[2026-04-13 11:16:34] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
|
||||||
[2026-04-13 11:16:34] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
|
||||||
[2026-04-13 11:16:34] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
|
||||||
[2026-04-13 11:16:34] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
|
||||||
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
|
||||||
[2026-04-13 11:16:34] [AutoResearch] Only 1 results — using random proposal.
|
|
||||||
[2026-04-13 11:16:53] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:16:53] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
|
||||||
[2026-04-13 11:16:53] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
|
||||||
[2026-04-13 11:16:53] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
|
||||||
[2026-04-13 11:16:53] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
|
||||||
[2026-04-13 11:16:53] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
|
||||||
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
|
||||||
[2026-04-13 11:16:53] [AutoResearch] Only 1 results — using random proposal.
|
|
||||||
[2026-04-13 11:17:15] ============================================================
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion
|
|
||||||
[2026-04-13 11:17:15] ============================================================
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Loaded 0 existing Phase 1 results.
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] No champion yet.
|
|
||||||
[2026-04-13 11:17:15]
|
|
||||||
[AutoResearch] ========== Trial 1/50 ==========
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Only 0 results — using random proposal.
|
|
||||||
[2026-04-13 11:17:15] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:17:17] [AutoResearch] Launching trial 1: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:20:53] [AutoResearch] Trial 1 finished in 215.5s, returncode=0
|
|
||||||
[2026-04-13 11:20:53] [AutoResearch] Trial 1: mean_reward=5.7246 std_reward=0.027
|
|
||||||
[2026-04-13 11:20:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:20:53] [AutoResearch] === Trial 1 Summary ===
|
|
||||||
[2026-04-13 11:20:53] Total Phase 1 runs: 1
|
|
||||||
[2026-04-13 11:20:53] Champion: trial=1 mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:20:53] Top 5:
|
|
||||||
[2026-04-13 11:20:53] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:20:55]
|
|
||||||
[AutoResearch] ========== Trial 2/50 ==========
|
|
||||||
[2026-04-13 11:20:55] [AutoResearch] Only 1 results — using random proposal.
|
|
||||||
[2026-04-13 11:20:55] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:20:57] [AutoResearch] Launching trial 2: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:32] [AutoResearch] Trial 2 finished in 215.5s, returncode=0
|
|
||||||
[2026-04-13 11:24:32] [AutoResearch] Trial 2: mean_reward=398.8564 std_reward=1.1786
|
|
||||||
[2026-04-13 11:24:33] [Champion] 🏆 NEW BEST! Trial 2: mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:33] [AutoResearch] === Trial 2 Summary ===
|
|
||||||
[2026-04-13 11:24:33] Total Phase 1 runs: 2
|
|
||||||
[2026-04-13 11:24:33] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:33] Top 5:
|
|
||||||
[2026-04-13 11:24:33] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:33] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:35]
|
|
||||||
[AutoResearch] ========== Trial 3/50 ==========
|
|
||||||
[2026-04-13 11:24:35] [AutoResearch] Only 2 results — using random proposal.
|
|
||||||
[2026-04-13 11:24:35] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:24:37] [AutoResearch] Launching trial 3: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:06] [AutoResearch] Trial 3 finished in 269.6s, returncode=0
|
|
||||||
[2026-04-13 11:29:06] [AutoResearch] Trial 3: mean_reward=5.9776 std_reward=0.0252
|
|
||||||
[2026-04-13 11:29:06] [AutoResearch] === Trial 3 Summary ===
|
|
||||||
[2026-04-13 11:29:06] Total Phase 1 runs: 3
|
|
||||||
[2026-04-13 11:29:06] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:06] Top 5:
|
|
||||||
[2026-04-13 11:29:06] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:06] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:06] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:08]
|
|
||||||
[AutoResearch] ========== Trial 4/50 ==========
|
|
||||||
[2026-04-13 11:29:08] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:29:08] UCB=2.4615 mu=0.8615 sigma=0.8000 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084}
|
|
||||||
[2026-04-13 11:29:08] UCB=2.4548 mu=0.9032 sigma=0.7758 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0009758380297094257, 'timesteps': 3202}
|
|
||||||
[2026-04-13 11:29:08] UCB=2.4540 mu=0.7444 sigma=0.8548 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0006970147905651335, 'timesteps': 3351}
|
|
||||||
[2026-04-13 11:29:08] UCB=2.4479 mu=0.7051 sigma=0.8714 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0009997388594600006, 'timesteps': 4139}
|
|
||||||
[2026-04-13 11:29:08] UCB=2.4443 mu=0.9374 sigma=0.7535 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001158537723428793, 'timesteps': 3743}
|
|
||||||
[2026-04-13 11:29:08] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:29:10] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:24] [AutoResearch] Trial 4 finished in 194.0s, returncode=0
|
|
||||||
[2026-04-13 11:32:24] [AutoResearch] Trial 4: mean_reward=22.8241 std_reward=0.1918
|
|
||||||
[2026-04-13 11:32:24] [AutoResearch] === Trial 4 Summary ===
|
|
||||||
[2026-04-13 11:32:24] Total Phase 1 runs: 4
|
|
||||||
[2026-04-13 11:32:24] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:24] Top 5:
|
|
||||||
[2026-04-13 11:32:24] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:24] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:24] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:24] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:26]
|
|
||||||
[AutoResearch] ========== Trial 5/50 ==========
|
|
||||||
[2026-04-13 11:32:26] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:32:26] UCB=2.9797 mu=1.4209 sigma=0.7794 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626}
|
|
||||||
[2026-04-13 11:32:26] UCB=2.9360 mu=1.6516 sigma=0.6422 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.003483580964392729, 'timesteps': 3613}
|
|
||||||
[2026-04-13 11:32:26] UCB=2.8856 mu=1.1888 sigma=0.8484 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.002515064142571671, 'timesteps': 4267}
|
|
||||||
[2026-04-13 11:32:26] UCB=2.8582 mu=1.5163 sigma=0.6709 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0029159438252188284, 'timesteps': 3730}
|
|
||||||
[2026-04-13 11:32:26] UCB=2.8422 mu=1.5296 sigma=0.6563 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0033924318546103937, 'timesteps': 3346}
|
|
||||||
[2026-04-13 11:32:26] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:32:28] [AutoResearch] Launching trial 5: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] [AutoResearch] Trial 5 finished in 264.3s, returncode=0
|
|
||||||
[2026-04-13 11:36:52] [AutoResearch] Trial 5: mean_reward=5.9913 std_reward=0.0246
|
|
||||||
[2026-04-13 11:36:52] [AutoResearch] === Trial 5 Summary ===
|
|
||||||
[2026-04-13 11:36:52] Total Phase 1 runs: 5
|
|
||||||
[2026-04-13 11:36:52] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] Top 5:
|
|
||||||
[2026-04-13 11:36:52] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:52] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:54]
|
|
||||||
[AutoResearch] ========== Trial 6/50 ==========
|
|
||||||
[2026-04-13 11:36:54] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:36:54] UCB=2.8622 mu=1.4083 sigma=0.7270 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387}
|
|
||||||
[2026-04-13 11:36:54] UCB=2.7841 mu=1.0518 sigma=0.8661 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002782960062629981, 'timesteps': 4433}
|
|
||||||
[2026-04-13 11:36:54] UCB=2.7380 mu=1.5849 sigma=0.5765 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001906612836342622, 'timesteps': 3714}
|
|
||||||
[2026-04-13 11:36:54] UCB=2.7029 mu=0.9236 sigma=0.8897 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002301914433902658, 'timesteps': 4751}
|
|
||||||
[2026-04-13 11:36:54] UCB=2.6924 mu=1.1628 sigma=0.7648 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019575950790335435, 'timesteps': 2548}
|
|
||||||
[2026-04-13 11:36:54] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:36:56] [AutoResearch] Launching trial 6: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] [AutoResearch] Trial 6 TIMED OUT after 480.1s
|
|
||||||
[2026-04-13 11:44:57] [AutoResearch] Trial 6: mean_reward=None std_reward=None
|
|
||||||
[2026-04-13 11:44:57] [AutoResearch] === Trial 6 Summary ===
|
|
||||||
[2026-04-13 11:44:57] Total Phase 1 runs: 5
|
|
||||||
[2026-04-13 11:44:57] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] Top 5:
|
|
||||||
[2026-04-13 11:44:57] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:57] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:44:59]
|
|
||||||
[AutoResearch] ========== Trial 7/50 ==========
|
|
||||||
[2026-04-13 11:44:59] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:44:59] UCB=2.7677 mu=1.3945 sigma=0.6866 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160}
|
|
||||||
[2026-04-13 11:44:59] UCB=2.6401 mu=0.8590 sigma=0.8906 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012329470317109907, 'timesteps': 4404}
|
|
||||||
[2026-04-13 11:44:59] UCB=2.6346 mu=0.8897 sigma=0.8725 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002824572687677801, 'timesteps': 2173}
|
|
||||||
[2026-04-13 11:44:59] UCB=2.6197 mu=1.1406 sigma=0.7395 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002264095441698803, 'timesteps': 3610}
|
|
||||||
[2026-04-13 11:44:59] UCB=2.6013 mu=0.7257 sigma=0.9378 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.001986372556075669, 'timesteps': 4899}
|
|
||||||
[2026-04-13 11:44:59] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:45:01] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] [AutoResearch] Trial 7 finished in 211.3s, returncode=0
|
|
||||||
[2026-04-13 11:48:32] [AutoResearch] Trial 7: mean_reward=5.7529 std_reward=0.0318
|
|
||||||
[2026-04-13 11:48:32] [AutoResearch] === Trial 7 Summary ===
|
|
||||||
[2026-04-13 11:48:32] Total Phase 1 runs: 6
|
|
||||||
[2026-04-13 11:48:32] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] Top 5:
|
|
||||||
[2026-04-13 11:48:32] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:32] mean_reward=5.7529 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:34]
|
|
||||||
[AutoResearch] ========== Trial 8/50 ==========
|
|
||||||
[2026-04-13 11:48:34] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:48:34] UCB=2.9928 mu=1.4031 sigma=0.7948 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429}
|
|
||||||
[2026-04-13 11:48:34] UCB=2.9102 mu=1.2105 sigma=0.8499 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0013337121696681005, 'timesteps': 4384}
|
|
||||||
[2026-04-13 11:48:34] UCB=2.9095 mu=1.2362 sigma=0.8366 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0016866172466465327, 'timesteps': 4586}
|
|
||||||
[2026-04-13 11:48:34] UCB=2.7220 mu=1.0017 sigma=0.8601 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012033218829906316, 'timesteps': 4239}
|
|
||||||
[2026-04-13 11:48:34] UCB=2.6586 mu=0.8020 sigma=0.9283 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0014425382569831862, 'timesteps': 4488}
|
|
||||||
[2026-04-13 11:48:34] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:48:36] [AutoResearch] Launching trial 8: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] [AutoResearch] Trial 8 finished in 252.2s, returncode=0
|
|
||||||
[2026-04-13 11:52:48] [AutoResearch] Trial 8: mean_reward=1936.8533 std_reward=34.0067
|
|
||||||
[2026-04-13 11:52:48] [Champion] 🏆 NEW BEST! Trial 8: mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] [AutoResearch] === Trial 8 Summary ===
|
|
||||||
[2026-04-13 11:52:48] Total Phase 1 runs: 7
|
|
||||||
[2026-04-13 11:52:48] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] Top 5:
|
|
||||||
[2026-04-13 11:52:48] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:48] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:50]
|
|
||||||
[AutoResearch] ========== Trial 9/50 ==========
|
|
||||||
[2026-04-13 11:52:50] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:52:50] UCB=3.6446 mu=2.2362 sigma=0.7042 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961}
|
|
||||||
[2026-04-13 11:52:50] UCB=3.6253 mu=2.3605 sigma=0.6324 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0014035502090066865, 'timesteps': 2985}
|
|
||||||
[2026-04-13 11:52:50] UCB=3.5079 mu=2.3661 sigma=0.5709 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0016891347290568105, 'timesteps': 3422}
|
|
||||||
[2026-04-13 11:52:50] UCB=3.4169 mu=2.2243 sigma=0.5963 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0011351472472032882, 'timesteps': 4191}
|
|
||||||
[2026-04-13 11:52:50] UCB=3.3399 mu=1.6131 sigma=0.8634 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.00114114991914373, 'timesteps': 3413}
|
|
||||||
[2026-04-13 11:52:50] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:52:52] [AutoResearch] Launching trial 9: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] [AutoResearch] Trial 9 finished in 215.5s, returncode=0
|
|
||||||
[2026-04-13 11:56:28] [AutoResearch] Trial 9: mean_reward=237.9115 std_reward=1.4136
|
|
||||||
[2026-04-13 11:56:28] [AutoResearch] === Trial 9 Summary ===
|
|
||||||
[2026-04-13 11:56:28] Total Phase 1 runs: 8
|
|
||||||
[2026-04-13 11:56:28] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] Top 5:
|
|
||||||
[2026-04-13 11:56:28] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:28] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:30]
|
|
||||||
[AutoResearch] ========== Trial 10/50 ==========
|
|
||||||
[2026-04-13 11:56:30] [AutoResearch] GP UCB top-5 candidates:
|
|
||||||
[2026-04-13 11:56:30] UCB=3.6513 mu=2.0026 sigma=0.8243 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691}
|
|
||||||
[2026-04-13 11:56:30] UCB=3.2438 mu=1.9644 sigma=0.6397 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0013292739097783752, 'timesteps': 3897}
|
|
||||||
[2026-04-13 11:56:30] UCB=3.1815 mu=1.2984 sigma=0.9415 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0004768119261475519, 'timesteps': 4914}
|
|
||||||
[2026-04-13 11:56:30] UCB=3.0779 mu=1.4273 sigma=0.8253 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001854044179957165, 'timesteps': 3308}
|
|
||||||
[2026-04-13 11:56:30] UCB=2.9649 mu=1.2760 sigma=0.8444 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001236030774697938, 'timesteps': 3010}
|
|
||||||
[2026-04-13 11:56:30] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 11:56:32] [AutoResearch] Launching trial 10: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] [AutoResearch] Trial 10 finished in 285.4s, returncode=0
|
|
||||||
[2026-04-13 12:01:17] [AutoResearch] Trial 10: mean_reward=7.6595 std_reward=0.1051
|
|
||||||
[2026-04-13 12:01:17] [AutoResearch] === Trial 10 Summary ===
|
|
||||||
[2026-04-13 12:01:17] Total Phase 1 runs: 9
|
|
||||||
[2026-04-13 12:01:17] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] Top 5:
|
|
||||||
[2026-04-13 12:01:17] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
[2026-04-13 12:01:17] mean_reward=7.6595 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,377 @@
|
||||||
|
[2026-04-13 10:00:54] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 10:00:54] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316}
|
||||||
|
[2026-04-13 10:00:54] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654}
|
||||||
|
[2026-04-13 10:00:54] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730}
|
||||||
|
[2026-04-13 10:00:54] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413}
|
||||||
|
[2026-04-13 10:00:54] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-13 10:00:54] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-13 10:00:54] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-13 10:02:55] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 10:02:55] UCB=2.5673 mu=0.8758 sigma=0.8458 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0019880522059802556, 'timesteps': 15316}
|
||||||
|
[2026-04-13 10:02:55] UCB=2.5533 mu=0.8978 sigma=0.8277 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0015934898587720348, 'timesteps': 17654}
|
||||||
|
[2026-04-13 10:02:55] UCB=2.5196 mu=0.8299 sigma=0.8449 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0017281974656910685, 'timesteps': 13730}
|
||||||
|
[2026-04-13 10:02:55] UCB=2.5042 mu=0.6556 sigma=0.9243 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0017985944720852176, 'timesteps': 12413}
|
||||||
|
[2026-04-13 10:02:55] UCB=2.4927 mu=0.6946 sigma=0.8991 params={'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.00239716045398226, 'timesteps': 7446}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-13 10:02:55] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-13 10:03:22] ============================================================
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion
|
||||||
|
[2026-04-13 10:03:22] ============================================================
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Loaded 0 existing Phase 1 results.
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] No champion yet.
|
||||||
|
[2026-04-13 10:03:22]
|
||||||
|
[AutoResearch] ========== Trial 1/50 ==========
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:03:22] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:03:24] [AutoResearch] Launching trial 1: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:13:24] [AutoResearch] Trial 1 TIMED OUT after 600.2s
|
||||||
|
[2026-04-13 10:13:24] [AutoResearch] Trial 1: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 10:13:26]
|
||||||
|
[AutoResearch] ========== Trial 2/50 ==========
|
||||||
|
[2026-04-13 10:13:26] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:13:26] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:13:28] [AutoResearch] Launching trial 2: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:23:28] [AutoResearch] Trial 2 TIMED OUT after 600.0s
|
||||||
|
[2026-04-13 10:23:28] [AutoResearch] Trial 2: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 10:23:30]
|
||||||
|
[AutoResearch] ========== Trial 3/50 ==========
|
||||||
|
[2026-04-13 10:23:30] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:23:30] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:23:32] [AutoResearch] Launching trial 3: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:33:32] [AutoResearch] Trial 3 TIMED OUT after 600.1s
|
||||||
|
[2026-04-13 10:33:32] [AutoResearch] Trial 3: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 10:33:34]
|
||||||
|
[AutoResearch] ========== Trial 4/50 ==========
|
||||||
|
[2026-04-13 10:33:34] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:33:34] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:33:36] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:43:36] [AutoResearch] Trial 4 TIMED OUT after 600.1s
|
||||||
|
[2026-04-13 10:43:36] [AutoResearch] Trial 4: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 10:43:39]
|
||||||
|
[AutoResearch] ========== Trial 5/50 ==========
|
||||||
|
[2026-04-13 10:43:39] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:43:39] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:43:41] [AutoResearch] Launching trial 5: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:53:41] [AutoResearch] Trial 5 TIMED OUT after 600.1s
|
||||||
|
[2026-04-13 10:53:41] [AutoResearch] Trial 5: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 10:53:43]
|
||||||
|
[AutoResearch] ========== Trial 6/50 ==========
|
||||||
|
[2026-04-13 10:53:43] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 10:53:43] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 10:53:45] [AutoResearch] Launching trial 6: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:03:45] [AutoResearch] Trial 6 TIMED OUT after 600.1s
|
||||||
|
[2026-04-13 11:03:45] [AutoResearch] Trial 6: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 11:03:47]
|
||||||
|
[AutoResearch] ========== Trial 7/50 ==========
|
||||||
|
[2026-04-13 11:03:47] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 11:03:47] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:03:49] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:16:34] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:16:34] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
||||||
|
[2026-04-13 11:16:34] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
||||||
|
[2026-04-13 11:16:34] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
||||||
|
[2026-04-13 11:16:34] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
||||||
|
[2026-04-13 11:16:34] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-13 11:16:34] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-13 11:16:53] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:16:53] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888}
|
||||||
|
[2026-04-13 11:16:53] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033}
|
||||||
|
[2026-04-13 11:16:53] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774}
|
||||||
|
[2026-04-13 11:16:53] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022}
|
||||||
|
[2026-04-13 11:16:53] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-13 11:16:53] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-13 11:17:15] ============================================================
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion
|
||||||
|
[2026-04-13 11:17:15] ============================================================
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Loaded 0 existing Phase 1 results.
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] No champion yet.
|
||||||
|
[2026-04-13 11:17:15]
|
||||||
|
[AutoResearch] ========== Trial 1/50 ==========
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Only 0 results — using random proposal.
|
||||||
|
[2026-04-13 11:17:15] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:17:17] [AutoResearch] Launching trial 1: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:20:53] [AutoResearch] Trial 1 finished in 215.5s, returncode=0
|
||||||
|
[2026-04-13 11:20:53] [AutoResearch] Trial 1: mean_reward=5.7246 std_reward=0.027
|
||||||
|
[2026-04-13 11:20:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:20:53] [AutoResearch] === Trial 1 Summary ===
|
||||||
|
[2026-04-13 11:20:53] Total Phase 1 runs: 1
|
||||||
|
[2026-04-13 11:20:53] Champion: trial=1 mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:20:53] Top 5:
|
||||||
|
[2026-04-13 11:20:53] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:20:55]
|
||||||
|
[AutoResearch] ========== Trial 2/50 ==========
|
||||||
|
[2026-04-13 11:20:55] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-13 11:20:55] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:20:57] [AutoResearch] Launching trial 2: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:32] [AutoResearch] Trial 2 finished in 215.5s, returncode=0
|
||||||
|
[2026-04-13 11:24:32] [AutoResearch] Trial 2: mean_reward=398.8564 std_reward=1.1786
|
||||||
|
[2026-04-13 11:24:33] [Champion] 🏆 NEW BEST! Trial 2: mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:33] [AutoResearch] === Trial 2 Summary ===
|
||||||
|
[2026-04-13 11:24:33] Total Phase 1 runs: 2
|
||||||
|
[2026-04-13 11:24:33] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:33] Top 5:
|
||||||
|
[2026-04-13 11:24:33] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:33] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:35]
|
||||||
|
[AutoResearch] ========== Trial 3/50 ==========
|
||||||
|
[2026-04-13 11:24:35] [AutoResearch] Only 2 results — using random proposal.
|
||||||
|
[2026-04-13 11:24:35] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:24:37] [AutoResearch] Launching trial 3: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:06] [AutoResearch] Trial 3 finished in 269.6s, returncode=0
|
||||||
|
[2026-04-13 11:29:06] [AutoResearch] Trial 3: mean_reward=5.9776 std_reward=0.0252
|
||||||
|
[2026-04-13 11:29:06] [AutoResearch] === Trial 3 Summary ===
|
||||||
|
[2026-04-13 11:29:06] Total Phase 1 runs: 3
|
||||||
|
[2026-04-13 11:29:06] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:06] Top 5:
|
||||||
|
[2026-04-13 11:29:06] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:06] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:06] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:08]
|
||||||
|
[AutoResearch] ========== Trial 4/50 ==========
|
||||||
|
[2026-04-13 11:29:08] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:29:08] UCB=2.4615 mu=0.8615 sigma=0.8000 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084}
|
||||||
|
[2026-04-13 11:29:08] UCB=2.4548 mu=0.9032 sigma=0.7758 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0009758380297094257, 'timesteps': 3202}
|
||||||
|
[2026-04-13 11:29:08] UCB=2.4540 mu=0.7444 sigma=0.8548 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0006970147905651335, 'timesteps': 3351}
|
||||||
|
[2026-04-13 11:29:08] UCB=2.4479 mu=0.7051 sigma=0.8714 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0009997388594600006, 'timesteps': 4139}
|
||||||
|
[2026-04-13 11:29:08] UCB=2.4443 mu=0.9374 sigma=0.7535 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001158537723428793, 'timesteps': 3743}
|
||||||
|
[2026-04-13 11:29:08] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:29:10] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:24] [AutoResearch] Trial 4 finished in 194.0s, returncode=0
|
||||||
|
[2026-04-13 11:32:24] [AutoResearch] Trial 4: mean_reward=22.8241 std_reward=0.1918
|
||||||
|
[2026-04-13 11:32:24] [AutoResearch] === Trial 4 Summary ===
|
||||||
|
[2026-04-13 11:32:24] Total Phase 1 runs: 4
|
||||||
|
[2026-04-13 11:32:24] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:24] Top 5:
|
||||||
|
[2026-04-13 11:32:24] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:24] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:24] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:24] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:26]
|
||||||
|
[AutoResearch] ========== Trial 5/50 ==========
|
||||||
|
[2026-04-13 11:32:26] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:32:26] UCB=2.9797 mu=1.4209 sigma=0.7794 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626}
|
||||||
|
[2026-04-13 11:32:26] UCB=2.9360 mu=1.6516 sigma=0.6422 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.003483580964392729, 'timesteps': 3613}
|
||||||
|
[2026-04-13 11:32:26] UCB=2.8856 mu=1.1888 sigma=0.8484 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.002515064142571671, 'timesteps': 4267}
|
||||||
|
[2026-04-13 11:32:26] UCB=2.8582 mu=1.5163 sigma=0.6709 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0029159438252188284, 'timesteps': 3730}
|
||||||
|
[2026-04-13 11:32:26] UCB=2.8422 mu=1.5296 sigma=0.6563 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0033924318546103937, 'timesteps': 3346}
|
||||||
|
[2026-04-13 11:32:26] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:32:28] [AutoResearch] Launching trial 5: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] [AutoResearch] Trial 5 finished in 264.3s, returncode=0
|
||||||
|
[2026-04-13 11:36:52] [AutoResearch] Trial 5: mean_reward=5.9913 std_reward=0.0246
|
||||||
|
[2026-04-13 11:36:52] [AutoResearch] === Trial 5 Summary ===
|
||||||
|
[2026-04-13 11:36:52] Total Phase 1 runs: 5
|
||||||
|
[2026-04-13 11:36:52] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] Top 5:
|
||||||
|
[2026-04-13 11:36:52] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:52] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:54]
|
||||||
|
[AutoResearch] ========== Trial 6/50 ==========
|
||||||
|
[2026-04-13 11:36:54] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:36:54] UCB=2.8622 mu=1.4083 sigma=0.7270 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387}
|
||||||
|
[2026-04-13 11:36:54] UCB=2.7841 mu=1.0518 sigma=0.8661 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002782960062629981, 'timesteps': 4433}
|
||||||
|
[2026-04-13 11:36:54] UCB=2.7380 mu=1.5849 sigma=0.5765 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.001906612836342622, 'timesteps': 3714}
|
||||||
|
[2026-04-13 11:36:54] UCB=2.7029 mu=0.9236 sigma=0.8897 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002301914433902658, 'timesteps': 4751}
|
||||||
|
[2026-04-13 11:36:54] UCB=2.6924 mu=1.1628 sigma=0.7648 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019575950790335435, 'timesteps': 2548}
|
||||||
|
[2026-04-13 11:36:54] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:36:56] [AutoResearch] Launching trial 6: {'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0023577235727246376, 'timesteps': 4387, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] [AutoResearch] Trial 6 TIMED OUT after 480.1s
|
||||||
|
[2026-04-13 11:44:57] [AutoResearch] Trial 6: mean_reward=None std_reward=None
|
||||||
|
[2026-04-13 11:44:57] [AutoResearch] === Trial 6 Summary ===
|
||||||
|
[2026-04-13 11:44:57] Total Phase 1 runs: 5
|
||||||
|
[2026-04-13 11:44:57] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] Top 5:
|
||||||
|
[2026-04-13 11:44:57] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:57] mean_reward=5.7246 params={'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0033894181299375602, 'timesteps': 2116, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:44:59]
|
||||||
|
[AutoResearch] ========== Trial 7/50 ==========
|
||||||
|
[2026-04-13 11:44:59] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:44:59] UCB=2.7677 mu=1.3945 sigma=0.6866 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160}
|
||||||
|
[2026-04-13 11:44:59] UCB=2.6401 mu=0.8590 sigma=0.8906 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012329470317109907, 'timesteps': 4404}
|
||||||
|
[2026-04-13 11:44:59] UCB=2.6346 mu=0.8897 sigma=0.8725 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.002824572687677801, 'timesteps': 2173}
|
||||||
|
[2026-04-13 11:44:59] UCB=2.6197 mu=1.1406 sigma=0.7395 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002264095441698803, 'timesteps': 3610}
|
||||||
|
[2026-04-13 11:44:59] UCB=2.6013 mu=0.7257 sigma=0.9378 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.001986372556075669, 'timesteps': 4899}
|
||||||
|
[2026-04-13 11:44:59] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:45:01] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] [AutoResearch] Trial 7 finished in 211.3s, returncode=0
|
||||||
|
[2026-04-13 11:48:32] [AutoResearch] Trial 7: mean_reward=5.7529 std_reward=0.0318
|
||||||
|
[2026-04-13 11:48:32] [AutoResearch] === Trial 7 Summary ===
|
||||||
|
[2026-04-13 11:48:32] Total Phase 1 runs: 6
|
||||||
|
[2026-04-13 11:48:32] Champion: trial=2 mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] Top 5:
|
||||||
|
[2026-04-13 11:48:32] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:32] mean_reward=5.7529 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.002636854645150246, 'timesteps': 3160, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:34]
|
||||||
|
[AutoResearch] ========== Trial 8/50 ==========
|
||||||
|
[2026-04-13 11:48:34] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:48:34] UCB=2.9928 mu=1.4031 sigma=0.7948 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429}
|
||||||
|
[2026-04-13 11:48:34] UCB=2.9102 mu=1.2105 sigma=0.8499 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0013337121696681005, 'timesteps': 4384}
|
||||||
|
[2026-04-13 11:48:34] UCB=2.9095 mu=1.2362 sigma=0.8366 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0016866172466465327, 'timesteps': 4586}
|
||||||
|
[2026-04-13 11:48:34] UCB=2.7220 mu=1.0017 sigma=0.8601 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0012033218829906316, 'timesteps': 4239}
|
||||||
|
[2026-04-13 11:48:34] UCB=2.6586 mu=0.8020 sigma=0.9283 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0014425382569831862, 'timesteps': 4488}
|
||||||
|
[2026-04-13 11:48:34] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:48:36] [AutoResearch] Launching trial 8: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] [AutoResearch] Trial 8 finished in 252.2s, returncode=0
|
||||||
|
[2026-04-13 11:52:48] [AutoResearch] Trial 8: mean_reward=1936.8533 std_reward=34.0067
|
||||||
|
[2026-04-13 11:52:48] [Champion] 🏆 NEW BEST! Trial 8: mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] [AutoResearch] === Trial 8 Summary ===
|
||||||
|
[2026-04-13 11:52:48] Total Phase 1 runs: 7
|
||||||
|
[2026-04-13 11:52:48] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] Top 5:
|
||||||
|
[2026-04-13 11:52:48] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:48] mean_reward=5.9776 params={'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.003869851995924804, 'timesteps': 2985, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:50]
|
||||||
|
[AutoResearch] ========== Trial 9/50 ==========
|
||||||
|
[2026-04-13 11:52:50] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:52:50] UCB=3.6446 mu=2.2362 sigma=0.7042 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961}
|
||||||
|
[2026-04-13 11:52:50] UCB=3.6253 mu=2.3605 sigma=0.6324 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0014035502090066865, 'timesteps': 2985}
|
||||||
|
[2026-04-13 11:52:50] UCB=3.5079 mu=2.3661 sigma=0.5709 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0016891347290568105, 'timesteps': 3422}
|
||||||
|
[2026-04-13 11:52:50] UCB=3.4169 mu=2.2243 sigma=0.5963 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0011351472472032882, 'timesteps': 4191}
|
||||||
|
[2026-04-13 11:52:50] UCB=3.3399 mu=1.6131 sigma=0.8634 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.00114114991914373, 'timesteps': 3413}
|
||||||
|
[2026-04-13 11:52:50] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:52:52] [AutoResearch] Launching trial 9: {'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] [AutoResearch] Trial 9 finished in 215.5s, returncode=0
|
||||||
|
[2026-04-13 11:56:28] [AutoResearch] Trial 9: mean_reward=237.9115 std_reward=1.4136
|
||||||
|
[2026-04-13 11:56:28] [AutoResearch] === Trial 9 Summary ===
|
||||||
|
[2026-04-13 11:56:28] Total Phase 1 runs: 8
|
||||||
|
[2026-04-13 11:56:28] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] Top 5:
|
||||||
|
[2026-04-13 11:56:28] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:28] mean_reward=5.9913 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.003486801052946445, 'timesteps': 3626, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:30]
|
||||||
|
[AutoResearch] ========== Trial 10/50 ==========
|
||||||
|
[2026-04-13 11:56:30] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 11:56:30] UCB=3.6513 mu=2.0026 sigma=0.8243 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691}
|
||||||
|
[2026-04-13 11:56:30] UCB=3.2438 mu=1.9644 sigma=0.6397 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0013292739097783752, 'timesteps': 3897}
|
||||||
|
[2026-04-13 11:56:30] UCB=3.1815 mu=1.2984 sigma=0.9415 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0004768119261475519, 'timesteps': 4914}
|
||||||
|
[2026-04-13 11:56:30] UCB=3.0779 mu=1.4273 sigma=0.8253 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001854044179957165, 'timesteps': 3308}
|
||||||
|
[2026-04-13 11:56:30] UCB=2.9649 mu=1.2760 sigma=0.8444 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0001236030774697938, 'timesteps': 3010}
|
||||||
|
[2026-04-13 11:56:30] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 11:56:32] [AutoResearch] Launching trial 10: {'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] [AutoResearch] Trial 10 finished in 285.4s, returncode=0
|
||||||
|
[2026-04-13 12:01:17] [AutoResearch] Trial 10: mean_reward=7.6595 std_reward=0.1051
|
||||||
|
[2026-04-13 12:01:17] [AutoResearch] === Trial 10 Summary ===
|
||||||
|
[2026-04-13 12:01:17] Total Phase 1 runs: 9
|
||||||
|
[2026-04-13 12:01:17] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] Top 5:
|
||||||
|
[2026-04-13 12:01:17] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:17] mean_reward=7.6595 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0012074041487018196, 'timesteps': 4691, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:19] [AutoResearch] Git push complete after trial 10
|
||||||
|
[2026-04-13 12:01:21]
|
||||||
|
[AutoResearch] ========== Trial 11/50 ==========
|
||||||
|
[2026-04-13 12:01:21] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 12:01:21] UCB=3.1424 mu=1.5222 sigma=0.8101 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548}
|
||||||
|
[2026-04-13 12:01:21] UCB=3.1149 mu=1.7370 sigma=0.6890 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001459419353524927, 'timesteps': 2410}
|
||||||
|
[2026-04-13 12:01:21] UCB=2.7824 mu=1.5507 sigma=0.6159 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0017876960785136527, 'timesteps': 3884}
|
||||||
|
[2026-04-13 12:01:21] UCB=2.7343 mu=1.2928 sigma=0.7207 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0019938475892844754, 'timesteps': 2452}
|
||||||
|
[2026-04-13 12:01:21] UCB=2.7199 mu=1.3608 sigma=0.6795 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0010871407527655017, 'timesteps': 2371}
|
||||||
|
[2026-04-13 12:01:21] [AutoResearch] Proposed: {'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:01:23] [AutoResearch] Launching trial 11: {'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] [AutoResearch] Trial 11 finished in 211.8s, returncode=0
|
||||||
|
[2026-04-13 12:04:55] [AutoResearch] Trial 11: mean_reward=439.8991 std_reward=2.2951
|
||||||
|
[2026-04-13 12:04:55] [AutoResearch] === Trial 11 Summary ===
|
||||||
|
[2026-04-13 12:04:55] Total Phase 1 runs: 10
|
||||||
|
[2026-04-13 12:04:55] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] Top 5:
|
||||||
|
[2026-04-13 12:04:55] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:55] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:57]
|
||||||
|
[AutoResearch] ========== Trial 12/50 ==========
|
||||||
|
[2026-04-13 12:04:57] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 12:04:57] UCB=2.7238 mu=2.2403 sigma=0.2418 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460}
|
||||||
|
[2026-04-13 12:04:57] UCB=2.5207 mu=1.4162 sigma=0.5522 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0019602238083129895, 'timesteps': 3653}
|
||||||
|
[2026-04-13 12:04:57] UCB=2.4574 mu=1.4037 sigma=0.5268 params={'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.0007010382162706215, 'timesteps': 3309}
|
||||||
|
[2026-04-13 12:04:57] UCB=2.3988 mu=0.5967 sigma=0.9011 params={'n_steer': 6, 'n_throttle': 3, 'learning_rate': 0.0013450109997306151, 'timesteps': 1954}
|
||||||
|
[2026-04-13 12:04:57] UCB=2.3760 mu=0.7624 sigma=0.8068 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.0011051791427736288, 'timesteps': 1984}
|
||||||
|
[2026-04-13 12:04:57] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:04:59] [AutoResearch] Launching trial 12: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0018881008842323835, 'timesteps': 3460, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] [AutoResearch] Trial 12 finished in 191.1s, returncode=0
|
||||||
|
[2026-04-13 12:08:10] [AutoResearch] Trial 12: mean_reward=6.446 std_reward=0.0024
|
||||||
|
[2026-04-13 12:08:10] [AutoResearch] === Trial 12 Summary ===
|
||||||
|
[2026-04-13 12:08:10] Total Phase 1 runs: 11
|
||||||
|
[2026-04-13 12:08:10] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] Top 5:
|
||||||
|
[2026-04-13 12:08:10] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:10] mean_reward=22.8241 params={'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0010468048869752956, 'timesteps': 3084, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:12]
|
||||||
|
[AutoResearch] ========== Trial 13/50 ==========
|
||||||
|
[2026-04-13 12:08:12] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 12:08:12] UCB=7.7182 mu=7.0518 sigma=0.3332 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686}
|
||||||
|
[2026-04-13 12:08:12] UCB=7.5060 mu=6.3573 sigma=0.5743 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0006674348206081718, 'timesteps': 2600}
|
||||||
|
[2026-04-13 12:08:12] UCB=7.2501 mu=6.6046 sigma=0.3227 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.0007355516271507972, 'timesteps': 3206}
|
||||||
|
[2026-04-13 12:08:12] UCB=6.7989 mu=5.8906 sigma=0.4542 params={'n_steer': 5, 'n_throttle': 2, 'learning_rate': 0.00023989918210819933, 'timesteps': 3143}
|
||||||
|
[2026-04-13 12:08:12] UCB=6.4551 mu=5.6895 sigma=0.3828 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0008766782176390233, 'timesteps': 3774}
|
||||||
|
[2026-04-13 12:08:12] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:08:14] [AutoResearch] Launching trial 13: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] [AutoResearch] Trial 13 finished in 212.8s, returncode=0
|
||||||
|
[2026-04-13 12:11:47] [AutoResearch] Trial 13: mean_reward=1139.4415 std_reward=1.9558
|
||||||
|
[2026-04-13 12:11:47] [AutoResearch] === Trial 13 Summary ===
|
||||||
|
[2026-04-13 12:11:47] Total Phase 1 runs: 12
|
||||||
|
[2026-04-13 12:11:47] Champion: trial=8 mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] Top 5:
|
||||||
|
[2026-04-13 12:11:47] mean_reward=1936.8533 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.001449588903551847, 'timesteps': 3429, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] mean_reward=1139.4415 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.000577795506052323, 'timesteps': 3686, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] mean_reward=439.8991 params={'n_steer': 5, 'n_throttle': 3, 'learning_rate': 0.00047930749995235496, 'timesteps': 3548, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] mean_reward=398.8564 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.002359934949698355, 'timesteps': 3386, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:47] mean_reward=237.9115 params={'n_steer': 4, 'n_throttle': 2, 'learning_rate': 0.0012562469886511318, 'timesteps': 2961, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:49]
|
||||||
|
[AutoResearch] ========== Trial 14/50 ==========
|
||||||
|
[2026-04-13 12:11:49] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-13 12:11:49] UCB=6.5039 mu=4.9135 sigma=0.7952 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527}
|
||||||
|
[2026-04-13 12:11:49] UCB=6.4956 mu=5.4779 sigma=0.5088 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.0042217867035675835, 'timesteps': 3617}
|
||||||
|
[2026-04-13 12:11:49] UCB=6.2232 mu=4.7772 sigma=0.7230 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.004423012325506047, 'timesteps': 4273}
|
||||||
|
[2026-04-13 12:11:49] UCB=6.1472 mu=4.5372 sigma=0.8050 params={'n_steer': 7, 'n_throttle': 2, 'learning_rate': 0.004498330263353934, 'timesteps': 2879}
|
||||||
|
[2026-04-13 12:11:49] UCB=6.0219 mu=4.2216 sigma=0.9001 params={'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.00012156867129133186, 'timesteps': 1887}
|
||||||
|
[2026-04-13 12:11:49] [AutoResearch] Proposed: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
[2026-04-13 12:11:51] [AutoResearch] Launching trial 14: {'n_steer': 6, 'n_throttle': 2, 'learning_rate': 0.004830816552588123, 'timesteps': 4527, 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True}
|
||||||
|
|
@ -14,3 +14,6 @@
|
||||||
{"trial": 8, "timestamp": "2026-04-13T11:52:48.821996", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.001449588903551847, "timesteps": 3429, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 1936.8533, "std_reward": 34.0067, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0008/model.zip", "champion": true, "run_status": "ok", "elapsed_sec": 252.2464599609375}
|
{"trial": 8, "timestamp": "2026-04-13T11:52:48.821996", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.001449588903551847, "timesteps": 3429, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 1936.8533, "std_reward": 34.0067, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0008/model.zip", "champion": true, "run_status": "ok", "elapsed_sec": 252.2464599609375}
|
||||||
{"trial": 9, "timestamp": "2026-04-13T11:56:28.296244", "params": {"n_steer": 4, "n_throttle": 2, "learning_rate": 0.0012562469886511318, "timesteps": 2961, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 237.9115, "std_reward": 1.4136, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0009/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 215.46081495285034}
|
{"trial": 9, "timestamp": "2026-04-13T11:56:28.296244", "params": {"n_steer": 4, "n_throttle": 2, "learning_rate": 0.0012562469886511318, "timesteps": 2961, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 237.9115, "std_reward": 1.4136, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0009/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 215.46081495285034}
|
||||||
{"trial": 10, "timestamp": "2026-04-13T12:01:17.700485", "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0012074041487018196, "timesteps": 4691, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 7.6595, "std_reward": 0.1051, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0010/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 285.3893370628357}
|
{"trial": 10, "timestamp": "2026-04-13T12:01:17.700485", "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0012074041487018196, "timesteps": 4691, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 7.6595, "std_reward": 0.1051, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0010/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 285.3893370628357}
|
||||||
|
{"trial": 11, "timestamp": "2026-04-13T12:04:55.096100", "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.00047930749995235496, "timesteps": 3548, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 439.8991, "std_reward": 2.2951, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0011/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 211.77687573432922}
|
||||||
|
{"trial": 12, "timestamp": "2026-04-13T12:08:10.184572", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.0018881008842323835, "timesteps": 3460, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 6.446, "std_reward": 0.0024, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0012/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 191.07323956489563}
|
||||||
|
{"trial": 13, "timestamp": "2026-04-13T12:11:47.012459", "params": {"n_steer": 6, "n_throttle": 2, "learning_rate": 0.000577795506052323, "timesteps": 3686, "agent": "ppo", "eval_episodes": 3, "reward_shaping": true}, "mean_reward": 1139.4415, "std_reward": 1.9558, "model_path": "/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/trial-0013/model.zip", "champion": false, "run_status": "ok", "elapsed_sec": 212.81442260742188}
|
||||||
|
|
@ -1,10 +1,36 @@
|
||||||
"""
|
"""
|
||||||
Speed-Aware Reward Wrapper for DonkeyCar RL
|
Speed-Aware Reward Wrapper for DonkeyCar RL — v2 (Hack-Proof)
|
||||||
============================================
|
==============================================================
|
||||||
Replaces the default CTE-only reward with:
|
|
||||||
reward = speed * (1.0 - min(abs(cte) / max_cte, 1.0))
|
|
||||||
|
|
||||||
Falls back to original reward if speed/cte not available in info dict.
|
DESIGN PRINCIPLE: Speed should only be rewarded when the car is
|
||||||
|
genuinely progressing down the track. The original DonkeyCar reward
|
||||||
|
already correctly signals track presence — we build on top of it.
|
||||||
|
|
||||||
|
FORMULA:
|
||||||
|
if original_reward > 0 (car is on track and centered):
|
||||||
|
shaped = original_reward × (1 + speed_scale × speed)
|
||||||
|
else (car is off track / crashed):
|
||||||
|
shaped = original_reward (no speed bonus — cannot be hacked)
|
||||||
|
|
||||||
|
WHY THIS IS HACK-PROOF:
|
||||||
|
The previous formula (speed × (1 - cte/max_cte)) could be maximized
|
||||||
|
by oscillating at the track boundary — the model learned this in practice.
|
||||||
|
|
||||||
|
The multiplicative formula is bounded by the original DonkeyCar reward:
|
||||||
|
- Off track → original_reward ≤ 0 → no speed multiplier possible
|
||||||
|
- The model CANNOT increase reward by going fast off-track
|
||||||
|
- Speed bonus only accumulates when genuinely driving on the track
|
||||||
|
|
||||||
|
RESEARCH NOTE (2026-04-13):
|
||||||
|
The additive formula caused reward hacking in Phase 1 — trials 8 and 13
|
||||||
|
achieved mean_reward=1936 and 1139 respectively by oscillating at the
|
||||||
|
track boundary. This design was developed to prevent that exploit.
|
||||||
|
See docs/RESEARCH_LOG.md for full details.
|
||||||
|
|
||||||
|
TUNING:
|
||||||
|
speed_scale=0.1 means a car going 5 m/s gets a 50% bonus on top of
|
||||||
|
the base CTE reward. This is a meaningful but not overwhelming incentive.
|
||||||
|
Increase to 0.3+ to prioritize speed more aggressively (Phase 3).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
|
|
@ -13,18 +39,18 @@ import numpy as np
|
||||||
|
|
||||||
class SpeedRewardWrapper(gym.Wrapper):
|
class SpeedRewardWrapper(gym.Wrapper):
|
||||||
"""
|
"""
|
||||||
Replace DonkeyCar's default reward with a speed-aware version.
|
Hack-proof speed reward: multiplicative bonus ONLY when on track.
|
||||||
|
|
||||||
Reward = speed * (1 - |cte| / max_cte)
|
Args:
|
||||||
- Maximum when car is fast AND centred on the track
|
env: gymnasium environment
|
||||||
- Zero when car is at max cross-track error
|
speed_scale: multiplier for speed bonus (default 0.1)
|
||||||
- Negative (crash penalty) preserved from original reward when episode ends with failure
|
shaped = original × (1 + speed_scale × speed) when on track
|
||||||
|
shaped = original when off track
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, env, max_cte: float = 8.0, crash_penalty: float = -10.0):
|
def __init__(self, env, speed_scale: float = 0.1):
|
||||||
super().__init__(env)
|
super().__init__(env)
|
||||||
self.max_cte = max_cte
|
self.speed_scale = speed_scale
|
||||||
self.crash_penalty = crash_penalty
|
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
result = self.env.step(action)
|
result = self.env.step(action)
|
||||||
|
|
@ -40,32 +66,37 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Unexpected step() result length: {len(result)}')
|
raise ValueError(f'Unexpected step() result length: {len(result)}')
|
||||||
|
|
||||||
# Shape the reward using speed and CTE from info
|
shaped = self._shape_reward(reward, info)
|
||||||
shaped = self._shape_reward(reward, done, info)
|
|
||||||
|
|
||||||
if len(result) == 5:
|
if len(result) == 5:
|
||||||
return obs, shaped, terminated, truncated, info
|
return obs, shaped, terminated, truncated, info
|
||||||
else:
|
else:
|
||||||
return obs, shaped, done, info
|
return obs, shaped, done, info
|
||||||
|
|
||||||
def _shape_reward(self, original_reward: float, done: bool, info: dict) -> float:
|
def _shape_reward(self, original_reward: float, info: dict) -> float:
|
||||||
"""Compute speed-aware reward, falling back to original if info is unavailable."""
|
"""
|
||||||
|
Multiplicative speed bonus — only when on track.
|
||||||
|
Falls back gracefully if speed not in info dict.
|
||||||
|
"""
|
||||||
|
# Only apply speed bonus when genuinely on track (positive CTE reward)
|
||||||
|
if original_reward <= 0:
|
||||||
|
return original_reward # Off track / crashed — no speed reward
|
||||||
|
|
||||||
|
# Extract speed from info dict
|
||||||
try:
|
try:
|
||||||
speed = float(info.get('speed', None))
|
speed = float(info.get('speed', 0.0))
|
||||||
cte = float(info.get('cte', None))
|
if speed is None:
|
||||||
|
|
||||||
if speed is None or cte is None:
|
|
||||||
return original_reward
|
return original_reward
|
||||||
|
speed = max(0.0, speed) # No negative speed bonus
|
||||||
# Positive driving reward: fast + centred
|
|
||||||
shaped = speed * (1.0 - min(abs(cte) / self.max_cte, 1.0))
|
|
||||||
|
|
||||||
# Preserve crash penalty (original reward is -1 on crash in DonkeyCar)
|
|
||||||
if done and original_reward < 0:
|
|
||||||
shaped += self.crash_penalty
|
|
||||||
|
|
||||||
return shaped
|
|
||||||
|
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
# info dict doesn't have speed/cte — fall back gracefully
|
return original_reward # Graceful fallback
|
||||||
return original_reward
|
|
||||||
|
# Multiplicative bonus: reward grows with speed, but only on track
|
||||||
|
# Hack-proof: cannot increase by going fast off-track
|
||||||
|
shaped = original_reward * (1.0 + self.speed_scale * speed)
|
||||||
|
return shaped
|
||||||
|
|
||||||
|
def theoretical_max_per_step(self, max_speed: float = 10.0) -> float:
|
||||||
|
"""Returns the theoretical max reward per step for bounds checking."""
|
||||||
|
# original_reward ≤ 1.0, so shaped ≤ 1.0 × (1 + speed_scale × max_speed)
|
||||||
|
return 1.0 * (1.0 + self.speed_scale * max_speed)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,182 @@
|
||||||
|
# Research Log — DonkeyCar RL Autoresearch
|
||||||
|
|
||||||
|
> Chronological research findings, discoveries, bugs, and decisions.
|
||||||
|
> Every significant observation is recorded here for scientific reproducibility and future reference.
|
||||||
|
> Format: date, finding, evidence, action taken.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-12 — Project Kickoff and Initial Infrastructure
|
||||||
|
|
||||||
|
### Finding: Grid Sweep as Research Baseline
|
||||||
|
|
||||||
|
**Observation:** Before any autoresearch, we ran an 18-config grid sweep across:
|
||||||
|
- `n_steer`: [3, 5, 7]
|
||||||
|
- `n_throttle`: [2, 3]
|
||||||
|
- `learning_rate`: [0.001, 0.0005, 0.0001]
|
||||||
|
- 3 repeats each
|
||||||
|
|
||||||
|
**Important caveat discovered later:** This sweep used a **random action policy** (bug — model training code had been removed). The rewards reflect how well a random policy can stumble through different action discretizations.
|
||||||
|
|
||||||
|
**Valid insight from this data:** Action discretization matters even for random policy.
|
||||||
|
`n_steer=7, n_throttle=2` outperformed `n_steer=3, n_throttle=2` with random actions — more steering granularity helps even without learning.
|
||||||
|
|
||||||
|
**Data location:** `outerloop-results/clean_sweep_results.jsonl` (18 records)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-12 — Discovery: Random Policy Bug (Critical)
|
||||||
|
|
||||||
|
### Finding: Inner Loop Was Never Training
|
||||||
|
|
||||||
|
**Observation:** The `donkeycar_sb3_runner.py` was calling `env.action_space.sample()` instead of `model.learn()`. This was introduced when we removed the broken `model.save()` call that caused `NameError: name 'model' is not defined`.
|
||||||
|
|
||||||
|
**Root cause:** Legacy code path removal was too aggressive — removed training along with the broken save call.
|
||||||
|
|
||||||
|
**Impact:**
|
||||||
|
- All 300 autoresearch trials (two overnight runs) used random policy
|
||||||
|
- `learning_rate` parameter was passed but completely ignored
|
||||||
|
- `mean_reward` values reflect random-walk quality, not RL training quality
|
||||||
|
- The GP+UCB found the best *action space for random walking*, not the best *hyperparameters for learning*
|
||||||
|
|
||||||
|
**Valid salvage:** The `n_steer=8, n_throttle=5` finding is valid as a discretization insight.
|
||||||
|
**Invalid:** All learning_rate optimization in the 300-trial autoresearch runs.
|
||||||
|
|
||||||
|
**Fix:** Completely rebuilt runner with real `PPO.learn()` + `evaluate_policy()` + `model.save()`.
|
||||||
|
|
||||||
|
**Decision record:** ADR-005 — Never call model.save() before model is defined.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-12 — Autoresearch Infrastructure Proven
|
||||||
|
|
||||||
|
### Finding: GP+UCB Autoresearch Works Correctly
|
||||||
|
|
||||||
|
**Observation:** The GP+UCB meta-controller correctly:
|
||||||
|
- Loads prior results and fits a Gaussian Process
|
||||||
|
- Uses UCB acquisition to balance exploration/exploitation
|
||||||
|
- Proposes parameters outside the original grid (e.g., `n_steer=6` was never in grid)
|
||||||
|
- Converges toward higher-reward regions with each trial
|
||||||
|
|
||||||
|
**Evidence:** After 300 trials, the top-5 consistently clustered around `n_steer=7-9, n_throttle=4-5, lr≈0.002` — a coherent high-reward region.
|
||||||
|
|
||||||
|
**Conclusion:** The infrastructure is sound. The data was from wrong experiments, but the meta-loop works exactly as designed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-13 — Phase 1 Launch: First Real Training Attempt
|
||||||
|
|
||||||
|
### Finding: Timeout — PPO+CNN is Too Slow on CPU for Large Timesteps
|
||||||
|
|
||||||
|
**Observation:** First Phase 1 run with real PPO training proposed 20k-30k timesteps.
|
||||||
|
At ~0.05-0.1 steps/sec (PPO+CNN on CPU), this requires 2000-6000 seconds per trial — far exceeding the 600-second timeout.
|
||||||
|
|
||||||
|
**Evidence:** Trials 1-6 all timed out at exactly 600 seconds.
|
||||||
|
|
||||||
|
**Fix:** Reduced timestep search space from [5000, 30000] to [1000, 5000].
|
||||||
|
At ~15-30 steps/sec (DonkeyCar sim speed), 5000 steps ≈ 170-330 seconds. Fits within 480s timeout.
|
||||||
|
|
||||||
|
**Lesson:** Always calibrate timeout to actual sim + training speed before launching sweeps.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-13 — Discovery: Car Not Moving (PPO Throttle Problem)
|
||||||
|
|
||||||
|
**Observation:** During early Phase 1 training, the car's steering values changed but the car did not move.
|
||||||
|
|
||||||
|
**Root cause:** PPO with continuous action space outputs actions in `[-1, 1]` for all dimensions.
|
||||||
|
DonkeyCar expects `throttle ∈ [0, 1]`. When PPO's random initial policy outputs throttle ≈ -0.5, it gets clipped to 0 — the car sits still.
|
||||||
|
|
||||||
|
**Fix:** Added `ThrottleClampWrapper` that ensures throttle ∈ [0.2, 1.0].
|
||||||
|
This guarantees the car always moves forward, even before any learning.
|
||||||
|
|
||||||
|
**Impact:** Without this fix, the car never moves and the health check detects it as a stuck sim, prematurely killing training.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-13 — Critical Discovery: Reward Hacking via SpeedRewardWrapper 🚨
|
||||||
|
|
||||||
|
### Finding: Model Learned to Exploit Speed Reward by Oscillating at Track Boundary
|
||||||
|
|
||||||
|
**Observation:** After fixing throttle and timestep issues, Phase 1 trials ran successfully.
|
||||||
|
Some trials produced suspiciously high rewards:
|
||||||
|
|
||||||
|
| Trial | mean_reward | n_throttle | lr | verdict |
|
||||||
|
|-------|-------------|------------|--------|---------|
|
||||||
|
| 8 | **1936.9** | 2 | 0.00145 | 🚨 HACKED |
|
||||||
|
| 13 | **1139.4** | 2 | 0.00058 | 🚨 HACKED |
|
||||||
|
| 11 | 439.9 | 3 | 0.00048 | ⚠️ Suspicious |
|
||||||
|
| 2 | 398.9 | 2 | 0.00236 | ⚠️ Suspicious |
|
||||||
|
|
||||||
|
**Root cause:** The `SpeedRewardWrapper` computed:
|
||||||
|
```
|
||||||
|
reward = speed × (1 - abs(cte) / max_cte)
|
||||||
|
```
|
||||||
|
|
||||||
|
The model discovered a policy that **maximizes this formula without genuine track driving**:
|
||||||
|
1. Drive fast toward the track boundary
|
||||||
|
2. Return to track center (momentarily low CTE = high reward)
|
||||||
|
3. Repeat — "oscillation farming"
|
||||||
|
|
||||||
|
The crash penalty (`-10`) was insufficient to deter this because thousands of oscillation steps accumulate far more positive reward.
|
||||||
|
|
||||||
|
**Physical impossibility check:** A car driving at max speed (≈5 m/s) perfectly centered for 3429 steps would accumulate ≈ `5.0 × 1.0 × 3429 = 17,145`. Observed max was 1937 — so technically possible but the high variance (`std_reward=34`) across only 3 eval episodes and the user's direct observation confirm hacking.
|
||||||
|
|
||||||
|
**User observation (direct visual confirmation):** "The model found a way to rig the reward by just going left — it was off the track and then back on the track."
|
||||||
|
|
||||||
|
**Impact:** The entire Phase 1 dataset with `reward_shaping=True` is corrupted.
|
||||||
|
The GP fitted on these rewards was optimizing for hacking parameters, not driving parameters.
|
||||||
|
|
||||||
|
**Action taken:**
|
||||||
|
- Archived all Phase 1 results: `autoresearch_results_phase1_CORRUPTED_reward_hacking.jsonl`
|
||||||
|
- Archived hacked models: `models/ARCHIVED_reward_hacking/`
|
||||||
|
- Redesigned reward function entirely
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-13 — Fix: Hack-Proof Reward Shaping Design
|
||||||
|
|
||||||
|
### Finding: Multiplicative Speed Bonus Prevents Reward Hacking
|
||||||
|
|
||||||
|
**Problem with additive formula:** `reward = speed × f(cte)` can be maximized by maximizing speed independently of f(cte).
|
||||||
|
|
||||||
|
**Solution — multiplicative on-track bonus:**
|
||||||
|
```python
|
||||||
|
if original_reward > 0:
|
||||||
|
shaped = original_reward × (1 + speed_scale × speed)
|
||||||
|
else:
|
||||||
|
shaped = original_reward # No speed bonus when off track
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why this is hack-proof:**
|
||||||
|
- `original_reward > 0` is ONLY true when the car is on track AND centered (DonkeyCar's own CTE signal)
|
||||||
|
- When off track, `original_reward ≤ 0` — no speed reward possible
|
||||||
|
- The model cannot increase reward by going fast off-track
|
||||||
|
- The formula is bounded: `shaped ≤ original_reward × (1 + speed_scale × max_speed)`
|
||||||
|
|
||||||
|
**Author's insight:** "Speed should only be rewarded if you are progressing down the track."
|
||||||
|
|
||||||
|
**Implementation:** `agent/reward_wrapper.py` — `SpeedRewardWrapper` v2.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-04-13 — Lesson: Reward Function Design Principles
|
||||||
|
|
||||||
|
From this experience, we derived the following principles for DonkeyCar RL reward shaping:
|
||||||
|
|
||||||
|
1. **Never reward speed unconditionally.** Speed reward must be gated on track presence.
|
||||||
|
2. **The original DonkeyCar reward is the ground truth.** Any shaping must respect it, not replace it.
|
||||||
|
3. **Multiplicative bonuses are safer than additive.** They can't be maximized independently.
|
||||||
|
4. **High variance in eval reward is a red flag.** `std_reward=34` on 3 episodes suggests instability.
|
||||||
|
5. **Physically impossible reward values signal hacking.** Establish theoretical reward bounds before training.
|
||||||
|
6. **Low `n_throttle` (=2) may enable hacking.** With only 2 throttle values, the model may discover degenerate oscillation policies more easily. Investigate.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Research Questions
|
||||||
|
|
||||||
|
1. **Does `n_throttle=2` uniquely enable hacking?** The hacked models all had `n_throttle=2`. With only 2 throttle states (stop/full-throttle), oscillation may be easier to exploit.
|
||||||
|
2. **What is the minimum timestep for genuine learning?** The low-reward trials (5-22) may not have trained long enough. Is 3000 steps sufficient for any real driving behavior?
|
||||||
|
3. **Does the multiplicative reward fix change the optimal hyperparameter region?** Re-run autoresearch with fixed reward and compare top configurations.
|
||||||
|
4. **Can we detect reward hacking automatically?** A reward-per-step threshold (e.g., flag if mean > 2.0 per step) could auto-detect hacking during training.
|
||||||
|
5. **What does a genuinely good reward look like?** After completing Phase 1 cleanly, characterize the reward distribution of a car that drives one full lap.
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Tests for reward_wrapper.py — no simulator required.
|
Tests for reward_wrapper.py v2 (hack-proof multiplicative formula) — no simulator required.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -17,10 +17,9 @@ class MockStepEnv(gym.Env):
|
||||||
"""Mock gymnasium.Env for testing SpeedRewardWrapper."""
|
"""Mock gymnasium.Env for testing SpeedRewardWrapper."""
|
||||||
metadata = {'render_modes': []}
|
metadata = {'render_modes': []}
|
||||||
|
|
||||||
def __init__(self, speed=2.0, cte=0.5, original_reward=1.0, done=False, use_5tuple=True):
|
def __init__(self, speed=2.0, original_reward=1.0, done=False, use_5tuple=True):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._speed = speed
|
self._speed = speed
|
||||||
self._cte = cte
|
|
||||||
self._reward = original_reward
|
self._reward = original_reward
|
||||||
self._done = done
|
self._done = done
|
||||||
self._use_5tuple = use_5tuple
|
self._use_5tuple = use_5tuple
|
||||||
|
|
@ -32,7 +31,7 @@ class MockStepEnv(gym.Env):
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
obs = np.zeros((120, 160, 3), dtype=np.uint8)
|
obs = np.zeros((120, 160, 3), dtype=np.uint8)
|
||||||
info = {'speed': self._speed, 'cte': self._cte}
|
info = {'speed': self._speed}
|
||||||
if self._use_5tuple:
|
if self._use_5tuple:
|
||||||
return obs, self._reward, self._done, False, info
|
return obs, self._reward, self._done, False, info
|
||||||
else:
|
else:
|
||||||
|
|
@ -41,53 +40,93 @@ class MockStepEnv(gym.Env):
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def close(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
# ---- Hack-Proof Guarantee Tests ----
|
||||||
|
|
||||||
def test_speed_reward_higher_when_fast_and_centered():
|
def test_no_speed_bonus_when_off_track():
|
||||||
"""Reward should be higher when car is fast and centered (low CTE)."""
|
"""
|
||||||
env_fast_centered = MockStepEnv(speed=5.0, cte=0.1, original_reward=1.0)
|
CRITICAL: Off-track reward (≤ 0) must NOT get a speed bonus.
|
||||||
env_slow_offset = MockStepEnv(speed=1.0, cte=3.0, original_reward=1.0)
|
This is the core anti-hacking guarantee.
|
||||||
|
"""
|
||||||
wrapped_fast = SpeedRewardWrapper(env_fast_centered)
|
env = MockStepEnv(speed=10.0, original_reward=-1.0) # Off track, very fast
|
||||||
wrapped_slow = SpeedRewardWrapper(env_slow_offset)
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.5)
|
||||||
|
|
||||||
_, reward_fast, _, _, _ = wrapped_fast.step(0)
|
|
||||||
_, reward_slow, _, _, _ = wrapped_slow.step(0)
|
|
||||||
|
|
||||||
assert reward_fast > reward_slow, \
|
|
||||||
f"Fast+centered should reward more: {reward_fast:.3f} vs {reward_slow:.3f}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_speed_reward_zero_at_max_cte():
|
|
||||||
"""Reward should be ~0 when CTE = max_cte (on the edge of the road)."""
|
|
||||||
env = MockStepEnv(speed=5.0, cte=8.0, original_reward=1.0)
|
|
||||||
wrapped = SpeedRewardWrapper(env, max_cte=8.0)
|
|
||||||
_, reward, _, _, _ = wrapped.step(0)
|
_, reward, _, _, _ = wrapped.step(0)
|
||||||
assert reward == pytest.approx(0.0, abs=0.01), \
|
assert reward == -1.0, \
|
||||||
f"Reward at max CTE should be ~0, got {reward}"
|
f"Off-track reward must not get speed bonus, got {reward}"
|
||||||
|
|
||||||
|
|
||||||
def test_speed_reward_positive_when_on_track():
|
def test_no_speed_bonus_when_reward_zero():
|
||||||
"""Reward should be positive when car is on track at any speed > 0."""
|
"""Reward exactly 0 (boundary case) should not get speed bonus."""
|
||||||
env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0)
|
env = MockStepEnv(speed=5.0, original_reward=0.0)
|
||||||
wrapped = SpeedRewardWrapper(env, max_cte=8.0)
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.5)
|
||||||
_, reward, _, _, _ = wrapped.step(0)
|
_, reward, _, _, _ = wrapped.step(0)
|
||||||
assert reward > 0, f"On-track reward should be positive, got {reward}"
|
assert reward == 0.0, f"Zero reward should stay zero, got {reward}"
|
||||||
|
|
||||||
|
|
||||||
def test_crash_penalty_applied_on_done():
|
def test_speed_bonus_scales_with_speed_when_on_track():
|
||||||
"""Crash penalty should be added when episode ends with negative reward."""
|
"""When on track (positive reward), faster = higher shaped reward."""
|
||||||
env = MockStepEnv(speed=0.0, cte=9.0, original_reward=-1.0, done=True)
|
env_slow = MockStepEnv(speed=1.0, original_reward=0.8)
|
||||||
wrapped = SpeedRewardWrapper(env, max_cte=8.0, crash_penalty=-10.0)
|
env_fast = MockStepEnv(speed=5.0, original_reward=0.8)
|
||||||
_, reward, terminated, truncated, _ = wrapped.step(0)
|
|
||||||
assert reward < -5.0, f"Crash penalty should make reward very negative, got {reward}"
|
wrapped_slow = SpeedRewardWrapper(env_slow, speed_scale=0.1)
|
||||||
|
wrapped_fast = SpeedRewardWrapper(env_fast, speed_scale=0.1)
|
||||||
|
|
||||||
|
_, r_slow, _, _, _ = wrapped_slow.step(0)
|
||||||
|
_, r_fast, _, _, _ = wrapped_fast.step(0)
|
||||||
|
|
||||||
|
assert r_fast > r_slow, f"Faster on-track should reward more: {r_fast:.3f} vs {r_slow:.3f}"
|
||||||
|
|
||||||
|
|
||||||
def test_fallback_to_original_reward_when_info_missing():
|
def test_multiplicative_formula_correct():
|
||||||
"""If info doesn't have speed/cte, should fall back to original reward."""
|
"""
|
||||||
class NoInfoEnv(gym.Env):
|
Verify exact formula: shaped = original × (1 + speed_scale × speed)
|
||||||
|
"""
|
||||||
|
original_reward = 0.6
|
||||||
|
speed = 3.0
|
||||||
|
speed_scale = 0.1
|
||||||
|
expected = original_reward * (1.0 + speed_scale * speed) # 0.6 × 1.3 = 0.78
|
||||||
|
|
||||||
|
env = MockStepEnv(speed=speed, original_reward=original_reward)
|
||||||
|
wrapped = SpeedRewardWrapper(env, speed_scale=speed_scale)
|
||||||
|
_, reward, _, _, _ = wrapped.step(0)
|
||||||
|
|
||||||
|
assert reward == pytest.approx(expected, abs=1e-6), \
|
||||||
|
f"Expected {expected:.6f}, got {reward:.6f}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_cannot_hack_by_going_fast_off_track():
|
||||||
|
"""
|
||||||
|
Demonstrate that the previous formula could be hacked but this one cannot.
|
||||||
|
Fast off-track (speed=10) must give same or worse result than slow off-track (speed=1).
|
||||||
|
"""
|
||||||
|
env_fast_offtrack = MockStepEnv(speed=10.0, original_reward=-1.0)
|
||||||
|
env_slow_offtrack = MockStepEnv(speed=1.0, original_reward=-1.0)
|
||||||
|
|
||||||
|
wrapped_fast = SpeedRewardWrapper(env_fast_offtrack, speed_scale=0.5)
|
||||||
|
wrapped_slow = SpeedRewardWrapper(env_slow_offtrack, speed_scale=0.5)
|
||||||
|
|
||||||
|
_, r_fast, _, _, _ = wrapped_fast.step(0)
|
||||||
|
_, r_slow, _, _, _ = wrapped_slow.step(0)
|
||||||
|
|
||||||
|
assert r_fast == r_slow == -1.0, \
|
||||||
|
f"Off-track reward must be identical regardless of speed: fast={r_fast}, slow={r_slow}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_theoretical_max_per_step():
|
||||||
|
"""
|
||||||
|
Verify theoretical_max_per_step returns correct upper bound.
|
||||||
|
With speed_scale=0.1 and max_speed=10.0: max = 1.0 × (1 + 0.1×10) = 2.0
|
||||||
|
"""
|
||||||
|
env = MockStepEnv()
|
||||||
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.1)
|
||||||
|
max_reward = wrapped.theoretical_max_per_step(max_speed=10.0)
|
||||||
|
assert max_reward == pytest.approx(2.0, abs=1e-6), \
|
||||||
|
f"Max per step should be 2.0, got {max_reward}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fallback_when_speed_not_in_info():
|
||||||
|
"""If info doesn't have speed, fall back to original reward."""
|
||||||
|
class NoSpeedEnv(gym.Env):
|
||||||
metadata = {'render_modes': []}
|
metadata = {'render_modes': []}
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
@ -96,20 +135,19 @@ def test_fallback_to_original_reward_when_info_missing():
|
||||||
def reset(self, seed=None, **kwargs):
|
def reset(self, seed=None, **kwargs):
|
||||||
return np.zeros((120, 160, 3), dtype=np.uint8), {}
|
return np.zeros((120, 160, 3), dtype=np.uint8), {}
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {}
|
return np.zeros((120, 160, 3), dtype=np.uint8), 0.75, False, False, {} # No 'speed' key
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
wrapped = SpeedRewardWrapper(NoInfoEnv())
|
wrapped = SpeedRewardWrapper(NoSpeedEnv(), speed_scale=0.5)
|
||||||
_, reward, _, _, _ = wrapped.step(0)
|
_, reward, _, _, _ = wrapped.step(0)
|
||||||
|
# speed=0.0 default → shaped = 0.75 × (1 + 0.5 × 0.0) = 0.75
|
||||||
assert reward == pytest.approx(0.75, abs=1e-6), \
|
assert reward == pytest.approx(0.75, abs=1e-6), \
|
||||||
f"Should fall back to original reward 0.75, got {reward}"
|
f"Should fall back gracefully, got {reward}"
|
||||||
|
|
||||||
|
|
||||||
def test_wrapper_preserves_observation():
|
def test_wrapper_preserves_observation():
|
||||||
"""SpeedRewardWrapper should not modify observations."""
|
"""SpeedRewardWrapper must not modify observations."""
|
||||||
obs_data = np.zeros((120, 160, 3), dtype=np.uint8)
|
|
||||||
|
|
||||||
class FixedObsEnv(gym.Env):
|
class FixedObsEnv(gym.Env):
|
||||||
metadata = {'render_modes': []}
|
metadata = {'render_modes': []}
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -117,22 +155,31 @@ def test_wrapper_preserves_observation():
|
||||||
self.action_space = gym.spaces.Discrete(5)
|
self.action_space = gym.spaces.Discrete(5)
|
||||||
self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8)
|
self.observation_space = gym.spaces.Box(low=0, high=255, shape=(120, 160, 3), dtype=np.uint8)
|
||||||
def reset(self, seed=None, **kwargs):
|
def reset(self, seed=None, **kwargs):
|
||||||
return obs_data.copy(), {}
|
return np.zeros((120, 160, 3), dtype=np.uint8), {}
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
return obs_data.copy(), 1.0, False, False, {'speed': 2.0, 'cte': 0.5}
|
return np.zeros((120, 160, 3), dtype=np.uint8), 0.8, False, False, {'speed': 2.0}
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
wrapped = SpeedRewardWrapper(FixedObsEnv())
|
wrapped = SpeedRewardWrapper(FixedObsEnv())
|
||||||
obs, _, _, _, _ = wrapped.step(0)
|
obs, _, _, _, _ = wrapped.step(0)
|
||||||
np.testing.assert_array_almost_equal(obs, obs_data)
|
np.testing.assert_array_equal(obs, np.zeros((120, 160, 3), dtype=np.uint8))
|
||||||
|
|
||||||
|
|
||||||
def test_4tuple_step_compatibility():
|
def test_4tuple_step_compatibility():
|
||||||
"""Wrapper should handle 4-tuple step() return (old gym API)."""
|
"""Wrapper should handle 4-tuple step() return (old gym API)."""
|
||||||
env = MockStepEnv(speed=2.0, cte=1.0, original_reward=1.0, use_5tuple=False)
|
env = MockStepEnv(speed=2.0, original_reward=0.8, use_5tuple=False)
|
||||||
wrapped = SpeedRewardWrapper(env)
|
wrapped = SpeedRewardWrapper(env)
|
||||||
result = wrapped.step(0)
|
result = wrapped.step(0)
|
||||||
assert len(result) == 4, f"Expected 4-tuple, got {len(result)}"
|
assert len(result) == 4, f"Expected 4-tuple, got {len(result)}"
|
||||||
_, reward, done, info = result
|
_, reward, done, info = result
|
||||||
assert isinstance(reward, float)
|
assert isinstance(reward, float)
|
||||||
|
assert reward > 0.8, "Speed bonus should increase reward when on track"
|
||||||
|
|
||||||
|
|
||||||
|
def test_crash_still_penalized():
|
||||||
|
"""Crash (original_reward=-1) should remain -1, not improved by speed."""
|
||||||
|
env = MockStepEnv(speed=8.0, original_reward=-1.0, done=True)
|
||||||
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.2)
|
||||||
|
_, reward, _, _, _ = wrapped.step(0)
|
||||||
|
assert reward == -1.0, f"Crash reward should remain -1.0, got {reward}"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue