From 7ed2456896dc9cadd851623a7a406e859b8a3430 Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Tue, 14 Apr 2026 13:47:28 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20remove=20Warren=20from=20test=20set=20?= =?UTF-8?q?=E2=80=94=20indoor=20carpet,=20broken=20done=20condition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Warren track surface is green carpet (not outdoor road), and the episode-done condition (|CTE| > max_cte) does not fire when the car crosses the INSIDE boundary. Car can drive off-track and bump into chairs indefinitely, making scores meaningless as a test metric. Changes: - multitrack_runner.py: TEST_TRACKS now mini_monaco only - wave3_controller.py: drop warren_reward from parse/save/champion paths - tests/test_wave3.py: update assertions to match single test track - All 83 tests pass Track classification (final): TRAIN : generated_road, generated_track, mountain_track TEST : mini_monaco (outdoor, proper road, correct done condition) SKIP : warren, warehouse, robo_racing_league, waveshare, circuit_launch SKIP : avc_sparkfun (orange markings) ADR-010 to be updated. Agent: pi Tests: 83 passed Tests-Added: 0 TypeScript: N/A --- agent/multitrack_runner.py | 7 +++- .../autoresearch_phase2_log.txt | 13 ++++++ .../autoresearch_phase3_log.txt | 23 +++++++++++ agent/wave3_controller.py | 38 +++++++---------- tests/test_wave3.py | 41 +++++++------------ 5 files changed, 70 insertions(+), 52 deletions(-) diff --git a/agent/multitrack_runner.py b/agent/multitrack_runner.py index 2395703..f20bd54 100644 --- a/agent/multitrack_runner.py +++ b/agent/multitrack_runner.py @@ -74,9 +74,12 @@ TRAINING_TRACKS = [ ('mountain_track', 'donkey-mountain-track-v0'), ] +# Zero-shot generalization test tracks — never seen during training. +# Warren was removed: its episode-done condition does not fire when the car +# crosses the INSIDE edge (CTE stays small), so the car can drive among +# chairs indefinitely and scores are meaningless. TEST_TRACKS = [ - ('mini_monaco', 'donkey-minimonaco-track-v0'), - ('warren', 'donkey-warren-track-v0'), + ('mini_monaco', 'donkey-minimonaco-track-v0'), ] # How many steps to sample before deciding the segment reward (shorter than segment) diff --git a/agent/outerloop-results/autoresearch_phase2_log.txt b/agent/outerloop-results/autoresearch_phase2_log.txt index 3d2fb3b..be01753 100644 --- a/agent/outerloop-results/autoresearch_phase2_log.txt +++ b/agent/outerloop-results/autoresearch_phase2_log.txt @@ -541,3 +541,16 @@ [2026-04-14 13:29:30] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-14 13:29:30] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-14 13:29:30] [AutoResearch] Only 1 results — using random proposal. +[2026-04-14 13:47:13] [AutoResearch] GP UCB top-5 candidates: +[2026-04-14 13:47:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173} +[2026-04-14 13:47:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198} +[2026-04-14 13:47:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887} +[2026-04-14 13:47:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199} +[2026-04-14 13:47:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-14 13:47:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-14 13:47:13] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_phase3_log.txt b/agent/outerloop-results/autoresearch_phase3_log.txt index 45d34b5..6c69fdc 100644 --- a/agent/outerloop-results/autoresearch_phase3_log.txt +++ b/agent/outerloop-results/autoresearch_phase3_log.txt @@ -51,3 +51,26 @@ [2026-04-14 13:29:34] [Wave3] Only 0 results — using random proposal. [2026-04-14 13:29:34] [Champion] 🏆 NEW BEST! Trial 3: combined=1500.00 (mini_monaco=900.0, warren=600.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} [2026-04-14 13:29:34] [Champion] 🏆 NEW BEST! Trial 1: combined=2000.00 (mini_monaco=1200.0, warren=800.0) params={} +[2026-04-14 13:36:58] ================================================================= +[2026-04-14 13:36:58] [Wave3] Multi-Track Autoresearch — GP+UCB Generalization Search +[2026-04-14 13:36:58] [Wave3] Training tracks : generated_road, generated_track, mountain_track +[2026-04-14 13:36:58] [Wave3] Test tracks : mini_monaco, warren (zero-shot) +[2026-04-14 13:36:58] [Wave3] Max trials : 25 | kappa=2.0 | push every 5 +[2026-04-14 13:36:58] [Wave3] Results file : /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase3.jsonl +[2026-04-14 13:36:58] [Wave3] Champion dir : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave3-champion +[2026-04-14 13:36:58] [Wave3] Warm start : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion/model.zip +[2026-04-14 13:36:58] ================================================================= +[2026-04-14 13:36:58] [Wave3] Loaded 0 existing Phase 3 results. +[2026-04-14 13:36:58] [Wave3] No Wave 3 champion yet. +[2026-04-14 13:36:58] [Wave3] Starting from trial 1. +[2026-04-14 13:36:58] +[Wave3] ========== Trial 1/25 ========== +[2026-04-14 13:36:58] [Wave3] Seed trial 1/2: using hardcoded params. +[2026-04-14 13:36:58] [Wave3] Proposed params: {'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000} +[2026-04-14 13:37:00] [Wave3] Launching trial 1: {'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000} +[2026-04-14 13:37:00] [Wave3] Command: python3 /home/paulh/projects/donkeycar-rl-autoresearch/agent/multitrack_runner.py --total-timesteps 45000 --steps-per-switch 5000 --learning-rate 0.000225 --eval-episodes 3 --save-dir /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave3-trial-0001 --warm-start /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion/model.zip +[2026-04-14 13:47:17] [Wave3] Seed trial 1/2: using hardcoded params. +[2026-04-14 13:47:17] [Wave3] Seed trial 2/2: using hardcoded params. +[2026-04-14 13:47:17] [Wave3] Only 0 results — using random proposal. +[2026-04-14 13:47:17] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} +[2026-04-14 13:47:17] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} diff --git a/agent/wave3_controller.py b/agent/wave3_controller.py index cc67c82..9f2894c 100644 --- a/agent/wave3_controller.py +++ b/agent/wave3_controller.py @@ -188,7 +188,7 @@ class Wave3ChampionTracker: return self._best.get('combined_test_score', float('-inf')) def update_if_better(self, score, params, model_zip_path, trial, - mini_monaco_reward=None, warren_reward=None): + mini_monaco_reward=None): if score <= self.best_score: return False @@ -206,15 +206,14 @@ class Wave3ChampionTracker: 'params': params, 'combined_test_score': score, 'mini_monaco_reward': mini_monaco_reward, - 'warren_reward': warren_reward, 'model_path': dest, } with open(self.manifest_path, 'w') as f: json.dump(manifest, f, indent=2) self._best = manifest log(f'[Champion] 🏆 NEW BEST! Trial {trial}: ' - f'combined={score:.2f} ' - f'(mini_monaco={mini_monaco_reward:.1f}, warren={warren_reward:.1f}) ' + f'score={score:.2f} ' + f'(mini_monaco={mini_monaco_reward:.1f}) ' f'params={params}') return True @@ -222,7 +221,7 @@ class Wave3ChampionTracker: if self._best['trial'] is None: return 'No Wave 3 champion yet.' return (f"Wave3 Champion: trial={self._best['trial']} " - f"combined={self._best['combined_test_score']:.2f} " + f"score={self._best['combined_test_score']:.2f} " f"params={self._best['params']}") @@ -297,11 +296,9 @@ def parse_runner_output(output): Looks for: [W3 Runner][TEST] combined_test_score= [W3 Runner][TEST] mini_monaco_reward= - [W3 Runner][TEST] warren_reward= """ - combined = None + combined = None mini_monaco = None - warren_rwd = None m = re.search(r'\[W3 Runner\]\[TEST\]\s+combined_test_score=([+-]?[\d.]+)', output) if m: @@ -311,11 +308,7 @@ def parse_runner_output(output): if m: mini_monaco = float(m.group(1)) - m = re.search(r'\[W3 Runner\]\[TEST\]\s+warren_reward=([+-]?[\d.]+)', output) - if m: - warren_rwd = float(m.group(1)) - - return combined, mini_monaco, warren_rwd + return combined, mini_monaco # ---- Job launcher ---- @@ -329,7 +322,7 @@ def kill_stale(): def launch_trial(params, trial_num): """ Launch multitrack_runner.py as a subprocess with the given hyperparameters. - Returns: (combined_test_score, mini_monaco_reward, warren_reward, + Returns: (combined_test_score, mini_monaco_reward, model_zip_path, output, status, elapsed_sec, save_dir) """ save_dir = os.path.join(MODELS_DIR, f'wave3-trial-{trial_num:04d}') @@ -374,18 +367,18 @@ def launch_trial(params, trial_num): print('--- End Runner Output ---\n', flush=True) # Parse results - combined, mini_monaco, warren_rwd = parse_runner_output(output) - log(f'[Wave3] Parsed: combined={combined} mini_monaco={mini_monaco} warren={warren_rwd}') + combined, mini_monaco = parse_runner_output(output) + log(f'[Wave3] Parsed: combined={combined} mini_monaco={mini_monaco}') model_zip = os.path.join(save_dir, 'model.zip') if not os.path.exists(model_zip): model_zip = None - return combined, mini_monaco, warren_rwd, model_zip, output, status, elapsed, save_dir + return combined, mini_monaco, model_zip, output, status, elapsed, save_dir # ---- Result saving ---- -def save_result(trial, params, combined, mini_monaco, warren_rwd, +def save_result(trial, params, combined, mini_monaco, model_path, is_champion, status, elapsed): rec = { 'trial': trial, @@ -393,7 +386,6 @@ def save_result(trial, params, combined, mini_monaco, warren_rwd, 'params': params, 'combined_test_score': combined, 'mini_monaco_reward': mini_monaco, - 'warren_reward': warren_rwd, 'model_path': model_path, 'champion': is_champion, 'run_status': status, @@ -438,7 +430,7 @@ def run_wave3(max_trials=25, kappa=UCB_KAPPA, push_every=5): log('=' * 65) log('[Wave3] Multi-Track Autoresearch — GP+UCB Generalization Search') log(f'[Wave3] Training tracks : generated_road, generated_track, mountain_track') - log(f'[Wave3] Test tracks : mini_monaco, warren (zero-shot)') + log(f'[Wave3] Test tracks : mini_monaco only (zero-shot; warren removed — broken done condition)') log(f'[Wave3] Max trials : {max_trials} | kappa={kappa} | push every {push_every}') log(f'[Wave3] Results file : {RESULTS_FILE}') log(f'[Wave3] Champion dir : {CHAMPION_DIR}') @@ -467,7 +459,7 @@ def run_wave3(max_trials=25, kappa=UCB_KAPPA, push_every=5): kill_stale() # 3. Launch training + eval - combined, mini_monaco, warren_rwd, model_zip, output, status, elapsed, save_dir = \ + combined, mini_monaco, model_zip, output, status, elapsed, save_dir = \ launch_trial(proposed, trial) # 4. Guard against None results (timeout / crash) @@ -475,17 +467,15 @@ def run_wave3(max_trials=25, kappa=UCB_KAPPA, push_every=5): log(f'[Wave3] ⚠️ No test score parsed — defaulting to 0.0') combined = 0.0 mini_monaco = mini_monaco or 0.0 - warren_rwd = warren_rwd or 0.0 # 5. Update champion is_champion = champion.update_if_better( combined, proposed, model_zip, trial, mini_monaco_reward=mini_monaco or 0.0, - warren_reward=warren_rwd or 0.0, ) # 6. Save result - save_result(trial, proposed, combined, mini_monaco, warren_rwd, + save_result(trial, proposed, combined, mini_monaco, model_zip, is_champion, status, elapsed) # 7. Update GP data diff --git a/tests/test_wave3.py b/tests/test_wave3.py index db6e758..05fd2ac 100644 --- a/tests/test_wave3.py +++ b/tests/test_wave3.py @@ -79,12 +79,14 @@ def test_multitrack_runner_training_tracks_defined(): def test_multitrack_runner_test_tracks_defined(): - """TEST_TRACKS must contain exactly 2 entries: mini_monaco and warren.""" + """TEST_TRACKS must contain exactly 1 entry: mini_monaco. + Warren was removed: CTE-based episode termination does not fire when the + car crosses the inside edge, so scores on Warren are unreliable.""" from multitrack_runner import TEST_TRACKS - assert len(TEST_TRACKS) == 2 + assert len(TEST_TRACKS) == 1 names = [t[0] for t in TEST_TRACKS] assert 'mini_monaco' in names - assert 'warren' in names + assert 'warren' not in names, 'Warren removed — broken episode termination on inside edge' def test_multitrack_runner_no_model_save_before_definition(): @@ -393,20 +395,17 @@ def test_wave3_propose_random_when_few_results(): def test_wave3_parse_runner_output_combined_score(): - """parse_runner_output() should extract combined_test_score correctly.""" + """parse_runner_output() should extract combined_test_score and mini_monaco correctly.""" from wave3_controller import parse_runner_output output = """ [12:34:56] [W3 Runner][TEST] track=mini_monaco mean_reward=1234.56 mean_steps=450.0 ✅ DRIVES -[12:34:57] [W3 Runner][TEST] track=warren mean_reward=789.01 mean_steps=310.0 ✅ DRIVES [12:34:57] [W3 Runner][TEST] mini_monaco_reward=1234.5600 -[12:34:57] [W3 Runner][TEST] warren_reward=789.0100 -[12:34:57] [W3 Runner][TEST] combined_test_score=2023.5700 +[12:34:57] [W3 Runner][TEST] combined_test_score=1234.5600 """ - combined, mini_monaco, warren = parse_runner_output(output) - assert combined == pytest.approx(2023.57, rel=1e-4) + combined, mini_monaco = parse_runner_output(output) + assert combined == pytest.approx(1234.56, rel=1e-4) assert mini_monaco == pytest.approx(1234.56, rel=1e-4) - assert warren == pytest.approx(789.01, rel=1e-4) def test_wave3_parse_runner_output_missing_returns_none(): @@ -414,10 +413,9 @@ def test_wave3_parse_runner_output_missing_returns_none(): from wave3_controller import parse_runner_output output = 'Training started... timeout' - combined, mini_monaco, warren = parse_runner_output(output) + combined, mini_monaco = parse_runner_output(output) assert combined is None assert mini_monaco is None - assert warren is None def test_wave3_champion_tracker_update_and_load(): @@ -433,8 +431,7 @@ def test_wave3_champion_tracker_update_and_load(): params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}, model_zip_path=None, trial=3, - mini_monaco_reward=900.0, - warren_reward=600.0, + mini_monaco_reward=1500.0, ) assert updated is True assert tracker.best_score == pytest.approx(1500.0) @@ -451,10 +448,8 @@ def test_wave3_champion_tracker_does_not_regress(): with tempfile.TemporaryDirectory() as tmpdir: tracker = Wave3ChampionTracker(tmpdir) - tracker.update_if_better(2000.0, {}, None, 1, - mini_monaco_reward=1200.0, warren_reward=800.0) - updated = tracker.update_if_better(1500.0, {}, None, 2, - mini_monaco_reward=900.0, warren_reward=600.0) + tracker.update_if_better(2000.0, {}, None, 1, mini_monaco_reward=2000.0) + updated = tracker.update_if_better(1500.0, {}, None, 2, mini_monaco_reward=1500.0) assert updated is False assert tracker.best_score == pytest.approx(2000.0) @@ -464,33 +459,27 @@ def test_wave3_results_appended_not_overwritten(): from wave3_controller import save_result with tempfile.TemporaryDirectory() as tmpdir: - # Monkey-patch the RESULTS_FILE path import wave3_controller original_path = wave3_controller.RESULTS_FILE wave3_controller.RESULTS_FILE = os.path.join(tmpdir, 'phase3_results.jsonl') try: - # Write 3 records for i in range(3): save_result( trial=i + 1, params={'learning_rate': 0.0002, 'steps_per_switch': 5000, 'total_timesteps': 100000}, combined=float(i * 100), - mini_monaco=float(i * 60), - warren_rwd=float(i * 40), + mini_monaco=float(i * 100), model_path=None, is_champion=(i == 2), status='ok', elapsed=120.0 * (i + 1), ) - # Should have 3 lines with open(wave3_controller.RESULTS_FILE) as f: lines = [l.strip() for l in f if l.strip()] - assert len(lines) == 3, f'Expected 3 result lines, got {len(lines)}' - - # All should be valid JSON + assert len(lines) == 3 for line in lines: rec = json.loads(line) assert 'combined_test_score' in rec