diff --git a/agent/multitrack_runner.py b/agent/multitrack_runner.py index e324512..49d6d3a 100644 --- a/agent/multitrack_runner.py +++ b/agent/multitrack_runner.py @@ -294,27 +294,25 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch, save_dir=None): """ Train PPO across training tracks by round-robin switching every steps_per_switch steps. + Saves BOTH the latest checkpoint AND the best model seen during training. - Args: - model: PPO model (already set to first_env) - first_env: The first wrapped training env (already connected) - total_timesteps: Total training budget across all tracks - steps_per_switch: Steps per track segment before switching - - Returns: - env: The last env used (caller must close it) - segment_rewards: List of (track_name, reward) for each completed segment + The best model is saved to save_dir/best_model.zip whenever a new high + segment reward is achieved. At the end, the best model weights are + reloaded so the returned model is the best seen, not just the final one. """ env = first_env steps_done = 0 - track_idx = 0 # Start on generated_road (first in TRAINING_TRACKS) + track_idx = 0 segment_rewards = [] health_cb = HealthCheckCallback() + best_segment_reward = float('-inf') + best_model_path = os.path.join(save_dir, 'best_model') if save_dir else None log(f'[W3 Runner] Starting multi-track training:') log(f' Total timesteps : {total_timesteps:,}') log(f' Steps per switch: {steps_per_switch:,}') log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}') + log(f' Best model saved: {best_model_path}.zip') log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles') while steps_done < total_timesteps: @@ -328,14 +326,12 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch, # Train segment model.learn( total_timesteps=segment_steps, - reset_num_timesteps=False, # Continuous timestep counter across segments + reset_num_timesteps=False, callback=health_cb, ) steps_done += segment_steps - # --- Checkpoint after every segment --- - # If the trial is killed (timeout/crash) the latest model is always - # on disk so results are never completely lost. + # --- Save latest checkpoint (crash recovery) --- if save_dir: try: os.makedirs(save_dir, exist_ok=True) @@ -361,6 +357,18 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch, seg_reward = ep_reward log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}') segment_rewards.append((track_name, float(seg_reward))) + + # --- Save BEST model if this segment beat the previous best --- + if seg_reward > best_segment_reward and best_model_path: + best_segment_reward = seg_reward + try: + model.save(best_model_path) + log(f'[W3 Runner] ⭐ NEW BEST model saved! ' + f'step={steps_done:,} reward={seg_reward:.2f} ' + f'track={track_name}') + except Exception as e: + log(f'[W3 Runner] WARNING: best model save failed: {e}') + except Exception as e: log(f'[W3 Runner][TRAIN] Segment eval failed: {e}') segment_rewards.append((track_name, 0.0)) @@ -392,6 +400,19 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch, log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across ' f'{len(segment_rewards)} segments.') + log(f'[W3 Runner] Best segment reward during training: {best_segment_reward:.2f}') + + # --- Reload the BEST model weights before returning --- + # The final model may have drifted from the best policy found mid-training. + # Always return the best checkpoint, not the last one. + if best_model_path and os.path.exists(best_model_path + '.zip'): + try: + log(f'[W3 Runner] Reloading best model from {best_model_path}.zip') + model = PPO.load(best_model_path, env=env, device='auto') + log(f'[W3 Runner] ✅ Best model reloaded (reward={best_segment_reward:.2f})') + except Exception as e: + log(f'[W3 Runner] WARNING: could not reload best model: {e}. Using final model.') + return env, segment_rewards diff --git a/agent/outerloop-results/autoresearch_phase2_log.txt b/agent/outerloop-results/autoresearch_phase2_log.txt index 367a5d5..9c76714 100644 --- a/agent/outerloop-results/autoresearch_phase2_log.txt +++ b/agent/outerloop-results/autoresearch_phase2_log.txt @@ -736,3 +736,16 @@ [2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal. +[2026-04-17 14:45:13] [AutoResearch] GP UCB top-5 candidates: +[2026-04-17 14:45:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173} +[2026-04-17 14:45:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198} +[2026-04-17 14:45:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887} +[2026-04-17 14:45:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199} +[2026-04-17 14:45:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-17 14:45:13] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_phase3_log.txt b/agent/outerloop-results/autoresearch_phase3_log.txt index 21c4ac2..1ea86fb 100644 --- a/agent/outerloop-results/autoresearch_phase3_log.txt +++ b/agent/outerloop-results/autoresearch_phase3_log.txt @@ -395,3 +395,8 @@ [2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal. [2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} [2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} +[2026-04-17 14:45:26] [Wave3] Seed trial 1/2: using hardcoded params. +[2026-04-17 14:45:26] [Wave3] Seed trial 2/2: using hardcoded params. +[2026-04-17 14:45:26] [Wave3] Only 0 results — using random proposal. +[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} +[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}