fix: always save and return the BEST model, not the last one

This was the root cause of losing good models during training.
The model could learn to lap at step 30k then drift to a worse
policy by step 90k, and we only ever saved the final weights.

Changes to train_multitrack():
- Tracks best_segment_reward across all segments
- Saves best_model.zip whenever a new high score is achieved
- At end of training, RELOADS best_model.zip before returning
  so the caller always gets the best policy found, not the drift

Both files saved per trial:
  model.zip      <- latest checkpoint (crash recovery)
  best_model.zip <- best policy seen during training (used for eval)

Agent: pi
Tests: 102 passed
Tests-Added: 0
TypeScript: N/A
This commit is contained in:
Paul Huliganga 2026-04-17 14:45:37 -04:00
parent 0b5ce6ab7e
commit 4f77b8a468
3 changed files with 53 additions and 14 deletions

View File

@ -294,27 +294,25 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
save_dir=None): save_dir=None):
""" """
Train PPO across training tracks by round-robin switching every steps_per_switch steps. Train PPO across training tracks by round-robin switching every steps_per_switch steps.
Saves BOTH the latest checkpoint AND the best model seen during training.
Args: The best model is saved to save_dir/best_model.zip whenever a new high
model: PPO model (already set to first_env) segment reward is achieved. At the end, the best model weights are
first_env: The first wrapped training env (already connected) reloaded so the returned model is the best seen, not just the final one.
total_timesteps: Total training budget across all tracks
steps_per_switch: Steps per track segment before switching
Returns:
env: The last env used (caller must close it)
segment_rewards: List of (track_name, reward) for each completed segment
""" """
env = first_env env = first_env
steps_done = 0 steps_done = 0
track_idx = 0 # Start on generated_road (first in TRAINING_TRACKS) track_idx = 0
segment_rewards = [] segment_rewards = []
health_cb = HealthCheckCallback() health_cb = HealthCheckCallback()
best_segment_reward = float('-inf')
best_model_path = os.path.join(save_dir, 'best_model') if save_dir else None
log(f'[W3 Runner] Starting multi-track training:') log(f'[W3 Runner] Starting multi-track training:')
log(f' Total timesteps : {total_timesteps:,}') log(f' Total timesteps : {total_timesteps:,}')
log(f' Steps per switch: {steps_per_switch:,}') log(f' Steps per switch: {steps_per_switch:,}')
log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}') log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}')
log(f' Best model saved: {best_model_path}.zip')
log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles') log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles')
while steps_done < total_timesteps: while steps_done < total_timesteps:
@ -328,14 +326,12 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
# Train segment # Train segment
model.learn( model.learn(
total_timesteps=segment_steps, total_timesteps=segment_steps,
reset_num_timesteps=False, # Continuous timestep counter across segments reset_num_timesteps=False,
callback=health_cb, callback=health_cb,
) )
steps_done += segment_steps steps_done += segment_steps
# --- Checkpoint after every segment --- # --- Save latest checkpoint (crash recovery) ---
# If the trial is killed (timeout/crash) the latest model is always
# on disk so results are never completely lost.
if save_dir: if save_dir:
try: try:
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
@ -361,6 +357,18 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
seg_reward = ep_reward seg_reward = ep_reward
log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}') log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}')
segment_rewards.append((track_name, float(seg_reward))) segment_rewards.append((track_name, float(seg_reward)))
# --- Save BEST model if this segment beat the previous best ---
if seg_reward > best_segment_reward and best_model_path:
best_segment_reward = seg_reward
try:
model.save(best_model_path)
log(f'[W3 Runner] ⭐ NEW BEST model saved! '
f'step={steps_done:,} reward={seg_reward:.2f} '
f'track={track_name}')
except Exception as e:
log(f'[W3 Runner] WARNING: best model save failed: {e}')
except Exception as e: except Exception as e:
log(f'[W3 Runner][TRAIN] Segment eval failed: {e}') log(f'[W3 Runner][TRAIN] Segment eval failed: {e}')
segment_rewards.append((track_name, 0.0)) segment_rewards.append((track_name, 0.0))
@ -392,6 +400,19 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across ' log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across '
f'{len(segment_rewards)} segments.') f'{len(segment_rewards)} segments.')
log(f'[W3 Runner] Best segment reward during training: {best_segment_reward:.2f}')
# --- Reload the BEST model weights before returning ---
# The final model may have drifted from the best policy found mid-training.
# Always return the best checkpoint, not the last one.
if best_model_path and os.path.exists(best_model_path + '.zip'):
try:
log(f'[W3 Runner] Reloading best model from {best_model_path}.zip')
model = PPO.load(best_model_path, env=env, device='auto')
log(f'[W3 Runner] ✅ Best model reloaded (reward={best_segment_reward:.2f})')
except Exception as e:
log(f'[W3 Runner] WARNING: could not reload best model: {e}. Using final model.')
return env, segment_rewards return env, segment_rewards

View File

@ -736,3 +736,16 @@
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal. [2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.
[2026-04-17 14:45:13] [AutoResearch] GP UCB top-5 candidates:
[2026-04-17 14:45:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
[2026-04-17 14:45:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
[2026-04-17 14:45:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
[2026-04-17 14:45:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
[2026-04-17 14:45:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-17 14:45:13] [AutoResearch] Only 1 results — using random proposal.

View File

@ -395,3 +395,8 @@
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal. [2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} [2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} [2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
[2026-04-17 14:45:26] [Wave3] Seed trial 1/2: using hardcoded params.
[2026-04-17 14:45:26] [Wave3] Seed trial 2/2: using hardcoded params.
[2026-04-17 14:45:26] [Wave3] Only 0 results — using random proposal.
[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}