fix: always save and return the BEST model, not the last one
This was the root cause of losing good models during training. The model could learn to lap at step 30k then drift to a worse policy by step 90k, and we only ever saved the final weights. Changes to train_multitrack(): - Tracks best_segment_reward across all segments - Saves best_model.zip whenever a new high score is achieved - At end of training, RELOADS best_model.zip before returning so the caller always gets the best policy found, not the drift Both files saved per trial: model.zip <- latest checkpoint (crash recovery) best_model.zip <- best policy seen during training (used for eval) Agent: pi Tests: 102 passed Tests-Added: 0 TypeScript: N/A
This commit is contained in:
parent
0b5ce6ab7e
commit
4f77b8a468
|
|
@ -294,27 +294,25 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
|
||||||
save_dir=None):
|
save_dir=None):
|
||||||
"""
|
"""
|
||||||
Train PPO across training tracks by round-robin switching every steps_per_switch steps.
|
Train PPO across training tracks by round-robin switching every steps_per_switch steps.
|
||||||
|
Saves BOTH the latest checkpoint AND the best model seen during training.
|
||||||
|
|
||||||
Args:
|
The best model is saved to save_dir/best_model.zip whenever a new high
|
||||||
model: PPO model (already set to first_env)
|
segment reward is achieved. At the end, the best model weights are
|
||||||
first_env: The first wrapped training env (already connected)
|
reloaded so the returned model is the best seen, not just the final one.
|
||||||
total_timesteps: Total training budget across all tracks
|
|
||||||
steps_per_switch: Steps per track segment before switching
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
env: The last env used (caller must close it)
|
|
||||||
segment_rewards: List of (track_name, reward) for each completed segment
|
|
||||||
"""
|
"""
|
||||||
env = first_env
|
env = first_env
|
||||||
steps_done = 0
|
steps_done = 0
|
||||||
track_idx = 0 # Start on generated_road (first in TRAINING_TRACKS)
|
track_idx = 0
|
||||||
segment_rewards = []
|
segment_rewards = []
|
||||||
health_cb = HealthCheckCallback()
|
health_cb = HealthCheckCallback()
|
||||||
|
best_segment_reward = float('-inf')
|
||||||
|
best_model_path = os.path.join(save_dir, 'best_model') if save_dir else None
|
||||||
|
|
||||||
log(f'[W3 Runner] Starting multi-track training:')
|
log(f'[W3 Runner] Starting multi-track training:')
|
||||||
log(f' Total timesteps : {total_timesteps:,}')
|
log(f' Total timesteps : {total_timesteps:,}')
|
||||||
log(f' Steps per switch: {steps_per_switch:,}')
|
log(f' Steps per switch: {steps_per_switch:,}')
|
||||||
log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}')
|
log(f' Training tracks : {[t[0] for t in TRAINING_TRACKS]}')
|
||||||
|
log(f' Best model saved: {best_model_path}.zip')
|
||||||
log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles')
|
log(f' Rotations : ~{total_timesteps // (steps_per_switch * len(TRAINING_TRACKS))} full cycles')
|
||||||
|
|
||||||
while steps_done < total_timesteps:
|
while steps_done < total_timesteps:
|
||||||
|
|
@ -328,14 +326,12 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
|
||||||
# Train segment
|
# Train segment
|
||||||
model.learn(
|
model.learn(
|
||||||
total_timesteps=segment_steps,
|
total_timesteps=segment_steps,
|
||||||
reset_num_timesteps=False, # Continuous timestep counter across segments
|
reset_num_timesteps=False,
|
||||||
callback=health_cb,
|
callback=health_cb,
|
||||||
)
|
)
|
||||||
steps_done += segment_steps
|
steps_done += segment_steps
|
||||||
|
|
||||||
# --- Checkpoint after every segment ---
|
# --- Save latest checkpoint (crash recovery) ---
|
||||||
# If the trial is killed (timeout/crash) the latest model is always
|
|
||||||
# on disk so results are never completely lost.
|
|
||||||
if save_dir:
|
if save_dir:
|
||||||
try:
|
try:
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
@ -361,6 +357,18 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
|
||||||
seg_reward = ep_reward
|
seg_reward = ep_reward
|
||||||
log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}')
|
log(f'[W3 Runner][TRAIN] track={track_name} segment_reward={seg_reward:.2f}')
|
||||||
segment_rewards.append((track_name, float(seg_reward)))
|
segment_rewards.append((track_name, float(seg_reward)))
|
||||||
|
|
||||||
|
# --- Save BEST model if this segment beat the previous best ---
|
||||||
|
if seg_reward > best_segment_reward and best_model_path:
|
||||||
|
best_segment_reward = seg_reward
|
||||||
|
try:
|
||||||
|
model.save(best_model_path)
|
||||||
|
log(f'[W3 Runner] ⭐ NEW BEST model saved! '
|
||||||
|
f'step={steps_done:,} reward={seg_reward:.2f} '
|
||||||
|
f'track={track_name}')
|
||||||
|
except Exception as e:
|
||||||
|
log(f'[W3 Runner] WARNING: best model save failed: {e}')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f'[W3 Runner][TRAIN] Segment eval failed: {e}')
|
log(f'[W3 Runner][TRAIN] Segment eval failed: {e}')
|
||||||
segment_rewards.append((track_name, 0.0))
|
segment_rewards.append((track_name, 0.0))
|
||||||
|
|
@ -392,6 +400,19 @@ def train_multitrack(model, first_env, total_timesteps, steps_per_switch,
|
||||||
|
|
||||||
log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across '
|
log(f'\n[W3 Runner] Training complete: {steps_done:,} total steps across '
|
||||||
f'{len(segment_rewards)} segments.')
|
f'{len(segment_rewards)} segments.')
|
||||||
|
log(f'[W3 Runner] Best segment reward during training: {best_segment_reward:.2f}')
|
||||||
|
|
||||||
|
# --- Reload the BEST model weights before returning ---
|
||||||
|
# The final model may have drifted from the best policy found mid-training.
|
||||||
|
# Always return the best checkpoint, not the last one.
|
||||||
|
if best_model_path and os.path.exists(best_model_path + '.zip'):
|
||||||
|
try:
|
||||||
|
log(f'[W3 Runner] Reloading best model from {best_model_path}.zip')
|
||||||
|
model = PPO.load(best_model_path, env=env, device='auto')
|
||||||
|
log(f'[W3 Runner] ✅ Best model reloaded (reward={best_segment_reward:.2f})')
|
||||||
|
except Exception as e:
|
||||||
|
log(f'[W3 Runner] WARNING: could not reload best model: {e}. Using final model.')
|
||||||
|
|
||||||
return env, segment_rewards
|
return env, segment_rewards
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -736,3 +736,16 @@
|
||||||
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.
|
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-17 14:45:13] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-17 14:45:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
|
||||||
|
[2026-04-17 14:45:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
|
||||||
|
[2026-04-17 14:45:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
|
||||||
|
[2026-04-17 14:45:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
|
||||||
|
[2026-04-17 14:45:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-17 14:45:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-17 14:45:13] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
|
|
||||||
|
|
@ -395,3 +395,8 @@
|
||||||
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
|
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
|
||||||
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
[2026-04-17 14:45:26] [Wave3] Seed trial 1/2: using hardcoded params.
|
||||||
|
[2026-04-17 14:45:26] [Wave3] Seed trial 2/2: using hardcoded params.
|
||||||
|
[2026-04-17 14:45:26] [Wave3] Only 0 results — using random proposal.
|
||||||
|
[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
|
[2026-04-17 14:45:26] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue