fix: complete LR override — must patch lr_schedule, not just param_groups
PPO.load() bakes lr_schedule=FloatSchedule(saved_lr) into the model. train() calls _update_learning_rate() which reads lr_schedule, not model.learning_rate. So even with param_groups patched, the first gradient step reverts the optimizer to the saved LR. Complete 3-part fix in create_or_load_model(): model.learning_rate = lr # attribute model.lr_schedule = get_schedule_fn(lr) # prevents train() reverting for pg in optimizer.param_groups: pg['lr'] = lr # immediate effect Also: - SEED_PARAMS: second seed now uses LR=0.001 (was 0.000225) so GP starts with real LR diversity instead of two identical seeds - tests/test_end_to_end.py: 13 new tests covering the full LR override path including a live learn() call; would have caught both bugs - Phase 3 results re-cleared (seed trial 1 ran with half-fix) - 96 tests total, all passing Agent: pi Tests: 96 passed Tests-Added: 13 TypeScript: N/A
This commit is contained in:
parent
298cd1790a
commit
650f893d2d
|
|
@ -56,6 +56,7 @@ from datetime import datetime
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
import gym_donkeycar
|
import gym_donkeycar
|
||||||
from stable_baselines3 import PPO
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.utils import get_schedule_fn
|
||||||
from stable_baselines3.common.evaluation import evaluate_policy
|
from stable_baselines3.common.evaluation import evaluate_policy
|
||||||
from stable_baselines3.common.callbacks import BaseCallback
|
from stable_baselines3.common.callbacks import BaseCallback
|
||||||
|
|
||||||
|
|
@ -202,16 +203,19 @@ def create_or_load_model(env, learning_rate, warm_start_path=None, seed=None):
|
||||||
log(f'[W3 Runner] Loading warm-start model from {warm_start_path}')
|
log(f'[W3 Runner] Loading warm-start model from {warm_start_path}')
|
||||||
try:
|
try:
|
||||||
model = PPO.load(warm_start_path, env=env, device='auto')
|
model = PPO.load(warm_start_path, env=env, device='auto')
|
||||||
# Override learning rate — set BOTH the SB3 attribute AND the
|
# Three-part LR override required after PPO.load():
|
||||||
# optimizer param groups. PPO.load() restores the saved optimizer
|
# 1. model.learning_rate — Python attribute (used to recreate lr_schedule)
|
||||||
# state (lr=0.000225 from Phase 2), so just setting model.learning_rate
|
# 2. model.lr_schedule — FloatSchedule used by _update_learning_rate()
|
||||||
# is not enough — the optimizer ignores it until _update_learning_rate
|
# during every train() call. Without this,
|
||||||
# is called, and even then only if it reads the attribute correctly.
|
# _update_learning_rate() reverts the optimizer
|
||||||
|
# back to the saved LR on the first gradient step.
|
||||||
|
# 3. optimizer param_groups — immediate effect before first train()
|
||||||
model.learning_rate = learning_rate
|
model.learning_rate = learning_rate
|
||||||
|
model.lr_schedule = get_schedule_fn(learning_rate)
|
||||||
for pg in model.policy.optimizer.param_groups:
|
for pg in model.policy.optimizer.param_groups:
|
||||||
pg['lr'] = learning_rate
|
pg['lr'] = learning_rate
|
||||||
log(f'[W3 Runner] ✅ Warm start loaded. LR overridden to {learning_rate:.6f} '
|
log(f'[W3 Runner] ✅ Warm start loaded. LR overridden to {learning_rate:.6f} '
|
||||||
f'(set on model + {len(model.policy.optimizer.param_groups)} optimizer param group(s))')
|
f'(model + lr_schedule + {len(model.policy.optimizer.param_groups)} optimizer param group(s))')
|
||||||
return model
|
return model
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f'[W3 Runner] ⚠️ Warm start failed ({e}), training from scratch.')
|
log(f'[W3 Runner] ⚠️ Warm start failed ({e}), training from scratch.')
|
||||||
|
|
|
||||||
|
|
@ -567,3 +567,16 @@
|
||||||
[2026-04-14 20:37:35] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
[2026-04-14 20:37:35] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
[2026-04-14 20:37:35] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
[2026-04-14 20:37:35] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
[2026-04-14 20:37:35] [AutoResearch] Only 1 results — using random proposal.
|
[2026-04-14 20:37:35] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-14 21:27:08] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-14 21:27:08] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
|
||||||
|
[2026-04-14 21:27:08] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
|
||||||
|
[2026-04-14 21:27:08] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
|
||||||
|
[2026-04-14 21:27:08] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
|
||||||
|
[2026-04-14 21:27:08] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-14 21:27:08] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-14 21:27:08] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
|
|
||||||
|
|
@ -281,3 +281,26 @@
|
||||||
[2026-04-14 20:37:40] [Wave3] Only 0 results — using random proposal.
|
[2026-04-14 20:37:40] [Wave3] Only 0 results — using random proposal.
|
||||||
[2026-04-14 20:37:40] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
[2026-04-14 20:37:40] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
[2026-04-14 20:37:40] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
[2026-04-14 20:37:40] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
[2026-04-14 20:37:55] =================================================================
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Multi-Track Autoresearch — GP+UCB Generalization Search
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Training tracks : generated_road, generated_track, mountain_track
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Test tracks : mini_monaco only (zero-shot; warren removed — broken done condition)
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Max trials : 25 | kappa=2.0 | push every 5
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Results file : /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase3.jsonl
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Champion dir : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave3-champion
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Warm start : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion/model.zip
|
||||||
|
[2026-04-14 20:37:55] =================================================================
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Loaded 0 existing Phase 3 results.
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Wave3 Champion: trial=5 score=137.58 params={'learning_rate': 0.0008293130840877947, 'steps_per_switch': 7847, 'total_timesteps': 31625}
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Starting from trial 1.
|
||||||
|
[2026-04-14 20:37:55]
|
||||||
|
[Wave3] ========== Trial 1/25 ==========
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Seed trial 1/2: using hardcoded params.
|
||||||
|
[2026-04-14 20:37:55] [Wave3] Proposed params: {'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000}
|
||||||
|
[2026-04-14 20:37:57] [Wave3] Launching trial 1: {'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000}
|
||||||
|
[2026-04-14 20:37:57] [Wave3] Command: python3 /home/paulh/projects/donkeycar-rl-autoresearch/agent/multitrack_runner.py --total-timesteps 45000 --steps-per-switch 5000 --learning-rate 0.000225 --eval-episodes 3 --save-dir /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave3-trial-0001 --warm-start /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion/model.zip
|
||||||
|
[2026-04-14 21:27:21] [Wave3] Seed trial 1/2: using hardcoded params.
|
||||||
|
[2026-04-14 21:27:21] [Wave3] Seed trial 2/2: using hardcoded params.
|
||||||
|
[2026-04-14 21:27:21] [Wave3] Only 0 results — using random proposal.
|
||||||
|
[2026-04-14 21:27:21] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
|
[2026-04-14 21:27:21] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
|
|
||||||
|
|
@ -81,10 +81,10 @@ JOB_TIMEOUT = 7200 # 2h — 400k steps on CPU may need time
|
||||||
# ---- Seed trials near Phase 2 champion ----
|
# ---- Seed trials near Phase 2 champion ----
|
||||||
# GP warm-up: first 2 trials use known-good parameters so GP has real prior data
|
# GP warm-up: first 2 trials use known-good parameters so GP has real prior data
|
||||||
SEED_PARAMS = [
|
SEED_PARAMS = [
|
||||||
# 3 full rotations through all 3 training tracks (~35 min per trial)
|
# Low LR (same as Phase 2 champion) — baseline, ~35 min per trial
|
||||||
{'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000},
|
{'learning_rate': 0.000225, 'steps_per_switch': 5000, 'total_timesteps': 45000},
|
||||||
# Slower switching, more time per track (~45 min per trial)
|
# High LR — tests whether faster adaptation generalises better, ~35 min
|
||||||
{'learning_rate': 0.000225, 'steps_per_switch': 10000, 'total_timesteps': 90000},
|
{'learning_rate': 0.001000, 'steps_per_switch': 5000, 'total_timesteps': 45000},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,426 @@
|
||||||
|
"""
|
||||||
|
End-to-end pipeline tests — no live simulator required.
|
||||||
|
|
||||||
|
These tests exist to catch integration-level bugs that unit tests miss.
|
||||||
|
The LR-override bug (Wave 3: all trials silently ran at LR=0.000225) was
|
||||||
|
not caught because we had no test that verified the optimizer's actual LR
|
||||||
|
after PPO.load(). Every test in this file targets a real failure that
|
||||||
|
already burned training time.
|
||||||
|
|
||||||
|
Test categories
|
||||||
|
---------------
|
||||||
|
1. LR override — PPO.load() + param_group update
|
||||||
|
2. create_or_load_model — the function that wraps PPO.load in multitrack_runner
|
||||||
|
3. Training step LR — a short real PPO.learn() to confirm the log LR matches
|
||||||
|
4. Output parsing — parse_runner_output extracts correct metrics
|
||||||
|
5. Results round-trip — save → load → GP uses correct data
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Path setup — tests run from repo root or tests/ dir
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
AGENT_DIR = os.path.join(os.path.dirname(__file__), '..', 'agent')
|
||||||
|
if AGENT_DIR not in sys.path:
|
||||||
|
sys.path.insert(0, AGENT_DIR)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Shared helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import numpy as np
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
||||||
|
|
||||||
|
|
||||||
|
class MockDonkeyEnv(gym.Env):
|
||||||
|
"""
|
||||||
|
Minimal DonkeyCar-shaped env: image observations, Box actions.
|
||||||
|
No simulator required.
|
||||||
|
"""
|
||||||
|
metadata = {'render_modes': []}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.observation_space = gym.spaces.Box(
|
||||||
|
low=0, high=255, shape=(120, 160, 3), dtype=np.uint8
|
||||||
|
)
|
||||||
|
self.action_space = gym.spaces.Box(
|
||||||
|
low=np.array([-1.0, 0.0]),
|
||||||
|
high=np.array([1.0, 1.0]),
|
||||||
|
dtype=np.float32,
|
||||||
|
)
|
||||||
|
self._step_count = 0
|
||||||
|
|
||||||
|
def reset(self, seed=None, **kwargs):
|
||||||
|
self._step_count = 0
|
||||||
|
return np.zeros((120, 160, 3), dtype=np.uint8), {}
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
self._step_count += 1
|
||||||
|
obs = np.random.randint(0, 255, (120, 160, 3), dtype=np.uint8)
|
||||||
|
terminated = self._step_count >= 30
|
||||||
|
return obs, 1.0, terminated, False, {'speed': 2.0, 'cte': 0.1}
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def make_vec_env():
|
||||||
|
"""Wrap MockDonkeyEnv the same way SB3 expects for CnnPolicy."""
|
||||||
|
return VecTransposeImage(DummyVecEnv([MockDonkeyEnv]))
|
||||||
|
|
||||||
|
|
||||||
|
def save_ppo_model(path, lr):
|
||||||
|
"""Create a tiny CnnPolicy PPO, save it, return the path."""
|
||||||
|
env = make_vec_env()
|
||||||
|
model = PPO('CnnPolicy', env, learning_rate=lr, verbose=0,
|
||||||
|
n_steps=64, batch_size=16)
|
||||||
|
model.save(path)
|
||||||
|
env.close()
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 1. LR override — the bug that burned 8 hours of training
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_lr_override_bug_demonstration():
|
||||||
|
"""
|
||||||
|
Setting model.learning_rate after PPO.load() does NOT update the
|
||||||
|
optimizer. This demonstrates the exact bug that caused all Wave 3
|
||||||
|
trials to run at 0.000225 regardless of the GP-proposed LR.
|
||||||
|
"""
|
||||||
|
ORIGINAL_LR = 0.000225
|
||||||
|
NEW_LR = 0.001
|
||||||
|
|
||||||
|
env = make_vec_env()
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = os.path.join(tmpdir, 'model.zip')
|
||||||
|
save_ppo_model(path, ORIGINAL_LR)
|
||||||
|
|
||||||
|
loaded = PPO.load(path, env=env, device='cpu')
|
||||||
|
# WRONG: only set the Python attribute, not the optimizer
|
||||||
|
loaded.learning_rate = NEW_LR
|
||||||
|
|
||||||
|
actual_lr = loaded.policy.optimizer.param_groups[0]['lr']
|
||||||
|
# The optimizer still has the OLD lr — this is the bug
|
||||||
|
assert actual_lr == pytest.approx(ORIGINAL_LR), (
|
||||||
|
f"Expected optimizer to STILL have old LR {ORIGINAL_LR} "
|
||||||
|
f"(demonstrating the bug), got {actual_lr}"
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_lr_override_fix_sets_optimizer_param_groups():
|
||||||
|
"""
|
||||||
|
The correct fix: after PPO.load(), set lr on BOTH model.learning_rate
|
||||||
|
AND every optimizer param_group. This is what multitrack_runner now does.
|
||||||
|
"""
|
||||||
|
ORIGINAL_LR = 0.000225
|
||||||
|
NEW_LR = 0.001
|
||||||
|
|
||||||
|
env = make_vec_env()
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = os.path.join(tmpdir, 'model.zip')
|
||||||
|
save_ppo_model(path, ORIGINAL_LR)
|
||||||
|
|
||||||
|
loaded = PPO.load(path, env=env, device='cpu')
|
||||||
|
# CORRECT fix
|
||||||
|
loaded.learning_rate = NEW_LR
|
||||||
|
for pg in loaded.policy.optimizer.param_groups:
|
||||||
|
pg['lr'] = NEW_LR
|
||||||
|
|
||||||
|
# model attribute updated
|
||||||
|
assert loaded.learning_rate == pytest.approx(NEW_LR)
|
||||||
|
# optimizer updated — this is what matters for actual gradient updates
|
||||||
|
for i, pg in enumerate(loaded.policy.optimizer.param_groups):
|
||||||
|
assert pg['lr'] == pytest.approx(NEW_LR), (
|
||||||
|
f"param_group[{i}]['lr'] = {pg['lr']}, expected {NEW_LR}"
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_lr_override_survives_one_training_step():
|
||||||
|
"""
|
||||||
|
After the COMPLETE fix (learning_rate + lr_schedule + param_groups),
|
||||||
|
the optimizer LR must still be correct after one real PPO gradient update.
|
||||||
|
|
||||||
|
Root cause of the original bug:
|
||||||
|
- PPO.load() bakes lr_schedule = FloatSchedule(0.000225) into the model
|
||||||
|
- train() calls _update_learning_rate() which reads lr_schedule, NOT learning_rate
|
||||||
|
- So even if param_groups are patched, train() overwrites them back to 0.000225
|
||||||
|
- Fix: also patch model.lr_schedule = get_schedule_fn(NEW_LR)
|
||||||
|
"""
|
||||||
|
from stable_baselines3.common.utils import get_schedule_fn
|
||||||
|
ORIGINAL_LR = 0.000225
|
||||||
|
NEW_LR = 0.001
|
||||||
|
|
||||||
|
env = make_vec_env()
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = os.path.join(tmpdir, 'model.zip')
|
||||||
|
save_ppo_model(path, ORIGINAL_LR)
|
||||||
|
|
||||||
|
loaded = PPO.load(path, env=env, device='cpu')
|
||||||
|
# Complete 3-part fix
|
||||||
|
loaded.learning_rate = NEW_LR
|
||||||
|
loaded.lr_schedule = get_schedule_fn(NEW_LR) # <-- prevents train() reverting LR
|
||||||
|
for pg in loaded.policy.optimizer.param_groups:
|
||||||
|
pg['lr'] = NEW_LR
|
||||||
|
|
||||||
|
# Run a minimal training step
|
||||||
|
loaded.learn(total_timesteps=64, reset_num_timesteps=True)
|
||||||
|
|
||||||
|
# LR must still be NEW_LR after _update_learning_rate() fired
|
||||||
|
for i, pg in enumerate(loaded.policy.optimizer.param_groups):
|
||||||
|
assert pg['lr'] == pytest.approx(NEW_LR), (
|
||||||
|
f"After learn(), param_group[{i}]['lr'] = {pg['lr']}, "
|
||||||
|
f"expected {NEW_LR}. lr_schedule was not patched correctly."
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 2. create_or_load_model — the actual function in multitrack_runner
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_create_or_load_model_warm_start_lr_reaches_optimizer():
|
||||||
|
"""
|
||||||
|
create_or_load_model() must leave the optimizer at the requested LR,
|
||||||
|
not at the LR baked into the saved model — both before AND after a
|
||||||
|
training step (lr_schedule must be patched, not just param_groups).
|
||||||
|
"""
|
||||||
|
from multitrack_runner import create_or_load_model
|
||||||
|
|
||||||
|
ORIGINAL_LR = 0.000225
|
||||||
|
NEW_LR = 0.00083
|
||||||
|
|
||||||
|
env = make_vec_env()
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = os.path.join(tmpdir, 'model.zip')
|
||||||
|
save_ppo_model(path, ORIGINAL_LR)
|
||||||
|
|
||||||
|
model = create_or_load_model(env, learning_rate=NEW_LR,
|
||||||
|
warm_start_path=path)
|
||||||
|
|
||||||
|
# model attribute
|
||||||
|
assert model.learning_rate == pytest.approx(NEW_LR)
|
||||||
|
# lr_schedule (used by _update_learning_rate during train())
|
||||||
|
assert model.lr_schedule(1.0) == pytest.approx(NEW_LR), (
|
||||||
|
f"lr_schedule(1.0) = {model.lr_schedule(1.0)}, expected {NEW_LR}. "
|
||||||
|
"train() will revert optimizer to old LR without this fix."
|
||||||
|
)
|
||||||
|
# optimizer param_groups
|
||||||
|
for i, pg in enumerate(model.policy.optimizer.param_groups):
|
||||||
|
assert pg['lr'] == pytest.approx(NEW_LR)
|
||||||
|
|
||||||
|
# Verify it survives an actual training step
|
||||||
|
model.learn(total_timesteps=64, reset_num_timesteps=True)
|
||||||
|
for i, pg in enumerate(model.policy.optimizer.param_groups):
|
||||||
|
assert pg['lr'] == pytest.approx(NEW_LR), (
|
||||||
|
f"After learn(), param_group[{i}]['lr'] = {pg['lr']}, expected {NEW_LR}"
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_or_load_model_fresh_model_uses_correct_lr():
|
||||||
|
"""
|
||||||
|
When warm_start_path is None/missing, create_or_load_model() must
|
||||||
|
create a fresh PPO with the requested LR.
|
||||||
|
"""
|
||||||
|
from multitrack_runner import create_or_load_model
|
||||||
|
|
||||||
|
LR = 0.00075
|
||||||
|
env = make_vec_env()
|
||||||
|
model = create_or_load_model(env, learning_rate=LR, warm_start_path=None)
|
||||||
|
|
||||||
|
assert model.learning_rate == pytest.approx(LR)
|
||||||
|
for pg in model.policy.optimizer.param_groups:
|
||||||
|
assert pg['lr'] == pytest.approx(LR)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_or_load_model_falls_back_to_fresh_on_bad_path():
|
||||||
|
"""
|
||||||
|
If the warm_start_path doesn't exist, create_or_load_model() must
|
||||||
|
fall back to a fresh model (not crash).
|
||||||
|
"""
|
||||||
|
from multitrack_runner import create_or_load_model
|
||||||
|
|
||||||
|
LR = 0.0005
|
||||||
|
env = make_vec_env()
|
||||||
|
model = create_or_load_model(env, learning_rate=LR,
|
||||||
|
warm_start_path='/nonexistent/model.zip')
|
||||||
|
|
||||||
|
assert model is not None
|
||||||
|
assert model.learning_rate == pytest.approx(LR)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 3. Output parsing — parse_runner_output extracts correct metrics
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_parse_runner_output_full_success():
|
||||||
|
"""parse_runner_output correctly extracts all metrics from a full run."""
|
||||||
|
from wave3_controller import parse_runner_output
|
||||||
|
|
||||||
|
output = """
|
||||||
|
[12:00:01] [W3 Runner][TRAIN] track=generated_road segment_reward=2409.70
|
||||||
|
[12:08:00] [W3 Runner][TRAIN] track=generated_track segment_reward=112.30
|
||||||
|
[12:15:00] [W3 Runner] Switching to TEST track: mini_monaco
|
||||||
|
[12:15:30] [W3 Runner][TEST] track=mini_monaco mean_reward=843.21 mean_steps=980.0 ✅ DRIVES
|
||||||
|
[12:15:30] [W3 Runner][TEST] mini_monaco_reward=843.2100
|
||||||
|
[12:15:30] [W3 Runner][TEST] combined_test_score=843.2100
|
||||||
|
"""
|
||||||
|
combined, mini_monaco = parse_runner_output(output)
|
||||||
|
assert combined == pytest.approx(843.21, rel=1e-4)
|
||||||
|
assert mini_monaco == pytest.approx(843.21, rel=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_runner_output_crash():
|
||||||
|
"""parse_runner_output handles a crash/timeout (no test score lines)."""
|
||||||
|
from wave3_controller import parse_runner_output
|
||||||
|
|
||||||
|
output = "[TIMEOUT after 7200s]"
|
||||||
|
combined, mini_monaco = parse_runner_output(output)
|
||||||
|
assert combined is None
|
||||||
|
assert mini_monaco is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_runner_output_partial():
|
||||||
|
"""parse_runner_output handles missing combined but present mini_monaco."""
|
||||||
|
from wave3_controller import parse_runner_output
|
||||||
|
|
||||||
|
output = "[W3 Runner][TEST] mini_monaco_reward=55.5\n"
|
||||||
|
combined, mini_monaco = parse_runner_output(output)
|
||||||
|
assert combined is None
|
||||||
|
assert mini_monaco == pytest.approx(55.5, rel=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 4. Results round-trip — save → load → GP uses the data
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_results_round_trip_gp_sees_correct_lr():
|
||||||
|
"""
|
||||||
|
After save_result() writes a trial, load_results() must return it, and
|
||||||
|
the GP must receive the correct params (including the actual LR used).
|
||||||
|
This ensures GP data is not silently corrupted.
|
||||||
|
"""
|
||||||
|
import wave3_controller
|
||||||
|
|
||||||
|
TRIAL_LR = 0.00083
|
||||||
|
TRIAL_SCORE = 250.0
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
original_path = wave3_controller.RESULTS_FILE
|
||||||
|
wave3_controller.RESULTS_FILE = os.path.join(tmpdir, 'results.jsonl')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from wave3_controller import save_result, load_results
|
||||||
|
save_result(
|
||||||
|
trial=1,
|
||||||
|
params={'learning_rate': TRIAL_LR,
|
||||||
|
'steps_per_switch': 8000,
|
||||||
|
'total_timesteps': 45000},
|
||||||
|
combined=TRIAL_SCORE,
|
||||||
|
mini_monaco=TRIAL_SCORE,
|
||||||
|
model_path=None,
|
||||||
|
is_champion=True,
|
||||||
|
status='ok',
|
||||||
|
elapsed=1200.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = load_results()
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]['params']['learning_rate'] == pytest.approx(TRIAL_LR)
|
||||||
|
assert results[0]['combined_test_score'] == pytest.approx(TRIAL_SCORE)
|
||||||
|
finally:
|
||||||
|
wave3_controller.RESULTS_FILE = original_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_results_gp_data_never_includes_zero_score_trials():
|
||||||
|
"""
|
||||||
|
Zero-score trials (crash/timeout) must NOT be added to GP data.
|
||||||
|
If they were, the GP would learn that certain params are bad even
|
||||||
|
when the failure was actually a simulator glitch.
|
||||||
|
"""
|
||||||
|
import wave3_controller
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
original_path = wave3_controller.RESULTS_FILE
|
||||||
|
wave3_controller.RESULTS_FILE = os.path.join(tmpdir, 'results.jsonl')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from wave3_controller import save_result, load_results
|
||||||
|
|
||||||
|
# Write a zero-score (crash) trial
|
||||||
|
save_result(
|
||||||
|
trial=1,
|
||||||
|
params={'learning_rate': 0.001, 'steps_per_switch': 5000,
|
||||||
|
'total_timesteps': 30000},
|
||||||
|
combined=0.0,
|
||||||
|
mini_monaco=0.0,
|
||||||
|
model_path=None,
|
||||||
|
is_champion=False,
|
||||||
|
status='timeout',
|
||||||
|
elapsed=7200.0,
|
||||||
|
)
|
||||||
|
# Write a valid trial
|
||||||
|
save_result(
|
||||||
|
trial=2,
|
||||||
|
params={'learning_rate': 0.0005, 'steps_per_switch': 8000,
|
||||||
|
'total_timesteps': 45000},
|
||||||
|
combined=300.0,
|
||||||
|
mini_monaco=300.0,
|
||||||
|
model_path=None,
|
||||||
|
is_champion=True,
|
||||||
|
status='ok',
|
||||||
|
elapsed=1800.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = load_results()
|
||||||
|
# load_results only returns trials with non-None score; the
|
||||||
|
# wave3 main loop further filters out score==0 before adding to GP
|
||||||
|
scores = [r['combined_test_score'] for r in results]
|
||||||
|
assert 300.0 in scores
|
||||||
|
# The zero-score trial IS in the file but the main loop guards it
|
||||||
|
assert 0.0 in scores # it's saved
|
||||||
|
finally:
|
||||||
|
wave3_controller.RESULTS_FILE = original_path
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 5. Seed params sanity — seed trials cover the important LR range
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_seed_params_cover_both_low_and_high_lr():
|
||||||
|
"""
|
||||||
|
SEED_PARAMS must include at least one low-LR trial (≤ 3e-4) and
|
||||||
|
at least one higher-LR trial (≥ 5e-4) so the GP starts with data
|
||||||
|
across the search space, not just at one corner.
|
||||||
|
"""
|
||||||
|
from wave3_controller import SEED_PARAMS
|
||||||
|
lrs = [p['learning_rate'] for p in SEED_PARAMS]
|
||||||
|
assert min(lrs) <= 3e-4, f"No low-LR seed trial: {lrs}"
|
||||||
|
assert max(lrs) >= 5e-4, f"No high-LR seed trial: {lrs}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_seed_params_lr_is_not_all_identical():
|
||||||
|
"""SEED_PARAMS must not all have the same LR — that killed Wave 3 v1."""
|
||||||
|
from wave3_controller import SEED_PARAMS
|
||||||
|
lrs = [p['learning_rate'] for p in SEED_PARAMS]
|
||||||
|
assert len(set(lrs)) > 1, (
|
||||||
|
f"All seed params have the same LR ({lrs[0]}). "
|
||||||
|
"The GP needs diverse starting data to explore the LR dimension."
|
||||||
|
)
|
||||||
Loading…
Reference in New Issue