From 41d12dede27122c34a1ba59b6cea9bbefb784bca Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Sun, 19 Apr 2026 20:09:08 -0400 Subject: [PATCH] fix: load warm-start with original action space (throttle_min=0.2), then switch env for phase1 throttle --- agent/experiments/exp14_finetune_v5.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/agent/experiments/exp14_finetune_v5.py b/agent/experiments/exp14_finetune_v5.py index 549b2e8..d9cb899 100644 --- a/agent/experiments/exp14_finetune_v5.py +++ b/agent/experiments/exp14_finetune_v5.py @@ -134,19 +134,29 @@ def log(s): phase_defs = [ (PH1_STEPS, 0.4), (PH2_STEPS, 0.2) ] # create initial env and model (warm start) -first_throttle = phase_defs[0][1] -env0 = VecTransposeImage(DummyVecEnv([make_env(first_throttle)])) +# Important: load the warm-start model using the SAME action space it was trained with +# (throttle_min=0.2) so we can then switch envs for phase 1 if needed. +loaded_env = VecTransposeImage(DummyVecEnv([make_env(0.2)])) if os.path.exists(WARM_PATH): - log(f'Loading warm-start model from {WARM_PATH}') - model = PPO.load(WARM_PATH, env=env0, device='cpu') + log(f'Loading warm-start model from {WARM_PATH} using throttle_min=0.2 env') + model = PPO.load(WARM_PATH, env=loaded_env, device='cpu') # override lr and schedules model.learning_rate = LR model.lr_schedule = model.get_schedule_fn(LR) if hasattr(model,'get_schedule_fn') else None for pg in getattr(getattr(model.policy,'optimizer',None) or [], 'param_groups', []): pg['lr'] = LR + # Now create the actual training env with the first throttle setting + first_throttle = phase_defs[0][1] + env0 = VecTransposeImage(DummyVecEnv([make_env(first_throttle)])) + if first_throttle != 0.2: + log(f'Switching model to env with throttle_min={first_throttle}') + model.set_env(env0) else: - log('No warm-start found') + log('No warm-start found — creating fresh model with first throttle') + first_throttle = phase_defs[0][1] + env0 = VecTransposeImage(DummyVecEnv([make_env(first_throttle)])) model = PPO('CnnPolicy', env0, learning_rate=LR, verbose=1, device='cpu') + loaded_env.close() steps_done = 0 best_reward = float('-inf')