exp14 finetune: warm-start mountain champion, throttle schedule 0.4->0.2, LR=2e-4, checkpoints and evals
This commit is contained in:
parent
b1ec14e3cb
commit
bc23a316e0
|
|
@ -0,0 +1,206 @@
|
|||
"""
|
||||
Exp 14b: Fine-tune mountain champion (v5 reward) — throttle schedule.
|
||||
|
||||
- Warm-start from: agent/models/exp14-mountain-v5/best_model.zip
|
||||
- Phase 1: throttle_min=0.40 for 30k steps
|
||||
- Phase 2: throttle_min=0.20 for 90k steps
|
||||
- LR: 2e-4 (fine-tune)
|
||||
- Checkpoint every 6k steps, eval every checkpoint (3 deterministic episodes)
|
||||
- Save best model to: agent/models/exp14-mountain-v5-finetune/best_model.zip
|
||||
|
||||
This script is intentionally conservative: small LR, frequent evals, and
|
||||
saves so we don't lose progress.
|
||||
"""
|
||||
import sys, os, time
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from datetime import datetime
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
||||
import gymnasium as gym, numpy as np
|
||||
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||||
|
||||
# Paths / params
|
||||
HOST = '10.0.0.55'
|
||||
PORT = 9091
|
||||
WARM_PATH = os.path.join('models', 'exp14-mountain-v5', 'best_model.zip')
|
||||
SAVE_DIR = os.path.join('models', 'exp14-mountain-v5-finetune')
|
||||
os.makedirs(SAVE_DIR, exist_ok=True)
|
||||
LOG_PATH = os.path.join('outerloop-results', 'exp14_finetune_results.jsonl')
|
||||
|
||||
PH1_STEPS = 30000
|
||||
PH2_STEPS = 90000
|
||||
TOTAL_STEPS = PH1_STEPS + PH2_STEPS
|
||||
CHECKPOINT_EVERY = 6000
|
||||
LR = 2e-4
|
||||
EVAL_EPISODES = 3
|
||||
MAX_EVAL_STEPS = 2000
|
||||
|
||||
# Reward wrapper (v5) — minimal
|
||||
class V5RewardWrapper(gym.Wrapper):
|
||||
def __init__(self, env, max_cte=8.0, min_lap_time=5.0):
|
||||
super().__init__(env)
|
||||
self.max_cte = max_cte
|
||||
self.min_lap_time = min_lap_time
|
||||
self._last_lc = 0
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self._last_lc = 0
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
def step(self, action):
|
||||
result = self.env.step(action)
|
||||
if len(result) == 5:
|
||||
obs, _sim, terminated, truncated, info = result
|
||||
done = terminated or truncated
|
||||
else:
|
||||
obs, _sim, done, info = result
|
||||
terminated, truncated = done, False
|
||||
|
||||
# compute v5 reward (returned to SB3 but for eval we won't use it here)
|
||||
try:
|
||||
cte = float(info.get('cte', 0.0) or 0.0)
|
||||
except (TypeError, ValueError):
|
||||
cte = 0.0
|
||||
cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)
|
||||
try:
|
||||
speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
|
||||
except (TypeError, ValueError):
|
||||
speed = 0.0
|
||||
speed_norm = min(speed / 10.0, 1.0)
|
||||
shaped = cte_quality * speed_norm
|
||||
|
||||
# short-lap penalty / termination
|
||||
try:
|
||||
current_lc = int(info.get('lap_count', 0) or 0)
|
||||
except Exception:
|
||||
current_lc = self._last_lc
|
||||
force_terminate = False
|
||||
reward = shaped
|
||||
if current_lc > self._last_lc:
|
||||
self._last_lc = current_lc
|
||||
try:
|
||||
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
|
||||
except Exception:
|
||||
lap_time = 999.0
|
||||
if lap_time < self.min_lap_time:
|
||||
reward = -10.0 * (self.min_lap_time / max(lap_time, 0.1))
|
||||
force_terminate = True
|
||||
|
||||
if len(result) == 5:
|
||||
return obs, reward, terminated or force_terminate, truncated, info
|
||||
return obs, reward, terminated or force_terminate, info
|
||||
|
||||
# env factory
|
||||
def make_env(throttle_min):
|
||||
def _init():
|
||||
raw = gym.make('donkey-mountain-track-v0', conf={'host': HOST, 'port': PORT})
|
||||
env = ThrottleClampWrapper(raw, throttle_min=throttle_min)
|
||||
env = V5RewardWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
# small helper: evaluate model on deterministic episodes and return mean lap time
|
||||
from stable_baselines3.common.utils import set_random_seed
|
||||
|
||||
def evaluate_model(model, throttle_min, sets=EVAL_EPISODES):
|
||||
env = VecTransposeImage(DummyVecEnv([make_env(throttle_min)]))
|
||||
results = []
|
||||
for s in range(sets):
|
||||
obs = env.reset()
|
||||
steps = 0; laps = 0; prev_lc = 0; lap_times = []
|
||||
while steps < MAX_EVAL_STEPS:
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
obs, r, d, info = env.step(action)
|
||||
inf = info[0] if isinstance(info, (list,tuple)) else info
|
||||
steps += 1
|
||||
lc = int(inf.get('lap_count', 0) or 0)
|
||||
if lc > prev_lc:
|
||||
lap_times.append(float(inf.get('last_lap_time', 0) or 0))
|
||||
prev_lc = lc
|
||||
laps = lc
|
||||
if bool(d[0]):
|
||||
break
|
||||
results.append({'steps':steps, 'laps':laps, 'lap_times':lap_times})
|
||||
env.close()
|
||||
return results
|
||||
|
||||
# training loop: two phases
|
||||
logf = open(os.path.join('outerloop-results','exp14_finetune_log.txt'),'a', buffering=1)
|
||||
def log(s):
|
||||
print(s)
|
||||
logf.write(f'{datetime.utcnow().isoformat()} {s}\n')
|
||||
|
||||
# Phase 1: throttle_min=0.4 for PH1_STEPS
|
||||
phase_defs = [ (PH1_STEPS, 0.4), (PH2_STEPS, 0.2) ]
|
||||
|
||||
# create initial env and model (warm start)
|
||||
first_throttle = phase_defs[0][1]
|
||||
env0 = VecTransposeImage(DummyVecEnv([make_env(first_throttle)]))
|
||||
if os.path.exists(WARM_PATH):
|
||||
log(f'Loading warm-start model from {WARM_PATH}')
|
||||
model = PPO.load(WARM_PATH, env=env0, device='cpu')
|
||||
# override lr and schedules
|
||||
model.learning_rate = LR
|
||||
model.lr_schedule = model.get_schedule_fn(LR) if hasattr(model,'get_schedule_fn') else None
|
||||
for pg in getattr(getattr(model.policy,'optimizer',None) or [], 'param_groups', []):
|
||||
pg['lr'] = LR
|
||||
else:
|
||||
log('No warm-start found')
|
||||
model = PPO('CnnPolicy', env0, learning_rate=LR, verbose=1, device='cpu')
|
||||
|
||||
steps_done = 0
|
||||
best_reward = float('-inf')
|
||||
|
||||
try:
|
||||
for phase_steps, throttle_min in phase_defs:
|
||||
# If not the first phase, switch env
|
||||
if steps_done > 0:
|
||||
log(f'Switching env to throttle_min={throttle_min}')
|
||||
try:
|
||||
env0.close()
|
||||
except Exception:
|
||||
pass
|
||||
env0 = VecTransposeImage(DummyVecEnv([make_env(throttle_min)]))
|
||||
model.set_env(env0)
|
||||
|
||||
remaining = phase_steps
|
||||
while remaining > 0:
|
||||
seg = min(CHECKPOINT_EVERY, remaining)
|
||||
model.learn(total_timesteps=seg, reset_num_timesteps=False)
|
||||
steps_done += seg
|
||||
remaining -= seg
|
||||
|
||||
# Save checkpoint
|
||||
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
|
||||
model.save(ckpt)
|
||||
model.save(os.path.join(SAVE_DIR, 'model'))
|
||||
log(f'[{steps_done}/{TOTAL_STEPS}] Checkpoint saved: {ckpt}.zip')
|
||||
|
||||
# Eval
|
||||
res = evaluate_model(model, throttle_min, sets=EVAL_EPISODES)
|
||||
# compute mean lap time for episodes that completed at least 1 lap
|
||||
lap_times = [lt for r in res for lt in r['lap_times']]
|
||||
mean_lap = sum(lap_times)/len(lap_times) if lap_times else None
|
||||
mean_steps = sum(r['steps'] for r in res)/len(res)
|
||||
summary = {'steps_done':steps_done, 'throttle_min':throttle_min, 'mean_steps':mean_steps, 'mean_lap_time':mean_lap, 'per_set':res}
|
||||
with open(LOG_PATH,'a') as lf:
|
||||
lf.write(json.dumps(summary)+'\n')
|
||||
log(f' Eval @ {steps_done}: mean_steps={mean_steps:.1f} mean_lap={mean_lap}')
|
||||
|
||||
# Update best
|
||||
if mean_lap is not None and (best_reward is None or (mean_lap and (best_reward==float('-inf') or mean_lap < best_reward))):
|
||||
best_reward = mean_lap
|
||||
model.save(os.path.join(SAVE_DIR,'best_model'))
|
||||
log(f' ⭐ NEW BEST (mean lap {mean_lap:.2f}s) saved')
|
||||
|
||||
except Exception as e:
|
||||
log(f'ERROR during fine-tune: {e}')
|
||||
import traceback; traceback.print_exc()
|
||||
finally:
|
||||
try:
|
||||
env0.close()
|
||||
except Exception:
|
||||
pass
|
||||
model.save(os.path.join(SAVE_DIR,'model'))
|
||||
log(f'Fine-tune complete. steps_done={steps_done}')
|
||||
logf.close()
|
||||
Loading…
Reference in New Issue