diff --git a/agent/experiments/exp14_finetune_v5.py b/agent/experiments/exp14_finetune_v5.py new file mode 100644 index 0000000..549b2e8 --- /dev/null +++ b/agent/experiments/exp14_finetune_v5.py @@ -0,0 +1,206 @@ +""" +Exp 14b: Fine-tune mountain champion (v5 reward) — throttle schedule. + +- Warm-start from: agent/models/exp14-mountain-v5/best_model.zip +- Phase 1: throttle_min=0.40 for 30k steps +- Phase 2: throttle_min=0.20 for 90k steps +- LR: 2e-4 (fine-tune) +- Checkpoint every 6k steps, eval every checkpoint (3 deterministic episodes) +- Save best model to: agent/models/exp14-mountain-v5-finetune/best_model.zip + +This script is intentionally conservative: small LR, frequent evals, and +saves so we don't lose progress. +""" +import sys, os, time +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from datetime import datetime +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage +import gymnasium as gym, numpy as np +from donkeycar_sb3_runner import ThrottleClampWrapper + +# Paths / params +HOST = '10.0.0.55' +PORT = 9091 +WARM_PATH = os.path.join('models', 'exp14-mountain-v5', 'best_model.zip') +SAVE_DIR = os.path.join('models', 'exp14-mountain-v5-finetune') +os.makedirs(SAVE_DIR, exist_ok=True) +LOG_PATH = os.path.join('outerloop-results', 'exp14_finetune_results.jsonl') + +PH1_STEPS = 30000 +PH2_STEPS = 90000 +TOTAL_STEPS = PH1_STEPS + PH2_STEPS +CHECKPOINT_EVERY = 6000 +LR = 2e-4 +EVAL_EPISODES = 3 +MAX_EVAL_STEPS = 2000 + +# Reward wrapper (v5) — minimal +class V5RewardWrapper(gym.Wrapper): + def __init__(self, env, max_cte=8.0, min_lap_time=5.0): + super().__init__(env) + self.max_cte = max_cte + self.min_lap_time = min_lap_time + self._last_lc = 0 + + def reset(self, **kwargs): + self._last_lc = 0 + return self.env.reset(**kwargs) + + def step(self, action): + result = self.env.step(action) + if len(result) == 5: + obs, _sim, terminated, truncated, info = result + done = terminated or truncated + else: + obs, _sim, done, info = result + terminated, truncated = done, False + + # compute v5 reward (returned to SB3 but for eval we won't use it here) + try: + cte = float(info.get('cte', 0.0) or 0.0) + except (TypeError, ValueError): + cte = 0.0 + cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) + try: + speed = max(0.0, float(info.get('speed', 0.0) or 0.0)) + except (TypeError, ValueError): + speed = 0.0 + speed_norm = min(speed / 10.0, 1.0) + shaped = cte_quality * speed_norm + + # short-lap penalty / termination + try: + current_lc = int(info.get('lap_count', 0) or 0) + except Exception: + current_lc = self._last_lc + force_terminate = False + reward = shaped + if current_lc > self._last_lc: + self._last_lc = current_lc + try: + lap_time = float(info.get('last_lap_time', 999.0) or 999.0) + except Exception: + lap_time = 999.0 + if lap_time < self.min_lap_time: + reward = -10.0 * (self.min_lap_time / max(lap_time, 0.1)) + force_terminate = True + + if len(result) == 5: + return obs, reward, terminated or force_terminate, truncated, info + return obs, reward, terminated or force_terminate, info + +# env factory +def make_env(throttle_min): + def _init(): + raw = gym.make('donkey-mountain-track-v0', conf={'host': HOST, 'port': PORT}) + env = ThrottleClampWrapper(raw, throttle_min=throttle_min) + env = V5RewardWrapper(env) + return env + return _init + +# small helper: evaluate model on deterministic episodes and return mean lap time +from stable_baselines3.common.utils import set_random_seed + +def evaluate_model(model, throttle_min, sets=EVAL_EPISODES): + env = VecTransposeImage(DummyVecEnv([make_env(throttle_min)])) + results = [] + for s in range(sets): + obs = env.reset() + steps = 0; laps = 0; prev_lc = 0; lap_times = [] + while steps < MAX_EVAL_STEPS: + action, _ = model.predict(obs, deterministic=True) + obs, r, d, info = env.step(action) + inf = info[0] if isinstance(info, (list,tuple)) else info + steps += 1 + lc = int(inf.get('lap_count', 0) or 0) + if lc > prev_lc: + lap_times.append(float(inf.get('last_lap_time', 0) or 0)) + prev_lc = lc + laps = lc + if bool(d[0]): + break + results.append({'steps':steps, 'laps':laps, 'lap_times':lap_times}) + env.close() + return results + +# training loop: two phases +logf = open(os.path.join('outerloop-results','exp14_finetune_log.txt'),'a', buffering=1) +def log(s): + print(s) + logf.write(f'{datetime.utcnow().isoformat()} {s}\n') + +# Phase 1: throttle_min=0.4 for PH1_STEPS +phase_defs = [ (PH1_STEPS, 0.4), (PH2_STEPS, 0.2) ] + +# create initial env and model (warm start) +first_throttle = phase_defs[0][1] +env0 = VecTransposeImage(DummyVecEnv([make_env(first_throttle)])) +if os.path.exists(WARM_PATH): + log(f'Loading warm-start model from {WARM_PATH}') + model = PPO.load(WARM_PATH, env=env0, device='cpu') + # override lr and schedules + model.learning_rate = LR + model.lr_schedule = model.get_schedule_fn(LR) if hasattr(model,'get_schedule_fn') else None + for pg in getattr(getattr(model.policy,'optimizer',None) or [], 'param_groups', []): + pg['lr'] = LR +else: + log('No warm-start found') + model = PPO('CnnPolicy', env0, learning_rate=LR, verbose=1, device='cpu') + +steps_done = 0 +best_reward = float('-inf') + +try: + for phase_steps, throttle_min in phase_defs: + # If not the first phase, switch env + if steps_done > 0: + log(f'Switching env to throttle_min={throttle_min}') + try: + env0.close() + except Exception: + pass + env0 = VecTransposeImage(DummyVecEnv([make_env(throttle_min)])) + model.set_env(env0) + + remaining = phase_steps + while remaining > 0: + seg = min(CHECKPOINT_EVERY, remaining) + model.learn(total_timesteps=seg, reset_num_timesteps=False) + steps_done += seg + remaining -= seg + + # Save checkpoint + ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}') + model.save(ckpt) + model.save(os.path.join(SAVE_DIR, 'model')) + log(f'[{steps_done}/{TOTAL_STEPS}] Checkpoint saved: {ckpt}.zip') + + # Eval + res = evaluate_model(model, throttle_min, sets=EVAL_EPISODES) + # compute mean lap time for episodes that completed at least 1 lap + lap_times = [lt for r in res for lt in r['lap_times']] + mean_lap = sum(lap_times)/len(lap_times) if lap_times else None + mean_steps = sum(r['steps'] for r in res)/len(res) + summary = {'steps_done':steps_done, 'throttle_min':throttle_min, 'mean_steps':mean_steps, 'mean_lap_time':mean_lap, 'per_set':res} + with open(LOG_PATH,'a') as lf: + lf.write(json.dumps(summary)+'\n') + log(f' Eval @ {steps_done}: mean_steps={mean_steps:.1f} mean_lap={mean_lap}') + + # Update best + if mean_lap is not None and (best_reward is None or (mean_lap and (best_reward==float('-inf') or mean_lap < best_reward))): + best_reward = mean_lap + model.save(os.path.join(SAVE_DIR,'best_model')) + log(f' ⭐ NEW BEST (mean lap {mean_lap:.2f}s) saved') + +except Exception as e: + log(f'ERROR during fine-tune: {e}') + import traceback; traceback.print_exc() +finally: + try: + env0.close() + except Exception: + pass + model.save(os.path.join(SAVE_DIR,'model')) + log(f'Fine-tune complete. steps_done={steps_done}') + logf.close()