251 lines
9.8 KiB
Python
251 lines
9.8 KiB
Python
"""
|
|
Exp 14b: Fine-tune mountain champion (v5 reward) — throttle schedule.
|
|
|
|
- Warm-start from: agent/models/exp14-mountain-v5/best_model.zip
|
|
- Phase 1: throttle_min=0.40 for 30k steps
|
|
- Phase 2: throttle_min=0.20 for 90k steps
|
|
- LR: 2e-4 (fine-tune)
|
|
- Checkpoint every 6k steps, eval every checkpoint (3 deterministic episodes)
|
|
- Save best model to: agent/models/exp14-mountain-v5-finetune/best_model.zip
|
|
|
|
This script is intentionally conservative: small LR, frequent evals, and
|
|
saves so we don't lose progress.
|
|
"""
|
|
import sys, os, time
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
from datetime import datetime
|
|
from stable_baselines3 import PPO
|
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
|
from stable_baselines3.common.utils import get_schedule_fn
|
|
import gymnasium as gym, numpy as np
|
|
from donkeycar_sb3_runner import ThrottleClampWrapper
|
|
|
|
# Paths / params
|
|
HOST = '10.0.0.55'
|
|
PORT = 9091
|
|
WARM_PATH = os.path.join('models', 'exp14-mountain-v5', 'best_model.zip')
|
|
SAVE_DIR = os.path.join('models', 'exp14-mountain-v5-finetune')
|
|
os.makedirs(SAVE_DIR, exist_ok=True)
|
|
LOG_PATH = os.path.join('outerloop-results', 'exp14_finetune_results.jsonl')
|
|
|
|
PH1_STEPS = 30000
|
|
PH2_STEPS = 90000
|
|
TOTAL_STEPS = PH1_STEPS + PH2_STEPS
|
|
CHECKPOINT_EVERY = 6000
|
|
LR = 2e-4
|
|
EVAL_EPISODES = 3
|
|
MAX_EVAL_STEPS = 2000
|
|
|
|
# Reward wrapper (v5) — minimal
|
|
class V5RewardWrapper(gym.Wrapper):
|
|
def __init__(self, env, max_cte=8.0, min_lap_time=5.0):
|
|
super().__init__(env)
|
|
self.max_cte = max_cte
|
|
self.min_lap_time = min_lap_time
|
|
self._last_lc = 0
|
|
|
|
def reset(self, **kwargs):
|
|
self._last_lc = 0
|
|
return self.env.reset(**kwargs)
|
|
|
|
def step(self, action):
|
|
result = self.env.step(action)
|
|
if len(result) == 5:
|
|
obs, _sim, terminated, truncated, info = result
|
|
done = terminated or truncated
|
|
else:
|
|
obs, _sim, done, info = result
|
|
terminated, truncated = done, False
|
|
|
|
# compute v5 reward (returned to SB3 but for eval we won't use it here)
|
|
try:
|
|
cte = float(info.get('cte', 0.0) or 0.0)
|
|
except (TypeError, ValueError):
|
|
cte = 0.0
|
|
cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0)
|
|
try:
|
|
speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
|
|
except (TypeError, ValueError):
|
|
speed = 0.0
|
|
speed_norm = min(speed / 10.0, 1.0)
|
|
shaped = cte_quality * speed_norm
|
|
|
|
# short-lap penalty / termination
|
|
try:
|
|
current_lc = int(info.get('lap_count', 0) or 0)
|
|
except Exception:
|
|
current_lc = self._last_lc
|
|
force_terminate = False
|
|
reward = shaped
|
|
if current_lc > self._last_lc:
|
|
self._last_lc = current_lc
|
|
try:
|
|
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
|
|
except Exception:
|
|
lap_time = 999.0
|
|
if lap_time < self.min_lap_time:
|
|
reward = -10.0 * (self.min_lap_time / max(lap_time, 0.1))
|
|
force_terminate = True
|
|
|
|
if len(result) == 5:
|
|
return obs, reward, terminated or force_terminate, truncated, info
|
|
return obs, reward, terminated or force_terminate, info
|
|
|
|
# env factory
|
|
def make_env_base(base_throttle=0.2, throttle_floor=None):
|
|
"""Create env with underlying action space based on base_throttle (must match saved model).
|
|
If throttle_floor is provided, wrap the env to enforce a minimum throttle at action runtime
|
|
without changing the action_space (so model loading is compatible).
|
|
"""
|
|
def _init():
|
|
raw = gym.make('donkey-mountain-track-v0', conf={'host': HOST, 'port': PORT})
|
|
env = ThrottleClampWrapper(raw, throttle_min=base_throttle)
|
|
# If a runtime throttle floor is requested, apply wrapper that enforces it
|
|
if throttle_floor is not None:
|
|
class ThrottleFloorWrapper(gym.Wrapper):
|
|
def __init__(self, env, floor):
|
|
super().__init__(env)
|
|
self.floor = floor
|
|
def step(self, action):
|
|
# action is [steer, throttle]
|
|
act = np.array(action)
|
|
# Ensure throttle element >= floor (maps in [-1,1]? assume throttle in [0,1])
|
|
try:
|
|
# clamp second element
|
|
act[1] = max(act[1], self.floor)
|
|
except Exception:
|
|
pass
|
|
return self.env.step(act)
|
|
def reset(self, **kwargs):
|
|
return self.env.reset(**kwargs)
|
|
env = ThrottleFloorWrapper(env, throttle_floor)
|
|
env = V5RewardWrapper(env)
|
|
return env
|
|
return _init
|
|
|
|
# small helper: evaluate model on deterministic episodes and return mean lap time
|
|
from stable_baselines3.common.utils import set_random_seed
|
|
|
|
def evaluate_model(model, throttle_min, sets=EVAL_EPISODES):
|
|
env = VecTransposeImage(DummyVecEnv([make_env(throttle_min)]))
|
|
results = []
|
|
for s in range(sets):
|
|
obs = env.reset()
|
|
steps = 0; laps = 0; prev_lc = 0; lap_times = []
|
|
while steps < MAX_EVAL_STEPS:
|
|
action, _ = model.predict(obs, deterministic=True)
|
|
obs, r, d, info = env.step(action)
|
|
inf = info[0] if isinstance(info, (list,tuple)) else info
|
|
steps += 1
|
|
lc = int(inf.get('lap_count', 0) or 0)
|
|
if lc > prev_lc:
|
|
lap_times.append(float(inf.get('last_lap_time', 0) or 0))
|
|
prev_lc = lc
|
|
laps = lc
|
|
if bool(d[0]):
|
|
break
|
|
results.append({'steps':steps, 'laps':laps, 'lap_times':lap_times})
|
|
env.close()
|
|
return results
|
|
|
|
# training loop: two phases
|
|
logf = open(os.path.join('outerloop-results','exp14_finetune_log.txt'),'a', buffering=1)
|
|
def log(s):
|
|
print(s)
|
|
logf.write(f'{datetime.utcnow().isoformat()} {s}\n')
|
|
|
|
# Phase 1: throttle_min=0.4 for PH1_STEPS
|
|
phase_defs = [ (PH1_STEPS, 0.4), (PH2_STEPS, 0.2) ]
|
|
|
|
# create initial env and model (warm start)
|
|
# Load model with base action space (throttle_min=0.2). We'll enforce a runtime
|
|
# throttle FLOOR during phase 1 via a wrapper, but keep the action space unchanged.
|
|
loaded_env = VecTransposeImage(DummyVecEnv([make_env_base(0.2, throttle_floor=None)]))
|
|
if os.path.exists(WARM_PATH):
|
|
log(f'Loading warm-start model from {WARM_PATH} using base throttle_min=0.2 env')
|
|
model = PPO.load(WARM_PATH, env=loaded_env, device='cpu')
|
|
# override lr and schedules — ensure lr_schedule callable exists
|
|
model.learning_rate = LR
|
|
try:
|
|
model.lr_schedule = get_schedule_fn(LR)
|
|
except Exception:
|
|
model.lr_schedule = None
|
|
# update optimizer param groups to new LR
|
|
try:
|
|
for pg in model.policy.optimizer.param_groups:
|
|
pg['lr'] = LR
|
|
except Exception:
|
|
pass
|
|
# Create the training env using base action space but enforce throttle_floor at runtime
|
|
first_throttle_floor = phase_defs[0][1]
|
|
env0 = VecTransposeImage(DummyVecEnv([make_env_base(0.2, throttle_floor=first_throttle_floor)]))
|
|
model.set_env(env0)
|
|
# Close the loaded_env used only for model loading to avoid leaving a stale
|
|
# TCP connection (which would create an extra vehicle in the simulator).
|
|
try:
|
|
loaded_env.close()
|
|
except Exception:
|
|
pass
|
|
else:
|
|
log('No warm-start found — creating fresh model with base throttle_min=0.2')
|
|
env0 = VecTransposeImage(DummyVecEnv([make_env_base(0.2, throttle_floor=phase_defs[0][1])]))
|
|
model = PPO('CnnPolicy', env0, learning_rate=LR, verbose=1, device='cpu')
|
|
loaded_env.close()
|
|
|
|
steps_done = 0
|
|
best_reward = float('-inf')
|
|
|
|
try:
|
|
for phase_steps, throttle_min in phase_defs:
|
|
# If not the first phase, switch env
|
|
if steps_done > 0:
|
|
log(f'Switching env to throttle_min={throttle_min}')
|
|
try:
|
|
env0.close()
|
|
except Exception:
|
|
pass
|
|
env0 = VecTransposeImage(DummyVecEnv([make_env(throttle_min)]))
|
|
model.set_env(env0)
|
|
|
|
remaining = phase_steps
|
|
while remaining > 0:
|
|
seg = min(CHECKPOINT_EVERY, remaining)
|
|
model.learn(total_timesteps=seg, reset_num_timesteps=False)
|
|
steps_done += seg
|
|
remaining -= seg
|
|
|
|
# Save checkpoint
|
|
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
|
|
model.save(ckpt)
|
|
model.save(os.path.join(SAVE_DIR, 'model'))
|
|
log(f'[{steps_done}/{TOTAL_STEPS}] Checkpoint saved: {ckpt}.zip')
|
|
|
|
# Eval
|
|
res = evaluate_model(model, throttle_min, sets=EVAL_EPISODES)
|
|
# compute mean lap time for episodes that completed at least 1 lap
|
|
lap_times = [lt for r in res for lt in r['lap_times']]
|
|
mean_lap = sum(lap_times)/len(lap_times) if lap_times else None
|
|
mean_steps = sum(r['steps'] for r in res)/len(res)
|
|
summary = {'steps_done':steps_done, 'throttle_min':throttle_min, 'mean_steps':mean_steps, 'mean_lap_time':mean_lap, 'per_set':res}
|
|
with open(LOG_PATH,'a') as lf:
|
|
lf.write(json.dumps(summary)+'\n')
|
|
log(f' Eval @ {steps_done}: mean_steps={mean_steps:.1f} mean_lap={mean_lap}')
|
|
|
|
# Update best
|
|
if mean_lap is not None and (best_reward is None or (mean_lap and (best_reward==float('-inf') or mean_lap < best_reward))):
|
|
best_reward = mean_lap
|
|
model.save(os.path.join(SAVE_DIR,'best_model'))
|
|
log(f' ⭐ NEW BEST (mean lap {mean_lap:.2f}s) saved')
|
|
|
|
except Exception as e:
|
|
log(f'ERROR during fine-tune: {e}')
|
|
import traceback; traceback.print_exc()
|
|
finally:
|
|
try:
|
|
env0.close()
|
|
except Exception:
|
|
pass
|
|
model.save(os.path.join(SAVE_DIR,'model'))
|
|
log(f'Fine-tune complete. steps_done={steps_done}')
|
|
logf.close()
|