donkeycar-rl-autoresearch/agent/experiments/exp26_generated_road_warmst...

268 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 26: Warm-start from exp25 best_model — extended training on generated_road.
What changed from exp25:
- Warm start: loads exp25 best_model (381r @ 80k) instead of fresh weights.
This skips the early exploration phase and pushes the policy further.
- 300k total steps (vs 200k) — more time to improve beyond the exp25 plateau.
- Python-side hit check now active: multitrack_runner.py checks info['hit'] != 'none'
as the FIRST termination condition (added late in exp25 session, not loaded then).
- Everything else identical to exp25: discrete(7) steering, wheel OverlapSphere Unity fix,
road regen every 10k steps, LR=0.0003.
"""
import os
import sys
import time
from datetime import datetime
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
_SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp26-warmstart'
_PIDFILE = os.path.join(_SAVE_DIR, 'current.pid')
_WARM_MODEL = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp25-wheel-fix/best_model.zip'
os.makedirs(_SAVE_DIR, exist_ok=True)
if os.path.exists(_PIDFILE):
try:
_old = int(open(_PIDFILE).read().strip())
if _old != os.getpid():
import signal
os.kill(_old, 0)
print(f'[exp26] Another instance already running (PID {_old}). Exiting.', flush=True)
sys.exit(1)
except (OSError, ValueError):
pass
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from discretize_action import DiscretizedActionWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from multitrack_runner import StuckTerminationWrapper
from reward_wrapper import SpeedRewardWrapper
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.0003
TOTAL_STEPS = 300_000
CHECKPOINT_EVERY = 10_000
SCENE_RELOAD_WAIT = 5.0
N_STEER = 7
N_THROTTLE = 1
EFFICIENCY_WINDOW = 30
MIN_EFFICIENCY = 0.15
MAX_CTE = 8.0
MIN_LAP_TIME = 12.0
PROGRESS_PATIENCE = 100
MAX_STUCK_SECONDS = 5.0
MAX_EPISODE_SECONDS = 30.0
LOW_SPEED_THRESHOLD = 1.0
MAX_LOW_SPEED_SECONDS = 1.5
MAX_CTE_TERMINATION = 3.0
MAX_HIGH_CTE_SECONDS = 1.0
TRACK_ID = 'donkey-generated-roads-v0'
PORT = 9091
def log(msg):
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE)
env = StuckTerminationWrapper(
env,
stuck_steps=40,
min_displacement=0.5,
max_stuck_seconds=MAX_STUCK_SECONDS,
max_episode_seconds=MAX_EPISODE_SECONDS,
low_speed_threshold=LOW_SPEED_THRESHOLD,
max_low_speed_seconds=MAX_LOW_SPEED_SECONDS,
max_cte=MAX_CTE_TERMINATION,
max_high_cte_seconds=MAX_HIGH_CTE_SECONDS,
)
env = SpeedRewardWrapper(
env,
window_size=EFFICIENCY_WINDOW,
min_efficiency=MIN_EFFICIENCY,
max_cte=MAX_CTE,
min_lap_time=MIN_LAP_TIME,
progress_patience=PROGRESS_PATIENCE,
)
return env
return _init
def connect_env():
new_env = DummyVecEnv([make_env(TRACK_ID, PORT)])
new_env = VecTransposeImage(new_env)
return new_env
def reconnect_env(old_env):
try:
old_env.close()
except Exception as e:
log(f' env.close() warning: {e}')
time.sleep(SCENE_RELOAD_WAIT)
return connect_env()
log('=' * 60)
log('Exp 26: generated_road — warm start from exp25 best_model')
log(f' Sim: {HOST}:{PORT} -> {TRACK_ID}')
log(f' Warm model: {_WARM_MODEL}')
log(f' Discrete steering: {N_STEER} bins, throttle fixed at {THROTTLE_MIN}')
log(f' LR={LR}, total={TOTAL_STEPS:,}, checkpoint every {CHECKPOINT_EVERY:,}')
log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)')
log(f' Stuck: speed<{LOW_SPEED_THRESHOLD}/{MAX_LOW_SPEED_SECONDS}s OR hit!=none OR CTE>{MAX_CTE_TERMINATION}/{MAX_HIGH_CTE_SECONDS}s')
log(f' Episode cap: {MAX_EPISODE_SECONDS}s | Road regen: every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Connecting to sim...')
env = connect_env()
log(f' obs={env.observation_space.shape}, action={env.action_space}')
log(f'Loading warm-start model from exp25...')
model = PPO.load(_WARM_MODEL, env=env, device='cpu')
model.learning_rate = LR
log(f' Warm model loaded. LR={LR}')
with open(_PIDFILE, 'w') as f:
f.write(str(os.getpid()))
best_total_steps = float('-inf')
best_total_reward = float('-inf')
steps_done = 0
run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_warmstart'
log_path = os.path.join(_SAVE_DIR, f'run_{run_tag}.log')
best_model_path = os.path.join(_SAVE_DIR, 'best_model.zip')
import logging
_file_handler = logging.FileHandler(log_path)
_file_handler.setFormatter(logging.Formatter('%(message)s'))
_stream_handler = logging.StreamHandler(sys.stdout)
_stream_handler.setFormatter(logging.Formatter('%(message)s'))
file_log = logging.getLogger('exp26')
file_log.setLevel(logging.INFO)
file_log.propagate = False
file_log.addHandler(_file_handler)
file_log.addHandler(_stream_handler)
def flog(msg):
ts = datetime.now().strftime('%H:%M:%S')
file_log.info(f'[{ts}] {msg}')
flog('=' * 60)
flog(f'Exp 26 started — PID {os.getpid()}')
flog(f'Log: {log_path}')
flog(f'Warm start: exp25 best_model (381r @ 80k)')
flog('=' * 60)
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(_SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(_SAVE_DIR, 'model'))
flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
flog(f' Reconnecting for fresh road...')
env = reconnect_env(env)
model.set_env(env)
flog(f' Connected (new road)')
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
total_steps_eval = int(ep_steps.sum())
total_reward_eval = float(ep_rewards.sum())
status = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}')
if (total_steps_eval > best_total_steps
or (total_steps_eval == best_total_steps
and total_reward_eval > best_total_reward)):
best_total_steps = total_steps_eval
best_total_reward = total_reward_eval
model.save(best_model_path)
flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}')
except Exception as e:
flog(f' Eval error: {e}')
env.close()
flog('=' * 60)
flog('FINAL EVALUATION: best_model on generated_road (3 fresh roads)')
flog('=' * 60)
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
steps_list = []
reward_list = []
for s in range(1, EVAL_SETS + 1):
try:
time.sleep(SCENE_RELOAD_WAIT)
eval_env = connect_env()
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
done = False
total_s = 0
total_r = 0.0
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
obs, r, done = result[0], result[1], result[2]
if hasattr(done, '__len__'):
done = bool(done[0])
total_r += float(r) if not hasattr(r, '__len__') else float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
reward_list.append(total_r)
eval_env.close()
except Exception as e:
flog(f' Set {s} error: {e}')
if steps_list:
flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward')
flog('Exp 26 complete.')