donkeycar-rl-autoresearch/agent/experiments/exp23_generated_road_clean.py

237 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 23: Clean slate — generated_road, solid barriers, simple reward.
What changed from exp22:
- Single track: generated_road on port 9091 only (diagnose one track first)
- Simulator now uses BoxCollider barriers + CCD on the car Rigidbody.
The car physically cannot escape. No Python-side exploit patches needed.
- Reward wrapper v7: speed × CTE_quality + efficiency gate + no-progress kill.
Removed: CTE-patience termination, solid_hit detection, wedge detection,
MAX_EPISODE_SECONDS hard cap.
- StuckTerminationWrapper: max_episode_seconds raised to 120s (genuine safety
net only — physics handles the actual containment).
- No warm-start: fresh PPO weights. Previous warm-starts were trained under
broken reward/barrier conditions and add more noise than signal.
- Total steps: 200k (more room to learn with clean signal).
"""
import os
import sys
import time
from datetime import datetime
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from donkeycar_sb3_runner import ThrottleClampWrapper
from multitrack_runner import StuckTerminationWrapper
from reward_wrapper import SpeedRewardWrapper
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.0003
TOTAL_STEPS = 200_000
CHECKPOINT_EVERY = 10_000
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp23-generated-road-clean'
os.makedirs(SAVE_DIR, exist_ok=True)
# Reward wrapper v7 params — clean and minimal
EFFICIENCY_WINDOW = 30
MIN_EFFICIENCY = 0.15
MAX_CTE = 8.0
MIN_LAP_TIME = 12.0
PROGRESS_PATIENCE = 100 # steps without new waypoint → terminate
# StuckTerminationWrapper — generous limit, physics does the real work now
MAX_STUCK_SECONDS = 5.0
MAX_EPISODE_SECONDS = 120.0 # safety net only
def log(msg):
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(
env,
stuck_steps=40,
min_displacement=0.5,
max_stuck_seconds=MAX_STUCK_SECONDS,
max_episode_seconds=MAX_EPISODE_SECONDS,
)
env = SpeedRewardWrapper(
env,
window_size=EFFICIENCY_WINDOW,
min_efficiency=MIN_EFFICIENCY,
max_cte=MAX_CTE,
min_lap_time=MIN_LAP_TIME,
progress_patience=PROGRESS_PATIENCE,
)
return env
return _init
def make_eval_env(track_id, port):
inner = make_env(track_id, port)()
return VecTransposeImage(DummyVecEnv([lambda e=inner: e]))
log('=' * 60)
log('Exp 23: generated_road — clean barriers, clean reward')
log(f' Sim: {HOST}:9091 -> generated_road')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)')
log(f' Max stuck: {MAX_STUCK_SECONDS}s, episode cap: {MAX_EPISODE_SECONDS}s (safety net)')
log(f' Progress patience: {PROGRESS_PATIENCE} steps')
log(f' Checkpoints every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Creating DummyVecEnv on generated_road...')
env = DummyVecEnv([make_env('donkey-generated-roads-v0', 9091)])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
model = PPO(
'CnnPolicy',
env,
learning_rate=LR,
n_steps=2048,
batch_size=64,
n_epochs=10,
gamma=0.99,
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01,
verbose=1,
device='cpu',
)
# Write PID for external monitoring
pid_path = os.path.join(SAVE_DIR, 'current.pid')
with open(pid_path, 'w') as f:
f.write(str(os.getpid()))
log(f'Fresh PPO model created. Starting training...')
best_total_steps = float('-inf')
best_total_reward = float('-inf')
steps_done = 0
run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_clean'
log_path = os.path.join(SAVE_DIR, f'run_{run_tag}.log')
best_model_path = os.path.join(SAVE_DIR, 'best_model.zip')
import logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s',
handlers=[logging.FileHandler(log_path), logging.StreamHandler(sys.stdout)],
)
file_log = logging.getLogger('exp23')
def flog(msg):
ts = datetime.now().strftime('%H:%M:%S')
file_log.info(f'[{ts}] {msg}')
flog('=' * 60)
flog(f'Exp 23 started — PID {os.getpid()}')
flog(f'Log: {log_path}')
flog('=' * 60)
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model'))
flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
# Mid-training eval on generated_road
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
total_steps_eval = int(ep_steps.sum())
total_reward_eval = float(ep_rewards.sum())
status = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}')
if (total_steps_eval > best_total_steps
or (total_steps_eval == best_total_steps
and total_reward_eval > best_total_reward)):
best_total_steps = total_steps_eval
best_total_reward = total_reward_eval
model.save(best_model_path)
flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}')
except Exception as e:
flog(f' Eval error: {e}')
env.close()
# ── Final evaluation ──────────────────────────────────────────────────────────
flog('=' * 60)
flog('FINAL EVALUATION: best_model on generated_road')
flog('=' * 60)
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
steps_list = []
reward_list = []
for s in range(1, EVAL_SETS + 1):
try:
eval_env = make_eval_env('donkey-generated-roads-v0', 9091)
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
done = False
total_s = 0
total_r = 0.0
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
obs, r, done = result[0], result[1], result[2]
if hasattr(done, '__len__'):
done = bool(done[0])
total_r += float(r) if not hasattr(r, '__len__') else float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
reward_list.append(total_r)
eval_env.close()
except Exception as e:
flog(f' Set {s} error: {e}')
if steps_list:
flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward')
flog('Exp 23 complete.')