donkeycar-rl-autoresearch/agent/experiments/exp24_generated_road_discre...

291 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 24: Discrete steering + speed-based stuck detection + road regeneration.
What changed from exp23:
- Discrete action space: 7 steering bins × 1 throttle = 7 actions.
Eliminates Gaussian policy noise that caused rapid steering oscillation.
Bins: steer ∈ {-1, -0.67, -0.33, 0, 0.33, 0.67, 1}, throttle=0→clamped to 0.2.
- Speed-based stuck detection: if speed < 0.5 for 2 wall-clock seconds → terminate.
Catches car pinned against a barrier regardless of lateral sliding.
(WheelColliders don't fire OnCollisionStay on Car.cs — perpendicular contact was
undetectable. Now also fixed in Unity via forward raycast, but Python speed check
is the reliable backstop.)
- Road regeneration: env is closed and reconnected after each 10k-step segment.
Reconnecting reloads the scene → sdsandbox generates a new random road.
Training diversity: each 10k-step segment trains on a different road layout.
Eval is always on a freshly generated road (proper generalization test).
- max_episode_seconds reduced to 30s (speed check handles stuck cases faster).
- Single track: generated_road on port 9091.
- Fresh PPO weights.
- Total steps: 200k.
"""
import os
import sys
import time
from datetime import datetime
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
_SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete'
_PIDFILE = os.path.join(_SAVE_DIR, 'current.pid')
os.makedirs(_SAVE_DIR, exist_ok=True)
if os.path.exists(_PIDFILE):
try:
_old = int(open(_PIDFILE).read().strip())
if _old != os.getpid():
import signal
os.kill(_old, 0)
print(f'[exp24] Another instance already running (PID {_old}). Exiting.', flush=True)
sys.exit(1)
except (OSError, ValueError):
pass
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from discretize_action import DiscretizedActionWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from multitrack_runner import StuckTerminationWrapper
from reward_wrapper import SpeedRewardWrapper
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.0003
TOTAL_STEPS = 200_000
CHECKPOINT_EVERY = 10_000
SCENE_RELOAD_WAIT = 5.0 # seconds after env.close() for sim to return to menu
N_STEER = 7 # steering bins: -1, -0.67, -0.33, 0, 0.33, 0.67, 1
N_THROTTLE = 1 # fixed at 0.0 → clamped to THROTTLE_MIN by ThrottleClampWrapper
# Reward wrapper params (same as exp23 v7)
EFFICIENCY_WINDOW = 30
MIN_EFFICIENCY = 0.15
MAX_CTE = 8.0
MIN_LAP_TIME = 12.0
PROGRESS_PATIENCE = 100
# StuckTerminationWrapper
MAX_STUCK_SECONDS = 5.0 # position-based: 0.5m displacement timer
MAX_EPISODE_SECONDS = 30.0 # hard cap
LOW_SPEED_THRESHOLD = 0.5 # below this counts as stuck
MAX_LOW_SPEED_SECONDS = 2.0 # seconds at low speed before termination
TRACK_ID = 'donkey-generated-roads-v0'
PORT = 9091
def log(msg):
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE)
env = StuckTerminationWrapper(
env,
stuck_steps=40,
min_displacement=0.5,
max_stuck_seconds=MAX_STUCK_SECONDS,
max_episode_seconds=MAX_EPISODE_SECONDS,
low_speed_threshold=LOW_SPEED_THRESHOLD,
max_low_speed_seconds=MAX_LOW_SPEED_SECONDS,
)
env = SpeedRewardWrapper(
env,
window_size=EFFICIENCY_WINDOW,
min_efficiency=MIN_EFFICIENCY,
max_cte=MAX_CTE,
min_lap_time=MIN_LAP_TIME,
progress_patience=PROGRESS_PATIENCE,
)
return env
return _init
def connect_env():
"""Connect to the sim. Reloading generates a new random road."""
new_env = DummyVecEnv([make_env(TRACK_ID, PORT)])
new_env = VecTransposeImage(new_env)
return new_env
def reconnect_env(old_env):
"""Close old env, wait for sim to return to menu, reconnect with new road."""
try:
old_env.close()
except Exception as e:
log(f' env.close() warning: {e}')
time.sleep(SCENE_RELOAD_WAIT)
return connect_env()
log('=' * 60)
log('Exp 24: generated_road — discrete steering, speed stuck, road regen')
log(f' Sim: {HOST}:{PORT} -> {TRACK_ID}')
log(f' Discrete steering: {N_STEER} bins, throttle fixed at {THROTTLE_MIN}')
log(f' LR={LR}, total={TOTAL_STEPS:,}, checkpoint every {CHECKPOINT_EVERY:,}')
log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)')
log(f' Stuck: position/{MAX_STUCK_SECONDS}s OR speed<{LOW_SPEED_THRESHOLD}/{MAX_LOW_SPEED_SECONDS}s')
log(f' Episode cap: {MAX_EPISODE_SECONDS}s | Road regen: every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Connecting to sim...')
env = connect_env()
log(f' obs={env.observation_space.shape}, action={env.action_space}')
model = PPO(
'CnnPolicy',
env,
learning_rate=LR,
n_steps=2048,
batch_size=64,
n_epochs=10,
gamma=0.99,
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01,
verbose=1,
device='cpu',
)
with open(_PIDFILE, 'w') as f:
f.write(str(os.getpid()))
log(f'Fresh PPO (Discrete({N_STEER * N_THROTTLE})). Starting training...')
best_total_steps = float('-inf')
best_total_reward = float('-inf')
steps_done = 0
run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_discrete'
log_path = os.path.join(_SAVE_DIR, f'run_{run_tag}.log')
best_model_path = os.path.join(_SAVE_DIR, 'best_model.zip')
import logging
# logging.basicConfig is a no-op if the root logger already has handlers (e.g. from
# gym_donkeycar/SB3 imports). Add handlers directly to avoid silent file-log loss.
_file_handler = logging.FileHandler(log_path)
_file_handler.setFormatter(logging.Formatter('%(message)s'))
_stream_handler = logging.StreamHandler(sys.stdout)
_stream_handler.setFormatter(logging.Formatter('%(message)s'))
file_log = logging.getLogger('exp24')
file_log.setLevel(logging.INFO)
file_log.propagate = False
file_log.addHandler(_file_handler)
file_log.addHandler(_stream_handler)
def flog(msg):
ts = datetime.now().strftime('%H:%M:%S')
file_log.info(f'[{ts}] {msg}')
flog('=' * 60)
flog(f'Exp 24 started — PID {os.getpid()}')
flog(f'Log: {log_path}')
flog('=' * 60)
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(_SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(_SAVE_DIR, 'model'))
flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
# Reconnect → sim reloads scene → new random road generated.
# Eval runs on this fresh road, then the next training segment uses it too.
flog(f' Reconnecting for fresh road...')
env = reconnect_env(env)
model.set_env(env)
flog(f' Connected (new road)')
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
total_steps_eval = int(ep_steps.sum())
total_reward_eval = float(ep_rewards.sum())
status = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}')
if (total_steps_eval > best_total_steps
or (total_steps_eval == best_total_steps
and total_reward_eval > best_total_reward)):
best_total_steps = total_steps_eval
best_total_reward = total_reward_eval
model.save(best_model_path)
flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}')
except Exception as e:
flog(f' Eval error: {e}')
env.close()
flog('=' * 60)
flog('FINAL EVALUATION: best_model on generated_road (3 fresh roads)')
flog('=' * 60)
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
steps_list = []
reward_list = []
for s in range(1, EVAL_SETS + 1):
try:
# Each eval set reconnects → different random road
time.sleep(SCENE_RELOAD_WAIT)
eval_env = connect_env()
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
done = False
total_s = 0
total_r = 0.0
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
obs, r, done = result[0], result[1], result[2]
if hasattr(done, '__len__'):
done = bool(done[0])
total_r += float(r) if not hasattr(r, '__len__') else float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
reward_list.append(total_r)
eval_env.close()
except Exception as e:
flog(f' Set {s} error: {e}')
if steps_list:
flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward')
flog('Exp 24 complete.')