donkeycar-rl-autoresearch/agent/experiments/exp24_generated_road_discre...

260 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exp 24: Discrete steering + speed-based stuck detection.
What changed from exp23:
- Discrete action space: 7 steering bins × 1 throttle = 7 actions.
Eliminates Gaussian policy noise that caused rapid steering oscillation.
Bins: steer ∈ {-1, -0.67, -0.33, 0, 0.33, 0.67, 1}, throttle=0→clamped to 0.2.
- Speed-based stuck detection: if speed < 0.5 m/s for 2 wall-clock seconds
→ terminate. Catches car pinned against a barrier regardless of lateral sliding
(lateral drift was resetting the position-based timer in exp23, leaving the car
against the wall for up to max_episode_seconds).
- max_episode_seconds reduced to 30s (stuck detection catches the bad cases faster;
120s was a consequence of stuck detection not working, not a design choice).
- Single track: generated_road on port 9091.
- Fresh PPO (MlpPolicy not CnnPolicy — Discrete action space, same CNN obs encoder).
- Total steps: 200k.
"""
import os
import sys
import time
from datetime import datetime
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
_SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete'
_PIDFILE = os.path.join(_SAVE_DIR, 'current.pid')
os.makedirs(_SAVE_DIR, exist_ok=True)
if os.path.exists(_PIDFILE):
try:
_old = int(open(_PIDFILE).read().strip())
if _old != os.getpid():
import signal
os.kill(_old, 0)
print(f'[exp24] Another instance already running (PID {_old}). Exiting.', flush=True)
sys.exit(1)
except (OSError, ValueError):
pass
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from discretize_action import DiscretizedActionWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from multitrack_runner import StuckTerminationWrapper
from reward_wrapper import SpeedRewardWrapper
HOST = 'localhost'
THROTTLE_MIN = 0.2
LR = 0.0003
TOTAL_STEPS = 200_000
CHECKPOINT_EVERY = 10_000
N_STEER = 7 # steering bins: -1, -0.67, -0.33, 0, 0.33, 0.67, 1
N_THROTTLE = 1 # fixed at 0.0 → clamped to THROTTLE_MIN by ThrottleClampWrapper
# Reward wrapper params (same as exp23 v7)
EFFICIENCY_WINDOW = 30
MIN_EFFICIENCY = 0.15
MAX_CTE = 8.0
MIN_LAP_TIME = 12.0
PROGRESS_PATIENCE = 100
# StuckTerminationWrapper — speed-based check is the primary stuck detector now
MAX_STUCK_SECONDS = 5.0 # position-based: 0.5m displacement timer
MAX_EPISODE_SECONDS = 30.0 # hard cap (reduced from 120s — speed check handles it)
LOW_SPEED_THRESHOLD = 0.5 # m/s — below this counts as "stuck"
MAX_LOW_SPEED_SECONDS = 2.0 # seconds at low speed before termination
def log(msg):
print(f'[{datetime.now().strftime("%H:%M:%S")}] {msg}', flush=True)
def make_env(track_id, port):
def _init():
raw = gym.make(track_id, conf={'host': HOST, 'port': port})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE)
env = StuckTerminationWrapper(
env,
stuck_steps=40,
min_displacement=0.5,
max_stuck_seconds=MAX_STUCK_SECONDS,
max_episode_seconds=MAX_EPISODE_SECONDS,
low_speed_threshold=LOW_SPEED_THRESHOLD,
max_low_speed_seconds=MAX_LOW_SPEED_SECONDS,
)
env = SpeedRewardWrapper(
env,
window_size=EFFICIENCY_WINDOW,
min_efficiency=MIN_EFFICIENCY,
max_cte=MAX_CTE,
min_lap_time=MIN_LAP_TIME,
progress_patience=PROGRESS_PATIENCE,
)
return env
return _init
def make_eval_env(track_id, port):
inner = make_env(track_id, port)()
return VecTransposeImage(DummyVecEnv([lambda e=inner: e]))
log('=' * 60)
log('Exp 24: generated_road — discrete steering, speed-based stuck')
log(f' Sim: {HOST}:9091 -> generated_road')
log(f' Discrete steering: {N_STEER} bins, throttle fixed at {THROTTLE_MIN}')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total={TOTAL_STEPS:,}')
log(f' Reward: v7 (speed×CTE, efficiency gate, no-progress kill)')
log(f' Stuck: position≥0.5m/{MAX_STUCK_SECONDS}s OR speed<{LOW_SPEED_THRESHOLD}/{MAX_LOW_SPEED_SECONDS}s')
log(f' Episode cap: {MAX_EPISODE_SECONDS}s (safety net)')
log(f' Checkpoints every {CHECKPOINT_EVERY:,} steps')
log('=' * 60)
log('Creating DummyVecEnv on generated_road...')
env = DummyVecEnv([make_env('donkey-generated-roads-v0', 9091)])
env = VecTransposeImage(env)
log(f' VecEnv num_envs={env.num_envs}, obs={env.observation_space.shape}')
log(f' Action space: {env.action_space}')
model = PPO(
'CnnPolicy',
env,
learning_rate=LR,
n_steps=2048,
batch_size=64,
n_epochs=10,
gamma=0.99,
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01,
verbose=1,
device='cpu',
)
with open(_PIDFILE, 'w') as f:
f.write(str(os.getpid()))
log(f'Fresh PPO model created (Discrete({N_STEER * N_THROTTLE}) actions). Starting training...')
best_total_steps = float('-inf')
best_total_reward = float('-inf')
steps_done = 0
run_tag = datetime.now().strftime('%Y-%m-%d_%H%M%S') + '_discrete'
log_path = os.path.join(_SAVE_DIR, f'run_{run_tag}.log')
best_model_path = os.path.join(_SAVE_DIR, 'best_model.zip')
import logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s',
handlers=[logging.FileHandler(log_path), logging.StreamHandler(sys.stdout)],
)
file_log = logging.getLogger('exp24')
def flog(msg):
ts = datetime.now().strftime('%H:%M:%S')
file_log.info(f'[{ts}] {msg}')
flog('=' * 60)
flog(f'Exp 24 started — PID {os.getpid()}')
flog(f'Log: {log_path}')
flog('=' * 60)
while steps_done < TOTAL_STEPS:
seg_steps = min(CHECKPOINT_EVERY, TOTAL_STEPS - steps_done)
model.learn(total_timesteps=seg_steps, reset_num_timesteps=False)
steps_done += seg_steps
ckpt = os.path.join(_SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(_SAVE_DIR, 'model'))
flog(f'[{steps_done:,}/{TOTAL_STEPS:,}] Checkpoint saved: {ckpt}.zip')
try:
obs = env.reset()
ep_rewards = np.zeros(env.num_envs)
ep_steps = np.zeros(env.num_envs)
done_mask = np.zeros(env.num_envs, dtype=bool)
for _ in range(2000):
action, _ = model.predict(obs, deterministic=True)
obs, rewards, dones, infos = env.step(action)
for i in range(env.num_envs):
if not done_mask[i]:
ep_rewards[i] += rewards[i]
ep_steps[i] += 1
if dones[i]:
done_mask[i] = True
if done_mask.all():
break
total_steps_eval = int(ep_steps.sum())
total_reward_eval = float(ep_rewards.sum())
status = '' if ep_steps[0] >= 2000 else f'❌@{int(ep_steps[0])}'
flog(f' Eval: gen_road={total_reward_eval:.1f}r/{int(ep_steps[0])}s {status}')
if (total_steps_eval > best_total_steps
or (total_steps_eval == best_total_steps
and total_reward_eval > best_total_reward)):
best_total_steps = total_steps_eval
best_total_reward = total_reward_eval
model.save(best_model_path)
flog(f' NEW BEST: steps={best_total_steps} reward={best_total_reward:.1f}')
except Exception as e:
flog(f' Eval error: {e}')
env.close()
flog('=' * 60)
flog('FINAL EVALUATION: best_model on generated_road')
flog('=' * 60)
EVAL_SETS = 3
EVAL_MAX_STEPS = 2000
steps_list = []
reward_list = []
for s in range(1, EVAL_SETS + 1):
try:
eval_env = make_eval_env('donkey-generated-roads-v0', 9091)
eval_model = PPO.load(best_model_path, env=eval_env, device='cpu')
obs = eval_env.reset()
done = False
total_s = 0
total_r = 0.0
while not done and total_s < EVAL_MAX_STEPS:
action, _ = eval_model.predict(obs, deterministic=True)
result = eval_env.step(action)
obs, r, done = result[0], result[1], result[2]
if hasattr(done, '__len__'):
done = bool(done[0])
total_r += float(r) if not hasattr(r, '__len__') else float(r[0])
total_s += 1
status = '' if total_s >= EVAL_MAX_STEPS else f'❌@{total_s}'
flog(f' Set {s}: {total_r:.1f}r / {total_s}s {status}')
steps_list.append(total_s)
reward_list.append(total_r)
eval_env.close()
except Exception as e:
flog(f' Set {s} error: {e}')
if steps_list:
flog(f' Mean: {np.mean(steps_list):.0f} steps / {np.mean(reward_list):.1f} reward')
flog('Exp 24 complete.')