donkeycar-rl-autoresearch/agent/donkeycar_sb3_runner.py

190 lines
6.6 KiB
Python

"""
DonkeyCar RL Runner — Real Training Edition
============================================
Trains a PPO or DQN model using Stable-Baselines3, evaluates with evaluate_policy(),
saves the model to disk, and exits cleanly.
Usage:
python3 donkeycar_sb3_runner.py \
--agent ppo \
--env donkey-generated-roads-v0 \
--timesteps 10000 \
--eval-episodes 5 \
--learning-rate 0.0003 \
--save-dir agent/models/trial-0001 \
--n-steer 7 \
--n-throttle 3 \
--reward-shaping \
--seed 42
Exit codes:
0 — success, model saved, evaluation complete
100 — failed to connect to simulator
101 — training failed
102 — evaluation failed
"""
import argparse
import os
import sys
import time
import numpy as np
import gymnasium as gym
import gym_donkeycar
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.evaluation import evaluate_policy
from discretize_action import DiscretizedActionWrapper
# Optional reward shaping — imported only if available
try:
from reward_wrapper import SpeedRewardWrapper
REWARD_WRAPPER_AVAILABLE = True
except ImportError:
REWARD_WRAPPER_AVAILABLE = False
def log(msg):
print(msg, flush=True)
def make_env(env_id, agent, n_steer, n_throttle, reward_shaping):
"""Create and wrap the gym environment."""
env = gym.make(env_id)
if agent == 'dqn':
env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle)
log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}')
if reward_shaping:
if REWARD_WRAPPER_AVAILABLE:
env = SpeedRewardWrapper(env)
log(f'[SB3 Runner][MONITOR] Speed reward shaping ENABLED. {time.ctime()}')
else:
log(f'[SB3 Runner][MONITOR] WARNING: reward_wrapper.py not found — reward shaping disabled. {time.ctime()}')
return env
def train_model(agent, env, learning_rate, timesteps, seed):
"""Train a PPO or DQN model and return it."""
if agent == 'ppo':
model = PPO(
'CnnPolicy',
env,
learning_rate=learning_rate,
verbose=1,
seed=seed,
)
elif agent == 'dqn':
model = DQN(
'CnnPolicy',
env,
learning_rate=learning_rate,
verbose=1,
seed=seed,
)
else:
raise ValueError(f'Unknown agent: {agent}. Use ppo or dqn.')
log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}')
start = time.time()
model.learn(total_timesteps=timesteps)
elapsed = time.time() - start
log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}')
return model
def evaluate_model(model, env, eval_episodes):
"""Evaluate the model using SB3 evaluate_policy and print per-episode detail."""
log(f'[SB3 Runner][MONITOR] Evaluating model for {eval_episodes} episodes. {time.ctime()}')
mean_reward, std_reward = evaluate_policy(
model,
env,
n_eval_episodes=eval_episodes,
return_episode_rewards=False,
deterministic=True,
)
log(f'[SB3 Runner][TEST] mean_reward={mean_reward:.4f}')
log(f'[SB3 Runner][TEST] std_reward={std_reward:.4f}')
return mean_reward, std_reward
def save_model(model, save_dir):
"""Save the model to save_dir/model.zip."""
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'model')
model.save(save_path)
log(f'[SB3 Runner][MONITOR] Model saved to {save_path}.zip {time.ctime()}')
return save_path + '.zip'
def teardown(env):
"""Close environment cleanly with race avoidance sleep."""
log(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}')
try:
env.close()
log(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}')
except Exception as e:
log(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {e} {time.ctime()}')
log(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}')
time.sleep(2)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train and evaluate an RL agent on DonkeyCar.')
parser.add_argument('--agent', type=str, default='ppo', choices=['ppo', 'dqn'], help='RL agent type')
parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym env ID')
parser.add_argument('--timesteps', type=int, default=10000, help='Training timesteps')
parser.add_argument('--eval-episodes', type=int, default=5, help='Evaluation episodes')
parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate')
parser.add_argument('--save-dir', type=str, default=None, help='Directory to save model')
parser.add_argument('--n-steer', type=int, default=7, help='Steer bins (DQN only)')
parser.add_argument('--n-throttle', type=int, default=3, help='Throttle bins (DQN only)')
parser.add_argument('--reward-shaping', action='store_true', help='Enable speed reward shaping')
parser.add_argument('--seed', type=int, default=None, help='Random seed')
args = parser.parse_args()
log(f'[SB3 Runner] Starting: agent={args.agent} timesteps={args.timesteps} lr={args.learning_rate} {time.ctime()}')
# --- 1. Connect to simulator ---
env = None
try:
env = make_env(args.env, args.agent, args.n_steer, args.n_throttle, args.reward_shaping)
log(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}')
except Exception as e:
log(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {e}')
sys.exit(100)
# --- 2. Train model ---
model = None
try:
model = train_model(args.agent, env, args.learning_rate, args.timesteps, args.seed)
except Exception as e:
log(f'[SB3 Runner][MONITOR ALERT] Training failed: {e} {time.ctime()}')
teardown(env)
sys.exit(101)
# --- 3. Save model ---
save_dir = args.save_dir or f'/tmp/donkeycar-trial-{int(time.time())}'
try:
saved_path = save_model(model, save_dir)
except Exception as e:
log(f'[SB3 Runner][MONITOR ALERT] Model save failed: {e} {time.ctime()}')
teardown(env)
sys.exit(101)
# --- 4. Evaluate trained policy ---
try:
mean_reward, std_reward = evaluate_model(model, env, args.eval_episodes)
except Exception as e:
log(f'[SB3 Runner][MONITOR ALERT] Evaluation failed: {e} {time.ctime()}')
teardown(env)
sys.exit(102)
# --- 5. Teardown ---
teardown(env)
log(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}')