190 lines
6.6 KiB
Python
190 lines
6.6 KiB
Python
"""
|
|
DonkeyCar RL Runner — Real Training Edition
|
|
============================================
|
|
Trains a PPO or DQN model using Stable-Baselines3, evaluates with evaluate_policy(),
|
|
saves the model to disk, and exits cleanly.
|
|
|
|
Usage:
|
|
python3 donkeycar_sb3_runner.py \
|
|
--agent ppo \
|
|
--env donkey-generated-roads-v0 \
|
|
--timesteps 10000 \
|
|
--eval-episodes 5 \
|
|
--learning-rate 0.0003 \
|
|
--save-dir agent/models/trial-0001 \
|
|
--n-steer 7 \
|
|
--n-throttle 3 \
|
|
--reward-shaping \
|
|
--seed 42
|
|
|
|
Exit codes:
|
|
0 — success, model saved, evaluation complete
|
|
100 — failed to connect to simulator
|
|
101 — training failed
|
|
102 — evaluation failed
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
import numpy as np
|
|
|
|
import gymnasium as gym
|
|
import gym_donkeycar
|
|
|
|
from stable_baselines3 import PPO, DQN
|
|
from stable_baselines3.common.evaluation import evaluate_policy
|
|
|
|
from discretize_action import DiscretizedActionWrapper
|
|
|
|
# Optional reward shaping — imported only if available
|
|
try:
|
|
from reward_wrapper import SpeedRewardWrapper
|
|
REWARD_WRAPPER_AVAILABLE = True
|
|
except ImportError:
|
|
REWARD_WRAPPER_AVAILABLE = False
|
|
|
|
|
|
def log(msg):
|
|
print(msg, flush=True)
|
|
|
|
|
|
def make_env(env_id, agent, n_steer, n_throttle, reward_shaping):
|
|
"""Create and wrap the gym environment."""
|
|
env = gym.make(env_id)
|
|
|
|
if agent == 'dqn':
|
|
env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle)
|
|
log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}')
|
|
|
|
if reward_shaping:
|
|
if REWARD_WRAPPER_AVAILABLE:
|
|
env = SpeedRewardWrapper(env)
|
|
log(f'[SB3 Runner][MONITOR] Speed reward shaping ENABLED. {time.ctime()}')
|
|
else:
|
|
log(f'[SB3 Runner][MONITOR] WARNING: reward_wrapper.py not found — reward shaping disabled. {time.ctime()}')
|
|
|
|
return env
|
|
|
|
|
|
def train_model(agent, env, learning_rate, timesteps, seed):
|
|
"""Train a PPO or DQN model and return it."""
|
|
if agent == 'ppo':
|
|
model = PPO(
|
|
'CnnPolicy',
|
|
env,
|
|
learning_rate=learning_rate,
|
|
verbose=1,
|
|
seed=seed,
|
|
)
|
|
elif agent == 'dqn':
|
|
model = DQN(
|
|
'CnnPolicy',
|
|
env,
|
|
learning_rate=learning_rate,
|
|
verbose=1,
|
|
seed=seed,
|
|
)
|
|
else:
|
|
raise ValueError(f'Unknown agent: {agent}. Use ppo or dqn.')
|
|
|
|
log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}')
|
|
start = time.time()
|
|
model.learn(total_timesteps=timesteps)
|
|
elapsed = time.time() - start
|
|
log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}')
|
|
return model
|
|
|
|
|
|
def evaluate_model(model, env, eval_episodes):
|
|
"""Evaluate the model using SB3 evaluate_policy and print per-episode detail."""
|
|
log(f'[SB3 Runner][MONITOR] Evaluating model for {eval_episodes} episodes. {time.ctime()}')
|
|
mean_reward, std_reward = evaluate_policy(
|
|
model,
|
|
env,
|
|
n_eval_episodes=eval_episodes,
|
|
return_episode_rewards=False,
|
|
deterministic=True,
|
|
)
|
|
log(f'[SB3 Runner][TEST] mean_reward={mean_reward:.4f}')
|
|
log(f'[SB3 Runner][TEST] std_reward={std_reward:.4f}')
|
|
return mean_reward, std_reward
|
|
|
|
|
|
def save_model(model, save_dir):
|
|
"""Save the model to save_dir/model.zip."""
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
save_path = os.path.join(save_dir, 'model')
|
|
model.save(save_path)
|
|
log(f'[SB3 Runner][MONITOR] Model saved to {save_path}.zip {time.ctime()}')
|
|
return save_path + '.zip'
|
|
|
|
|
|
def teardown(env):
|
|
"""Close environment cleanly with race avoidance sleep."""
|
|
log(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}')
|
|
try:
|
|
env.close()
|
|
log(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}')
|
|
except Exception as e:
|
|
log(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {e} {time.ctime()}')
|
|
log(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}')
|
|
time.sleep(2)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Train and evaluate an RL agent on DonkeyCar.')
|
|
parser.add_argument('--agent', type=str, default='ppo', choices=['ppo', 'dqn'], help='RL agent type')
|
|
parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym env ID')
|
|
parser.add_argument('--timesteps', type=int, default=10000, help='Training timesteps')
|
|
parser.add_argument('--eval-episodes', type=int, default=5, help='Evaluation episodes')
|
|
parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate')
|
|
parser.add_argument('--save-dir', type=str, default=None, help='Directory to save model')
|
|
parser.add_argument('--n-steer', type=int, default=7, help='Steer bins (DQN only)')
|
|
parser.add_argument('--n-throttle', type=int, default=3, help='Throttle bins (DQN only)')
|
|
parser.add_argument('--reward-shaping', action='store_true', help='Enable speed reward shaping')
|
|
parser.add_argument('--seed', type=int, default=None, help='Random seed')
|
|
args = parser.parse_args()
|
|
|
|
log(f'[SB3 Runner] Starting: agent={args.agent} timesteps={args.timesteps} lr={args.learning_rate} {time.ctime()}')
|
|
|
|
# --- 1. Connect to simulator ---
|
|
env = None
|
|
try:
|
|
env = make_env(args.env, args.agent, args.n_steer, args.n_throttle, args.reward_shaping)
|
|
log(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}')
|
|
except Exception as e:
|
|
log(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {e}')
|
|
sys.exit(100)
|
|
|
|
# --- 2. Train model ---
|
|
model = None
|
|
try:
|
|
model = train_model(args.agent, env, args.learning_rate, args.timesteps, args.seed)
|
|
except Exception as e:
|
|
log(f'[SB3 Runner][MONITOR ALERT] Training failed: {e} {time.ctime()}')
|
|
teardown(env)
|
|
sys.exit(101)
|
|
|
|
# --- 3. Save model ---
|
|
save_dir = args.save_dir or f'/tmp/donkeycar-trial-{int(time.time())}'
|
|
try:
|
|
saved_path = save_model(model, save_dir)
|
|
except Exception as e:
|
|
log(f'[SB3 Runner][MONITOR ALERT] Model save failed: {e} {time.ctime()}')
|
|
teardown(env)
|
|
sys.exit(101)
|
|
|
|
# --- 4. Evaluate trained policy ---
|
|
try:
|
|
mean_reward, std_reward = evaluate_model(model, env, args.eval_episodes)
|
|
except Exception as e:
|
|
log(f'[SB3 Runner][MONITOR ALERT] Evaluation failed: {e} {time.ctime()}')
|
|
teardown(env)
|
|
sys.exit(102)
|
|
|
|
# --- 5. Teardown ---
|
|
teardown(env)
|
|
log(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}')
|