""" DonkeyCar RL Runner — Real Training Edition ============================================ Trains a PPO or DQN model using Stable-Baselines3, evaluates with evaluate_policy(), saves the model to disk, and exits cleanly. Usage: python3 donkeycar_sb3_runner.py \ --agent ppo \ --env donkey-generated-roads-v0 \ --timesteps 10000 \ --eval-episodes 5 \ --learning-rate 0.0003 \ --save-dir agent/models/trial-0001 \ --n-steer 7 \ --n-throttle 3 \ --reward-shaping \ --seed 42 Exit codes: 0 — success, model saved, evaluation complete 100 — failed to connect to simulator 101 — training failed 102 — evaluation failed """ import argparse import os import sys import time import numpy as np import gymnasium as gym import gym_donkeycar from stable_baselines3 import PPO, DQN from stable_baselines3.common.evaluation import evaluate_policy from discretize_action import DiscretizedActionWrapper # Optional reward shaping — imported only if available try: from reward_wrapper import SpeedRewardWrapper REWARD_WRAPPER_AVAILABLE = True except ImportError: REWARD_WRAPPER_AVAILABLE = False def log(msg): print(msg, flush=True) def make_env(env_id, agent, n_steer, n_throttle, reward_shaping): """Create and wrap the gym environment.""" env = gym.make(env_id) if agent == 'dqn': env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}') if reward_shaping: if REWARD_WRAPPER_AVAILABLE: env = SpeedRewardWrapper(env) log(f'[SB3 Runner][MONITOR] Speed reward shaping ENABLED. {time.ctime()}') else: log(f'[SB3 Runner][MONITOR] WARNING: reward_wrapper.py not found — reward shaping disabled. {time.ctime()}') return env def train_model(agent, env, learning_rate, timesteps, seed): """Train a PPO or DQN model and return it.""" if agent == 'ppo': model = PPO( 'CnnPolicy', env, learning_rate=learning_rate, verbose=1, seed=seed, ) elif agent == 'dqn': model = DQN( 'CnnPolicy', env, learning_rate=learning_rate, verbose=1, seed=seed, ) else: raise ValueError(f'Unknown agent: {agent}. Use ppo or dqn.') log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}') start = time.time() model.learn(total_timesteps=timesteps) elapsed = time.time() - start log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}') return model def evaluate_model(model, env, eval_episodes): """Evaluate the model using SB3 evaluate_policy and print per-episode detail.""" log(f'[SB3 Runner][MONITOR] Evaluating model for {eval_episodes} episodes. {time.ctime()}') mean_reward, std_reward = evaluate_policy( model, env, n_eval_episodes=eval_episodes, return_episode_rewards=False, deterministic=True, ) log(f'[SB3 Runner][TEST] mean_reward={mean_reward:.4f}') log(f'[SB3 Runner][TEST] std_reward={std_reward:.4f}') return mean_reward, std_reward def save_model(model, save_dir): """Save the model to save_dir/model.zip.""" os.makedirs(save_dir, exist_ok=True) save_path = os.path.join(save_dir, 'model') model.save(save_path) log(f'[SB3 Runner][MONITOR] Model saved to {save_path}.zip {time.ctime()}') return save_path + '.zip' def teardown(env): """Close environment cleanly with race avoidance sleep.""" log(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}') try: env.close() log(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}') except Exception as e: log(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {e} {time.ctime()}') log(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}') time.sleep(2) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train and evaluate an RL agent on DonkeyCar.') parser.add_argument('--agent', type=str, default='ppo', choices=['ppo', 'dqn'], help='RL agent type') parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym env ID') parser.add_argument('--timesteps', type=int, default=10000, help='Training timesteps') parser.add_argument('--eval-episodes', type=int, default=5, help='Evaluation episodes') parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate') parser.add_argument('--save-dir', type=str, default=None, help='Directory to save model') parser.add_argument('--n-steer', type=int, default=7, help='Steer bins (DQN only)') parser.add_argument('--n-throttle', type=int, default=3, help='Throttle bins (DQN only)') parser.add_argument('--reward-shaping', action='store_true', help='Enable speed reward shaping') parser.add_argument('--seed', type=int, default=None, help='Random seed') args = parser.parse_args() log(f'[SB3 Runner] Starting: agent={args.agent} timesteps={args.timesteps} lr={args.learning_rate} {time.ctime()}') # --- 1. Connect to simulator --- env = None try: env = make_env(args.env, args.agent, args.n_steer, args.n_throttle, args.reward_shaping) log(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}') except Exception as e: log(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {e}') sys.exit(100) # --- 2. Train model --- model = None try: model = train_model(args.agent, env, args.learning_rate, args.timesteps, args.seed) except Exception as e: log(f'[SB3 Runner][MONITOR ALERT] Training failed: {e} {time.ctime()}') teardown(env) sys.exit(101) # --- 3. Save model --- save_dir = args.save_dir or f'/tmp/donkeycar-trial-{int(time.time())}' try: saved_path = save_model(model, save_dir) except Exception as e: log(f'[SB3 Runner][MONITOR ALERT] Model save failed: {e} {time.ctime()}') teardown(env) sys.exit(101) # --- 4. Evaluate trained policy --- try: mean_reward, std_reward = evaluate_model(model, env, args.eval_episodes) except Exception as e: log(f'[SB3 Runner][MONITOR ALERT] Evaluation failed: {e} {time.ctime()}') teardown(env) sys.exit(102) # --- 5. Teardown --- teardown(env) log(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}')