import argparse import gymnasium as gym import gym_donkeycar from stable_baselines3 import DQN, PPO from stable_baselines3.common.evaluation import evaluate_policy import os import sys import time from discretize_action import DiscretizedActionWrapper AGENT_MAP = { 'dqn': DQN, 'ppo': PPO, # For later extension } def run_training(env_id, agent_name, total_timesteps, reward_shaping=False, eval_episodes=10, log_dir=None, seed=None, dqn_discretize=True, n_steer=3, n_throttle=3): assert agent_name in AGENT_MAP, f"Agent '{agent_name}' not recognized. Available: {list(AGENT_MAP.keys())}" AgentClass = AGENT_MAP[agent_name] print('[SB3 Runner] Starting: Connecting to sim…', flush=True) start = time.time() try: env = gym.make(env_id) print(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}', flush=True) except Exception as e: print(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {str(e)}', flush=True) sys.exit(100) if agent_name == 'dqn' and dqn_discretize: env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) print(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}', flush=True) EPISODES = 10 # Number of full env.reset runs for this special test try: ep_rewards = [] for episode in range(EPISODES): ep_reward = 0.0 if seed is not None: obs = env.reset(seed=seed) else: obs = env.reset() print(f'[SB3 Runner][TEST] Episode {episode+1}/{EPISODES} - reset at {time.ctime()}', flush=True) done = False t = 0 while not done: action = env.action_space.sample() result = env.step(action) if len(result) in (4, 5): # obs, reward, done, info or obs, reward, done, truncated, info if len(result) == 4: obs, reward, done, info = result else: obs, reward, done, truncated, info = result done = done or truncated else: print('[SB3 Runner][MONITOR] UNEXPECTED step() result shape!', flush=True) break ep_reward += reward t += 1 if t % 10 == 0 or done: print(f'[SB3 Runner][TEST] Step {t} done={done} reward={reward} {time.ctime()}', flush=True) if done: print(f'[SB3 Runner][TEST] Episode {episode+1} ended after {t} steps, total_reward={ep_reward} at {time.ctime()}', flush=True) break ep_rewards.append(ep_reward) print(f'[SB3 Runner][TEST] All episode rewards: {ep_rewards}', flush=True) if len(ep_rewards) > 0: print(f'[SB3 Runner][TEST] mean_reward={sum(ep_rewards)/len(ep_rewards):.4f}', flush=True) except Exception as e: print(f'[SB3 Runner][MONITOR ALERT] Exception during episodes: {str(e)} {time.ctime()}', flush=True) sys.exit(102) # ---- NEW: Ensure teardown and sleep for race avoidance ---- print(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}', flush=True) try: env.close() print(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}', flush=True) except Exception as e: print(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {str(e)} {time.ctime()}', flush=True) print(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}', flush=True) time.sleep(2) print(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}', flush=True) # Save if needed if log_dir: os.makedirs(log_dir, exist_ok=True) save_path = os.path.join(log_dir, f'{agent_name}_model') model.save(save_path) print(f"[SB3 Runner] Model saved to {save_path}") mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=eval_episodes, return_episode_rewards=False) print(f"[SB3 Runner] Eval episodes={eval_episodes}: mean_reward={mean_reward:.3f} std={std_reward:.3f}") return mean_reward, std_reward if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train/Eval an RL agent on DonkeyCar Gym using SB3.") parser.add_argument('--agent', type=str, default='dqn', choices=AGENT_MAP.keys(), help='RL agent type') parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym/Gymnasium env ID') parser.add_argument('--timesteps', type=int, default=5000, help='Total training timesteps') parser.add_argument('--eval-episodes', type=int, default=10, help='Episodes for evaluation after training') parser.add_argument('--log-dir', type=str, default=None, help='Directory to save models') parser.add_argument('--seed', type=int, default=None, help='Random seed') parser.add_argument('--n-steer', type=int, default=3, help='Number of steer bins (DQN only)') parser.add_argument('--n-throttle', type=int, default=3, help='Number of throttle bins (DQN only)') args = parser.parse_args() run_training( env_id=args.env, agent_name=args.agent, total_timesteps=args.timesteps, eval_episodes=args.eval_episodes, log_dir=args.log_dir, seed=args.seed, n_steer=args.n_steer, n_throttle=args.n_throttle )