donkeycar-rl-autoresearch/agent/donkeycar_sb3_runner.py

"""
DonkeyCar RL Runner — Real Training Edition
============================================
Trains a PPO or DQN model using Stable-Baselines3, evaluates with evaluate_policy(),
saves the model to disk, and exits cleanly.

Usage:
    python3 donkeycar_sb3_runner.py \
        --agent ppo \
        --env donkey-generated-roads-v0 \
        --timesteps 10000 \
        --eval-episodes 5 \
        --learning-rate 0.0003 \
        --save-dir agent/models/trial-0001 \
        --n-steer 7 \
        --n-throttle 3 \
        --reward-shaping \
        --seed 42

Exit codes:
    0 — success, model saved, evaluation complete
    100 — failed to connect to simulator
    101 — training failed
    102 — evaluation failed
"""

import argparse
import os
import sys
import time
import numpy as np

import gymnasium as gym
import gym_donkeycar

from stable_baselines3 import PPO, DQN
from stable_baselines3.common.evaluation import evaluate_policy

from discretize_action import DiscretizedActionWrapper

# Optional reward shaping — imported only if available
try:
    from reward_wrapper import SpeedRewardWrapper
    REWARD_WRAPPER_AVAILABLE = True
except ImportError:
    REWARD_WRAPPER_AVAILABLE = False


class ThrottleClampWrapper(gym.ActionWrapper):
    """
    Clamps the throttle dimension of a continuous action to [throttle_min, 1.0].
    Prevents PPO's random initial policy from outputting zero throttle
    and leaving the car stationary.
    Action format expected: [steer, throttle] where steer ∈ [-1,1], throttle ∈ [0,1].
    """
    def __init__(self, env, throttle_min=0.2):
        super().__init__(env)
        self.throttle_min = throttle_min
        # Update action space so SB3 knows the real bounds
        import numpy as np
        low = np.array([-1.0, throttle_min], dtype=np.float32)
        high = np.array([1.0, 1.0], dtype=np.float32)
        self.action_space = gym.spaces.Box(low=low, high=high, dtype=np.float32)

    def action(self, action):
        import numpy as np
        action = np.array(action, dtype=np.float32)
        action[1] = float(np.clip(action[1], self.throttle_min, 1.0))
        return action


def log(msg):
    print(msg, flush=True)


def make_env(env_id, agent, n_steer, n_throttle, reward_shaping):
    """Create and wrap the gym environment."""
    env = gym.make(env_id)

    if agent == 'dqn':
        env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle)
        log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}')
    else:
        # PPO uses continuous actions. Clip throttle to [0.2, 1.0] so the car always moves.
        # Without this, PPO's random initial policy outputs throttle~0 and the car sits still.
        log(f'[SB3 Runner][MONITOR] PPO continuous actions. Throttle clamped to [0.2, 1.0]. {time.ctime()}')
        env = ThrottleClampWrapper(env, throttle_min=0.2)

    if reward_shaping:
        if REWARD_WRAPPER_AVAILABLE:
            env = SpeedRewardWrapper(env)
            log(f'[SB3 Runner][MONITOR] Speed reward shaping ENABLED. {time.ctime()}')
        else:
            log(f'[SB3 Runner][MONITOR] WARNING: reward_wrapper.py not found — reward shaping disabled. {time.ctime()}')

    return env


class SimHealthCallback:
    """
    Stable-Baselines3 compatible callback that detects a stuck/dead simulator.
    If the car speed stays near zero for too many consecutive steps, raises an error.
    Also detects if observations stop changing (frozen frame = connection lost).
    """
    def __init__(self, max_stuck_steps=100, min_speed=0.05):
        self.max_stuck_steps = max_stuck_steps
        self.min_speed = min_speed
        self._stuck_count = 0
        self._last_obs = None
        self._frozen_count = 0

    def on_step(self, obs, reward, done, info):
        """Call after each env.step(). Returns False if sim appears dead."""
        # Check speed from info dict
        speed = info.get('speed', None) if isinstance(info, dict) else None
        if speed is not None:
            if float(speed) < self.min_speed:
                self._stuck_count += 1
            else:
                self._stuck_count = 0
            if self._stuck_count >= self.max_stuck_steps:
                log(f'[SB3 Runner][MONITOR ALERT] Sim appears STUCK: speed<{self.min_speed} for {self._stuck_count} steps. {time.ctime()}')
                return False

        # Check for frozen observation (connection lost)
        if obs is not None and self._last_obs is not None:
            if np.array_equal(obs, self._last_obs):
                self._frozen_count += 1
            else:
                self._frozen_count = 0
            if self._frozen_count >= 30:
                log(f'[SB3 Runner][MONITOR ALERT] Sim appears FROZEN: observation unchanged for {self._frozen_count} steps. {time.ctime()}')
                return False
        self._last_obs = obs
        return True


def train_model(agent, env, learning_rate, timesteps, seed):
    """Train a PPO or DQN model and return it."""
    from stable_baselines3.common.callbacks import BaseCallback

    class HealthCheckCallback(BaseCallback):
        """SB3 callback that checks sim health each step and stops training if stuck."""
        def __init__(self, max_stuck_steps=100, min_speed=0.05):
            super().__init__(verbose=0)
            self.health = SimHealthCallback(max_stuck_steps=max_stuck_steps, min_speed=min_speed)

        def _on_step(self):
            infos = self.locals.get('infos', [{}])
            obs = self.locals.get('new_obs', None)
            info = infos[0] if infos else {}
            obs_arr = obs[0] if obs is not None and len(obs) > 0 else None
            healthy = self.health.on_step(obs_arr, None, None, info)
            if not healthy:
                log(f'[SB3 Runner][MONITOR ALERT] Health check failed — stopping training early. {time.ctime()}')
                return False  # Stops SB3 training
            return True

    if agent == 'ppo':
        model = PPO(
            'CnnPolicy',
            env,
            learning_rate=learning_rate,
            verbose=1,
            seed=seed,
        )
    elif agent == 'dqn':
        model = DQN(
            'CnnPolicy',
            env,
            learning_rate=learning_rate,
            verbose=1,
            seed=seed,
        )
    else:
        raise ValueError(f'Unknown agent: {agent}. Use ppo or dqn.')

    log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}')
    start = time.time()
    health_cb = HealthCheckCallback(max_stuck_steps=100, min_speed=0.02)
    model.learn(total_timesteps=timesteps, callback=health_cb)
    elapsed = time.time() - start
    log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}')
    return model


def evaluate_model(model, env, eval_episodes):
    """Evaluate the model using SB3 evaluate_policy and print per-episode detail."""
    log(f'[SB3 Runner][MONITOR] Evaluating model for {eval_episodes} episodes. {time.ctime()}')
    mean_reward, std_reward = evaluate_policy(
        model,
        env,
        n_eval_episodes=eval_episodes,
        return_episode_rewards=False,
        deterministic=True,
    )
    log(f'[SB3 Runner][TEST] mean_reward={mean_reward:.4f}')
    log(f'[SB3 Runner][TEST] std_reward={std_reward:.4f}')
    return mean_reward, std_reward


def save_model(model, save_dir):
    """Save the model to save_dir/model.zip."""
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, 'model')
    model.save(save_path)
    log(f'[SB3 Runner][MONITOR] Model saved to {save_path}.zip {time.ctime()}')
    return save_path + '.zip'


def teardown(env):
    """Close environment cleanly with race avoidance sleep."""
    log(f'[SB3 Runner][MONITOR] Calling env.close() at {time.ctime()}')
    try:
        env.close()
        log(f'[SB3 Runner][MONITOR] env.close() complete. {time.ctime()}')
    except Exception as e:
        log(f'[SB3 Runner][MONITOR ALERT] Exception during env.close(): {e} {time.ctime()}')
    log(f'[SB3 Runner][MONITOR] Waiting 2s before process exit to avoid race. {time.ctime()}')
    time.sleep(2)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train and evaluate an RL agent on DonkeyCar.')
    parser.add_argument('--agent', type=str, default='ppo', choices=['ppo', 'dqn'], help='RL agent type')
    parser.add_argument('--env', type=str, default='donkey-generated-roads-v0', help='Gym env ID')
    parser.add_argument('--timesteps', type=int, default=10000, help='Training timesteps')
    parser.add_argument('--eval-episodes', type=int, default=5, help='Evaluation episodes')
    parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate')
    parser.add_argument('--save-dir', type=str, default=None, help='Directory to save model')
    parser.add_argument('--n-steer', type=int, default=7, help='Steer bins (DQN only)')
    parser.add_argument('--n-throttle', type=int, default=3, help='Throttle bins (DQN only)')
    parser.add_argument('--reward-shaping', action='store_true', help='Enable speed reward shaping')
    parser.add_argument('--seed', type=int, default=None, help='Random seed')
    args = parser.parse_args()

    log(f'[SB3 Runner] Starting: agent={args.agent} timesteps={args.timesteps} lr={args.learning_rate} {time.ctime()}')

    # --- 1. Connect to simulator ---
    env = None
    try:
        env = make_env(args.env, args.agent, args.n_steer, args.n_throttle, args.reward_shaping)
        log(f'[SB3 Runner][MONITOR] Connected to gym env. {time.ctime()}')
    except Exception as e:
        log(f'[SB3 Runner][MONITOR ALERT] Failed to connect to sim: {e}')
        sys.exit(100)

    # --- 2. Train model ---
    model = None
    try:
        model = train_model(args.agent, env, args.learning_rate, args.timesteps, args.seed)
    except Exception as e:
        log(f'[SB3 Runner][MONITOR ALERT] Training failed: {e} {time.ctime()}')
        teardown(env)
        sys.exit(101)

    # --- 3. Save model ---
    save_dir = args.save_dir or f'/tmp/donkeycar-trial-{int(time.time())}'
    try:
        saved_path = save_model(model, save_dir)
    except Exception as e:
        log(f'[SB3 Runner][MONITOR ALERT] Model save failed: {e} {time.ctime()}')
        teardown(env)
        sys.exit(101)

    # --- 4. Evaluate trained policy ---
    try:
        mean_reward, std_reward = evaluate_model(model, env, args.eval_episodes)
    except Exception as e:
        log(f'[SB3 Runner][MONITOR ALERT] Evaluation failed: {e} {time.ctime()}')
        teardown(env)
        sys.exit(102)

    # --- 5. Teardown ---
    teardown(env)
    log(f'[SB3 Runner][MONITOR] Exiting RL runner at {time.ctime()}')