diff --git a/agent/autoresearch_controller.py b/agent/autoresearch_controller.py index 42c61e3..c2d7d27 100644 --- a/agent/autoresearch_controller.py +++ b/agent/autoresearch_controller.py @@ -52,25 +52,28 @@ os.makedirs(CHAMPION_DIR, exist_ok=True) # ---- Parameter Space ---- # These are the parameters GP+UCB will optimize +# NOTE: timesteps kept small (1000-5000) for Phase 1 exploration on CPU. +# DonkeyCar sim runs ~20-50 steps/sec. 5000 steps ≈ 100-250s → fits in 600s timeout. +# Increase max_timesteps once we confirm the pipeline works end-to-end. PARAM_SPACE = { 'n_steer': {'type': 'int', 'min': 3, 'max': 9}, 'n_throttle': {'type': 'int', 'min': 2, 'max': 5}, 'learning_rate': {'type': 'float', 'min': 0.00005, 'max': 0.005}, - 'timesteps': {'type': 'int', 'min': 5000, 'max': 30000}, + 'timesteps': {'type': 'int', 'min': 1000, 'max': 5000}, } PARAM_KEYS = list(PARAM_SPACE.keys()) # Fixed params FIXED_PARAMS = { 'agent': 'ppo', - 'eval_episodes': 5, + 'eval_episodes': 3, 'reward_shaping': True, } N_CANDIDATES = 500 UCB_KAPPA = 2.0 MIN_TRIALS_BEFORE_GP = 3 -JOB_TIMEOUT = 600 # 10 minutes per trial (real training takes longer) +JOB_TIMEOUT = 480 # 8 minutes — enough for 5000 steps + eval, with margin # ---- Logging ---- def log(msg): diff --git a/agent/donkeycar_sb3_runner.py b/agent/donkeycar_sb3_runner.py index 2943c4c..e0e958f 100644 --- a/agent/donkeycar_sb3_runner.py +++ b/agent/donkeycar_sb3_runner.py @@ -46,6 +46,29 @@ except ImportError: REWARD_WRAPPER_AVAILABLE = False +class ThrottleClampWrapper(gym.ActionWrapper): + """ + Clamps the throttle dimension of a continuous action to [throttle_min, 1.0]. + Prevents PPO's random initial policy from outputting zero throttle + and leaving the car stationary. + Action format expected: [steer, throttle] where steer ∈ [-1,1], throttle ∈ [0,1]. + """ + def __init__(self, env, throttle_min=0.2): + super().__init__(env) + self.throttle_min = throttle_min + # Update action space so SB3 knows the real bounds + import numpy as np + low = np.array([-1.0, throttle_min], dtype=np.float32) + high = np.array([1.0, 1.0], dtype=np.float32) + self.action_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + def action(self, action): + import numpy as np + action = np.array(action, dtype=np.float32) + action[1] = float(np.clip(action[1], self.throttle_min, 1.0)) + return action + + def log(msg): print(msg, flush=True) @@ -57,6 +80,11 @@ def make_env(env_id, agent, n_steer, n_throttle, reward_shaping): if agent == 'dqn': env = DiscretizedActionWrapper(env, n_steer=n_steer, n_throttle=n_throttle) log(f'[SB3 Runner][MONITOR] Action discretization: steer={n_steer}, throttle={n_throttle}. {time.ctime()}') + else: + # PPO uses continuous actions. Clip throttle to [0.2, 1.0] so the car always moves. + # Without this, PPO's random initial policy outputs throttle~0 and the car sits still. + log(f'[SB3 Runner][MONITOR] PPO continuous actions. Throttle clamped to [0.2, 1.0]. {time.ctime()}') + env = ThrottleClampWrapper(env, throttle_min=0.2) if reward_shaping: if REWARD_WRAPPER_AVAILABLE: @@ -68,8 +96,66 @@ def make_env(env_id, agent, n_steer, n_throttle, reward_shaping): return env +class SimHealthCallback: + """ + Stable-Baselines3 compatible callback that detects a stuck/dead simulator. + If the car speed stays near zero for too many consecutive steps, raises an error. + Also detects if observations stop changing (frozen frame = connection lost). + """ + def __init__(self, max_stuck_steps=100, min_speed=0.05): + self.max_stuck_steps = max_stuck_steps + self.min_speed = min_speed + self._stuck_count = 0 + self._last_obs = None + self._frozen_count = 0 + + def on_step(self, obs, reward, done, info): + """Call after each env.step(). Returns False if sim appears dead.""" + # Check speed from info dict + speed = info.get('speed', None) if isinstance(info, dict) else None + if speed is not None: + if float(speed) < self.min_speed: + self._stuck_count += 1 + else: + self._stuck_count = 0 + if self._stuck_count >= self.max_stuck_steps: + log(f'[SB3 Runner][MONITOR ALERT] Sim appears STUCK: speed<{self.min_speed} for {self._stuck_count} steps. {time.ctime()}') + return False + + # Check for frozen observation (connection lost) + if obs is not None and self._last_obs is not None: + if np.array_equal(obs, self._last_obs): + self._frozen_count += 1 + else: + self._frozen_count = 0 + if self._frozen_count >= 30: + log(f'[SB3 Runner][MONITOR ALERT] Sim appears FROZEN: observation unchanged for {self._frozen_count} steps. {time.ctime()}') + return False + self._last_obs = obs + return True + + def train_model(agent, env, learning_rate, timesteps, seed): """Train a PPO or DQN model and return it.""" + from stable_baselines3.common.callbacks import BaseCallback + + class HealthCheckCallback(BaseCallback): + """SB3 callback that checks sim health each step and stops training if stuck.""" + def __init__(self, max_stuck_steps=100, min_speed=0.05): + super().__init__(verbose=0) + self.health = SimHealthCallback(max_stuck_steps=max_stuck_steps, min_speed=min_speed) + + def _on_step(self): + infos = self.locals.get('infos', [{}]) + obs = self.locals.get('new_obs', None) + info = infos[0] if infos else {} + obs_arr = obs[0] if obs is not None and len(obs) > 0 else None + healthy = self.health.on_step(obs_arr, None, None, info) + if not healthy: + log(f'[SB3 Runner][MONITOR ALERT] Health check failed — stopping training early. {time.ctime()}') + return False # Stops SB3 training + return True + if agent == 'ppo': model = PPO( 'CnnPolicy', @@ -91,7 +177,8 @@ def train_model(agent, env, learning_rate, timesteps, seed): log(f'[SB3 Runner][MONITOR] Starting training: agent={agent} timesteps={timesteps} lr={learning_rate} {time.ctime()}') start = time.time() - model.learn(total_timesteps=timesteps) + health_cb = HealthCheckCallback(max_stuck_steps=100, min_speed=0.02) + model.learn(total_timesteps=timesteps, callback=health_cb) elapsed = time.time() - start log(f'[SB3 Runner][MONITOR] Training complete in {elapsed:.1f}s. {time.ctime()}') return model diff --git a/agent/outerloop-results/autoresearch_phase1_log.txt b/agent/outerloop-results/autoresearch_phase1_log.txt index 074f241..2ab8a04 100644 --- a/agent/outerloop-results/autoresearch_phase1_log.txt +++ b/agent/outerloop-results/autoresearch_phase1_log.txt @@ -24,3 +24,84 @@ [2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-13 10:02:55] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-13 10:02:55] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 10:03:22] ============================================================ +[2026-04-13 10:03:22] [AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization +[2026-04-13 10:03:22] [AutoResearch] Max trials: 50 | kappa: 2.0 | push every: 10 +[2026-04-13 10:03:22] [AutoResearch] Results: /home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results/autoresearch_results_phase1.jsonl +[2026-04-13 10:03:22] [AutoResearch] Champion: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/champion +[2026-04-13 10:03:22] ============================================================ +[2026-04-13 10:03:22] [AutoResearch] Loaded 0 existing Phase 1 results. +[2026-04-13 10:03:22] [AutoResearch] No champion yet. +[2026-04-13 10:03:22] +[AutoResearch] ========== Trial 1/50 ========== +[2026-04-13 10:03:22] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:03:22] [AutoResearch] Proposed: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:03:24] [AutoResearch] Launching trial 1: {'n_steer': 7, 'n_throttle': 4, 'learning_rate': 0.0031442729980003356, 'timesteps': 28959, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:13:24] [AutoResearch] Trial 1 TIMED OUT after 600.2s +[2026-04-13 10:13:24] [AutoResearch] Trial 1: mean_reward=None std_reward=None +[2026-04-13 10:13:26] +[AutoResearch] ========== Trial 2/50 ========== +[2026-04-13 10:13:26] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:13:26] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:13:28] [AutoResearch] Launching trial 2: {'n_steer': 4, 'n_throttle': 4, 'learning_rate': 0.0034866189644944764, 'timesteps': 19697, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:23:28] [AutoResearch] Trial 2 TIMED OUT after 600.0s +[2026-04-13 10:23:28] [AutoResearch] Trial 2: mean_reward=None std_reward=None +[2026-04-13 10:23:30] +[AutoResearch] ========== Trial 3/50 ========== +[2026-04-13 10:23:30] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:23:30] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:23:32] [AutoResearch] Launching trial 3: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.0021394857089897554, 'timesteps': 28858, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:33:32] [AutoResearch] Trial 3 TIMED OUT after 600.1s +[2026-04-13 10:33:32] [AutoResearch] Trial 3: mean_reward=None std_reward=None +[2026-04-13 10:33:34] +[AutoResearch] ========== Trial 4/50 ========== +[2026-04-13 10:33:34] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:33:34] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:33:36] [AutoResearch] Launching trial 4: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0005174658025335539, 'timesteps': 22022, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:43:36] [AutoResearch] Trial 4 TIMED OUT after 600.1s +[2026-04-13 10:43:36] [AutoResearch] Trial 4: mean_reward=None std_reward=None +[2026-04-13 10:43:39] +[AutoResearch] ========== Trial 5/50 ========== +[2026-04-13 10:43:39] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:43:39] [AutoResearch] Proposed: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:43:41] [AutoResearch] Launching trial 5: {'n_steer': 4, 'n_throttle': 3, 'learning_rate': 0.004765524064388173, 'timesteps': 23582, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:53:41] [AutoResearch] Trial 5 TIMED OUT after 600.1s +[2026-04-13 10:53:41] [AutoResearch] Trial 5: mean_reward=None std_reward=None +[2026-04-13 10:53:43] +[AutoResearch] ========== Trial 6/50 ========== +[2026-04-13 10:53:43] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 10:53:43] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 10:53:45] [AutoResearch] Launching trial 6: {'n_steer': 8, 'n_throttle': 2, 'learning_rate': 0.0008238758073115486, 'timesteps': 23327, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:03:45] [AutoResearch] Trial 6 TIMED OUT after 600.1s +[2026-04-13 11:03:45] [AutoResearch] Trial 6: mean_reward=None std_reward=None +[2026-04-13 11:03:47] +[AutoResearch] ========== Trial 7/50 ========== +[2026-04-13 11:03:47] [AutoResearch] Only 0 results — using random proposal. +[2026-04-13 11:03:47] [AutoResearch] Proposed: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:03:49] [AutoResearch] Launching trial 7: {'n_steer': 8, 'n_throttle': 4, 'learning_rate': 0.0021827786572140534, 'timesteps': 8101, 'agent': 'ppo', 'eval_episodes': 5, 'reward_shaping': True} +[2026-04-13 11:16:34] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:16:34] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 11:16:34] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 11:16:34] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 11:16:34] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 11:16:34] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 11:16:34] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 11:16:34] [AutoResearch] Only 1 results — using random proposal. +[2026-04-13 11:16:53] [AutoResearch] GP UCB top-5 candidates: +[2026-04-13 11:16:53] UCB=2.7567 mu=1.2278 sigma=0.7644 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002270622623224986, 'timesteps': 3888} +[2026-04-13 11:16:53] UCB=2.7300 mu=1.1710 sigma=0.7795 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.002011397993568161, 'timesteps': 4033} +[2026-04-13 11:16:53] UCB=2.6457 mu=1.4878 sigma=0.5790 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.00219005726516088, 'timesteps': 4774} +[2026-04-13 11:16:53] UCB=2.6320 mu=1.1819 sigma=0.7250 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0020813954690263674, 'timesteps': 4022} +[2026-04-13 11:16:53] UCB=2.5412 mu=1.2499 sigma=0.6457 params={'n_steer': 8, 'n_throttle': 3, 'learning_rate': 0.0025942479713410636, 'timesteps': 4135} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} +[2026-04-13 11:16:53] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} +[2026-04-13 11:16:53] [AutoResearch] Only 1 results — using random proposal. diff --git a/agent/outerloop-results/autoresearch_results_phase1.jsonl b/agent/outerloop-results/autoresearch_results_phase1.jsonl new file mode 100644 index 0000000..6ab5a20 --- /dev/null +++ b/agent/outerloop-results/autoresearch_results_phase1.jsonl @@ -0,0 +1,6 @@ +{"trial": 1, "timestamp": "2026-04-13T10:13:24.756815", "params": {"n_steer": 7, "n_throttle": 4, "learning_rate": 0.0031442729980003356, "timesteps": 28959, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.2142441272736} +{"trial": 2, "timestamp": "2026-04-13T10:23:28.811316", "params": {"n_steer": 4, "n_throttle": 4, "learning_rate": 0.0034866189644944764, "timesteps": 19697, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.0406067371368} +{"trial": 3, "timestamp": "2026-04-13T10:33:32.891060", "params": {"n_steer": 4, "n_throttle": 3, "learning_rate": 0.0021394857089897554, "timesteps": 28858, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.0660693645477} +{"trial": 4, "timestamp": "2026-04-13T10:43:36.999174", "params": {"n_steer": 8, "n_throttle": 2, "learning_rate": 0.0005174658025335539, "timesteps": 22022, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.093513250351} +{"trial": 5, "timestamp": "2026-04-13T10:53:41.112283", "params": {"n_steer": 4, "n_throttle": 3, "learning_rate": 0.004765524064388173, "timesteps": 23582, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.0998013019562} +{"trial": 6, "timestamp": "2026-04-13T11:03:45.201524", "params": {"n_steer": 8, "n_throttle": 2, "learning_rate": 0.0008238758073115486, "timesteps": 23327, "agent": "ppo", "eval_episodes": 5, "reward_shaping": true}, "mean_reward": null, "std_reward": null, "model_path": null, "champion": false, "run_status": "timeout", "elapsed_sec": 600.0790619850159} diff --git a/tests/test_autoresearch_controller.py b/tests/test_autoresearch_controller.py index 6ce0831..61d1ff9 100644 --- a/tests/test_autoresearch_controller.py +++ b/tests/test_autoresearch_controller.py @@ -19,7 +19,7 @@ import autoresearch_controller as ctrl def test_param_encode_decode_roundtrip(): """encode → decode should reproduce original values (within int rounding).""" - params = {'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002, 'timesteps': 10000} + params = {'n_steer': 7, 'n_throttle': 3, 'learning_rate': 0.002, 'timesteps': 3000} vec = ctrl.encode_params(params) recovered = ctrl.decode_params(vec) assert recovered['n_steer'] == params['n_steer']