""" ============================================================= DonkeyCar RL Autoresearch Controller — Phase 1 (Real Training) ============================================================= Uses Gaussian Process + UCB Bayesian optimization to propose hyperparameters for REAL PPO/DQN training runs (not random policy). Each trial: 1. GP+UCB proposes next hyperparameters 2. Launches donkeycar_sb3_runner.py with REAL training 3. Runner saves a trained model to disk 4. Controller records result, updates GP, tracks champion 5. Repeat Results go to: outerloop-results/autoresearch_results_phase1.jsonl Champion: models/champion/model.zip + manifest.json Usage: python3 autoresearch_controller.py --trials 50 --explore 2.0 --push-every 10 Stop at any time with Ctrl+C. Restart and it picks up from existing results. ============================================================= """ import os import sys import json import time import subprocess import re import shutil import numpy as np from datetime import datetime # ---- Project Paths ---- PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) RUNNER_SCRIPT = os.path.join(PROJECT_DIR, 'donkeycar_sb3_runner.py') RESULTS_DIR = os.path.join(PROJECT_DIR, 'outerloop-results') MODELS_DIR = os.path.join(PROJECT_DIR, 'models') CHAMPION_DIR = os.path.join(MODELS_DIR, 'champion') # Phase 1 uses a separate results file — do NOT mix with random-policy data PHASE1_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results_phase1.jsonl') PHASE1_LOG = os.path.join(RESULTS_DIR, 'autoresearch_phase1_log.txt') # Legacy base data (discretization insights, valid for n_steer/n_throttle) BASE_DATA_FILE = os.path.join(RESULTS_DIR, 'clean_sweep_results.jsonl') os.makedirs(RESULTS_DIR, exist_ok=True) os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(CHAMPION_DIR, exist_ok=True) # ---- Parameter Space ---- # These are the parameters GP+UCB will optimize # NOTE: timesteps kept small (1000-5000) for Phase 1 exploration on CPU. # DonkeyCar sim runs ~20-50 steps/sec. 5000 steps ≈ 100-250s → fits in 600s timeout. # Increase max_timesteps once we confirm the pipeline works end-to-end. PARAM_SPACE = { 'n_steer': {'type': 'int', 'min': 3, 'max': 9}, 'n_throttle': {'type': 'int', 'min': 2, 'max': 5}, 'learning_rate': {'type': 'float', 'min': 0.00005, 'max': 0.005}, 'timesteps': {'type': 'int', 'min': 1000, 'max': 5000}, } PARAM_KEYS = list(PARAM_SPACE.keys()) # Fixed params FIXED_PARAMS = { 'agent': 'ppo', 'eval_episodes': 3, 'reward_shaping': True, } N_CANDIDATES = 500 UCB_KAPPA = 2.0 MIN_TRIALS_BEFORE_GP = 3 JOB_TIMEOUT = 480 # 8 minutes — enough for 5000 steps + eval, with margin # ---- Logging ---- def log(msg): ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') line = f'[{ts}] {msg}' print(line, flush=True) with open(PHASE1_LOG, 'a') as f: f.write(line + '\n') # ---- Parameter Encoding ---- def encode_params(params): vec = [] for k in PARAM_KEYS: if k not in params: continue spec = PARAM_SPACE[k] v = params[k] norm = (v - spec['min']) / (spec['max'] - spec['min']) vec.append(np.clip(norm, 0.0, 1.0)) return np.array(vec) def decode_params(vec): params = {} for i, k in enumerate(PARAM_KEYS): spec = PARAM_SPACE[k] v = float(vec[i]) * (spec['max'] - spec['min']) + spec['min'] if spec['type'] == 'int': v = int(round(v)) v = max(spec['min'], min(spec['max'], v)) else: v = float(v) v = max(spec['min'], min(spec['max'], v)) params[k] = v return params def random_candidate(): return np.random.uniform(0, 1, len(PARAM_KEYS)) # ---- Gaussian Process Surrogate Model ---- class TinyGP: """Minimal RBF-kernel Gaussian Process for surrogate modelling.""" def __init__(self, length_scale=0.3, noise=1e-3): self.ls = length_scale self.noise = noise self.X = None self.alpha = None self.K_inv = None def _rbf(self, X1, X2): diff = X1[:, np.newaxis, :] - X2[np.newaxis, :, :] sq = np.sum(diff ** 2, axis=-1) return np.exp(-sq / (2 * self.ls ** 2)) def fit(self, X, y): self.X = np.array(X) n = len(y) K = self._rbf(self.X, self.X) + self.noise * np.eye(n) try: self.K_inv = np.linalg.inv(K) except np.linalg.LinAlgError: self.K_inv = np.linalg.pinv(K) self.alpha = self.K_inv @ np.array(y) def predict(self, X_new): X_new = np.atleast_2d(X_new) K_s = self._rbf(X_new, self.X) mu = K_s @ self.alpha var = np.maximum( 1.0 + self.noise - np.sum((K_s @ self.K_inv) * K_s, axis=1), 1e-9 ) return mu, np.sqrt(var) # ---- Champion Tracker ---- class ChampionTracker: def __init__(self, champion_dir): self.champion_dir = champion_dir self.manifest_path = os.path.join(champion_dir, 'manifest.json') os.makedirs(champion_dir, exist_ok=True) self._best = self._load() def _load(self): if os.path.exists(self.manifest_path): try: with open(self.manifest_path) as f: return json.load(f) except Exception: pass return {'mean_reward': float('-inf'), 'trial': None} @property def best_reward(self): return self._best.get('mean_reward', float('-inf')) def update_if_better(self, mean_reward, params, model_zip_path, trial): if mean_reward is None or mean_reward <= self.best_reward: return False dest = os.path.join(self.champion_dir, 'model.zip') if model_zip_path and os.path.exists(model_zip_path): try: shutil.copy2(model_zip_path, dest) except Exception as e: log(f'[Champion] WARNING: Could not copy model: {e}') dest = model_zip_path manifest = { 'trial': trial, 'timestamp': datetime.now().isoformat(), 'params': params, 'mean_reward': mean_reward, 'model_path': dest, } with open(self.manifest_path, 'w') as f: json.dump(manifest, f, indent=2) self._best = manifest log(f'[Champion] 🏆 NEW BEST! Trial {trial}: mean_reward={mean_reward:.4f} params={params}') return True def summary(self): if self._best['trial'] is None: return 'No champion yet.' return f"Champion: trial={self._best['trial']} mean_reward={self._best['mean_reward']:.4f} params={self._best['params']}" # ---- Load Results ---- def load_phase1_results(): """Load Phase 1 results only — no random-policy contamination.""" results = [] if not os.path.exists(PHASE1_RESULTS): return results with open(PHASE1_RESULTS) as f: for line in f: line = line.strip() if not line: continue try: rec = json.loads(line) mr = rec.get('mean_reward') if mr is not None: results.append({'params': rec['params'], 'mean_reward': float(mr)}) except Exception: pass return results # ---- GP+UCB Proposal ---- def propose_next_params(results, trial_num, kappa=UCB_KAPPA): if len(results) < MIN_TRIALS_BEFORE_GP: log(f'[AutoResearch] Only {len(results)} results — using random proposal.') return decode_params(random_candidate()) X = np.array([encode_params(r['params']) for r in results]) y = np.array([r['mean_reward'] for r in results]) y_mean, y_std = y.mean(), y.std() if y.std() > 0 else 1.0 y_norm = (y - y_mean) / y_std gp = TinyGP(length_scale=0.3, noise=1e-3) gp.fit(X, y_norm) candidates = np.random.uniform(0, 1, (N_CANDIDATES, len(PARAM_KEYS))) mu, sigma = gp.predict(candidates) ucb = mu + kappa * sigma top5 = np.argsort(ucb)[-5:][::-1] log(f'[AutoResearch] GP UCB top-5 candidates:') for idx in top5: p = decode_params(candidates[idx]) log(f' UCB={ucb[idx]:.4f} mu={mu[idx]:.4f} sigma={sigma[idx]:.4f} params={p}') return decode_params(candidates[np.argmax(ucb)]) # ---- Job Launcher ---- def kill_stale(): subprocess.run(['pkill', '-9', '-f', 'donkeycar_sb3_runner.py'], check=False) time.sleep(2) def launch_job(params, trial_num): save_dir = os.path.join(MODELS_DIR, f'trial-{trial_num:04d}') os.makedirs(save_dir, exist_ok=True) cmd = [ 'python3', RUNNER_SCRIPT, '--agent', params.get('agent', FIXED_PARAMS['agent']), '--env', 'donkey-generated-roads-v0', '--timesteps', str(int(params.get('timesteps', 10000))), '--eval-episodes', str(FIXED_PARAMS['eval_episodes']), '--learning-rate', str(params.get('learning_rate', 0.0003)), '--n-steer', str(int(params.get('n_steer', 7))), '--n-throttle', str(int(params.get('n_throttle', 3))), '--save-dir', save_dir, ] if FIXED_PARAMS.get('reward_shaping'): cmd.append('--reward-shaping') log(f'[AutoResearch] Launching trial {trial_num}: {params}') start = time.time() try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=JOB_TIMEOUT) elapsed = time.time() - start output = proc.stdout + '\n' + proc.stderr status = 'ok' if proc.returncode == 0 else 'error' log(f'[AutoResearch] Trial {trial_num} finished in {elapsed:.1f}s, returncode={proc.returncode}') except subprocess.TimeoutExpired as e: elapsed = time.time() - start output = f'[TIMEOUT after {elapsed:.1f}s]' status = 'timeout' log(f'[AutoResearch] Trial {trial_num} TIMED OUT after {elapsed:.1f}s') # Print last 2000 chars of output print('--- Runner Output (tail) ---', flush=True) print(output[-2000:], flush=True) print('--- End Runner Output ---', flush=True) # Parse results mean_reward = None std_reward = None m = re.search(r'\[SB3 Runner\]\[TEST\] mean_reward=([\d.]+)', output) if m: mean_reward = float(m.group(1)) m = re.search(r'\[SB3 Runner\]\[TEST\] std_reward=([\d.]+)', output) if m: std_reward = float(m.group(1)) log(f'[AutoResearch] Trial {trial_num}: mean_reward={mean_reward} std_reward={std_reward}') model_zip = os.path.join(save_dir, 'model.zip') if not os.path.exists(model_zip): model_zip = None return mean_reward, std_reward, model_zip, output, status, elapsed, save_dir # ---- Result Saving ---- def save_result(trial, params, mean_reward, std_reward, model_path, champion, status, elapsed): rec = { 'trial': trial, 'timestamp': datetime.now().isoformat(), 'params': params, 'mean_reward': mean_reward, 'std_reward': std_reward, 'model_path': model_path, 'champion': champion, 'run_status': status, 'elapsed_sec': elapsed, } with open(PHASE1_RESULTS, 'a') as f: f.write(json.dumps(rec) + '\n') # ---- Git Push ---- def git_push(project_root, trial_num): try: repo_root = os.path.dirname(PROJECT_DIR) subprocess.run(['git', '-C', repo_root, 'add', '-A'], check=True, capture_output=True) subprocess.run([ 'git', '-C', repo_root, 'commit', '-m', f'autoresearch: phase1 trial {trial_num} results\n\nAgent: pi\nTests: N/A\nTests-Added: 0\nTypeScript: N/A' ], check=True, capture_output=True) subprocess.run(['git', '-C', repo_root, 'push'], check=True, capture_output=True) log(f'[AutoResearch] Git push complete after trial {trial_num}') except subprocess.CalledProcessError as e: log(f'[AutoResearch] Git push failed: {e}') # ---- Summary ---- def print_summary(results, champion, trial): if not results: return log(f'[AutoResearch] === Trial {trial} Summary ===') log(f' Total Phase 1 runs: {len(results)}') log(f' {champion.summary()}') sorted_r = sorted(results, key=lambda r: r['mean_reward'], reverse=True) log(f' Top 5:') for r in sorted_r[:5]: log(f' mean_reward={r["mean_reward"]:.4f} params={r["params"]}') # ---- Main Loop ---- def run_autoresearch(max_trials=50, kappa=UCB_KAPPA, push_every=10): log('=' * 60) log('[AutoResearch] Phase 1 — Real PPO Training + GP+UCB Optimization') log(f'[AutoResearch] Max trials: {max_trials} | kappa: {kappa} | push every: {push_every}') log(f'[AutoResearch] Results: {PHASE1_RESULTS}') log(f'[AutoResearch] Champion: {CHAMPION_DIR}') log('=' * 60) results = load_phase1_results() champion = ChampionTracker(CHAMPION_DIR) log(f'[AutoResearch] Loaded {len(results)} existing Phase 1 results.') log(f'[AutoResearch] {champion.summary()}') for trial in range(1, max_trials + 1): log(f'\n[AutoResearch] ========== Trial {trial}/{max_trials} ==========') # 1. Propose params proposed = propose_next_params(results, trial, kappa=kappa) full_params = {**proposed, **FIXED_PARAMS} log(f'[AutoResearch] Proposed: {full_params}') # 2. Kill stale jobs kill_stale() # 3. Launch real training job mean_reward, std_reward, model_zip, output, status, elapsed, save_dir = launch_job(full_params, trial) # 4. Update champion is_champion = champion.update_if_better(mean_reward, full_params, model_zip, trial) # 5. Save result save_result(trial, full_params, mean_reward, std_reward, model_zip, is_champion, status, elapsed) # 6. Add to GP data (only successful runs with valid reward) if mean_reward is not None: results.append({'params': full_params, 'mean_reward': mean_reward}) # 7. Print summary print_summary(results, champion, trial) # 8. Git push periodically if push_every > 0 and trial % push_every == 0: git_push(PROJECT_DIR, trial) time.sleep(2) log('[AutoResearch] All trials complete!') print_summary(results, champion, trial=max_trials) # Final push git_push(PROJECT_DIR, max_trials) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Phase 1 Autoresearch: Real PPO training + GP+UCB.') parser.add_argument('--trials', type=int, default=50, help='Number of trials (default: 50)') parser.add_argument('--explore', type=float, default=2.0, help='UCB kappa (default: 2.0)') parser.add_argument('--push-every', type=int, default=10, help='Git push every N trials (0=disabled)') args = parser.parse_args() run_autoresearch(max_trials=args.trials, kappa=args.explore, push_every=args.push_every)