AUTORESEARCH: Full Karpathy-style GP+UCB meta-controller, clean base data, fixed all paths, ready to run

2026-04-13 00:52:00 -04:00 · 2026-04-13 00:52:00 -04:00 · bb9e6d9105
parent 4a4e61d463
commit bb9e6d9105
4 changed files with 396 additions and 1 deletions
--- a/agent/AUTORESEARCH_README.txt
+++ b/agent/AUTORESEARCH_README.txt
@ -0,0 +1,40 @@
+# DonkeyCar RL Autoresearch - README
+# ===================================
+#
+# QUICK START (after simulator is running):
+#
+#   cd /home/paulh/projects/donkeycar-rl-autoresearch/agent
+#   python3 autoresearch_controller.py --trials 100
+#
+# The autoresearch will:
+#   1. Load all base sweep data (clean_sweep_results.jsonl)
+#   2. Fit a Gaussian Process surrogate model on reward-vs-params
+#   3. Use UCB (Upper Confidence Bound) to propose next best params
+#   4. Launch RL jobs automatically via the robust runner
+#   5. Record all results to outerloop-results/autoresearch_results.jsonl
+#   6. Repeat for --trials iterations, learning as it goes
+#
+# You can stop at any time with Ctrl+C.
+# Restart and it automatically picks up all prior results.
+#
+# LOGS:
+#   outerloop-results/autoresearch_log.txt     - human-readable log
+#   outerloop-results/autoresearch_results.jsonl - all trial results (JSON)
+#   outerloop-results/clean_sweep_results.jsonl  - base sweep data
+#
+# TUNING:
+#   --trials N    : number of autoresearch trials (default 100)
+#   --explore K   : UCB kappa, higher = more exploration (default 2.0)
+#
+# HOW IT WORKS (Karpathy-style autoresearch):
+#   - A Gaussian Process (GP) is fit on all existing (params, reward) pairs
+#   - The GP models the unknown reward function over the parameter space
+#   - UCB acquisition = GP mean + kappa * GP uncertainty
+#   - The next trial uses the params that maximize UCB
+#   - This intelligently balances exploiting known good regions vs
+#     exploring uncertain regions - far smarter than any fixed grid
+#
+# PARAMETER SPACE EXPLORED (continuously, not just grid values):
+#   n_steer:       3 to 9  (integer)
+#   n_throttle:    2 to 5  (integer)
+#   learning_rate: 0.00005 to 0.005 (float)
--- a/agent/autoresearch_controller.py
+++ b/agent/autoresearch_controller.py
@ -0,0 +1,337 @@
+"""
+=============================================================
+DonkeyCar RL Autoresearch Controller
+Karpathy-style meta-agent that:
+  1. Loads base sweep data
+  2. Builds a surrogate model (Gaussian Process) of reward landscape
+  3. Uses Upper Confidence Bound (UCB) acquisition to propose next params
+  4. Launches RL jobs via robust runner
+  5. Records results and iterates autonomously
+=============================================================
+Usage:
+    python3 autoresearch_controller.py [--trials N] [--explore K]
+
+All results are appended to:
+    outerloop-results/autoresearch_results.jsonl
+    outerloop-results/autoresearch_log.txt
+
+Stop at any time with Ctrl+C. Restart and it picks up from existing data.
+=============================================================
+"""
+
+import os
+import sys
+import json
+import time
+import subprocess
+import itertools
+import re
+import numpy as np
+from datetime import datetime
+
+# ---- Paths ----
+PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
+RUNNER_SCRIPT = os.path.join(PROJECT_DIR, 'donkeycar_sb3_runner.py')
+RESULTS_DIR = os.path.join(PROJECT_DIR, 'outerloop-results')
+BASE_DATA_FILE = os.path.join(RESULTS_DIR, 'clean_sweep_results.jsonl')
+AUTORESEARCH_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results.jsonl')
+AUTORESEARCH_LOG = os.path.join(RESULTS_DIR, 'autoresearch_log.txt')
+
+os.makedirs(RESULTS_DIR, exist_ok=True)
+
+# ---- Parameter Space Definition ----
+# These define the bounds for the autoresearch to explore.
+# Autoresearch can propose any value within these continuous ranges.
+PARAM_SPACE = {
+    'n_steer':       {'type': 'int',   'min': 3,      'max': 9},
+    'n_throttle':    {'type': 'int',   'min': 2,      'max': 5},
+    'learning_rate': {'type': 'float', 'min': 0.00005,'max': 0.005},
+}
+
+# Fixed params for all runs
+FIXED_PARAMS = {
+    'timesteps':    2000,
+    'eval_episodes': 3,
+}
+
+# How many candidate proposals to sample when searching for next best
+N_CANDIDATES = 500
+
+# UCB exploration constant (higher = more exploration)
+UCB_KAPPA = 2.0
+
+# Job timeout seconds
+JOB_TIMEOUT = 360
+
+# ---- Logging ----
+def log(msg):
+    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    line = f'[{ts}] {msg}'
+    print(line, flush=True)
+    with open(AUTORESEARCH_LOG, 'a') as f:
+        f.write(line + '\n')
+
+# ---- Parameter Encoding (for surrogate model) ----
+PARAM_KEYS = list(PARAM_SPACE.keys())
+
+def encode_params(params):
+    """Encode a params dict into a normalized numpy vector [0,1] for the GP."""
+    vec = []
+    for k in PARAM_KEYS:
+        spec = PARAM_SPACE[k]
+        v = params[k]
+        norm = (v - spec['min']) / (spec['max'] - spec['min'])
+        vec.append(norm)
+    return np.array(vec)
+
+def decode_params(vec):
+    """Decode a normalized numpy vector back to a params dict."""
+    params = {}
+    for i, k in enumerate(PARAM_KEYS):
+        spec = PARAM_SPACE[k]
+        v = vec[i] * (spec['max'] - spec['min']) + spec['min']
+        if spec['type'] == 'int':
+            v = int(round(v))
+            v = max(spec['min'], min(spec['max'], v))
+        else:
+            v = float(v)
+            v = max(spec['min'], min(spec['max'], v))
+        params[k] = v
+    return params
+
+def random_candidate():
+    """Sample a random candidate in the parameter space."""
+    vec = np.random.uniform(0, 1, len(PARAM_KEYS))
+    return vec
+
+# ---- Gaussian Process Surrogate Model (pure numpy, no sklearn needed) ----
+class TinyGP:
+    """
+    Minimal Gaussian Process regressor (RBF kernel) for surrogate modelling.
+    Predicts mean and std of reward for any parameter vector.
+    """
+    def __init__(self, length_scale=0.3, noise=1e-3):
+        self.ls = length_scale
+        self.noise = noise
+        self.X = None
+        self.y = None
+        self.K_inv = None
+
+    def _rbf(self, X1, X2):
+        """RBF kernel matrix between X1 and X2."""
+        diff = X1[:, np.newaxis, :] - X2[np.newaxis, :, :]
+        sq = np.sum(diff**2, axis=-1)
+        return np.exp(-sq / (2 * self.ls**2))
+
+    def fit(self, X, y):
+        self.X = np.array(X)
+        self.y = np.array(y)
+        n = len(y)
+        K = self._rbf(self.X, self.X) + self.noise * np.eye(n)
+        try:
+            self.K_inv = np.linalg.inv(K)
+        except np.linalg.LinAlgError:
+            self.K_inv = np.linalg.pinv(K)
+        self.alpha = self.K_inv @ self.y
+
+    def predict(self, X_new):
+        """Returns (mean, std) arrays for each row in X_new."""
+        X_new = np.atleast_2d(X_new)
+        K_s = self._rbf(X_new, self.X)
+        mean = K_s @ self.alpha
+        K_ss = np.ones(len(X_new)) + self.noise
+        var = K_ss - np.sum((K_s @ self.K_inv) * K_s, axis=1)
+        var = np.maximum(var, 1e-9)
+        return mean, np.sqrt(var)
+
+# ---- Load All Available Data (base sweep + autoresearch results) ----
+def load_all_results():
+    """Load all param-reward pairs from base sweep and any autoresearch runs."""
+    results = []
+    for fpath in [BASE_DATA_FILE, AUTORESEARCH_RESULTS]:
+        if not os.path.exists(fpath):
+            continue
+        with open(fpath) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rec = json.loads(line)
+                    mr = rec.get('mean_reward')
+                    if mr is not None:
+                        results.append({'params': rec['params'], 'mean_reward': float(mr)})
+                except Exception:
+                    pass
+    return results
+
+# ---- UCB Acquisition: Propose Next Best Parameters ----
+def propose_next_params(results, n_candidates=N_CANDIDATES, kappa=UCB_KAPPA):
+    """
+    Fit GP on existing results, then maximize UCB acquisition
+    over random candidate samples to propose the next params to try.
+    Returns: proposed params dict
+    """
+    if len(results) < 2:
+        log('[AutoResearch] Not enough data for GP yet, using random proposal.')
+        return decode_params(random_candidate())
+
+    X = np.array([encode_params(r['params']) for r in results])
+    y = np.array([r['mean_reward'] for r in results])
+
+    # Normalize y for numerical stability
+    y_mean = y.mean()
+    y_std = y.std() if y.std() > 0 else 1.0
+    y_norm = (y - y_mean) / y_std
+
+    gp = TinyGP(length_scale=0.3, noise=1e-3)
+    gp.fit(X, y_norm)
+
+    # Sample candidates
+    candidates = np.random.uniform(0, 1, (n_candidates, len(PARAM_KEYS)))
+
+    # Compute UCB acquisition
+    mu, sigma = gp.predict(candidates)
+    ucb = mu + kappa * sigma
+
+    best_idx = np.argmax(ucb)
+    best_vec = candidates[best_idx]
+    proposed = decode_params(best_vec)
+
+    # Log the GP's top predictions
+    top5_idx = np.argsort(ucb)[-5:][::-1]
+    log(f'[AutoResearch] GP UCB top-5 candidates:')
+    for idx in top5_idx:
+        p = decode_params(candidates[idx])
+        log(f'  UCB={ucb[idx]:.4f} mu={mu[idx]:.4f} sigma={sigma[idx]:.4f} params={p}')
+
+    return proposed
+
+# ---- Kill Stale Jobs ----
+def kill_stale():
+    subprocess.run(['pkill', '-9', '-f', 'donkeycar_sb3_runner.py'], check=False)
+    time.sleep(2)
+
+# ---- Launch RL Job with Proposed Params ----
+def launch_job(params):
+    """Launch a single RL runner job and return (mean_reward, output, status)."""
+    cmd = [
+        'python3', RUNNER_SCRIPT,
+        '--agent', 'dqn',
+        '--env', 'donkey-generated-roads-v0',
+        '--timesteps', str(params.get('timesteps', FIXED_PARAMS['timesteps'])),
+        '--eval-episodes', str(params.get('eval_episodes', FIXED_PARAMS['eval_episodes'])),
+        '--n-steer', str(params['n_steer']),
+        '--n-throttle', str(params['n_throttle']),
+    ]
+    log(f'[AutoResearch] Launching job: n_steer={params["n_steer"]} n_throttle={params["n_throttle"]} lr={params["learning_rate"]:.6f}')
+    start = time.time()
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=JOB_TIMEOUT)
+        elapsed = time.time() - start
+        output = proc.stdout + '\n' + proc.stderr
+        status = 'ok' if proc.returncode == 0 else 'error'
+        log(f'[AutoResearch] Job finished in {elapsed:.1f}s, returncode={proc.returncode}')
+    except subprocess.TimeoutExpired as e:
+        elapsed = time.time() - start
+        output = f'[TIMEOUT after {elapsed:.1f}s]'
+        status = 'timeout'
+        log(f'[AutoResearch] Job TIMED OUT after {elapsed:.1f}s')
+
+    # Parse mean_reward from output
+    mean_reward = None
+    m = re.search(r'\[SB3 Runner\]\[TEST\] mean_reward=([\d.]+)', output)
+    if m:
+        mean_reward = float(m.group(1))
+    log(f'[AutoResearch] mean_reward={mean_reward}')
+
+    # Print full runner output for transparency
+    print('--- Runner Output ---')
+    print(output[-3000:])  # last 3000 chars
+    print('--- End Runner Output ---')
+
+    return mean_reward, output, status, elapsed
+
+# ---- Save Result ----
+def save_result(trial, params, mean_reward, status, elapsed):
+    rec = {
+        'trial': trial,
+        'timestamp': datetime.now().isoformat(),
+        'params': params,
+        'mean_reward': mean_reward,
+        'run_status': status,
+        'elapsed_sec': elapsed,
+    }
+    with open(AUTORESEARCH_RESULTS, 'a') as f:
+        f.write(json.dumps(rec) + '\n')
+
+# ---- Print Current Best ----
+def print_summary(results, trial):
+    if not results:
+        return
+    best = max(results, key=lambda r: r['mean_reward'])
+    log(f'[AutoResearch] === Trial {trial} Summary ===')
+    log(f'  Total runs in history: {len(results)}')
+    log(f'  Best so far: mean_reward={best["mean_reward"]:.4f} params={best["params"]}')
+    # Top 5
+    sorted_r = sorted(results, key=lambda r: r['mean_reward'], reverse=True)
+    log(f'  Top 5 results:')
+    for r in sorted_r[:5]:
+        log(f'    mean_reward={r["mean_reward"]:.4f}  params={r["params"]}')
+
+# ---- Main Autoresearch Loop ----
+def run_autoresearch(max_trials=100):
+    log('=' * 60)
+    log('[AutoResearch] Starting Karpathy-style autoresearch controller')
+    log(f'[AutoResearch] Max trials: {max_trials}')
+    log(f'[AutoResearch] Runner: {RUNNER_SCRIPT}')
+    log(f'[AutoResearch] Results: {AUTORESEARCH_RESULTS}')
+    log('=' * 60)
+
+    # Load all existing data (base sweep + prior autoresearch runs)
+    results = load_all_results()
+    log(f'[AutoResearch] Loaded {len(results)} existing result(s) from base sweep + history.')
+    print_summary(results, trial=0)
+
+    for trial in range(1, max_trials + 1):
+        log(f'\n[AutoResearch] ========== Trial {trial}/{max_trials} ==========')
+
+        # 1. Propose next params using GP+UCB
+        proposed = propose_next_params(results)
+        full_params = {**proposed, **FIXED_PARAMS}
+        log(f'[AutoResearch] Proposed params: {full_params}')
+
+        # 2. Kill any stale jobs
+        kill_stale()
+
+        # 3. Launch job
+        mean_reward, output, status, elapsed = launch_job(full_params)
+
+        # 4. Save result
+        save_result(trial, full_params, mean_reward, status, elapsed)
+
+        # 5. If we got a valid reward, add to results for next GP fit
+        if mean_reward is not None:
+            results.append({'params': full_params, 'mean_reward': mean_reward})
+        else:
+            log(f'[AutoResearch] WARNING: No valid mean_reward from this trial.')
+
+        # 6. Print running summary
+        print_summary(results, trial)
+
+        # 7. Brief pause between trials
+        time.sleep(2)
+
+    log('[AutoResearch] All trials complete!')
+    print_summary(results, trial=max_trials)
+
+# ---- Entry Point ----
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Karpathy-style autoresearch controller for DonkeyCar RL.')
+    parser.add_argument('--trials', type=int, default=100, help='Number of autoresearch trials to run (default: 100)')
+    parser.add_argument('--explore', type=float, default=2.0, help='UCB exploration constant kappa (default: 2.0, higher=more explore)')
+    args = parser.parse_args()
+
+    UCB_KAPPA = args.explore
+    run_autoresearch(max_trials=args.trials)
--- a/agent/donkeycar_outer_loop.py
+++ b/agent/donkeycar_outer_loop.py
@ -31,7 +31,7 @@ def build_param_combinations(grid):

 def run_sweep():
    results = []
-    out_dir = '/home/paulh/.pi/agent/outerloop-results'
+    out_dir = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results'
    os.makedirs(out_dir, exist_ok=True)
    log_file = os.path.join(out_dir, 'sweep_results.jsonl')

--- a/agent/outerloop-results/clean_sweep_results.jsonl
+++ b/agent/outerloop-results/clean_sweep_results.jsonl
@ -0,0 +1,18 @@
+{"config_id": 1, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 46.7312, "run": 1}
+{"config_id": 2, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 64.7249, "run": 2}
+{"config_id": 3, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 36.2958, "run": 3}
+{"config_id": 4, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 33.6781, "run": 4}
+{"config_id": 5, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 59.7928, "run": 5}
+{"config_id": 6, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 61.8774, "run": 6}
+{"config_id": 7, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 97.7536, "run": 7}
+{"config_id": 8, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 61.7233, "run": 8}
+{"config_id": 9, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 53.6128, "run": 9}
+{"config_id": 10, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 50.171, "run": 10}
+{"config_id": 11, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 78.3455, "run": 11}
+{"config_id": 12, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 71.8459, "run": 12}
+{"config_id": 13, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 87.96, "run": 13}
+{"config_id": 14, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 45.0102, "run": 14}
+{"config_id": 15, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 52.6958, "run": 15}
+{"config_id": 16, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 80.3866, "run": 16}
+{"config_id": 17, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 84.9219, "run": 17}
+{"config_id": 18, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 77.3825, "run": 18}