AUTORESEARCH: Full Karpathy-style GP+UCB meta-controller, clean base data, fixed all paths, ready to run

This commit is contained in:
Paul Huliganga 2026-04-13 00:52:00 -04:00
parent 4a4e61d463
commit bb9e6d9105
4 changed files with 396 additions and 1 deletions

View File

@ -0,0 +1,40 @@
# DonkeyCar RL Autoresearch - README
# ===================================
#
# QUICK START (after simulator is running):
#
# cd /home/paulh/projects/donkeycar-rl-autoresearch/agent
# python3 autoresearch_controller.py --trials 100
#
# The autoresearch will:
# 1. Load all base sweep data (clean_sweep_results.jsonl)
# 2. Fit a Gaussian Process surrogate model on reward-vs-params
# 3. Use UCB (Upper Confidence Bound) to propose next best params
# 4. Launch RL jobs automatically via the robust runner
# 5. Record all results to outerloop-results/autoresearch_results.jsonl
# 6. Repeat for --trials iterations, learning as it goes
#
# You can stop at any time with Ctrl+C.
# Restart and it automatically picks up all prior results.
#
# LOGS:
# outerloop-results/autoresearch_log.txt - human-readable log
# outerloop-results/autoresearch_results.jsonl - all trial results (JSON)
# outerloop-results/clean_sweep_results.jsonl - base sweep data
#
# TUNING:
# --trials N : number of autoresearch trials (default 100)
# --explore K : UCB kappa, higher = more exploration (default 2.0)
#
# HOW IT WORKS (Karpathy-style autoresearch):
# - A Gaussian Process (GP) is fit on all existing (params, reward) pairs
# - The GP models the unknown reward function over the parameter space
# - UCB acquisition = GP mean + kappa * GP uncertainty
# - The next trial uses the params that maximize UCB
# - This intelligently balances exploiting known good regions vs
# exploring uncertain regions - far smarter than any fixed grid
#
# PARAMETER SPACE EXPLORED (continuously, not just grid values):
# n_steer: 3 to 9 (integer)
# n_throttle: 2 to 5 (integer)
# learning_rate: 0.00005 to 0.005 (float)

View File

@ -0,0 +1,337 @@
"""
=============================================================
DonkeyCar RL Autoresearch Controller
Karpathy-style meta-agent that:
1. Loads base sweep data
2. Builds a surrogate model (Gaussian Process) of reward landscape
3. Uses Upper Confidence Bound (UCB) acquisition to propose next params
4. Launches RL jobs via robust runner
5. Records results and iterates autonomously
=============================================================
Usage:
python3 autoresearch_controller.py [--trials N] [--explore K]
All results are appended to:
outerloop-results/autoresearch_results.jsonl
outerloop-results/autoresearch_log.txt
Stop at any time with Ctrl+C. Restart and it picks up from existing data.
=============================================================
"""
import os
import sys
import json
import time
import subprocess
import itertools
import re
import numpy as np
from datetime import datetime
# ---- Paths ----
PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
RUNNER_SCRIPT = os.path.join(PROJECT_DIR, 'donkeycar_sb3_runner.py')
RESULTS_DIR = os.path.join(PROJECT_DIR, 'outerloop-results')
BASE_DATA_FILE = os.path.join(RESULTS_DIR, 'clean_sweep_results.jsonl')
AUTORESEARCH_RESULTS = os.path.join(RESULTS_DIR, 'autoresearch_results.jsonl')
AUTORESEARCH_LOG = os.path.join(RESULTS_DIR, 'autoresearch_log.txt')
os.makedirs(RESULTS_DIR, exist_ok=True)
# ---- Parameter Space Definition ----
# These define the bounds for the autoresearch to explore.
# Autoresearch can propose any value within these continuous ranges.
PARAM_SPACE = {
'n_steer': {'type': 'int', 'min': 3, 'max': 9},
'n_throttle': {'type': 'int', 'min': 2, 'max': 5},
'learning_rate': {'type': 'float', 'min': 0.00005,'max': 0.005},
}
# Fixed params for all runs
FIXED_PARAMS = {
'timesteps': 2000,
'eval_episodes': 3,
}
# How many candidate proposals to sample when searching for next best
N_CANDIDATES = 500
# UCB exploration constant (higher = more exploration)
UCB_KAPPA = 2.0
# Job timeout seconds
JOB_TIMEOUT = 360
# ---- Logging ----
def log(msg):
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
line = f'[{ts}] {msg}'
print(line, flush=True)
with open(AUTORESEARCH_LOG, 'a') as f:
f.write(line + '\n')
# ---- Parameter Encoding (for surrogate model) ----
PARAM_KEYS = list(PARAM_SPACE.keys())
def encode_params(params):
"""Encode a params dict into a normalized numpy vector [0,1] for the GP."""
vec = []
for k in PARAM_KEYS:
spec = PARAM_SPACE[k]
v = params[k]
norm = (v - spec['min']) / (spec['max'] - spec['min'])
vec.append(norm)
return np.array(vec)
def decode_params(vec):
"""Decode a normalized numpy vector back to a params dict."""
params = {}
for i, k in enumerate(PARAM_KEYS):
spec = PARAM_SPACE[k]
v = vec[i] * (spec['max'] - spec['min']) + spec['min']
if spec['type'] == 'int':
v = int(round(v))
v = max(spec['min'], min(spec['max'], v))
else:
v = float(v)
v = max(spec['min'], min(spec['max'], v))
params[k] = v
return params
def random_candidate():
"""Sample a random candidate in the parameter space."""
vec = np.random.uniform(0, 1, len(PARAM_KEYS))
return vec
# ---- Gaussian Process Surrogate Model (pure numpy, no sklearn needed) ----
class TinyGP:
"""
Minimal Gaussian Process regressor (RBF kernel) for surrogate modelling.
Predicts mean and std of reward for any parameter vector.
"""
def __init__(self, length_scale=0.3, noise=1e-3):
self.ls = length_scale
self.noise = noise
self.X = None
self.y = None
self.K_inv = None
def _rbf(self, X1, X2):
"""RBF kernel matrix between X1 and X2."""
diff = X1[:, np.newaxis, :] - X2[np.newaxis, :, :]
sq = np.sum(diff**2, axis=-1)
return np.exp(-sq / (2 * self.ls**2))
def fit(self, X, y):
self.X = np.array(X)
self.y = np.array(y)
n = len(y)
K = self._rbf(self.X, self.X) + self.noise * np.eye(n)
try:
self.K_inv = np.linalg.inv(K)
except np.linalg.LinAlgError:
self.K_inv = np.linalg.pinv(K)
self.alpha = self.K_inv @ self.y
def predict(self, X_new):
"""Returns (mean, std) arrays for each row in X_new."""
X_new = np.atleast_2d(X_new)
K_s = self._rbf(X_new, self.X)
mean = K_s @ self.alpha
K_ss = np.ones(len(X_new)) + self.noise
var = K_ss - np.sum((K_s @ self.K_inv) * K_s, axis=1)
var = np.maximum(var, 1e-9)
return mean, np.sqrt(var)
# ---- Load All Available Data (base sweep + autoresearch results) ----
def load_all_results():
"""Load all param-reward pairs from base sweep and any autoresearch runs."""
results = []
for fpath in [BASE_DATA_FILE, AUTORESEARCH_RESULTS]:
if not os.path.exists(fpath):
continue
with open(fpath) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
rec = json.loads(line)
mr = rec.get('mean_reward')
if mr is not None:
results.append({'params': rec['params'], 'mean_reward': float(mr)})
except Exception:
pass
return results
# ---- UCB Acquisition: Propose Next Best Parameters ----
def propose_next_params(results, n_candidates=N_CANDIDATES, kappa=UCB_KAPPA):
"""
Fit GP on existing results, then maximize UCB acquisition
over random candidate samples to propose the next params to try.
Returns: proposed params dict
"""
if len(results) < 2:
log('[AutoResearch] Not enough data for GP yet, using random proposal.')
return decode_params(random_candidate())
X = np.array([encode_params(r['params']) for r in results])
y = np.array([r['mean_reward'] for r in results])
# Normalize y for numerical stability
y_mean = y.mean()
y_std = y.std() if y.std() > 0 else 1.0
y_norm = (y - y_mean) / y_std
gp = TinyGP(length_scale=0.3, noise=1e-3)
gp.fit(X, y_norm)
# Sample candidates
candidates = np.random.uniform(0, 1, (n_candidates, len(PARAM_KEYS)))
# Compute UCB acquisition
mu, sigma = gp.predict(candidates)
ucb = mu + kappa * sigma
best_idx = np.argmax(ucb)
best_vec = candidates[best_idx]
proposed = decode_params(best_vec)
# Log the GP's top predictions
top5_idx = np.argsort(ucb)[-5:][::-1]
log(f'[AutoResearch] GP UCB top-5 candidates:')
for idx in top5_idx:
p = decode_params(candidates[idx])
log(f' UCB={ucb[idx]:.4f} mu={mu[idx]:.4f} sigma={sigma[idx]:.4f} params={p}')
return proposed
# ---- Kill Stale Jobs ----
def kill_stale():
subprocess.run(['pkill', '-9', '-f', 'donkeycar_sb3_runner.py'], check=False)
time.sleep(2)
# ---- Launch RL Job with Proposed Params ----
def launch_job(params):
"""Launch a single RL runner job and return (mean_reward, output, status)."""
cmd = [
'python3', RUNNER_SCRIPT,
'--agent', 'dqn',
'--env', 'donkey-generated-roads-v0',
'--timesteps', str(params.get('timesteps', FIXED_PARAMS['timesteps'])),
'--eval-episodes', str(params.get('eval_episodes', FIXED_PARAMS['eval_episodes'])),
'--n-steer', str(params['n_steer']),
'--n-throttle', str(params['n_throttle']),
]
log(f'[AutoResearch] Launching job: n_steer={params["n_steer"]} n_throttle={params["n_throttle"]} lr={params["learning_rate"]:.6f}')
start = time.time()
try:
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=JOB_TIMEOUT)
elapsed = time.time() - start
output = proc.stdout + '\n' + proc.stderr
status = 'ok' if proc.returncode == 0 else 'error'
log(f'[AutoResearch] Job finished in {elapsed:.1f}s, returncode={proc.returncode}')
except subprocess.TimeoutExpired as e:
elapsed = time.time() - start
output = f'[TIMEOUT after {elapsed:.1f}s]'
status = 'timeout'
log(f'[AutoResearch] Job TIMED OUT after {elapsed:.1f}s')
# Parse mean_reward from output
mean_reward = None
m = re.search(r'\[SB3 Runner\]\[TEST\] mean_reward=([\d.]+)', output)
if m:
mean_reward = float(m.group(1))
log(f'[AutoResearch] mean_reward={mean_reward}')
# Print full runner output for transparency
print('--- Runner Output ---')
print(output[-3000:]) # last 3000 chars
print('--- End Runner Output ---')
return mean_reward, output, status, elapsed
# ---- Save Result ----
def save_result(trial, params, mean_reward, status, elapsed):
rec = {
'trial': trial,
'timestamp': datetime.now().isoformat(),
'params': params,
'mean_reward': mean_reward,
'run_status': status,
'elapsed_sec': elapsed,
}
with open(AUTORESEARCH_RESULTS, 'a') as f:
f.write(json.dumps(rec) + '\n')
# ---- Print Current Best ----
def print_summary(results, trial):
if not results:
return
best = max(results, key=lambda r: r['mean_reward'])
log(f'[AutoResearch] === Trial {trial} Summary ===')
log(f' Total runs in history: {len(results)}')
log(f' Best so far: mean_reward={best["mean_reward"]:.4f} params={best["params"]}')
# Top 5
sorted_r = sorted(results, key=lambda r: r['mean_reward'], reverse=True)
log(f' Top 5 results:')
for r in sorted_r[:5]:
log(f' mean_reward={r["mean_reward"]:.4f} params={r["params"]}')
# ---- Main Autoresearch Loop ----
def run_autoresearch(max_trials=100):
log('=' * 60)
log('[AutoResearch] Starting Karpathy-style autoresearch controller')
log(f'[AutoResearch] Max trials: {max_trials}')
log(f'[AutoResearch] Runner: {RUNNER_SCRIPT}')
log(f'[AutoResearch] Results: {AUTORESEARCH_RESULTS}')
log('=' * 60)
# Load all existing data (base sweep + prior autoresearch runs)
results = load_all_results()
log(f'[AutoResearch] Loaded {len(results)} existing result(s) from base sweep + history.')
print_summary(results, trial=0)
for trial in range(1, max_trials + 1):
log(f'\n[AutoResearch] ========== Trial {trial}/{max_trials} ==========')
# 1. Propose next params using GP+UCB
proposed = propose_next_params(results)
full_params = {**proposed, **FIXED_PARAMS}
log(f'[AutoResearch] Proposed params: {full_params}')
# 2. Kill any stale jobs
kill_stale()
# 3. Launch job
mean_reward, output, status, elapsed = launch_job(full_params)
# 4. Save result
save_result(trial, full_params, mean_reward, status, elapsed)
# 5. If we got a valid reward, add to results for next GP fit
if mean_reward is not None:
results.append({'params': full_params, 'mean_reward': mean_reward})
else:
log(f'[AutoResearch] WARNING: No valid mean_reward from this trial.')
# 6. Print running summary
print_summary(results, trial)
# 7. Brief pause between trials
time.sleep(2)
log('[AutoResearch] All trials complete!')
print_summary(results, trial=max_trials)
# ---- Entry Point ----
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Karpathy-style autoresearch controller for DonkeyCar RL.')
parser.add_argument('--trials', type=int, default=100, help='Number of autoresearch trials to run (default: 100)')
parser.add_argument('--explore', type=float, default=2.0, help='UCB exploration constant kappa (default: 2.0, higher=more explore)')
args = parser.parse_args()
UCB_KAPPA = args.explore
run_autoresearch(max_trials=args.trials)

View File

@ -31,7 +31,7 @@ def build_param_combinations(grid):
def run_sweep():
results = []
out_dir = '/home/paulh/.pi/agent/outerloop-results'
out_dir = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/outerloop-results'
os.makedirs(out_dir, exist_ok=True)
log_file = os.path.join(out_dir, 'sweep_results.jsonl')

View File

@ -0,0 +1,18 @@
{"config_id": 1, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 46.7312, "run": 1}
{"config_id": 2, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 64.7249, "run": 2}
{"config_id": 3, "params": {"n_steer": 3, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 36.2958, "run": 3}
{"config_id": 4, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 33.6781, "run": 4}
{"config_id": 5, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 59.7928, "run": 5}
{"config_id": 6, "params": {"n_steer": 3, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 61.8774, "run": 6}
{"config_id": 7, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 97.7536, "run": 7}
{"config_id": 8, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 61.7233, "run": 8}
{"config_id": 9, "params": {"n_steer": 5, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 53.6128, "run": 9}
{"config_id": 10, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 50.171, "run": 10}
{"config_id": 11, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 78.3455, "run": 11}
{"config_id": 12, "params": {"n_steer": 5, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 71.8459, "run": 12}
{"config_id": 13, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 87.96, "run": 13}
{"config_id": 14, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 45.0102, "run": 14}
{"config_id": 15, "params": {"n_steer": 7, "n_throttle": 2, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 52.6958, "run": 15}
{"config_id": 16, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 80.3866, "run": 16}
{"config_id": 17, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.0005, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 84.9219, "run": 17}
{"config_id": 18, "params": {"n_steer": 7, "n_throttle": 3, "learning_rate": 0.0001, "timesteps": 2000, "eval_episodes": 3}, "mean_reward": 77.3825, "run": 18}