From 0615b22cb922e3c3840927dbf99a200964c6097e Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Thu, 14 May 2026 15:32:21 -0400 Subject: [PATCH] =?UTF-8?q?feat(eval):=20cross-model=20evaluation=20script?= =?UTF-8?q?s=20for=20exp24/25/26=20+=20gentrack=E2=86=92minimonaco?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eval_best_models.py: evaluates exp24/25/26 best models across 10 fixed random roads (regen_road with fixed seeds) for fair head-to-head comparison. eval_gentrack_on_minimonaco.py: zero-shot evaluation of gentrack specialists (exp13, wave5-gentrack-only, wave4-trial-0009) on mini-monaco. Results: exp26 > exp25 > exp24 on random roads. Co-Authored-By: Claude Sonnet 4.6 --- agent/experiments/eval_best_models.py | 197 ++++++++++++++++++ .../eval_gentrack_on_minimonaco.py | 195 +++++++++++++++++ .../eval_best_models_20260506_102952.log | 88 ++++++++ ...al_gentrack_minimonaco_20260506_184305.log | 47 +++++ ...al_gentrack_minimonaco_20260506_184636.log | 49 +++++ ...al_gentrack_minimonaco_20260506_184902.log | 49 +++++ ...al_gentrack_minimonaco_20260506_211519.log | 49 +++++ ...al_gentrack_minimonaco_20260506_212714.log | 33 +++ 8 files changed, 707 insertions(+) create mode 100644 agent/experiments/eval_best_models.py create mode 100644 agent/experiments/eval_gentrack_on_minimonaco.py create mode 100644 agent/models/eval_best_models_20260506_102952.log create mode 100644 agent/models/eval_gentrack_minimonaco_20260506_184305.log create mode 100644 agent/models/eval_gentrack_minimonaco_20260506_184636.log create mode 100644 agent/models/eval_gentrack_minimonaco_20260506_184902.log create mode 100644 agent/models/eval_gentrack_minimonaco_20260506_211519.log create mode 100644 agent/models/eval_gentrack_minimonaco_20260506_212714.log diff --git a/agent/experiments/eval_best_models.py b/agent/experiments/eval_best_models.py new file mode 100644 index 0000000..35e2fb3 --- /dev/null +++ b/agent/experiments/eval_best_models.py @@ -0,0 +1,197 @@ +""" +Evaluate best models from exp24/25/26 across 10 truly different random roads. + +Uses the regen_road TCP message (rand_seed) to get genuinely different roads — +NOT the same road-2 that all prior training used. + +Road style 0 = default generated road geometry. +10 different seeds per model, 2000 steps per road. +""" +import os +import sys +import time +import random +import json +from datetime import datetime + +sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') + +import gymnasium as gym +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage + +from discretize_action import DiscretizedActionWrapper +from donkeycar_sb3_runner import ThrottleClampWrapper +from multitrack_runner import StuckTerminationWrapper +from reward_wrapper import SpeedRewardWrapper + +HOST = 'localhost' +PORT = 9091 +THROTTLE_MIN = 0.2 +N_STEER = 7 +N_THROTTLE = 1 +TRACK_ID = 'donkey-generated-roads-v0' + +MAX_EVAL_STEPS = 2000 +REGEN_WAIT = 3.0 # seconds after regen_road before reset +N_ROADS = 10 + +MODELS = { + 'exp24': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete/best_model.zip', + 'exp25': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp25-wheel-fix/best_model.zip', + 'exp26': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp26-warmstart/best_model.zip', +} + +# 10 fixed seeds so every model is evaluated on the same set of roads +EVAL_SEEDS = [1001, 2002, 3003, 4004, 5005, 6006, 7007, 8008, 9009, 1234] + +LOG_PATH = f'/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_best_models_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log' + +import logging +_fh = logging.FileHandler(LOG_PATH) +_fh.setFormatter(logging.Formatter('%(message)s')) +_sh = logging.StreamHandler(sys.stdout) +_sh.setFormatter(logging.Formatter('%(message)s')) +log = logging.getLogger('eval') +log.setLevel(logging.INFO) +log.propagate = False +log.addHandler(_fh) +log.addHandler(_sh) + + +def ts(): + return datetime.now().strftime('%H:%M:%S') + + +def flog(msg): + log.info(f'[{ts()}] {msg}') + + +def make_env(): + def _init(): + raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT}) + env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE) + env = StuckTerminationWrapper( + env, + stuck_steps=40, + min_displacement=0.5, + max_stuck_seconds=5.0, + max_episode_seconds=30.0, + low_speed_threshold=1.0, + max_low_speed_seconds=1.5, + max_cte=3.0, + max_high_cte_seconds=1.0, + ) + env = SpeedRewardWrapper( + env, + window_size=30, + min_efficiency=0.15, + max_cte=8.0, + min_lap_time=12.0, + progress_patience=100, + ) + return env + return _init + + +def get_handler(vec_env): + """Navigate wrapper stack to reach DonkeyUnitySimHandler.""" + return vec_env.venv.envs[0].unwrapped.viewer.handler + + +def regen_road(vec_env, seed, road_style=0): + """Send regen_road message to sim with the given seed.""" + msg = { + 'msg_type': 'regen_road', + 'road_style': str(road_style), + 'rand_seed': str(seed), + 'turn_increment': '0.0', + } + handler = get_handler(vec_env) + handler.queue_message(msg) + time.sleep(REGEN_WAIT) + + +def run_episode(model, env): + """Run one deterministic episode, return (steps, reward).""" + obs = env.reset() + total_r = 0.0 + total_s = 0 + done = False + + while not done and total_s < MAX_EVAL_STEPS: + action, _ = model.predict(obs, deterministic=True) + obs, r, d, _ = env.step(action) + r_val = float(r[0]) if hasattr(r, '__len__') else float(r) + d_val = bool(d[0]) if hasattr(d, '__len__') else bool(d) + total_r += r_val + total_s += 1 + done = d_val + + return total_s, total_r + + +flog('=' * 70) +flog('Evaluating best models on 10 genuinely different random roads') +flog(f'Seeds: {EVAL_SEEDS}') +flog(f'Log: {LOG_PATH}') +flog('=' * 70) + +# Connect once — reuse env for all models/roads +flog('Connecting to sim...') +env = DummyVecEnv([make_env()]) +env = VecTransposeImage(env) +flog(f' Connected. obs={env.observation_space.shape}, action={env.action_space}') + +results = {} + +for model_name, model_path in MODELS.items(): + flog('') + flog(f'── {model_name} ──────────────────────────────────────') + flog(f' Model: {model_path}') + + try: + model = PPO.load(model_path, env=env, device='cpu') + except Exception as e: + flog(f' LOAD ERROR: {e}') + continue + + steps_list = [] + reward_list = [] + + for i, seed in enumerate(EVAL_SEEDS): + flog(f' Road {i+1:2d}/10 (seed={seed}) — regenerating...') + regen_road(env, seed) + + steps, reward = run_episode(model, env) + status = '✅' if steps >= MAX_EVAL_STEPS else f'❌@{steps}' + flog(f' → {reward:.1f}r / {steps}s {status}') + steps_list.append(steps) + reward_list.append(reward) + + mean_steps = np.mean(steps_list) + mean_reward = np.mean(reward_list) + full_eps = sum(1 for s in steps_list if s >= MAX_EVAL_STEPS) + + flog(f' {model_name} SUMMARY: {full_eps}/10 full | mean {mean_steps:.0f}s / {mean_reward:.1f}r') + results[model_name] = { + 'full': full_eps, + 'mean_steps': mean_steps, + 'mean_reward': mean_reward, + 'per_road': list(zip(EVAL_SEEDS, steps_list, reward_list)), + } + +env.close() + +flog('') +flog('=' * 70) +flog('FINAL RANKING') +flog('=' * 70) +ranked = sorted(results.items(), key=lambda x: (x[1]['full'], x[1]['mean_steps']), reverse=True) +for rank, (name, r) in enumerate(ranked, 1): + flog(f' #{rank} {name:8s} {r["full"]}/10 full mean {r["mean_steps"]:.0f}s / {r["mean_reward"]:.1f}r') + +flog('') +flog('Evaluation complete.') diff --git a/agent/experiments/eval_gentrack_on_minimonaco.py b/agent/experiments/eval_gentrack_on_minimonaco.py new file mode 100644 index 0000000..6cf6455 --- /dev/null +++ b/agent/experiments/eval_gentrack_on_minimonaco.py @@ -0,0 +1,195 @@ +""" +eval_gentrack_on_minimonaco.py + +Evaluate generated-track specialist models on mini-monaco (zero-shot). + +Key question: does a model trained on generated-track generalize to +mini-monaco, given that both tracks are visually very similar? + +Models tested: + - exp13-gentrack-v4/best_model.zip (30k steps, clean gentrack specialist) + - wave5-gentrack-only/model.zip (90k steps, gentrack from scratch) + - wave4-trial-0009/model.zip (the one run that drove mini-monaco) + +Track: donkey-minimonaco-track-v0 (never seen during any of these trainings) +Episodes: 7 per model +Max steps: 2000 per episode +""" +import sys, os, time +from datetime import datetime +import numpy as np + +sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') + +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage +from donkeycar_sb3_runner import ThrottleClampWrapper +import gymnasium as gym + +HOST = 'localhost' +PORT = 9091 +TRACK_ID = 'donkey-minimonaco-track-v0' +EPISODES = 7 +MAX_STEPS = 3000 # enough for 2+ laps +THROTTLE_MIN = 0.2 +STUCK_STEPS = 60 # terminate if car hasn't moved in this many steps +STUCK_DIST = 0.3 # minimum displacement (metres) to not be considered stuck + +BASE = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models' + +MODELS = [ + ('exp13-gentrack-v4', f'{BASE}/exp13-gentrack-v4/best_model.zip'), + ('wave5-gentrack-only', f'{BASE}/wave5-gentrack-only/model.zip'), + ('wave4-trial-0009', f'{BASE}/wave4-trial-0009/model.zip'), +] + +# Log to file + stdout +log_path = os.path.join( + BASE, + f'eval_gentrack_minimonaco_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log' +) +_logfile = open(log_path, 'w', buffering=1) + +def log(msg): + ts = datetime.now().strftime('%H:%M:%S') + line = f'[{ts}] {msg}' + print(line, flush=True) + _logfile.write(line + '\n') + + +class MiniMonacoWrapper(gym.Wrapper): + """Two fixes for mini-monaco evaluation: + 1. Suppress starting_line termination until lap_count >= 1 (car spawns + just before the line; the first crossing is not a lap completion). + 2. Terminate if the car hasn't moved STUCK_DIST metres in STUCK_STEPS steps. + """ + def reset(self, **kwargs): + self._lap_count = 0 + self._pos_history = [] + return self.env.reset(**kwargs) + + def step(self, action): + obs, reward, terminated, truncated, info = self.env.step(action) + + laps = int(info.get('lap_count', 0) or 0) + if laps > self._lap_count: + self._lap_count = laps + + # Suppress initial starting_line crossing + if terminated and info.get('hit') == 'starting_line' and self._lap_count < 1: + terminated = False + reward = 0.0 + + # Stuck detection + pos = info.get('pos') + if pos is not None: + self._pos_history.append(np.array(list(pos)[:3])) + if len(self._pos_history) > STUCK_STEPS: + self._pos_history.pop(0) + if len(self._pos_history) == STUCK_STEPS: + displacement = np.linalg.norm( + self._pos_history[-1] - self._pos_history[0]) + if displacement < STUCK_DIST: + terminated = True + reward = -1.0 + info['hit'] = 'stuck' + + return obs, reward, terminated, truncated, info + + +def make_env(): + raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT}) + env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) + env = MiniMonacoWrapper(env) + return env + + +def run_eval(model_label, model_path): + log('') + log(f'── {model_label} ──────────────────────────────────────') + log(f' Model: {model_path}') + + if not os.path.exists(model_path): + log(f' ERROR: model file not found — skipping') + return None + + env = VecTransposeImage(DummyVecEnv([make_env])) + try: + model = PPO.load(model_path, env=env, device='cpu') + except Exception as e: + log(f' ERROR loading model: {e}') + env.close() + return None + + rewards, steps_list, laps_list = [], [], [] + + for ep in range(1, EPISODES + 1): + obs = env.reset() + total_r, steps, done = 0.0, 0, False + laps = 0 + while not done and steps < MAX_STEPS: + action, _ = model.predict(obs, deterministic=True) + obs, r, d, info = env.step(action) + total_r += float(r[0]) + steps += 1 + done = bool(d[0]) + raw_info = info[0] if isinstance(info, (list, tuple)) else info + laps = int((raw_info.get('lap_count', 0) or 0)) + + hit = (info[0] if isinstance(info, (list, tuple)) else info).get('hit', '?') + if steps >= MAX_STEPS: + status = f'✅ timeout ({laps} laps)' + elif hit == 'stuck': + status = f'❌ STUCK @{steps} ({laps} laps)' + else: + status = f'❌ crash @{steps} hit={hit} ({laps} laps)' + log(f' ep{ep}: {total_r:.1f}r / {steps}s {status}') + rewards.append(total_r) + steps_list.append(steps) + laps_list.append(laps) + time.sleep(0.3) + + mean_r = np.mean(rewards) + mean_s = np.mean(steps_list) + total_laps = sum(laps_list) + lapped = sum(1 for l in laps_list if l >= 1) + + log(f' SUMMARY: {lapped}/{EPISODES} completed a lap | ' + f'total laps={total_laps} | mean {mean_s:.0f}s / {mean_r:.1f}r') + + env.close() + time.sleep(2) + + return {'label': model_label, 'lapped': lapped, 'total_laps': total_laps, + 'mean_steps': mean_s, 'mean_reward': mean_r} + + +def main(): + log('=' * 70) + log('Eval: generated-track specialists on mini-monaco (zero-shot)') + log(f'Track : {TRACK_ID}') + log(f'Episodes: {EPISODES} x max {MAX_STEPS} steps') + log(f'Host : {HOST}:{PORT}') + log(f'Log : {log_path}') + log('=' * 70) + + results = [] + for label, path in MODELS: + r = run_eval(label, path) + if r: + results.append(r) + + log('') + log('=' * 70) + log('FINAL RESULTS') + log('=' * 70) + for r in sorted(results, key=lambda x: -x['total_laps']): + log(f" {r['label']:<25} lapped={r['lapped']}/{EPISODES} " + f"total_laps={r['total_laps']} mean {r['mean_steps']:>5.0f}s / {r['mean_reward']:>6.1f}r") + + log(f'\nLog saved: {log_path}') + _logfile.close() + + +if __name__ == '__main__': + main() diff --git a/agent/models/eval_best_models_20260506_102952.log b/agent/models/eval_best_models_20260506_102952.log new file mode 100644 index 0000000..1edfd94 --- /dev/null +++ b/agent/models/eval_best_models_20260506_102952.log @@ -0,0 +1,88 @@ +[10:29:52] ====================================================================== +[10:29:52] Evaluating best models on 10 genuinely different random roads +[10:29:52] Seeds: [1001, 2002, 3003, 4004, 5005, 6006, 7007, 8008, 9009, 1234] +[10:29:52] Log: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_best_models_20260506_102952.log +[10:29:52] ====================================================================== +[10:29:52] Connecting to sim... +[10:29:52] Connected. obs=(3, 120, 160), action=Discrete(7) +[10:29:52] +[10:29:52] ── exp24 ────────────────────────────────────── +[10:29:52] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete/best_model.zip +[10:29:55] Road 1/10 (seed=1001) — regenerating... +[10:30:24] → 371.0r / 2000s ✅ +[10:30:24] Road 2/10 (seed=2002) — regenerating... +[10:30:53] → 365.2r / 2000s ✅ +[10:30:53] Road 3/10 (seed=3003) — regenerating... +[10:31:22] → 365.0r / 2000s ✅ +[10:31:22] Road 4/10 (seed=4004) — regenerating... +[10:31:51] → 372.2r / 2000s ✅ +[10:31:51] Road 5/10 (seed=5005) — regenerating... +[10:32:21] → 363.3r / 2000s ✅ +[10:32:21] Road 6/10 (seed=6006) — regenerating... +[10:32:50] → 365.8r / 2000s ✅ +[10:32:50] Road 7/10 (seed=7007) — regenerating... +[10:33:19] → 371.5r / 2000s ✅ +[10:33:19] Road 8/10 (seed=8008) — regenerating... +[10:33:36] → 157.7r / 912s ❌@912 +[10:33:36] Road 9/10 (seed=9009) — regenerating... +[10:34:05] → 371.6r / 2000s ✅ +[10:34:05] Road 10/10 (seed=1234) — regenerating... +[10:34:35] → 372.1r / 2000s ✅ +[10:34:35] exp24 SUMMARY: 9/10 full | mean 1891s / 347.5r +[10:34:35] +[10:34:35] ── exp25 ────────────────────────────────────── +[10:34:35] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp25-wheel-fix/best_model.zip +[10:34:36] Road 1/10 (seed=1001) — regenerating... +[10:35:05] → 378.5r / 2000s ✅ +[10:35:05] Road 2/10 (seed=2002) — regenerating... +[10:35:34] → 382.9r / 2000s ✅ +[10:35:34] Road 3/10 (seed=3003) — regenerating... +[10:36:03] → 382.0r / 2000s ✅ +[10:36:03] Road 4/10 (seed=4004) — regenerating... +[10:36:18] → 122.8r / 694s ❌@694 +[10:36:18] Road 5/10 (seed=5005) — regenerating... +[10:36:47] → 384.3r / 2000s ✅ +[10:36:47] Road 6/10 (seed=6006) — regenerating... +[10:37:16] → 379.7r / 2000s ✅ +[10:37:16] Road 7/10 (seed=7007) — regenerating... +[10:37:45] → 382.7r / 2000s ✅ +[10:37:45] Road 8/10 (seed=8008) — regenerating... +[10:38:15] → 382.8r / 2000s ✅ +[10:38:15] Road 9/10 (seed=9009) — regenerating... +[10:38:44] → 383.2r / 2000s ✅ +[10:38:44] Road 10/10 (seed=1234) — regenerating... +[10:39:13] → 383.9r / 2000s ✅ +[10:39:13] exp25 SUMMARY: 9/10 full | mean 1869s / 356.3r +[10:39:13] +[10:39:13] ── exp26 ────────────────────────────────────── +[10:39:13] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp26-warmstart/best_model.zip +[10:39:14] Road 1/10 (seed=1001) — regenerating... +[10:39:43] → 392.2r / 2000s ✅ +[10:39:43] Road 2/10 (seed=2002) — regenerating... +[10:40:10] → 307.0r / 1583s ❌@1583 +[10:40:10] Road 3/10 (seed=3003) — regenerating... +[10:40:39] → 387.6r / 2000s ✅ +[10:40:39] Road 4/10 (seed=4004) — regenerating... +[10:41:08] → 392.5r / 2000s ✅ +[10:41:08] Road 5/10 (seed=5005) — regenerating... +[10:41:37] → 390.6r / 2000s ✅ +[10:41:37] Road 6/10 (seed=6006) — regenerating... +[10:42:07] → 389.4r / 2000s ✅ +[10:42:07] Road 7/10 (seed=7007) — regenerating... +[10:42:36] → 388.2r / 2000s ✅ +[10:42:36] Road 8/10 (seed=8008) — regenerating... +[10:43:05] → 389.1r / 2000s ✅ +[10:43:05] Road 9/10 (seed=9009) — regenerating... +[10:43:34] → 389.0r / 2000s ✅ +[10:43:34] Road 10/10 (seed=1234) — regenerating... +[10:44:04] → 386.5r / 2000s ✅ +[10:44:04] exp26 SUMMARY: 9/10 full | mean 1958s / 381.2r +[10:44:04] +[10:44:04] ====================================================================== +[10:44:04] FINAL RANKING +[10:44:04] ====================================================================== +[10:44:04] #1 exp26 9/10 full mean 1958s / 381.2r +[10:44:04] #2 exp24 9/10 full mean 1891s / 347.5r +[10:44:04] #3 exp25 9/10 full mean 1869s / 356.3r +[10:44:04] +[10:44:04] Evaluation complete. diff --git a/agent/models/eval_gentrack_minimonaco_20260506_184305.log b/agent/models/eval_gentrack_minimonaco_20260506_184305.log new file mode 100644 index 0000000..2281e17 --- /dev/null +++ b/agent/models/eval_gentrack_minimonaco_20260506_184305.log @@ -0,0 +1,47 @@ +[18:43:05] ====================================================================== +[18:43:05] Eval: generated-track specialists on mini-monaco (zero-shot) +[18:43:05] Track : donkey-minimonaco-track-v0 +[18:43:05] Episodes: 7 x max 2000 steps +[18:43:05] Host : localhost:9091 +[18:43:05] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184305.log +[18:43:05] ====================================================================== +[18:43:05] +[18:43:05] ── exp13-gentrack-v4 ────────────────────────────────────── +[18:43:05] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip +[18:43:12] ep1: 4.5r / 29s ❌@29 +[18:43:15] ep2: 4.5r / 28s ❌@28 +[18:43:18] ep3: 4.6r / 28s ❌@28 +[18:43:21] ep4: 4.7r / 28s ❌@28 +[18:43:24] ep5: 4.6r / 28s ❌@28 +[18:43:27] ep6: 4.6r / 28s ❌@28 +[18:43:30] ep7: 4.6r / 28s ❌@28 +[18:43:31] SUMMARY: 0/7 full | mean 28s / 4.6r | ❌ CRASHES +[18:43:33] +[18:43:33] ── wave5-gentrack-only ────────────────────────────────────── +[18:43:33] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip +[18:43:36] ep1: 4.8r / 28s ❌@28 +[18:43:39] ep2: 4.7r / 28s ❌@28 +[18:43:42] ep3: 4.9r / 28s ❌@28 +[18:43:45] ep4: 4.7r / 28s ❌@28 +[18:43:49] ep5: 4.6r / 27s ❌@27 +[18:43:52] ep6: 4.9r / 28s ❌@28 +[18:43:55] ep7: 4.9r / 28s ❌@28 +[18:43:55] SUMMARY: 0/7 full | mean 28s / 4.8r | ❌ CRASHES +[18:43:57] +[18:43:57] ── wave4-trial-0009 ────────────────────────────────────── +[18:43:57] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip +[18:44:01] ep1: 4.9r / 28s ❌@28 +[18:44:04] ep2: 5.3r / 28s ❌@28 +[18:44:07] ep3: 5.1r / 28s ❌@28 +[18:44:10] ep4: 5.0r / 29s ❌@29 +[18:44:13] ep5: 5.1r / 28s ❌@28 +[18:44:16] ep6: 5.3r / 29s ❌@29 +[18:44:19] ep7: 5.3r / 29s ❌@29 +[18:44:19] SUMMARY: 0/7 full | mean 28s / 5.1r | ❌ CRASHES +[18:44:21] +[18:44:21] ====================================================================== +[18:44:21] FINAL RESULTS +[18:44:21] ====================================================================== +[18:44:21] wave4-trial-0009 0/7 full mean 28s / 5.1r ❌ CRASHES +[18:44:21] exp13-gentrack-v4 0/7 full mean 28s / 4.6r ❌ CRASHES +[18:44:21] wave5-gentrack-only 0/7 full mean 28s / 4.8r ❌ CRASHES diff --git a/agent/models/eval_gentrack_minimonaco_20260506_184636.log b/agent/models/eval_gentrack_minimonaco_20260506_184636.log new file mode 100644 index 0000000..de4125f --- /dev/null +++ b/agent/models/eval_gentrack_minimonaco_20260506_184636.log @@ -0,0 +1,49 @@ +[18:46:36] ====================================================================== +[18:46:36] Eval: generated-track specialists on mini-monaco (zero-shot) +[18:46:36] Track : donkey-minimonaco-track-v0 +[18:46:36] Episodes: 7 x max 2000 steps +[18:46:36] Host : localhost:9091 +[18:46:36] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184636.log +[18:46:36] ====================================================================== +[18:46:36] +[18:46:36] ── exp13-gentrack-v4 ────────────────────────────────────── +[18:46:36] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip +[18:46:43] ep1: 4.6r / 100s ❌@100 +[18:46:47] ep2: 4.6r / 100s ❌@100 +[18:46:51] ep3: 4.5r / 100s ❌@100 +[18:46:55] ep4: 4.5r / 100s ❌@100 +[18:46:58] ep5: 4.8r / 100s ❌@100 +[18:47:02] ep6: 4.5r / 100s ❌@100 +[18:47:06] ep7: 4.7r / 100s ❌@100 +[18:47:07] SUMMARY: 0/7 full | mean 100s / 4.6r | ❌ CRASHES +[18:47:09] +[18:47:09] ── wave5-gentrack-only ────────────────────────────────────── +[18:47:09] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip +[18:47:14] ep1: 4.8r / 100s ❌@100 +[18:47:17] ep2: 4.8r / 100s ❌@100 +[18:47:21] ep3: 4.6r / 100s ❌@100 +[18:47:25] ep4: 4.8r / 100s ❌@100 +[18:47:29] ep5: 4.6r / 100s ❌@100 +[18:47:33] ep6: 4.6r / 100s ❌@100 +[18:47:37] ep7: 4.8r / 100s ❌@100 +[18:47:38] SUMMARY: 0/7 full | mean 100s / 4.7r | ❌ CRASHES +[18:47:40] +[18:47:40] ── wave4-trial-0009 ────────────────────────────────────── +[18:47:40] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip +[18:47:44] ep1: 5.1r / 100s ❌@100 +[18:47:48] ep2: 5.1r / 100s ❌@100 +[18:47:52] ep3: 5.0r / 100s ❌@100 +[18:47:56] ep4: 5.3r / 100s ❌@100 +[18:48:00] ep5: 4.9r / 100s ❌@100 +[18:48:04] ep6: 5.0r / 100s ❌@100 +[18:48:08] ep7: 5.1r / 100s ❌@100 +[18:48:08] SUMMARY: 0/7 full | mean 100s / 5.1r | ❌ CRASHES +[18:48:10] +[18:48:10] ====================================================================== +[18:48:10] FINAL RESULTS +[18:48:10] ====================================================================== +[18:48:10] exp13-gentrack-v4 0/7 full mean 100s / 4.6r ❌ CRASHES +[18:48:10] wave5-gentrack-only 0/7 full mean 100s / 4.7r ❌ CRASHES +[18:48:10] wave4-trial-0009 0/7 full mean 100s / 5.1r ❌ CRASHES +[18:48:10] +Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184636.log diff --git a/agent/models/eval_gentrack_minimonaco_20260506_184902.log b/agent/models/eval_gentrack_minimonaco_20260506_184902.log new file mode 100644 index 0000000..a93f704 --- /dev/null +++ b/agent/models/eval_gentrack_minimonaco_20260506_184902.log @@ -0,0 +1,49 @@ +[18:49:02] ====================================================================== +[18:49:02] Eval: generated-track specialists on mini-monaco (zero-shot) +[18:49:02] Track : donkey-minimonaco-track-v0 +[18:49:02] Episodes: 7 x max 2000 steps +[18:49:02] Host : localhost:9091 +[18:49:02] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184902.log +[18:49:02] ====================================================================== +[18:49:02] +[18:49:02] ── exp13-gentrack-v4 ────────────────────────────────────── +[18:49:02] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip +[18:49:31] ep1: 5.5r / 2000s ✅ +[18:49:58] ep2: 5.9r / 2000s ✅ +[18:50:24] ep3: 5.8r / 2000s ✅ +[18:50:51] ep4: 5.6r / 2000s ✅ +[18:51:17] ep5: 5.8r / 2000s ✅ +[18:51:44] ep6: 5.5r / 2000s ✅ +[18:52:10] ep7: 5.5r / 2000s ✅ +[18:52:11] SUMMARY: 7/7 full | mean 2000s / 5.7r | ✅ DRIVES +[18:52:13] +[18:52:13] ── wave5-gentrack-only ────────────────────────────────────── +[18:52:13] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip +[18:52:40] ep1: 5.6r / 2000s ✅ +[18:53:06] ep2: 5.7r / 2000s ✅ +[18:53:33] ep3: 5.9r / 2000s ✅ +[18:53:42] ep4: 4.6r / 550s ❌@550 +[18:54:09] ep5: 5.9r / 2000s ✅ +[18:54:18] ep6: 4.5r / 540s ❌@540 +[18:54:45] ep7: 5.8r / 2000s ✅ +[18:54:45] SUMMARY: 5/7 full | mean 1584s / 5.4r | ✅ DRIVES +[18:54:47] +[18:54:47] ── wave4-trial-0009 ────────────────────────────────────── +[18:54:47] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip +[18:55:01] ep1: 5.3r / 865s ❌@865 +[18:55:28] ep2: 6.0r / 2000s ✅ +[18:55:42] ep3: 5.0r / 874s ❌@874 +[18:55:55] ep4: 5.2r / 858s ❌@858 +[18:56:22] ep5: 6.0r / 2000s ✅ +[18:56:37] ep6: 5.3r / 845s ❌@845 +[18:56:51] ep7: 5.1r / 890s ❌@890 +[18:56:52] SUMMARY: 2/7 full | mean 1190s / 5.4r | ✅ DRIVES +[18:56:54] +[18:56:54] ====================================================================== +[18:56:54] FINAL RESULTS +[18:56:54] ====================================================================== +[18:56:54] exp13-gentrack-v4 7/7 full mean 2000s / 5.7r ✅ DRIVES +[18:56:54] wave5-gentrack-only 5/7 full mean 1584s / 5.4r ✅ DRIVES +[18:56:54] wave4-trial-0009 2/7 full mean 1190s / 5.4r ✅ DRIVES +[18:56:54] +Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184902.log diff --git a/agent/models/eval_gentrack_minimonaco_20260506_211519.log b/agent/models/eval_gentrack_minimonaco_20260506_211519.log new file mode 100644 index 0000000..e80a838 --- /dev/null +++ b/agent/models/eval_gentrack_minimonaco_20260506_211519.log @@ -0,0 +1,49 @@ +[21:15:19] ====================================================================== +[21:15:19] Eval: generated-track specialists on mini-monaco (zero-shot) +[21:15:19] Track : donkey-minimonaco-track-v0 +[21:15:19] Episodes: 7 x max 2000 steps +[21:15:19] Host : localhost:9091 +[21:15:19] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_211519.log +[21:15:19] ====================================================================== +[21:15:19] +[21:15:19] ── exp13-gentrack-v4 ────────────────────────────────────── +[21:15:19] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip +[21:15:48] ep1: 5.7r / 2000s ✅ +[21:16:15] ep2: 5.8r / 2000s ✅ +[21:16:41] ep3: 5.6r / 2000s ✅ +[21:17:08] ep4: 5.8r / 2000s ✅ +[21:17:34] ep5: 5.7r / 2000s ✅ +[21:18:01] ep6: 5.8r / 2000s ✅ +[21:18:27] ep7: 5.7r / 2000s ✅ +[21:18:27] SUMMARY: 7/7 full | mean 2000s / 5.7r | ✅ DRIVES +[21:18:29] +[21:18:29] ── wave5-gentrack-only ────────────────────────────────────── +[21:18:29] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip +[21:18:57] ep1: 5.7r / 2000s ✅ +[21:19:23] ep2: 5.9r / 2000s ✅ +[21:19:50] ep3: 5.8r / 2000s ✅ +[21:20:16] ep4: 5.8r / 2000s ✅ +[21:20:43] ep5: 5.9r / 2000s ✅ +[21:21:09] ep6: 5.6r / 2000s ✅ +[21:21:36] ep7: 5.7r / 2000s ✅ +[21:21:36] SUMMARY: 7/7 full | mean 2000s / 5.8r | ✅ DRIVES +[21:21:38] +[21:21:38] ── wave4-trial-0009 ────────────────────────────────────── +[21:21:38] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip +[21:21:52] ep1: 5.3r / 859s ❌@859 +[21:22:06] ep2: 5.3r / 847s ❌@847 +[21:22:19] ep3: 4.9r / 850s ❌@850 +[21:22:33] ep4: 5.0r / 904s ❌@904 +[21:23:00] ep5: 6.3r / 2000s ✅ +[21:23:27] ep6: 6.0r / 2000s ✅ +[21:23:40] ep7: 4.9r / 857s ❌@857 +[21:23:41] SUMMARY: 2/7 full | mean 1188s / 5.4r | ✅ DRIVES +[21:23:43] +[21:23:43] ====================================================================== +[21:23:43] FINAL RESULTS +[21:23:43] ====================================================================== +[21:23:43] exp13-gentrack-v4 7/7 full mean 2000s / 5.7r ✅ DRIVES +[21:23:43] wave5-gentrack-only 7/7 full mean 2000s / 5.8r ✅ DRIVES +[21:23:43] wave4-trial-0009 2/7 full mean 1188s / 5.4r ✅ DRIVES +[21:23:43] +Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_211519.log diff --git a/agent/models/eval_gentrack_minimonaco_20260506_212714.log b/agent/models/eval_gentrack_minimonaco_20260506_212714.log new file mode 100644 index 0000000..1907786 --- /dev/null +++ b/agent/models/eval_gentrack_minimonaco_20260506_212714.log @@ -0,0 +1,33 @@ +[21:27:14] ====================================================================== +[21:27:14] Eval: generated-track specialists on mini-monaco (zero-shot) +[21:27:14] Track : donkey-minimonaco-track-v0 +[21:27:14] Episodes: 7 x max 3000 steps +[21:27:14] Host : localhost:9091 +[21:27:14] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_212714.log +[21:27:14] ====================================================================== +[21:27:14] +[21:27:14] ── exp13-gentrack-v4 ────────────────────────────────────── +[21:27:14] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip +[21:27:21] ep1: 4.5r / 157s ❌ STUCK @157 (0 laps) +[21:27:25] ep2: 4.8r / 156s ❌ STUCK @156 (0 laps) +[21:27:30] ep3: 4.7r / 184s ❌ STUCK @184 (0 laps) +[21:27:35] ep4: 4.8r / 182s ❌ STUCK @182 (0 laps) +[21:27:40] ep5: 4.6r / 197s ❌ STUCK @197 (0 laps) +[21:27:46] ep6: 4.8r / 209s ❌ STUCK @209 (0 laps) +[21:27:50] ep7: 4.6r / 161s ❌ STUCK @161 (0 laps) +[21:27:51] SUMMARY: 0/7 completed a lap | total laps=0 | mean 178s / 4.7r +[21:27:53] +[21:27:53] ── wave5-gentrack-only ────────────────────────────────────── +[21:27:53] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip +[21:27:59] ep1: 4.7r / 258s ❌ STUCK @258 (0 laps) +[21:28:10] ep2: 5.0r / 611s ❌ STUCK @611 (0 laps) +[21:28:16] ep3: 4.8r / 271s ❌ STUCK @271 (0 laps) +[21:28:22] ep4: 4.9r / 272s ❌ STUCK @272 (0 laps) +[21:28:31] ep5: 4.6r / 497s ❌ STUCK @497 (0 laps) +[21:28:40] ep6: 4.9r / 514s ❌ STUCK @514 (0 laps) +[21:28:46] ep7: 4.9r / 289s ❌ STUCK @289 (0 laps) +[21:28:47] SUMMARY: 0/7 completed a lap | total laps=0 | mean 387s / 4.8r +[21:28:49] +[21:28:49] ── wave4-trial-0009 ────────────────────────────────────── +[21:28:49] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip +[21:28:56] ep1: 5.3r / 331s ❌ STUCK @331 (0 laps)