feat(eval): cross-model evaluation scripts for exp24/25/26 + gentrack→minimonaco
eval_best_models.py: evaluates exp24/25/26 best models across 10 fixed random roads (regen_road with fixed seeds) for fair head-to-head comparison. eval_gentrack_on_minimonaco.py: zero-shot evaluation of gentrack specialists (exp13, wave5-gentrack-only, wave4-trial-0009) on mini-monaco. Results: exp26 > exp25 > exp24 on random roads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8de4838c6b
commit
0615b22cb9
|
|
@ -0,0 +1,197 @@
|
|||
"""
|
||||
Evaluate best models from exp24/25/26 across 10 truly different random roads.
|
||||
|
||||
Uses the regen_road TCP message (rand_seed) to get genuinely different roads —
|
||||
NOT the same road-2 that all prior training used.
|
||||
|
||||
Road style 0 = default generated road geometry.
|
||||
10 different seeds per model, 2000 steps per road.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
||||
|
||||
from discretize_action import DiscretizedActionWrapper
|
||||
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||||
from multitrack_runner import StuckTerminationWrapper
|
||||
from reward_wrapper import SpeedRewardWrapper
|
||||
|
||||
HOST = 'localhost'
|
||||
PORT = 9091
|
||||
THROTTLE_MIN = 0.2
|
||||
N_STEER = 7
|
||||
N_THROTTLE = 1
|
||||
TRACK_ID = 'donkey-generated-roads-v0'
|
||||
|
||||
MAX_EVAL_STEPS = 2000
|
||||
REGEN_WAIT = 3.0 # seconds after regen_road before reset
|
||||
N_ROADS = 10
|
||||
|
||||
MODELS = {
|
||||
'exp24': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete/best_model.zip',
|
||||
'exp25': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp25-wheel-fix/best_model.zip',
|
||||
'exp26': '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp26-warmstart/best_model.zip',
|
||||
}
|
||||
|
||||
# 10 fixed seeds so every model is evaluated on the same set of roads
|
||||
EVAL_SEEDS = [1001, 2002, 3003, 4004, 5005, 6006, 7007, 8008, 9009, 1234]
|
||||
|
||||
LOG_PATH = f'/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_best_models_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
||||
|
||||
import logging
|
||||
_fh = logging.FileHandler(LOG_PATH)
|
||||
_fh.setFormatter(logging.Formatter('%(message)s'))
|
||||
_sh = logging.StreamHandler(sys.stdout)
|
||||
_sh.setFormatter(logging.Formatter('%(message)s'))
|
||||
log = logging.getLogger('eval')
|
||||
log.setLevel(logging.INFO)
|
||||
log.propagate = False
|
||||
log.addHandler(_fh)
|
||||
log.addHandler(_sh)
|
||||
|
||||
|
||||
def ts():
|
||||
return datetime.now().strftime('%H:%M:%S')
|
||||
|
||||
|
||||
def flog(msg):
|
||||
log.info(f'[{ts()}] {msg}')
|
||||
|
||||
|
||||
def make_env():
|
||||
def _init():
|
||||
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
|
||||
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
|
||||
env = DiscretizedActionWrapper(env, n_steer=N_STEER, n_throttle=N_THROTTLE)
|
||||
env = StuckTerminationWrapper(
|
||||
env,
|
||||
stuck_steps=40,
|
||||
min_displacement=0.5,
|
||||
max_stuck_seconds=5.0,
|
||||
max_episode_seconds=30.0,
|
||||
low_speed_threshold=1.0,
|
||||
max_low_speed_seconds=1.5,
|
||||
max_cte=3.0,
|
||||
max_high_cte_seconds=1.0,
|
||||
)
|
||||
env = SpeedRewardWrapper(
|
||||
env,
|
||||
window_size=30,
|
||||
min_efficiency=0.15,
|
||||
max_cte=8.0,
|
||||
min_lap_time=12.0,
|
||||
progress_patience=100,
|
||||
)
|
||||
return env
|
||||
return _init
|
||||
|
||||
|
||||
def get_handler(vec_env):
|
||||
"""Navigate wrapper stack to reach DonkeyUnitySimHandler."""
|
||||
return vec_env.venv.envs[0].unwrapped.viewer.handler
|
||||
|
||||
|
||||
def regen_road(vec_env, seed, road_style=0):
|
||||
"""Send regen_road message to sim with the given seed."""
|
||||
msg = {
|
||||
'msg_type': 'regen_road',
|
||||
'road_style': str(road_style),
|
||||
'rand_seed': str(seed),
|
||||
'turn_increment': '0.0',
|
||||
}
|
||||
handler = get_handler(vec_env)
|
||||
handler.queue_message(msg)
|
||||
time.sleep(REGEN_WAIT)
|
||||
|
||||
|
||||
def run_episode(model, env):
|
||||
"""Run one deterministic episode, return (steps, reward)."""
|
||||
obs = env.reset()
|
||||
total_r = 0.0
|
||||
total_s = 0
|
||||
done = False
|
||||
|
||||
while not done and total_s < MAX_EVAL_STEPS:
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
obs, r, d, _ = env.step(action)
|
||||
r_val = float(r[0]) if hasattr(r, '__len__') else float(r)
|
||||
d_val = bool(d[0]) if hasattr(d, '__len__') else bool(d)
|
||||
total_r += r_val
|
||||
total_s += 1
|
||||
done = d_val
|
||||
|
||||
return total_s, total_r
|
||||
|
||||
|
||||
flog('=' * 70)
|
||||
flog('Evaluating best models on 10 genuinely different random roads')
|
||||
flog(f'Seeds: {EVAL_SEEDS}')
|
||||
flog(f'Log: {LOG_PATH}')
|
||||
flog('=' * 70)
|
||||
|
||||
# Connect once — reuse env for all models/roads
|
||||
flog('Connecting to sim...')
|
||||
env = DummyVecEnv([make_env()])
|
||||
env = VecTransposeImage(env)
|
||||
flog(f' Connected. obs={env.observation_space.shape}, action={env.action_space}')
|
||||
|
||||
results = {}
|
||||
|
||||
for model_name, model_path in MODELS.items():
|
||||
flog('')
|
||||
flog(f'── {model_name} ──────────────────────────────────────')
|
||||
flog(f' Model: {model_path}')
|
||||
|
||||
try:
|
||||
model = PPO.load(model_path, env=env, device='cpu')
|
||||
except Exception as e:
|
||||
flog(f' LOAD ERROR: {e}')
|
||||
continue
|
||||
|
||||
steps_list = []
|
||||
reward_list = []
|
||||
|
||||
for i, seed in enumerate(EVAL_SEEDS):
|
||||
flog(f' Road {i+1:2d}/10 (seed={seed}) — regenerating...')
|
||||
regen_road(env, seed)
|
||||
|
||||
steps, reward = run_episode(model, env)
|
||||
status = '✅' if steps >= MAX_EVAL_STEPS else f'❌@{steps}'
|
||||
flog(f' → {reward:.1f}r / {steps}s {status}')
|
||||
steps_list.append(steps)
|
||||
reward_list.append(reward)
|
||||
|
||||
mean_steps = np.mean(steps_list)
|
||||
mean_reward = np.mean(reward_list)
|
||||
full_eps = sum(1 for s in steps_list if s >= MAX_EVAL_STEPS)
|
||||
|
||||
flog(f' {model_name} SUMMARY: {full_eps}/10 full | mean {mean_steps:.0f}s / {mean_reward:.1f}r')
|
||||
results[model_name] = {
|
||||
'full': full_eps,
|
||||
'mean_steps': mean_steps,
|
||||
'mean_reward': mean_reward,
|
||||
'per_road': list(zip(EVAL_SEEDS, steps_list, reward_list)),
|
||||
}
|
||||
|
||||
env.close()
|
||||
|
||||
flog('')
|
||||
flog('=' * 70)
|
||||
flog('FINAL RANKING')
|
||||
flog('=' * 70)
|
||||
ranked = sorted(results.items(), key=lambda x: (x[1]['full'], x[1]['mean_steps']), reverse=True)
|
||||
for rank, (name, r) in enumerate(ranked, 1):
|
||||
flog(f' #{rank} {name:8s} {r["full"]}/10 full mean {r["mean_steps"]:.0f}s / {r["mean_reward"]:.1f}r')
|
||||
|
||||
flog('')
|
||||
flog('Evaluation complete.')
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
"""
|
||||
eval_gentrack_on_minimonaco.py
|
||||
|
||||
Evaluate generated-track specialist models on mini-monaco (zero-shot).
|
||||
|
||||
Key question: does a model trained on generated-track generalize to
|
||||
mini-monaco, given that both tracks are visually very similar?
|
||||
|
||||
Models tested:
|
||||
- exp13-gentrack-v4/best_model.zip (30k steps, clean gentrack specialist)
|
||||
- wave5-gentrack-only/model.zip (90k steps, gentrack from scratch)
|
||||
- wave4-trial-0009/model.zip (the one run that drove mini-monaco)
|
||||
|
||||
Track: donkey-minimonaco-track-v0 (never seen during any of these trainings)
|
||||
Episodes: 7 per model
|
||||
Max steps: 2000 per episode
|
||||
"""
|
||||
import sys, os, time
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
|
||||
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
||||
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||||
import gymnasium as gym
|
||||
|
||||
HOST = 'localhost'
|
||||
PORT = 9091
|
||||
TRACK_ID = 'donkey-minimonaco-track-v0'
|
||||
EPISODES = 7
|
||||
MAX_STEPS = 3000 # enough for 2+ laps
|
||||
THROTTLE_MIN = 0.2
|
||||
STUCK_STEPS = 60 # terminate if car hasn't moved in this many steps
|
||||
STUCK_DIST = 0.3 # minimum displacement (metres) to not be considered stuck
|
||||
|
||||
BASE = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models'
|
||||
|
||||
MODELS = [
|
||||
('exp13-gentrack-v4', f'{BASE}/exp13-gentrack-v4/best_model.zip'),
|
||||
('wave5-gentrack-only', f'{BASE}/wave5-gentrack-only/model.zip'),
|
||||
('wave4-trial-0009', f'{BASE}/wave4-trial-0009/model.zip'),
|
||||
]
|
||||
|
||||
# Log to file + stdout
|
||||
log_path = os.path.join(
|
||||
BASE,
|
||||
f'eval_gentrack_minimonaco_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
|
||||
)
|
||||
_logfile = open(log_path, 'w', buffering=1)
|
||||
|
||||
def log(msg):
|
||||
ts = datetime.now().strftime('%H:%M:%S')
|
||||
line = f'[{ts}] {msg}'
|
||||
print(line, flush=True)
|
||||
_logfile.write(line + '\n')
|
||||
|
||||
|
||||
class MiniMonacoWrapper(gym.Wrapper):
|
||||
"""Two fixes for mini-monaco evaluation:
|
||||
1. Suppress starting_line termination until lap_count >= 1 (car spawns
|
||||
just before the line; the first crossing is not a lap completion).
|
||||
2. Terminate if the car hasn't moved STUCK_DIST metres in STUCK_STEPS steps.
|
||||
"""
|
||||
def reset(self, **kwargs):
|
||||
self._lap_count = 0
|
||||
self._pos_history = []
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, terminated, truncated, info = self.env.step(action)
|
||||
|
||||
laps = int(info.get('lap_count', 0) or 0)
|
||||
if laps > self._lap_count:
|
||||
self._lap_count = laps
|
||||
|
||||
# Suppress initial starting_line crossing
|
||||
if terminated and info.get('hit') == 'starting_line' and self._lap_count < 1:
|
||||
terminated = False
|
||||
reward = 0.0
|
||||
|
||||
# Stuck detection
|
||||
pos = info.get('pos')
|
||||
if pos is not None:
|
||||
self._pos_history.append(np.array(list(pos)[:3]))
|
||||
if len(self._pos_history) > STUCK_STEPS:
|
||||
self._pos_history.pop(0)
|
||||
if len(self._pos_history) == STUCK_STEPS:
|
||||
displacement = np.linalg.norm(
|
||||
self._pos_history[-1] - self._pos_history[0])
|
||||
if displacement < STUCK_DIST:
|
||||
terminated = True
|
||||
reward = -1.0
|
||||
info['hit'] = 'stuck'
|
||||
|
||||
return obs, reward, terminated, truncated, info
|
||||
|
||||
|
||||
def make_env():
|
||||
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
|
||||
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
|
||||
env = MiniMonacoWrapper(env)
|
||||
return env
|
||||
|
||||
|
||||
def run_eval(model_label, model_path):
|
||||
log('')
|
||||
log(f'── {model_label} ──────────────────────────────────────')
|
||||
log(f' Model: {model_path}')
|
||||
|
||||
if not os.path.exists(model_path):
|
||||
log(f' ERROR: model file not found — skipping')
|
||||
return None
|
||||
|
||||
env = VecTransposeImage(DummyVecEnv([make_env]))
|
||||
try:
|
||||
model = PPO.load(model_path, env=env, device='cpu')
|
||||
except Exception as e:
|
||||
log(f' ERROR loading model: {e}')
|
||||
env.close()
|
||||
return None
|
||||
|
||||
rewards, steps_list, laps_list = [], [], []
|
||||
|
||||
for ep in range(1, EPISODES + 1):
|
||||
obs = env.reset()
|
||||
total_r, steps, done = 0.0, 0, False
|
||||
laps = 0
|
||||
while not done and steps < MAX_STEPS:
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
obs, r, d, info = env.step(action)
|
||||
total_r += float(r[0])
|
||||
steps += 1
|
||||
done = bool(d[0])
|
||||
raw_info = info[0] if isinstance(info, (list, tuple)) else info
|
||||
laps = int((raw_info.get('lap_count', 0) or 0))
|
||||
|
||||
hit = (info[0] if isinstance(info, (list, tuple)) else info).get('hit', '?')
|
||||
if steps >= MAX_STEPS:
|
||||
status = f'✅ timeout ({laps} laps)'
|
||||
elif hit == 'stuck':
|
||||
status = f'❌ STUCK @{steps} ({laps} laps)'
|
||||
else:
|
||||
status = f'❌ crash @{steps} hit={hit} ({laps} laps)'
|
||||
log(f' ep{ep}: {total_r:.1f}r / {steps}s {status}')
|
||||
rewards.append(total_r)
|
||||
steps_list.append(steps)
|
||||
laps_list.append(laps)
|
||||
time.sleep(0.3)
|
||||
|
||||
mean_r = np.mean(rewards)
|
||||
mean_s = np.mean(steps_list)
|
||||
total_laps = sum(laps_list)
|
||||
lapped = sum(1 for l in laps_list if l >= 1)
|
||||
|
||||
log(f' SUMMARY: {lapped}/{EPISODES} completed a lap | '
|
||||
f'total laps={total_laps} | mean {mean_s:.0f}s / {mean_r:.1f}r')
|
||||
|
||||
env.close()
|
||||
time.sleep(2)
|
||||
|
||||
return {'label': model_label, 'lapped': lapped, 'total_laps': total_laps,
|
||||
'mean_steps': mean_s, 'mean_reward': mean_r}
|
||||
|
||||
|
||||
def main():
|
||||
log('=' * 70)
|
||||
log('Eval: generated-track specialists on mini-monaco (zero-shot)')
|
||||
log(f'Track : {TRACK_ID}')
|
||||
log(f'Episodes: {EPISODES} x max {MAX_STEPS} steps')
|
||||
log(f'Host : {HOST}:{PORT}')
|
||||
log(f'Log : {log_path}')
|
||||
log('=' * 70)
|
||||
|
||||
results = []
|
||||
for label, path in MODELS:
|
||||
r = run_eval(label, path)
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
log('')
|
||||
log('=' * 70)
|
||||
log('FINAL RESULTS')
|
||||
log('=' * 70)
|
||||
for r in sorted(results, key=lambda x: -x['total_laps']):
|
||||
log(f" {r['label']:<25} lapped={r['lapped']}/{EPISODES} "
|
||||
f"total_laps={r['total_laps']} mean {r['mean_steps']:>5.0f}s / {r['mean_reward']:>6.1f}r")
|
||||
|
||||
log(f'\nLog saved: {log_path}')
|
||||
_logfile.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
[10:29:52] ======================================================================
|
||||
[10:29:52] Evaluating best models on 10 genuinely different random roads
|
||||
[10:29:52] Seeds: [1001, 2002, 3003, 4004, 5005, 6006, 7007, 8008, 9009, 1234]
|
||||
[10:29:52] Log: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_best_models_20260506_102952.log
|
||||
[10:29:52] ======================================================================
|
||||
[10:29:52] Connecting to sim...
|
||||
[10:29:52] Connected. obs=(3, 120, 160), action=Discrete(7)
|
||||
[10:29:52]
|
||||
[10:29:52] ── exp24 ──────────────────────────────────────
|
||||
[10:29:52] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp24-discrete/best_model.zip
|
||||
[10:29:55] Road 1/10 (seed=1001) — regenerating...
|
||||
[10:30:24] → 371.0r / 2000s ✅
|
||||
[10:30:24] Road 2/10 (seed=2002) — regenerating...
|
||||
[10:30:53] → 365.2r / 2000s ✅
|
||||
[10:30:53] Road 3/10 (seed=3003) — regenerating...
|
||||
[10:31:22] → 365.0r / 2000s ✅
|
||||
[10:31:22] Road 4/10 (seed=4004) — regenerating...
|
||||
[10:31:51] → 372.2r / 2000s ✅
|
||||
[10:31:51] Road 5/10 (seed=5005) — regenerating...
|
||||
[10:32:21] → 363.3r / 2000s ✅
|
||||
[10:32:21] Road 6/10 (seed=6006) — regenerating...
|
||||
[10:32:50] → 365.8r / 2000s ✅
|
||||
[10:32:50] Road 7/10 (seed=7007) — regenerating...
|
||||
[10:33:19] → 371.5r / 2000s ✅
|
||||
[10:33:19] Road 8/10 (seed=8008) — regenerating...
|
||||
[10:33:36] → 157.7r / 912s ❌@912
|
||||
[10:33:36] Road 9/10 (seed=9009) — regenerating...
|
||||
[10:34:05] → 371.6r / 2000s ✅
|
||||
[10:34:05] Road 10/10 (seed=1234) — regenerating...
|
||||
[10:34:35] → 372.1r / 2000s ✅
|
||||
[10:34:35] exp24 SUMMARY: 9/10 full | mean 1891s / 347.5r
|
||||
[10:34:35]
|
||||
[10:34:35] ── exp25 ──────────────────────────────────────
|
||||
[10:34:35] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp25-wheel-fix/best_model.zip
|
||||
[10:34:36] Road 1/10 (seed=1001) — regenerating...
|
||||
[10:35:05] → 378.5r / 2000s ✅
|
||||
[10:35:05] Road 2/10 (seed=2002) — regenerating...
|
||||
[10:35:34] → 382.9r / 2000s ✅
|
||||
[10:35:34] Road 3/10 (seed=3003) — regenerating...
|
||||
[10:36:03] → 382.0r / 2000s ✅
|
||||
[10:36:03] Road 4/10 (seed=4004) — regenerating...
|
||||
[10:36:18] → 122.8r / 694s ❌@694
|
||||
[10:36:18] Road 5/10 (seed=5005) — regenerating...
|
||||
[10:36:47] → 384.3r / 2000s ✅
|
||||
[10:36:47] Road 6/10 (seed=6006) — regenerating...
|
||||
[10:37:16] → 379.7r / 2000s ✅
|
||||
[10:37:16] Road 7/10 (seed=7007) — regenerating...
|
||||
[10:37:45] → 382.7r / 2000s ✅
|
||||
[10:37:45] Road 8/10 (seed=8008) — regenerating...
|
||||
[10:38:15] → 382.8r / 2000s ✅
|
||||
[10:38:15] Road 9/10 (seed=9009) — regenerating...
|
||||
[10:38:44] → 383.2r / 2000s ✅
|
||||
[10:38:44] Road 10/10 (seed=1234) — regenerating...
|
||||
[10:39:13] → 383.9r / 2000s ✅
|
||||
[10:39:13] exp25 SUMMARY: 9/10 full | mean 1869s / 356.3r
|
||||
[10:39:13]
|
||||
[10:39:13] ── exp26 ──────────────────────────────────────
|
||||
[10:39:13] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp26-warmstart/best_model.zip
|
||||
[10:39:14] Road 1/10 (seed=1001) — regenerating...
|
||||
[10:39:43] → 392.2r / 2000s ✅
|
||||
[10:39:43] Road 2/10 (seed=2002) — regenerating...
|
||||
[10:40:10] → 307.0r / 1583s ❌@1583
|
||||
[10:40:10] Road 3/10 (seed=3003) — regenerating...
|
||||
[10:40:39] → 387.6r / 2000s ✅
|
||||
[10:40:39] Road 4/10 (seed=4004) — regenerating...
|
||||
[10:41:08] → 392.5r / 2000s ✅
|
||||
[10:41:08] Road 5/10 (seed=5005) — regenerating...
|
||||
[10:41:37] → 390.6r / 2000s ✅
|
||||
[10:41:37] Road 6/10 (seed=6006) — regenerating...
|
||||
[10:42:07] → 389.4r / 2000s ✅
|
||||
[10:42:07] Road 7/10 (seed=7007) — regenerating...
|
||||
[10:42:36] → 388.2r / 2000s ✅
|
||||
[10:42:36] Road 8/10 (seed=8008) — regenerating...
|
||||
[10:43:05] → 389.1r / 2000s ✅
|
||||
[10:43:05] Road 9/10 (seed=9009) — regenerating...
|
||||
[10:43:34] → 389.0r / 2000s ✅
|
||||
[10:43:34] Road 10/10 (seed=1234) — regenerating...
|
||||
[10:44:04] → 386.5r / 2000s ✅
|
||||
[10:44:04] exp26 SUMMARY: 9/10 full | mean 1958s / 381.2r
|
||||
[10:44:04]
|
||||
[10:44:04] ======================================================================
|
||||
[10:44:04] FINAL RANKING
|
||||
[10:44:04] ======================================================================
|
||||
[10:44:04] #1 exp26 9/10 full mean 1958s / 381.2r
|
||||
[10:44:04] #2 exp24 9/10 full mean 1891s / 347.5r
|
||||
[10:44:04] #3 exp25 9/10 full mean 1869s / 356.3r
|
||||
[10:44:04]
|
||||
[10:44:04] Evaluation complete.
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
[18:43:05] ======================================================================
|
||||
[18:43:05] Eval: generated-track specialists on mini-monaco (zero-shot)
|
||||
[18:43:05] Track : donkey-minimonaco-track-v0
|
||||
[18:43:05] Episodes: 7 x max 2000 steps
|
||||
[18:43:05] Host : localhost:9091
|
||||
[18:43:05] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184305.log
|
||||
[18:43:05] ======================================================================
|
||||
[18:43:05]
|
||||
[18:43:05] ── exp13-gentrack-v4 ──────────────────────────────────────
|
||||
[18:43:05] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip
|
||||
[18:43:12] ep1: 4.5r / 29s ❌@29
|
||||
[18:43:15] ep2: 4.5r / 28s ❌@28
|
||||
[18:43:18] ep3: 4.6r / 28s ❌@28
|
||||
[18:43:21] ep4: 4.7r / 28s ❌@28
|
||||
[18:43:24] ep5: 4.6r / 28s ❌@28
|
||||
[18:43:27] ep6: 4.6r / 28s ❌@28
|
||||
[18:43:30] ep7: 4.6r / 28s ❌@28
|
||||
[18:43:31] SUMMARY: 0/7 full | mean 28s / 4.6r | ❌ CRASHES
|
||||
[18:43:33]
|
||||
[18:43:33] ── wave5-gentrack-only ──────────────────────────────────────
|
||||
[18:43:33] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip
|
||||
[18:43:36] ep1: 4.8r / 28s ❌@28
|
||||
[18:43:39] ep2: 4.7r / 28s ❌@28
|
||||
[18:43:42] ep3: 4.9r / 28s ❌@28
|
||||
[18:43:45] ep4: 4.7r / 28s ❌@28
|
||||
[18:43:49] ep5: 4.6r / 27s ❌@27
|
||||
[18:43:52] ep6: 4.9r / 28s ❌@28
|
||||
[18:43:55] ep7: 4.9r / 28s ❌@28
|
||||
[18:43:55] SUMMARY: 0/7 full | mean 28s / 4.8r | ❌ CRASHES
|
||||
[18:43:57]
|
||||
[18:43:57] ── wave4-trial-0009 ──────────────────────────────────────
|
||||
[18:43:57] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip
|
||||
[18:44:01] ep1: 4.9r / 28s ❌@28
|
||||
[18:44:04] ep2: 5.3r / 28s ❌@28
|
||||
[18:44:07] ep3: 5.1r / 28s ❌@28
|
||||
[18:44:10] ep4: 5.0r / 29s ❌@29
|
||||
[18:44:13] ep5: 5.1r / 28s ❌@28
|
||||
[18:44:16] ep6: 5.3r / 29s ❌@29
|
||||
[18:44:19] ep7: 5.3r / 29s ❌@29
|
||||
[18:44:19] SUMMARY: 0/7 full | mean 28s / 5.1r | ❌ CRASHES
|
||||
[18:44:21]
|
||||
[18:44:21] ======================================================================
|
||||
[18:44:21] FINAL RESULTS
|
||||
[18:44:21] ======================================================================
|
||||
[18:44:21] wave4-trial-0009 0/7 full mean 28s / 5.1r ❌ CRASHES
|
||||
[18:44:21] exp13-gentrack-v4 0/7 full mean 28s / 4.6r ❌ CRASHES
|
||||
[18:44:21] wave5-gentrack-only 0/7 full mean 28s / 4.8r ❌ CRASHES
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
[18:46:36] ======================================================================
|
||||
[18:46:36] Eval: generated-track specialists on mini-monaco (zero-shot)
|
||||
[18:46:36] Track : donkey-minimonaco-track-v0
|
||||
[18:46:36] Episodes: 7 x max 2000 steps
|
||||
[18:46:36] Host : localhost:9091
|
||||
[18:46:36] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184636.log
|
||||
[18:46:36] ======================================================================
|
||||
[18:46:36]
|
||||
[18:46:36] ── exp13-gentrack-v4 ──────────────────────────────────────
|
||||
[18:46:36] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip
|
||||
[18:46:43] ep1: 4.6r / 100s ❌@100
|
||||
[18:46:47] ep2: 4.6r / 100s ❌@100
|
||||
[18:46:51] ep3: 4.5r / 100s ❌@100
|
||||
[18:46:55] ep4: 4.5r / 100s ❌@100
|
||||
[18:46:58] ep5: 4.8r / 100s ❌@100
|
||||
[18:47:02] ep6: 4.5r / 100s ❌@100
|
||||
[18:47:06] ep7: 4.7r / 100s ❌@100
|
||||
[18:47:07] SUMMARY: 0/7 full | mean 100s / 4.6r | ❌ CRASHES
|
||||
[18:47:09]
|
||||
[18:47:09] ── wave5-gentrack-only ──────────────────────────────────────
|
||||
[18:47:09] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip
|
||||
[18:47:14] ep1: 4.8r / 100s ❌@100
|
||||
[18:47:17] ep2: 4.8r / 100s ❌@100
|
||||
[18:47:21] ep3: 4.6r / 100s ❌@100
|
||||
[18:47:25] ep4: 4.8r / 100s ❌@100
|
||||
[18:47:29] ep5: 4.6r / 100s ❌@100
|
||||
[18:47:33] ep6: 4.6r / 100s ❌@100
|
||||
[18:47:37] ep7: 4.8r / 100s ❌@100
|
||||
[18:47:38] SUMMARY: 0/7 full | mean 100s / 4.7r | ❌ CRASHES
|
||||
[18:47:40]
|
||||
[18:47:40] ── wave4-trial-0009 ──────────────────────────────────────
|
||||
[18:47:40] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip
|
||||
[18:47:44] ep1: 5.1r / 100s ❌@100
|
||||
[18:47:48] ep2: 5.1r / 100s ❌@100
|
||||
[18:47:52] ep3: 5.0r / 100s ❌@100
|
||||
[18:47:56] ep4: 5.3r / 100s ❌@100
|
||||
[18:48:00] ep5: 4.9r / 100s ❌@100
|
||||
[18:48:04] ep6: 5.0r / 100s ❌@100
|
||||
[18:48:08] ep7: 5.1r / 100s ❌@100
|
||||
[18:48:08] SUMMARY: 0/7 full | mean 100s / 5.1r | ❌ CRASHES
|
||||
[18:48:10]
|
||||
[18:48:10] ======================================================================
|
||||
[18:48:10] FINAL RESULTS
|
||||
[18:48:10] ======================================================================
|
||||
[18:48:10] exp13-gentrack-v4 0/7 full mean 100s / 4.6r ❌ CRASHES
|
||||
[18:48:10] wave5-gentrack-only 0/7 full mean 100s / 4.7r ❌ CRASHES
|
||||
[18:48:10] wave4-trial-0009 0/7 full mean 100s / 5.1r ❌ CRASHES
|
||||
[18:48:10]
|
||||
Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184636.log
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
[18:49:02] ======================================================================
|
||||
[18:49:02] Eval: generated-track specialists on mini-monaco (zero-shot)
|
||||
[18:49:02] Track : donkey-minimonaco-track-v0
|
||||
[18:49:02] Episodes: 7 x max 2000 steps
|
||||
[18:49:02] Host : localhost:9091
|
||||
[18:49:02] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184902.log
|
||||
[18:49:02] ======================================================================
|
||||
[18:49:02]
|
||||
[18:49:02] ── exp13-gentrack-v4 ──────────────────────────────────────
|
||||
[18:49:02] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip
|
||||
[18:49:31] ep1: 5.5r / 2000s ✅
|
||||
[18:49:58] ep2: 5.9r / 2000s ✅
|
||||
[18:50:24] ep3: 5.8r / 2000s ✅
|
||||
[18:50:51] ep4: 5.6r / 2000s ✅
|
||||
[18:51:17] ep5: 5.8r / 2000s ✅
|
||||
[18:51:44] ep6: 5.5r / 2000s ✅
|
||||
[18:52:10] ep7: 5.5r / 2000s ✅
|
||||
[18:52:11] SUMMARY: 7/7 full | mean 2000s / 5.7r | ✅ DRIVES
|
||||
[18:52:13]
|
||||
[18:52:13] ── wave5-gentrack-only ──────────────────────────────────────
|
||||
[18:52:13] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip
|
||||
[18:52:40] ep1: 5.6r / 2000s ✅
|
||||
[18:53:06] ep2: 5.7r / 2000s ✅
|
||||
[18:53:33] ep3: 5.9r / 2000s ✅
|
||||
[18:53:42] ep4: 4.6r / 550s ❌@550
|
||||
[18:54:09] ep5: 5.9r / 2000s ✅
|
||||
[18:54:18] ep6: 4.5r / 540s ❌@540
|
||||
[18:54:45] ep7: 5.8r / 2000s ✅
|
||||
[18:54:45] SUMMARY: 5/7 full | mean 1584s / 5.4r | ✅ DRIVES
|
||||
[18:54:47]
|
||||
[18:54:47] ── wave4-trial-0009 ──────────────────────────────────────
|
||||
[18:54:47] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip
|
||||
[18:55:01] ep1: 5.3r / 865s ❌@865
|
||||
[18:55:28] ep2: 6.0r / 2000s ✅
|
||||
[18:55:42] ep3: 5.0r / 874s ❌@874
|
||||
[18:55:55] ep4: 5.2r / 858s ❌@858
|
||||
[18:56:22] ep5: 6.0r / 2000s ✅
|
||||
[18:56:37] ep6: 5.3r / 845s ❌@845
|
||||
[18:56:51] ep7: 5.1r / 890s ❌@890
|
||||
[18:56:52] SUMMARY: 2/7 full | mean 1190s / 5.4r | ✅ DRIVES
|
||||
[18:56:54]
|
||||
[18:56:54] ======================================================================
|
||||
[18:56:54] FINAL RESULTS
|
||||
[18:56:54] ======================================================================
|
||||
[18:56:54] exp13-gentrack-v4 7/7 full mean 2000s / 5.7r ✅ DRIVES
|
||||
[18:56:54] wave5-gentrack-only 5/7 full mean 1584s / 5.4r ✅ DRIVES
|
||||
[18:56:54] wave4-trial-0009 2/7 full mean 1190s / 5.4r ✅ DRIVES
|
||||
[18:56:54]
|
||||
Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_184902.log
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
[21:15:19] ======================================================================
|
||||
[21:15:19] Eval: generated-track specialists on mini-monaco (zero-shot)
|
||||
[21:15:19] Track : donkey-minimonaco-track-v0
|
||||
[21:15:19] Episodes: 7 x max 2000 steps
|
||||
[21:15:19] Host : localhost:9091
|
||||
[21:15:19] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_211519.log
|
||||
[21:15:19] ======================================================================
|
||||
[21:15:19]
|
||||
[21:15:19] ── exp13-gentrack-v4 ──────────────────────────────────────
|
||||
[21:15:19] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip
|
||||
[21:15:48] ep1: 5.7r / 2000s ✅
|
||||
[21:16:15] ep2: 5.8r / 2000s ✅
|
||||
[21:16:41] ep3: 5.6r / 2000s ✅
|
||||
[21:17:08] ep4: 5.8r / 2000s ✅
|
||||
[21:17:34] ep5: 5.7r / 2000s ✅
|
||||
[21:18:01] ep6: 5.8r / 2000s ✅
|
||||
[21:18:27] ep7: 5.7r / 2000s ✅
|
||||
[21:18:27] SUMMARY: 7/7 full | mean 2000s / 5.7r | ✅ DRIVES
|
||||
[21:18:29]
|
||||
[21:18:29] ── wave5-gentrack-only ──────────────────────────────────────
|
||||
[21:18:29] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip
|
||||
[21:18:57] ep1: 5.7r / 2000s ✅
|
||||
[21:19:23] ep2: 5.9r / 2000s ✅
|
||||
[21:19:50] ep3: 5.8r / 2000s ✅
|
||||
[21:20:16] ep4: 5.8r / 2000s ✅
|
||||
[21:20:43] ep5: 5.9r / 2000s ✅
|
||||
[21:21:09] ep6: 5.6r / 2000s ✅
|
||||
[21:21:36] ep7: 5.7r / 2000s ✅
|
||||
[21:21:36] SUMMARY: 7/7 full | mean 2000s / 5.8r | ✅ DRIVES
|
||||
[21:21:38]
|
||||
[21:21:38] ── wave4-trial-0009 ──────────────────────────────────────
|
||||
[21:21:38] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip
|
||||
[21:21:52] ep1: 5.3r / 859s ❌@859
|
||||
[21:22:06] ep2: 5.3r / 847s ❌@847
|
||||
[21:22:19] ep3: 4.9r / 850s ❌@850
|
||||
[21:22:33] ep4: 5.0r / 904s ❌@904
|
||||
[21:23:00] ep5: 6.3r / 2000s ✅
|
||||
[21:23:27] ep6: 6.0r / 2000s ✅
|
||||
[21:23:40] ep7: 4.9r / 857s ❌@857
|
||||
[21:23:41] SUMMARY: 2/7 full | mean 1188s / 5.4r | ✅ DRIVES
|
||||
[21:23:43]
|
||||
[21:23:43] ======================================================================
|
||||
[21:23:43] FINAL RESULTS
|
||||
[21:23:43] ======================================================================
|
||||
[21:23:43] exp13-gentrack-v4 7/7 full mean 2000s / 5.7r ✅ DRIVES
|
||||
[21:23:43] wave5-gentrack-only 7/7 full mean 2000s / 5.8r ✅ DRIVES
|
||||
[21:23:43] wave4-trial-0009 2/7 full mean 1188s / 5.4r ✅ DRIVES
|
||||
[21:23:43]
|
||||
Log saved: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_211519.log
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
[21:27:14] ======================================================================
|
||||
[21:27:14] Eval: generated-track specialists on mini-monaco (zero-shot)
|
||||
[21:27:14] Track : donkey-minimonaco-track-v0
|
||||
[21:27:14] Episodes: 7 x max 3000 steps
|
||||
[21:27:14] Host : localhost:9091
|
||||
[21:27:14] Log : /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/eval_gentrack_minimonaco_20260506_212714.log
|
||||
[21:27:14] ======================================================================
|
||||
[21:27:14]
|
||||
[21:27:14] ── exp13-gentrack-v4 ──────────────────────────────────────
|
||||
[21:27:14] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp13-gentrack-v4/best_model.zip
|
||||
[21:27:21] ep1: 4.5r / 157s ❌ STUCK @157 (0 laps)
|
||||
[21:27:25] ep2: 4.8r / 156s ❌ STUCK @156 (0 laps)
|
||||
[21:27:30] ep3: 4.7r / 184s ❌ STUCK @184 (0 laps)
|
||||
[21:27:35] ep4: 4.8r / 182s ❌ STUCK @182 (0 laps)
|
||||
[21:27:40] ep5: 4.6r / 197s ❌ STUCK @197 (0 laps)
|
||||
[21:27:46] ep6: 4.8r / 209s ❌ STUCK @209 (0 laps)
|
||||
[21:27:50] ep7: 4.6r / 161s ❌ STUCK @161 (0 laps)
|
||||
[21:27:51] SUMMARY: 0/7 completed a lap | total laps=0 | mean 178s / 4.7r
|
||||
[21:27:53]
|
||||
[21:27:53] ── wave5-gentrack-only ──────────────────────────────────────
|
||||
[21:27:53] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave5-gentrack-only/model.zip
|
||||
[21:27:59] ep1: 4.7r / 258s ❌ STUCK @258 (0 laps)
|
||||
[21:28:10] ep2: 5.0r / 611s ❌ STUCK @611 (0 laps)
|
||||
[21:28:16] ep3: 4.8r / 271s ❌ STUCK @271 (0 laps)
|
||||
[21:28:22] ep4: 4.9r / 272s ❌ STUCK @272 (0 laps)
|
||||
[21:28:31] ep5: 4.6r / 497s ❌ STUCK @497 (0 laps)
|
||||
[21:28:40] ep6: 4.9r / 514s ❌ STUCK @514 (0 laps)
|
||||
[21:28:46] ep7: 4.9r / 289s ❌ STUCK @289 (0 laps)
|
||||
[21:28:47] SUMMARY: 0/7 completed a lap | total laps=0 | mean 387s / 4.8r
|
||||
[21:28:49]
|
||||
[21:28:49] ── wave4-trial-0009 ──────────────────────────────────────
|
||||
[21:28:49] Model: /home/paulh/projects/donkeycar-rl-autoresearch/agent/models/wave4-trial-0009/model.zip
|
||||
[21:28:56] ep1: 5.3r / 331s ❌ STUCK @331 (0 laps)
|
||||
Loading…
Reference in New Issue