""" Exp 8: mountain_track, v5 reward (speed x CTE), throttle_min=0.5 - Single TCP connection for the entire run (no disconnect/reconnect) - Saves numbered checkpoint every 6000 steps - Saves best_model.zip whenever a new best is found - Circle exploit: episode terminates immediately on short lap """ import sys, os, time sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent') from multitrack_runner import log, _send_exit_scene, StuckTerminationWrapper from donkeycar_sb3_runner import ThrottleClampWrapper from reward_wrapper import SpeedRewardWrapper from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage from stable_baselines3.common.utils import get_schedule_fn import gymnasium as gym THROTTLE_MIN = 0.5 LR = 0.000725 TOTAL_STEPS = 90000 STEPS_PER_SEG = 6000 # checkpoint frequency — NOT track switching SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp8-mountain-clean' os.makedirs(SAVE_DIR, exist_ok=True) def make_env(): raw = gym.make('donkey-mountain-track-v0') env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN) env = StuckTerminationWrapper(env, stuck_steps=80, min_displacement=0.5) env = SpeedRewardWrapper(env) return env log('='*60) log('Exp 8: mountain_track ONLY — single connection throughout') log(f' throttle_min={THROTTLE_MIN}, lr={LR}, total_steps={TOTAL_STEPS:,}') log(f' Checkpoint every {STEPS_PER_SEG:,} steps ({TOTAL_STEPS//STEPS_PER_SEG} checkpoints)') log(f' Reward: v5 (speed x CTE-quality)') log(f' Circle fix: short lap terminates episode immediately') log(f' NO disconnect/reconnect between chunks') log('='*60) # Connect ONCE — stay connected for the entire run log('Connecting to mountain_track...') tmp = gym.make('donkey-mountain-track-v0'); time.sleep(2) _send_exit_scene(tmp, verbose=False); tmp.close(); time.sleep(5) env = VecTransposeImage(DummyVecEnv([make_env])) model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu') log('Connected. Training begins — sim will NOT go to main menu between segments.') log('You will see: car runs → crashes/stuck → resets to start → runs again.') best_reward = float('-inf') steps_done = 0 seg_num = 0 while steps_done < TOTAL_STEPS: seg_steps = min(STEPS_PER_SEG, TOTAL_STEPS - steps_done) seg_num += 1 log(f'\n[Seg {seg_num}] steps {steps_done:,} → {steps_done+seg_steps:,}') model.learn(total_timesteps=seg_steps, reset_num_timesteps=False) steps_done += seg_steps # Numbered checkpoint — never overwritten ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}') model.save(ckpt) log(f'[Seg {seg_num}] Checkpoint saved: {ckpt}.zip') # Quick 1-episode deterministic eval to measure quality try: obs = env.reset() ep_reward, ep_steps, done = 0.0, 0, False while not done and ep_steps < 2000: action, _ = model.predict(obs, deterministic=True) result = env.step(action) if len(result)==5: obs,r,t,tr,_ = result; done=bool(t[0] or tr[0]) else: obs,r,d,_ = result; done=bool(d[0]) ep_reward += float(r[0]); ep_steps += 1 log(f'[Seg {seg_num}] Eval: {ep_reward:.1f} reward / {ep_steps} steps (deterministic)') if ep_reward > best_reward: best_reward = ep_reward best_path = os.path.join(SAVE_DIR, 'best_model') model.save(best_path) log(f'[Seg {seg_num}] ⭐ NEW BEST: {best_reward:.1f} → best_model.zip') except Exception as e: log(f'[Seg {seg_num}] Eval failed: {e}') env.close() time.sleep(2) log(f'\nTraining complete. Best reward: {best_reward:.1f}') log(f'Checkpoints: {SAVE_DIR}/') for f in sorted(os.listdir(SAVE_DIR)): log(f' {f}') # Eval best model on all 4 tracks best_path = os.path.join(SAVE_DIR, 'best_model.zip') log(f'\nEvaluating best_model.zip on all tracks...') def eval_track(current_id, track_id, name, n=3): log(f'\n--- EVAL: {name} ---') tmp2 = gym.make(current_id); time.sleep(2) _send_exit_scene(tmp2, verbose=False); tmp2.close(); time.sleep(5) ev = VecTransposeImage(DummyVecEnv([lambda: ( SpeedRewardWrapper( StuckTerminationWrapper( ThrottleClampWrapper(gym.make(track_id), throttle_min=THROTTLE_MIN), stuck_steps=80, min_displacement=0.5 ) ) )])) m = PPO.load(best_path, env=ev, device='cpu') for ep in range(1, n+1): obs = ev.reset(); total, steps, done = 0.0, 0, False while not done and steps < 2000: action, _ = m.predict(obs, deterministic=True) result = ev.step(action) if len(result)==5: obs,r,t,tr,info=result; done=bool(t[0] or tr[0]) else: obs,r,d,info=result; done=bool(d[0]) total+=float(r[0]); steps+=1 status='✅ FULL' if steps>=2000 else f'❌ crash@{steps}' log(f' ep{ep}: {total:.1f} reward / {steps} steps — {status}') time.sleep(1) ev.close(); time.sleep(3) return track_id current = 'donkey-mountain-track-v0' current = eval_track(current, 'donkey-mountain-track-v0', 'mountain_track (training)') current = eval_track(current, 'donkey-generated-track-v0', 'generated_track (zero-shot)') current = eval_track(current, 'donkey-minimonaco-track-v0', 'mini_monaco (zero-shot)') current = eval_track(current, 'donkey-generated-roads-v0', 'generated_road (zero-shot)') log('\n=== Exp 8 COMPLETE ===')