fix: reward v6.1 — active_node progress terminator kills circle/stuck exploits

User's insight: a circling car stays near the same track waypoints, so
active_node (sim's track progress indicator) never advances. Track the
maximum active_node reached this episode. If it hasn't increased in
progress_patience=60 steps (~3.3s), terminate.

This catches:
  - Circular driving (active_node oscillates, max never increases)
  - Stuck on cone/barrier (active_node frozen)
  - NOT triggered by: legitimate cornering, slow forward progress, lap resets

On lap completion, active_node wraps to 0 — reset max_node_seen and counter.

Also: Exp 12 — single track mountain training with lap-based stopping criterion.
Train until 3 consecutive laps in eval, not fixed step count.
This commit is contained in:
Paul Huliganga 2026-04-19 17:01:41 -04:00
parent 8b84409e58
commit 813f888502
3 changed files with 300 additions and 29 deletions

View File

@ -0,0 +1,150 @@
"""
Exp 12: Single track mountain_track, v6.1 reward, lap-based stopping.
Strategy: train until the car consistently completes multiple laps.
No fixed step count eval every 5k steps, stop when 3 consecutive laps
achieved in a single eval episode (2000 steps).
Key fixes applied:
- v6.1 reward: speed × CTE, efficiency gate, CTE patience (grass fix),
track progress patience (circle/stuck fix using active_node)
- stuck_steps=40, wall-clock timeout=12s
- Single track: all training budget goes to mountain_track
- throttle_min=0.2 (proven to work in Exp 9 with v5 reward + 90k steps)
Based on Exp 9 success: single track + v5/v6 reward + throttle_min=0.2
CAN learn mountain. The circle/stuck exploit was contaminating multi-track
training. Single track eliminates that interaction.
"""
import sys, os, time
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
from multitrack_runner import log, StuckTerminationWrapper
from donkeycar_sb3_runner import ThrottleClampWrapper
from reward_wrapper import SpeedRewardWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import gymnasium as gym
import numpy as np
HOST = '10.0.0.55'
PORT = 9091
TRACK_ID = 'donkey-mountain-track-v0'
TRACK_NAME = 'mountain_track'
THROTTLE_MIN = 0.2
LR = 0.000725
MAX_STEPS = 300000 # safety ceiling — stop via lap criterion first
EVAL_EVERY = 5000 # eval after every N training steps
EVAL_MAX_STEPS = 2000 # steps per eval episode
LAP_STOP_THRESHOLD = 3 # stop when eval achieves this many laps in one episode
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp12-mountain-single'
os.makedirs(SAVE_DIR, exist_ok=True)
def make_env():
def _init():
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5,
max_stuck_seconds=12.0)
env = SpeedRewardWrapper(env,
max_cte_terminate=4.0,
cte_patience=20,
progress_patience=60, # terminate if active_node max not advancing for 60 steps
)
return env
return _init
log('='*60)
log(f'Exp 12: Single track — {TRACK_NAME}')
log(f' Host: {HOST}:{PORT}')
log(f' throttle_min={THROTTLE_MIN}, lr={LR}')
log(f' Reward: v6.1 (speed×CTE + efficiency gate + progress termination)')
log(f' Stopping: eval every {EVAL_EVERY:,} steps, stop at {LAP_STOP_THRESHOLD} laps')
log(f' Safety ceiling: {MAX_STEPS:,} steps')
log('='*60)
env = VecTransposeImage(DummyVecEnv([make_env()]))
log(f' obs={env.observation_space.shape}')
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
log('PPO created. Training begins...')
best_reward = float('-inf')
best_laps = 0
steps_done = 0
stop_reason = None
while steps_done < MAX_STEPS:
seg = min(EVAL_EVERY, MAX_STEPS - steps_done)
model.learn(total_timesteps=seg, reset_num_timesteps=False)
steps_done += seg
# Save checkpoint
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
model.save(ckpt)
model.save(os.path.join(SAVE_DIR, 'model'))
# Eval: count laps in one deterministic episode
try:
obs = env.reset()
ep_reward, ep_steps, laps = 0.0, 0, 0
prev_lap_count = 0
for _ in range(EVAL_MAX_STEPS):
action, _ = model.predict(obs, deterministic=True)
result = env.step(action)
if len(result) == 4:
obs, r, d, info = result
done = bool(d[0])
else:
obs, r, t, tr, info = result
done = bool(t[0] or tr[0])
ep_reward += float(r[0])
ep_steps += 1
# Count laps from info
try:
lc = int((info[0] if isinstance(info, (list, tuple)) else info)
.get('lap_count', 0) or 0)
if lc > prev_lap_count:
laps = lc
prev_lap_count = lc
except Exception:
pass
if done:
break
status = '' if ep_steps >= EVAL_MAX_STEPS else f'❌@{ep_steps}'
log(f'[{steps_done:,}/{MAX_STEPS:,}] reward={ep_reward:.1f} '
f'steps={ep_steps} laps={laps} {status}')
if ep_reward > best_reward:
best_reward = ep_reward
model.save(os.path.join(SAVE_DIR, 'best_model'))
log(f' ⭐ NEW BEST reward: {best_reward:.1f}')
if laps > best_laps:
best_laps = laps
log(f' 🏆 NEW BEST laps: {best_laps}')
if laps >= LAP_STOP_THRESHOLD:
stop_reason = f'achieved {laps} laps in eval at {steps_done:,} steps'
log(f' 🎯 STOPPING CRITERION MET: {stop_reason}')
break
except Exception as e:
log(f' Eval error: {e}')
import traceback; traceback.print_exc()
env.close()
time.sleep(3)
log(f'\n{"="*60}')
log(f'Training complete.')
log(f' Stop reason : {stop_reason or "reached max steps"}')
log(f' Total steps : {steps_done:,}')
log(f' Best laps : {best_laps}')
log(f' Best reward : {best_reward:.1f}')
log(f' Best model : {SAVE_DIR}/best_model.zip')
log(f'{"="*60}')
log(f'\n=== Exp 12 COMPLETE ===')

View File

@ -67,15 +67,23 @@ class SpeedRewardWrapper(gym.Wrapper):
Completely ignores the sim's own reward (which uses forward_vel and is Completely ignores the sim's own reward (which uses forward_vel and is
exploitable by circular/spinning motion). exploitable by circular/spinning motion).
Exploit termination:
- Sustained high CTE (> max_cte_terminate for cte_patience steps): grass exploit
- No track progress (active_node max not advancing for progress_patience steps):
catches circular driving, stuck-on-cone, stuck-on-barrier.
A circling car stays near the same waypoints active_node never advances.
A stuck car never advances either. Forward driving always advances.
Args: Args:
env: gymnasium environment env: gymnasium environment
speed_scale: speed bonus multiplier (default 0.1) speed_scale: speed bonus multiplier (default 0.1)
window_size: steps for efficiency calculation (default 30) window_size: steps for efficiency gate (default 30)
min_efficiency: efficiency below which no reward (default 0.15) min_efficiency: efficiency gate threshold (default 0.15)
max_cte: track half-width for normalization (default 8.0) max_cte: track half-width for reward normalization (default 8.0)
min_lap_time: laps faster than this are penalised as exploits min_lap_time: laps faster than this are penalised as exploits
max_cte_terminate: terminate if CTE exceeds this for cte_patience steps max_cte_terminate: terminate if CTE > this for cte_patience steps
cte_patience: steps of sustained high CTE before termination (default 20) cte_patience: steps of sustained high CTE before termination
progress_patience: steps without new max active_node before termination
""" """
def __init__( def __init__(
@ -86,26 +94,32 @@ class SpeedRewardWrapper(gym.Wrapper):
min_efficiency: float = 0.15, min_efficiency: float = 0.15,
max_cte: float = 8.0, max_cte: float = 8.0,
min_lap_time: float = 5.0, min_lap_time: float = 5.0,
max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m max_cte_terminate: float = 4.0,
cte_patience: int = 20, # steps of high CTE before terminate cte_patience: int = 20,
progress_patience: int = 60, # ~3.3s at 18 steps/sec
): ):
super().__init__(env) super().__init__(env)
self.speed_scale = speed_scale self.speed_scale = speed_scale
self.window_size = window_size self.window_size = window_size
self.min_efficiency = min_efficiency self.min_efficiency = min_efficiency
self.max_cte = max_cte self.max_cte = max_cte
self.min_lap_time = min_lap_time self.min_lap_time = min_lap_time
self.max_cte_terminate = max_cte_terminate self.max_cte_terminate = max_cte_terminate
self.cte_patience = cte_patience self.cte_patience = cte_patience
self._pos_history = deque(maxlen=window_size + 1) self.progress_patience = progress_patience
self._last_lap_count = 0 self._pos_history = deque(maxlen=window_size + 1)
self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate self._last_lap_count = 0
self._high_cte_steps = 0
self._max_node_seen = -1 # highest active_node reached this episode
self._no_progress_steps = 0 # steps since max_node last increased
def reset(self, **kwargs): def reset(self, **kwargs):
result = self.env.reset(**kwargs) result = self.env.reset(**kwargs)
self._pos_history.clear() self._pos_history.clear()
self._last_lap_count = 0 self._last_lap_count = 0
self._high_cte_steps = 0 self._high_cte_steps = 0
self._max_node_seen = -1
self._no_progress_steps = 0
return result return result
def step(self, action): def step(self, action):
@ -176,14 +190,39 @@ class SpeedRewardWrapper(gym.Wrapper):
else: else:
self._high_cte_steps = 0 self._high_cte_steps = 0
# --- Short-lap exploit detection --- # --- Circle / stuck exploit: no track progress termination ---
# Track the highest active_node (track waypoint) reached this episode.
# A circling car stays near the same waypoints — max_node never advances.
# A stuck car never advances either. Only genuine forward driving advances.
# On lap completion, active_node resets to 0 — we reset our tracker too.
try:
active_node = int(info.get('active_node', -1) or 0)
total_nodes = int(info.get('total_nodes', 1) or 1)
except (TypeError, ValueError):
active_node = -1
total_nodes = 1
if active_node >= 0:
if active_node > self._max_node_seen:
# New furthest point reached — genuine forward progress
self._max_node_seen = active_node
self._no_progress_steps = 0
else:
self._no_progress_steps += 1
if self._no_progress_steps >= self.progress_patience:
return -1.0, True # no forward progress — terminate
try: try:
current_lap_count = int(info.get('lap_count', 0) or 0) current_lap_count = int(info.get('lap_count', 0) or 0)
except (TypeError, ValueError): except (TypeError, ValueError):
current_lap_count = self._last_lap_count current_lap_count = self._last_lap_count
if current_lap_count > self._last_lap_count: if current_lap_count > self._last_lap_count:
self._last_lap_count = current_lap_count self._last_lap_count = current_lap_count
# Reset progress tracker — active_node wraps to 0 on new lap
self._max_node_seen = -1
self._no_progress_steps = 0
try: try:
lap_time = float(info.get('last_lap_time', 999.0) or 999.0) lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
except (TypeError, ValueError): except (TypeError, ValueError):

View File

@ -362,16 +362,98 @@ def test_high_cte_resets_when_back_on_track():
def test_no_track_progress_terminates_episode(): def test_no_track_progress_terminates_episode():
""" """
REMOVED - progress_patience terminator removed from v6.1. Circle/stuck exploit fix: if max active_node doesn't advance for
Mountain rollback is a learning issue, not a termination issue (ADR-020). progress_patience steps, the episode must be force-terminated.
A circling car stays near the same waypoints max_node never increases.
""" """
pass # placeholder env = MockEnv(speed=3.0, cte=0.5)
wrapper = SpeedRewardWrapper(env, progress_patience=10)
wrapper.reset()
# First step initialises max_node to 5, then 10 more steps stuck at 5 → terminate
for i in range(12):
info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.),
'active_node': 5, 'total_nodes': 100,
'lap_count': 0, 'last_lap_time': 0.0}
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
if ft:
break
assert ft == True, 'Should terminate when max active_node not advancing'
assert r == -1.0
def test_track_progress_resets_counter(): def test_track_progress_resets_counter():
""" """
Node advancement must reset the no-progress counter. Advancing to a new max active_node must reset the no-progress counter.
REMOVED - progress_patience terminator removed from v6.1.
Mountain rollback is a learning issue, not a termination issue (ADR-020).
""" """
pass # placeholder to keep test count stable env = MockEnv(speed=3.0, cte=0.5)
wrapper = SpeedRewardWrapper(env, progress_patience=5)
wrapper.reset()
# Step forward: nodes 0, 1, 2, 3 — each new node resets counter
for node in range(4):
info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(node)*0.5, 0., 0.),
'active_node': node, 'total_nodes': 100,
'lap_count': 0, 'last_lap_time': 0.0}
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
assert ft == False, f'Should not terminate when advancing (node {node})'
assert wrapper._no_progress_steps == 0, 'Counter should reset on new max node'
def test_circle_exploit_terminates():
"""
A car circling near the same spot should be terminated.
active_node oscillates but never exceeds the initial max.
"""
env = MockEnv(speed=3.0, cte=0.5)
wrapper = SpeedRewardWrapper(env, progress_patience=10)
wrapper.reset()
# Set max_node to 10
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
'active_node': 10, 'total_nodes': 100,
'lap_count': 0, 'last_lap_time': 0.0}
wrapper._compute_reward_and_done(done=False, info=info)
# Now oscillate between nodes 8-10 (circling near node 10)
terminated = False
for i in range(20):
node = 8 + (i % 3) # oscillates 8, 9, 10, 8, 9, 10...
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
'active_node': node, 'total_nodes': 100,
'lap_count': 0, 'last_lap_time': 0.0}
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
if ft:
terminated = True
break
assert terminated, 'Circling (oscillating active_node, no new max) should terminate'
def test_lap_completion_resets_progress_tracker():
"""
On lap completion, active_node resets to 0. Progress tracker must also
reset so the car isn't immediately terminated for 'no progress'.
"""
env = MockEnv(speed=3.0, cte=0.5)
wrapper = SpeedRewardWrapper(env, progress_patience=5, min_lap_time=5.0)
wrapper.reset()
# Drive to near end of track
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
'active_node': 99, 'total_nodes': 100,
'lap_count': 0, 'last_lap_time': 0.0}
wrapper._compute_reward_and_done(done=False, info=info)
assert wrapper._max_node_seen == 99
# Complete a valid lap
info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.),
'active_node': 0, 'total_nodes': 100,
'lap_count': 1, 'last_lap_time': 12.0} # 12s lap = valid
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
# Progress tracker should be reset
assert wrapper._max_node_seen == -1, 'max_node_seen should reset on lap completion'
assert wrapper._no_progress_steps == 0
assert ft == False, 'Valid lap should not terminate'