fix: reward v6.1 — active_node progress terminator kills circle/stuck exploits
User's insight: a circling car stays near the same track waypoints, so active_node (sim's track progress indicator) never advances. Track the maximum active_node reached this episode. If it hasn't increased in progress_patience=60 steps (~3.3s), terminate. This catches: - Circular driving (active_node oscillates, max never increases) - Stuck on cone/barrier (active_node frozen) - NOT triggered by: legitimate cornering, slow forward progress, lap resets On lap completion, active_node wraps to 0 — reset max_node_seen and counter. Also: Exp 12 — single track mountain training with lap-based stopping criterion. Train until 3 consecutive laps in eval, not fixed step count.
This commit is contained in:
parent
8b84409e58
commit
813f888502
|
|
@ -0,0 +1,150 @@
|
|||
"""
|
||||
Exp 12: Single track — mountain_track, v6.1 reward, lap-based stopping.
|
||||
|
||||
Strategy: train until the car consistently completes multiple laps.
|
||||
No fixed step count — eval every 5k steps, stop when 3 consecutive laps
|
||||
achieved in a single eval episode (2000 steps).
|
||||
|
||||
Key fixes applied:
|
||||
- v6.1 reward: speed × CTE, efficiency gate, CTE patience (grass fix),
|
||||
track progress patience (circle/stuck fix using active_node)
|
||||
- stuck_steps=40, wall-clock timeout=12s
|
||||
- Single track: all training budget goes to mountain_track
|
||||
- throttle_min=0.2 (proven to work in Exp 9 with v5 reward + 90k steps)
|
||||
|
||||
Based on Exp 9 success: single track + v5/v6 reward + throttle_min=0.2
|
||||
CAN learn mountain. The circle/stuck exploit was contaminating multi-track
|
||||
training. Single track eliminates that interaction.
|
||||
"""
|
||||
import sys, os, time
|
||||
sys.path.insert(0, '/home/paulh/projects/donkeycar-rl-autoresearch/agent')
|
||||
|
||||
from multitrack_runner import log, StuckTerminationWrapper
|
||||
from donkeycar_sb3_runner import ThrottleClampWrapper
|
||||
from reward_wrapper import SpeedRewardWrapper
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
|
||||
HOST = '10.0.0.55'
|
||||
PORT = 9091
|
||||
TRACK_ID = 'donkey-mountain-track-v0'
|
||||
TRACK_NAME = 'mountain_track'
|
||||
THROTTLE_MIN = 0.2
|
||||
LR = 0.000725
|
||||
MAX_STEPS = 300000 # safety ceiling — stop via lap criterion first
|
||||
EVAL_EVERY = 5000 # eval after every N training steps
|
||||
EVAL_MAX_STEPS = 2000 # steps per eval episode
|
||||
LAP_STOP_THRESHOLD = 3 # stop when eval achieves this many laps in one episode
|
||||
SAVE_DIR = '/home/paulh/projects/donkeycar-rl-autoresearch/agent/models/exp12-mountain-single'
|
||||
os.makedirs(SAVE_DIR, exist_ok=True)
|
||||
|
||||
def make_env():
|
||||
def _init():
|
||||
raw = gym.make(TRACK_ID, conf={'host': HOST, 'port': PORT})
|
||||
env = ThrottleClampWrapper(raw, throttle_min=THROTTLE_MIN)
|
||||
env = StuckTerminationWrapper(env, stuck_steps=40, min_displacement=0.5,
|
||||
max_stuck_seconds=12.0)
|
||||
env = SpeedRewardWrapper(env,
|
||||
max_cte_terminate=4.0,
|
||||
cte_patience=20,
|
||||
progress_patience=60, # terminate if active_node max not advancing for 60 steps
|
||||
)
|
||||
return env
|
||||
return _init
|
||||
|
||||
log('='*60)
|
||||
log(f'Exp 12: Single track — {TRACK_NAME}')
|
||||
log(f' Host: {HOST}:{PORT}')
|
||||
log(f' throttle_min={THROTTLE_MIN}, lr={LR}')
|
||||
log(f' Reward: v6.1 (speed×CTE + efficiency gate + progress termination)')
|
||||
log(f' Stopping: eval every {EVAL_EVERY:,} steps, stop at {LAP_STOP_THRESHOLD} laps')
|
||||
log(f' Safety ceiling: {MAX_STEPS:,} steps')
|
||||
log('='*60)
|
||||
|
||||
env = VecTransposeImage(DummyVecEnv([make_env()]))
|
||||
log(f' obs={env.observation_space.shape}')
|
||||
|
||||
model = PPO('CnnPolicy', env, learning_rate=LR, verbose=1, device='cpu')
|
||||
log('PPO created. Training begins...')
|
||||
|
||||
best_reward = float('-inf')
|
||||
best_laps = 0
|
||||
steps_done = 0
|
||||
stop_reason = None
|
||||
|
||||
while steps_done < MAX_STEPS:
|
||||
seg = min(EVAL_EVERY, MAX_STEPS - steps_done)
|
||||
model.learn(total_timesteps=seg, reset_num_timesteps=False)
|
||||
steps_done += seg
|
||||
|
||||
# Save checkpoint
|
||||
ckpt = os.path.join(SAVE_DIR, f'checkpoint_{steps_done:07d}')
|
||||
model.save(ckpt)
|
||||
model.save(os.path.join(SAVE_DIR, 'model'))
|
||||
|
||||
# Eval: count laps in one deterministic episode
|
||||
try:
|
||||
obs = env.reset()
|
||||
ep_reward, ep_steps, laps = 0.0, 0, 0
|
||||
prev_lap_count = 0
|
||||
for _ in range(EVAL_MAX_STEPS):
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
result = env.step(action)
|
||||
if len(result) == 4:
|
||||
obs, r, d, info = result
|
||||
done = bool(d[0])
|
||||
else:
|
||||
obs, r, t, tr, info = result
|
||||
done = bool(t[0] or tr[0])
|
||||
ep_reward += float(r[0])
|
||||
ep_steps += 1
|
||||
|
||||
# Count laps from info
|
||||
try:
|
||||
lc = int((info[0] if isinstance(info, (list, tuple)) else info)
|
||||
.get('lap_count', 0) or 0)
|
||||
if lc > prev_lap_count:
|
||||
laps = lc
|
||||
prev_lap_count = lc
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
status = '✅' if ep_steps >= EVAL_MAX_STEPS else f'❌@{ep_steps}'
|
||||
log(f'[{steps_done:,}/{MAX_STEPS:,}] reward={ep_reward:.1f} '
|
||||
f'steps={ep_steps} laps={laps} {status}')
|
||||
|
||||
if ep_reward > best_reward:
|
||||
best_reward = ep_reward
|
||||
model.save(os.path.join(SAVE_DIR, 'best_model'))
|
||||
log(f' ⭐ NEW BEST reward: {best_reward:.1f}')
|
||||
|
||||
if laps > best_laps:
|
||||
best_laps = laps
|
||||
log(f' 🏆 NEW BEST laps: {best_laps}')
|
||||
|
||||
if laps >= LAP_STOP_THRESHOLD:
|
||||
stop_reason = f'achieved {laps} laps in eval at {steps_done:,} steps'
|
||||
log(f' 🎯 STOPPING CRITERION MET: {stop_reason}')
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
log(f' Eval error: {e}')
|
||||
import traceback; traceback.print_exc()
|
||||
|
||||
env.close()
|
||||
time.sleep(3)
|
||||
|
||||
log(f'\n{"="*60}')
|
||||
log(f'Training complete.')
|
||||
log(f' Stop reason : {stop_reason or "reached max steps"}')
|
||||
log(f' Total steps : {steps_done:,}')
|
||||
log(f' Best laps : {best_laps}')
|
||||
log(f' Best reward : {best_reward:.1f}')
|
||||
log(f' Best model : {SAVE_DIR}/best_model.zip')
|
||||
log(f'{"="*60}')
|
||||
log(f'\n=== Exp 12 COMPLETE ===')
|
||||
|
|
@ -67,15 +67,23 @@ class SpeedRewardWrapper(gym.Wrapper):
|
|||
Completely ignores the sim's own reward (which uses forward_vel and is
|
||||
exploitable by circular/spinning motion).
|
||||
|
||||
Exploit termination:
|
||||
- Sustained high CTE (> max_cte_terminate for cte_patience steps): grass exploit
|
||||
- No track progress (active_node max not advancing for progress_patience steps):
|
||||
catches circular driving, stuck-on-cone, stuck-on-barrier.
|
||||
A circling car stays near the same waypoints — active_node never advances.
|
||||
A stuck car never advances either. Forward driving always advances.
|
||||
|
||||
Args:
|
||||
env: gymnasium environment
|
||||
speed_scale: speed bonus multiplier (default 0.1)
|
||||
window_size: steps for efficiency calculation (default 30)
|
||||
min_efficiency: efficiency below which no reward (default 0.15)
|
||||
max_cte: track half-width for normalization (default 8.0)
|
||||
window_size: steps for efficiency gate (default 30)
|
||||
min_efficiency: efficiency gate threshold (default 0.15)
|
||||
max_cte: track half-width for reward normalization (default 8.0)
|
||||
min_lap_time: laps faster than this are penalised as exploits
|
||||
max_cte_terminate: terminate if CTE exceeds this for cte_patience steps
|
||||
cte_patience: steps of sustained high CTE before termination (default 20)
|
||||
max_cte_terminate: terminate if CTE > this for cte_patience steps
|
||||
cte_patience: steps of sustained high CTE before termination
|
||||
progress_patience: steps without new max active_node before termination
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -86,8 +94,9 @@ class SpeedRewardWrapper(gym.Wrapper):
|
|||
min_efficiency: float = 0.15,
|
||||
max_cte: float = 8.0,
|
||||
min_lap_time: float = 5.0,
|
||||
max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m
|
||||
cte_patience: int = 20, # steps of high CTE before terminate
|
||||
max_cte_terminate: float = 4.0,
|
||||
cte_patience: int = 20,
|
||||
progress_patience: int = 60, # ~3.3s at 18 steps/sec
|
||||
):
|
||||
super().__init__(env)
|
||||
self.speed_scale = speed_scale
|
||||
|
|
@ -97,15 +106,20 @@ class SpeedRewardWrapper(gym.Wrapper):
|
|||
self.min_lap_time = min_lap_time
|
||||
self.max_cte_terminate = max_cte_terminate
|
||||
self.cte_patience = cte_patience
|
||||
self.progress_patience = progress_patience
|
||||
self._pos_history = deque(maxlen=window_size + 1)
|
||||
self._last_lap_count = 0
|
||||
self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate
|
||||
self._high_cte_steps = 0
|
||||
self._max_node_seen = -1 # highest active_node reached this episode
|
||||
self._no_progress_steps = 0 # steps since max_node last increased
|
||||
|
||||
def reset(self, **kwargs):
|
||||
result = self.env.reset(**kwargs)
|
||||
self._pos_history.clear()
|
||||
self._last_lap_count = 0
|
||||
self._high_cte_steps = 0
|
||||
self._max_node_seen = -1
|
||||
self._no_progress_steps = 0
|
||||
return result
|
||||
|
||||
def step(self, action):
|
||||
|
|
@ -176,7 +190,29 @@ class SpeedRewardWrapper(gym.Wrapper):
|
|||
else:
|
||||
self._high_cte_steps = 0
|
||||
|
||||
# --- Short-lap exploit detection ---
|
||||
# --- Circle / stuck exploit: no track progress termination ---
|
||||
# Track the highest active_node (track waypoint) reached this episode.
|
||||
# A circling car stays near the same waypoints — max_node never advances.
|
||||
# A stuck car never advances either. Only genuine forward driving advances.
|
||||
# On lap completion, active_node resets to 0 — we reset our tracker too.
|
||||
try:
|
||||
active_node = int(info.get('active_node', -1) or 0)
|
||||
total_nodes = int(info.get('total_nodes', 1) or 1)
|
||||
except (TypeError, ValueError):
|
||||
active_node = -1
|
||||
total_nodes = 1
|
||||
|
||||
if active_node >= 0:
|
||||
if active_node > self._max_node_seen:
|
||||
# New furthest point reached — genuine forward progress
|
||||
self._max_node_seen = active_node
|
||||
self._no_progress_steps = 0
|
||||
else:
|
||||
self._no_progress_steps += 1
|
||||
if self._no_progress_steps >= self.progress_patience:
|
||||
return -1.0, True # no forward progress — terminate
|
||||
|
||||
|
||||
try:
|
||||
current_lap_count = int(info.get('lap_count', 0) or 0)
|
||||
except (TypeError, ValueError):
|
||||
|
|
@ -184,6 +220,9 @@ class SpeedRewardWrapper(gym.Wrapper):
|
|||
|
||||
if current_lap_count > self._last_lap_count:
|
||||
self._last_lap_count = current_lap_count
|
||||
# Reset progress tracker — active_node wraps to 0 on new lap
|
||||
self._max_node_seen = -1
|
||||
self._no_progress_steps = 0
|
||||
try:
|
||||
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
|
||||
except (TypeError, ValueError):
|
||||
|
|
|
|||
|
|
@ -362,16 +362,98 @@ def test_high_cte_resets_when_back_on_track():
|
|||
|
||||
def test_no_track_progress_terminates_episode():
|
||||
"""
|
||||
REMOVED - progress_patience terminator removed from v6.1.
|
||||
Mountain rollback is a learning issue, not a termination issue (ADR-020).
|
||||
Circle/stuck exploit fix: if max active_node doesn't advance for
|
||||
progress_patience steps, the episode must be force-terminated.
|
||||
A circling car stays near the same waypoints — max_node never increases.
|
||||
"""
|
||||
pass # placeholder
|
||||
env = MockEnv(speed=3.0, cte=0.5)
|
||||
wrapper = SpeedRewardWrapper(env, progress_patience=10)
|
||||
wrapper.reset()
|
||||
|
||||
# First step initialises max_node to 5, then 10 more steps stuck at 5 → terminate
|
||||
for i in range(12):
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.),
|
||||
'active_node': 5, 'total_nodes': 100,
|
||||
'lap_count': 0, 'last_lap_time': 0.0}
|
||||
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
|
||||
if ft:
|
||||
break
|
||||
|
||||
assert ft == True, 'Should terminate when max active_node not advancing'
|
||||
assert r == -1.0
|
||||
|
||||
|
||||
def test_track_progress_resets_counter():
|
||||
"""
|
||||
Node advancement must reset the no-progress counter.
|
||||
REMOVED - progress_patience terminator removed from v6.1.
|
||||
Mountain rollback is a learning issue, not a termination issue (ADR-020).
|
||||
Advancing to a new max active_node must reset the no-progress counter.
|
||||
"""
|
||||
pass # placeholder to keep test count stable
|
||||
env = MockEnv(speed=3.0, cte=0.5)
|
||||
wrapper = SpeedRewardWrapper(env, progress_patience=5)
|
||||
wrapper.reset()
|
||||
|
||||
# Step forward: nodes 0, 1, 2, 3 — each new node resets counter
|
||||
for node in range(4):
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(node)*0.5, 0., 0.),
|
||||
'active_node': node, 'total_nodes': 100,
|
||||
'lap_count': 0, 'last_lap_time': 0.0}
|
||||
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
|
||||
assert ft == False, f'Should not terminate when advancing (node {node})'
|
||||
assert wrapper._no_progress_steps == 0, 'Counter should reset on new max node'
|
||||
|
||||
|
||||
def test_circle_exploit_terminates():
|
||||
"""
|
||||
A car circling near the same spot should be terminated.
|
||||
active_node oscillates but never exceeds the initial max.
|
||||
"""
|
||||
env = MockEnv(speed=3.0, cte=0.5)
|
||||
wrapper = SpeedRewardWrapper(env, progress_patience=10)
|
||||
wrapper.reset()
|
||||
|
||||
# Set max_node to 10
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
|
||||
'active_node': 10, 'total_nodes': 100,
|
||||
'lap_count': 0, 'last_lap_time': 0.0}
|
||||
wrapper._compute_reward_and_done(done=False, info=info)
|
||||
|
||||
# Now oscillate between nodes 8-10 (circling near node 10)
|
||||
terminated = False
|
||||
for i in range(20):
|
||||
node = 8 + (i % 3) # oscillates 8, 9, 10, 8, 9, 10...
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
|
||||
'active_node': node, 'total_nodes': 100,
|
||||
'lap_count': 0, 'last_lap_time': 0.0}
|
||||
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
|
||||
if ft:
|
||||
terminated = True
|
||||
break
|
||||
|
||||
assert terminated, 'Circling (oscillating active_node, no new max) should terminate'
|
||||
|
||||
|
||||
def test_lap_completion_resets_progress_tracker():
|
||||
"""
|
||||
On lap completion, active_node resets to 0. Progress tracker must also
|
||||
reset so the car isn't immediately terminated for 'no progress'.
|
||||
"""
|
||||
env = MockEnv(speed=3.0, cte=0.5)
|
||||
wrapper = SpeedRewardWrapper(env, progress_patience=5, min_lap_time=5.0)
|
||||
wrapper.reset()
|
||||
|
||||
# Drive to near end of track
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (1., 0., 0.),
|
||||
'active_node': 99, 'total_nodes': 100,
|
||||
'lap_count': 0, 'last_lap_time': 0.0}
|
||||
wrapper._compute_reward_and_done(done=False, info=info)
|
||||
assert wrapper._max_node_seen == 99
|
||||
|
||||
# Complete a valid lap
|
||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.),
|
||||
'active_node': 0, 'total_nodes': 100,
|
||||
'lap_count': 1, 'last_lap_time': 12.0} # 12s lap = valid
|
||||
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
|
||||
|
||||
# Progress tracker should be reset
|
||||
assert wrapper._max_node_seen == -1, 'max_node_seen should reset on lap completion'
|
||||
assert wrapper._no_progress_steps == 0
|
||||
assert ft == False, 'Valid lap should not terminate'
|
||||
|
|
|
|||
Loading…
Reference in New Issue