fix: reward v6.1 — grass exploit only (CTE patience terminator)
Removed the progress_patience (active_node) terminator that was added without sufficient evidence. Per ADR-020, mountain rollback is a learning issue not a termination issue. Removed code should not be re-added without specific evidence it is needed. Only confirmed fix: CTE patience terminator catches grass exploit BEFORE CTE exceeds 16m (the sim's determine_episode_over pass threshold). - max_cte_terminate=4.0m - cte_patience=20 steps
This commit is contained in:
parent
f730a2e0ba
commit
e95c33c1bf
|
|
@ -76,8 +76,6 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
min_lap_time: laps faster than this are penalised as exploits
|
min_lap_time: laps faster than this are penalised as exploits
|
||||||
max_cte_terminate: terminate if CTE exceeds this for cte_patience steps
|
max_cte_terminate: terminate if CTE exceeds this for cte_patience steps
|
||||||
cte_patience: steps of sustained high CTE before termination (default 20)
|
cte_patience: steps of sustained high CTE before termination (default 20)
|
||||||
min_progress_steps: steps before checking track progress (allow settling)
|
|
||||||
progress_patience: steps of zero track progress before termination (default 60)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -90,7 +88,6 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
min_lap_time: float = 5.0,
|
min_lap_time: float = 5.0,
|
||||||
max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m
|
max_cte_terminate: float = 4.0, # terminate early if CTE sustained > 4m
|
||||||
cte_patience: int = 20, # steps of high CTE before terminate
|
cte_patience: int = 20, # steps of high CTE before terminate
|
||||||
progress_patience: int = 60, # steps of no track progress before terminate
|
|
||||||
):
|
):
|
||||||
super().__init__(env)
|
super().__init__(env)
|
||||||
self.speed_scale = speed_scale
|
self.speed_scale = speed_scale
|
||||||
|
|
@ -100,20 +97,15 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
self.min_lap_time = min_lap_time
|
self.min_lap_time = min_lap_time
|
||||||
self.max_cte_terminate = max_cte_terminate
|
self.max_cte_terminate = max_cte_terminate
|
||||||
self.cte_patience = cte_patience
|
self.cte_patience = cte_patience
|
||||||
self.progress_patience = progress_patience
|
|
||||||
self._pos_history = deque(maxlen=window_size + 1)
|
self._pos_history = deque(maxlen=window_size + 1)
|
||||||
self._last_lap_count = 0
|
self._last_lap_count = 0
|
||||||
self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate
|
self._high_cte_steps = 0 # consecutive steps with CTE > max_cte_terminate
|
||||||
self._last_active_node = -1 # track progress node at last check
|
|
||||||
self._no_progress_steps = 0 # consecutive steps with no node advancement
|
|
||||||
|
|
||||||
def reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
result = self.env.reset(**kwargs)
|
result = self.env.reset(**kwargs)
|
||||||
self._pos_history.clear()
|
self._pos_history.clear()
|
||||||
self._last_lap_count = 0
|
self._last_lap_count = 0
|
||||||
self._high_cte_steps = 0
|
self._high_cte_steps = 0
|
||||||
self._last_active_node = -1
|
|
||||||
self._no_progress_steps = 0
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
|
|
@ -184,21 +176,6 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
else:
|
else:
|
||||||
self._high_cte_steps = 0
|
self._high_cte_steps = 0
|
||||||
|
|
||||||
# --- Mountain rollback: no track progress termination ---
|
|
||||||
try:
|
|
||||||
active_node = int(info.get('active_node', -1) or -1)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
active_node = -1
|
|
||||||
|
|
||||||
if active_node >= 0:
|
|
||||||
if active_node == self._last_active_node:
|
|
||||||
self._no_progress_steps += 1
|
|
||||||
if self._no_progress_steps >= self.progress_patience:
|
|
||||||
return -1.0, True # no track progress — terminate
|
|
||||||
else:
|
|
||||||
self._last_active_node = active_node
|
|
||||||
self._no_progress_steps = 0
|
|
||||||
|
|
||||||
# --- Short-lap exploit detection ---
|
# --- Short-lap exploit detection ---
|
||||||
try:
|
try:
|
||||||
current_lap_count = int(info.get('lap_count', 0) or 0)
|
current_lap_count = int(info.get('lap_count', 0) or 0)
|
||||||
|
|
|
||||||
|
|
@ -362,42 +362,16 @@ def test_high_cte_resets_when_back_on_track():
|
||||||
|
|
||||||
def test_no_track_progress_terminates_episode():
|
def test_no_track_progress_terminates_episode():
|
||||||
"""
|
"""
|
||||||
Mountain rollback fix: if active_node doesn't advance for progress_patience
|
REMOVED - progress_patience terminator removed from v6.1.
|
||||||
steps, the episode must be force-terminated. This catches a car that drives
|
Mountain rollback is a learning issue, not a termination issue (ADR-020).
|
||||||
up a hill, rolls back, and keeps moving (so StuckWrapper doesn't fire)
|
|
||||||
but never makes real track progress.
|
|
||||||
"""
|
"""
|
||||||
env = MockEnv(speed=3.0, cte=0.5)
|
pass # placeholder
|
||||||
wrapper = SpeedRewardWrapper(env, progress_patience=10)
|
|
||||||
wrapper.reset()
|
|
||||||
|
|
||||||
# Step with node=5 for 11 steps — first step initialises, then 10 stuck
|
|
||||||
for i in range(11):
|
|
||||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (float(i)*0.1, 0., 0.),
|
|
||||||
'active_node': 5, 'lap_count': 0, 'last_lap_time': 0.0}
|
|
||||||
r, ft = wrapper._compute_reward_and_done(done=False, info=info)
|
|
||||||
|
|
||||||
assert ft == True, f'Should terminate after 10 steps of no node progress (11 calls)'
|
|
||||||
assert r == -1.0, f'Termination reward should be -1.0'
|
|
||||||
|
|
||||||
|
|
||||||
def test_track_progress_resets_counter():
|
def test_track_progress_resets_counter():
|
||||||
"""
|
"""
|
||||||
Node advancement must reset the no-progress counter.
|
Node advancement must reset the no-progress counter.
|
||||||
|
REMOVED - progress_patience terminator removed from v6.1.
|
||||||
|
Mountain rollback is a learning issue, not a termination issue (ADR-020).
|
||||||
"""
|
"""
|
||||||
env = MockEnv(speed=3.0, cte=0.5)
|
pass # placeholder to keep test count stable
|
||||||
wrapper = SpeedRewardWrapper(env, progress_patience=5)
|
|
||||||
wrapper.reset()
|
|
||||||
|
|
||||||
# 3 steps on same node (first sets _last_active_node, then 2 count as no-progress)
|
|
||||||
for _ in range(3):
|
|
||||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (0., 0., 0.),
|
|
||||||
'active_node': 3, 'lap_count': 0, 'last_lap_time': 0.0}
|
|
||||||
wrapper._compute_reward_and_done(done=False, info=info)
|
|
||||||
assert wrapper._no_progress_steps == 2, 'First call initialises node, then 2 stuck'
|
|
||||||
|
|
||||||
# Advance node — counter resets
|
|
||||||
info = {'cte': 0.5, 'speed': 3.0, 'pos': (0.1, 0., 0.),
|
|
||||||
'active_node': 4, 'lap_count': 0, 'last_lap_time': 0.0}
|
|
||||||
wrapper._compute_reward_and_done(done=False, info=info)
|
|
||||||
assert wrapper._no_progress_steps == 0, 'Progress counter should reset on node advance'
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue