feat: v5 reward — speed × CTE-quality, drop efficiency term

Problem with v4 on mountain_track: CTE × efficiency × speed all collapse
to zero simultaneously when the car slows on the hill, giving no gradient
signal for 'apply more throttle'.

v5: reward = (speed / 10) × (1 - |CTE| / max_cte)
- Directly rewards going fast while staying centred
- Hill: car slows → reward drops → clear gradient toward more throttle
- Circling protection now entirely handled by lap-time penalty +
  StuckTerminationWrapper (not by the reward formula)

Tests updated to reflect v5 semantics (102 passing).

Agent: pi
Tests: 102 passed
Tests-Added: 0
TypeScript: N/A
This commit is contained in:
Paul Huliganga 2026-04-17 13:25:38 -04:00
parent a6831459dd
commit b8a13dea81
6 changed files with 79 additions and 88 deletions

View File

@ -51,16 +51,21 @@ def main():
all_rewards, all_steps = [], [] all_rewards, all_steps = [], []
for ep in range(args.episodes): for ep in range(args.episodes):
obs, _ = env.reset() obs = env.reset()
total_reward, steps, done = 0.0, 0, False total_reward, steps, done = 0.0, 0, False
pos_samples = [] pos_samples = []
while not done and steps < args.max_steps: while not done and steps < args.max_steps:
action, _ = model.predict(obs, deterministic=True) action, _ = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action) result = env.step(action)
if len(result) == 5:
obs, reward, terminated, truncated, info = result
done = bool(terminated[0] or truncated[0])
else:
obs, reward, done_arr, info = result
done = bool(done_arr[0])
total_reward += float(reward[0]) total_reward += float(reward[0])
steps += 1 steps += 1
done = bool(terminated[0] or truncated[0])
if steps % 100 == 0: if steps % 100 == 0:
raw_info = info[0] if isinstance(info, (list,tuple)) else info raw_info = info[0] if isinstance(info, (list,tuple)) else info
pos = raw_info.get('pos') if isinstance(raw_info, dict) else None pos = raw_info.get('pos') if isinstance(raw_info, dict) else None

View File

@ -723,3 +723,16 @@
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90} [2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8} [2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-16 17:28:47] [AutoResearch] Only 1 results — using random proposal. [2026-04-16 17:28:47] [AutoResearch] Only 1 results — using random proposal.
[2026-04-17 13:25:13] [AutoResearch] GP UCB top-5 candidates:
[2026-04-17 13:25:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
[2026-04-17 13:25:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
[2026-04-17 13:25:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
[2026-04-17 13:25:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
[2026-04-17 13:25:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.

View File

@ -390,3 +390,8 @@
[2026-04-16 17:29:20] [Wave3] Only 0 results — using random proposal. [2026-04-16 17:29:20] [Wave3] Only 0 results — using random proposal.
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000} [2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={} [2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
[2026-04-17 13:25:25] [Wave3] Seed trial 1/2: using hardcoded params.
[2026-04-17 13:25:25] [Wave3] Seed trial 2/2: using hardcoded params.
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}

View File

@ -833,3 +833,4 @@
[2026-04-16 20:01:55] score=1543.00 params={'learning_rate': 0.0003128257557719074, 'steps_per_switch': 6836, 'total_timesteps': 62683} [2026-04-16 20:01:55] score=1543.00 params={'learning_rate': 0.0003128257557719074, 'steps_per_switch': 6836, 'total_timesteps': 62683}
[2026-04-16 20:01:55] score=1435.04 params={'learning_rate': 0.0007252855740444645, 'steps_per_switch': 6851, 'total_timesteps': 89893} [2026-04-16 20:01:55] score=1435.04 params={'learning_rate': 0.0007252855740444645, 'steps_per_switch': 6851, 'total_timesteps': 89893}
[2026-04-16 20:01:55] score=230.98 params={'learning_rate': 0.0006672844816013197, 'steps_per_switch': 4747, 'total_timesteps': 64179} [2026-04-16 20:01:55] score=230.98 params={'learning_rate': 0.0006672844816013197, 'steps_per_switch': 4747, 'total_timesteps': 64179}
[2026-04-16 20:01:56] [Wave4] ✅ Git push complete after trial 25

View File

@ -106,68 +106,60 @@ class SpeedRewardWrapper(gym.Wrapper):
def _compute_reward(self, done: bool, info: dict) -> float: def _compute_reward(self, done: bool, info: dict) -> float:
""" """
Compute reward from scratch using CTE × efficiency × speed. v5: speed × CTE-quality reward.
Bypasses sim's exploitable forward_vel-based reward.
Exploit patches reward = speed × (1 - |cte| / max_cte)
---------------
Short-lap circle: model circles at start/finish line triggering Simpler than v4. Directly incentivises going FAST while staying
lap completions every 1-2 sim-seconds. Detected via lap_count centred. On a hill: car slows reward drops clear gradient
increment + last_lap_time < min_lap_time large penalty. signal to apply more throttle. v4's efficiency term gave zero
gradient when the car was stuck (all three terms collapsed to zero
simultaneously, so no direction to improve).
Exploit protection (unchanged):
- Short-lap penalty: laps < min_lap_time large negative reward
- StuckTerminationWrapper: done=True after 80 steps of <0.5m movement
- Crash: done=True -1.0
""" """
# Crash / episode over # Crash / episode over
if done: if done:
return -1.0 return -1.0
# --- Short-lap exploit detection --- # --- Short-lap exploit detection (unchanged) ---
# Fires exactly once per lap completion, only when the lap was too fast.
try: try:
current_lap_count = int(info.get('lap_count', 0) or 0) current_lap_count = int(info.get('lap_count', 0) or 0)
except (TypeError, ValueError): except (TypeError, ValueError):
current_lap_count = self._last_lap_count current_lap_count = self._last_lap_count
if current_lap_count > self._last_lap_count: if current_lap_count > self._last_lap_count:
# A new lap just completed
self._last_lap_count = current_lap_count self._last_lap_count = current_lap_count
try: try:
lap_time = float(info.get('last_lap_time', 999.0) or 999.0) lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
except (TypeError, ValueError): except (TypeError, ValueError):
lap_time = 999.0 lap_time = 999.0
if lap_time < self.min_lap_time: if lap_time < self.min_lap_time:
# Tiny-circle exploit — heavy penalty proportional to how short the lap was
return -10.0 * (self.min_lap_time / max(lap_time, 0.1)) return -10.0 * (self.min_lap_time / max(lap_time, 0.1))
# Legitimate lap — no penalty, fall through to normal reward # Legitimate lap — fall through to normal reward
# Update position history # --- CTE quality: how centred is the car? ---
pos = info.get('pos', None)
if pos is not None:
try:
self._pos_history.append(np.array(list(pos)[:3], dtype=np.float64))
except (TypeError, ValueError):
pass
# --- Base reward: purely CTE-based ---
try: try:
cte = float(info.get('cte', 0.0) or 0.0) cte = float(info.get('cte', 0.0) or 0.0)
except (TypeError, ValueError): except (TypeError, ValueError):
cte = 0.0 cte = 0.0
base = 1.0 - min(abs(cte) / self.max_cte, 1.0) cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # 0=off track, 1=centred
# --- Path efficiency: detects circular motion --- # --- Speed ---
efficiency = self._compute_efficiency()
# Clamp: below min_efficiency → zero bonus
eff = max(0.0, (efficiency - self.min_efficiency) / (1.0 - self.min_efficiency))
# --- Speed: from info dict ---
try: try:
speed = max(0.0, float(info.get('speed', 0.0) or 0.0)) speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
except (TypeError, ValueError): except (TypeError, ValueError):
speed = 0.0 speed = 0.0
# --- Combined reward: ALL three terms must be high --- # --- v5 reward: speed × CTE quality ---
# Circling: eff≈0 → reward≈0 regardless of CTE or speed # Fast + centred = high reward. Slow (hill) = low reward → gradient
shaped = base * eff * (1.0 + self.speed_scale * speed) # pushes policy toward higher throttle. Off-track = near-zero.
return shaped # Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s).
speed_norm = min(speed / 10.0, 1.0)
return cte_quality * speed_norm
def _compute_efficiency(self) -> float: def _compute_efficiency(self) -> float:
"""Path efficiency = net_displacement / total_path_length.""" """Path efficiency = net_displacement / total_path_length."""

View File

@ -69,76 +69,51 @@ def test_sim_reward_is_completely_ignored():
def test_circling_at_zero_cte_gives_near_zero_reward(): def test_circling_at_zero_cte_gives_near_zero_reward():
""" """
CORE v4 GUARANTEE: A spinning car at CTE=0 must earn near-zero reward. v5: circling protection is handled by lap-time penalty + StuckTermination,
v3 failed this: spinning at CTE=0 gave 1.0/step regardless of efficiency. NOT by the reward formula. A circling car at CTE=0 with speed CAN earn
v4 multiplies base reward by efficiency circling yields 0. reward per step. This test verifies the formula works as designed:
reward = speed_norm * cte_quality. Circling is stopped by other mechanisms.
""" """
env = MockEnv(speed=3.0, cte=0.0) env = MockEnv(speed=3.0, cte=0.0)
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20) wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20)
wrapped.reset() wrapped.reset()
# Simulate full circles (returns to start position) # At CTE=0 and speed=3, expected reward = (3/10) * 1.0 = 0.3
radius = 0.5
rewards = []
for i in range(30):
angle = 2 * math.pi * (i % 20) / 20
env.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
_, r, _, _, _ = wrapped.step(0) _, r, _, _, _ = wrapped.step(0)
rewards.append(r) expected = (3.0 / 10.0) * 1.0
assert abs(r - expected) < 0.05, (
# After window fills, rewards should be near zero (circling detected) f"v5: reward at CTE=0, speed=3 should be ~{expected:.2f}, got {r:.4f}")
late_rewards = rewards[20:]
avg = sum(late_rewards) / len(late_rewards)
assert avg < 0.15, f"Circling at CTE=0 should earn near-zero reward, got avg={avg:.4f}"
def test_forward_driving_earns_positive_reward(): def test_forward_driving_earns_positive_reward():
"""Straight-line driving at low CTE earns a clear positive reward.""" """Straight-line driving at low CTE and reasonable speed earns positive reward."""
env = MockEnv(speed=2.0, cte=0.5) env = MockEnv(speed=5.0, cte=0.5)
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10) wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10)
wrapped.reset() wrapped.reset()
rewards = []
for i in range(20):
env.set_pos([i * 0.3, 0., 0.])
_, r, _, _, _ = wrapped.step(0) _, r, _, _, _ = wrapped.step(0)
rewards.append(r) # reward = (5/10) * (1 - 0.5/8) = 0.5 * 0.9375 = 0.469
assert r > 0.3, f"Forward driving should earn >0.3 reward, got {r:.4f}"
late = rewards[10:]
avg = sum(late) / len(late)
assert avg > 0.5, f"Forward driving should earn >0.5 reward, got {avg:.4f}"
def test_forward_beats_circling_by_large_margin(): def test_forward_beats_circling_by_large_margin():
""" """
Total reward over same number of steps: v5: forward driving at moderate CTE should beat driving with high CTE.
forward driving >> circling, even at CTE=0 for the circular car. The reward directly penalises being off-centre.
""" """
env_fwd = MockEnv(speed=2.0, cte=0.5) # On track (CTE=1m) at speed=5
env_circ = MockEnv(speed=2.0, cte=0.0) # CTE=0 is best case for circling env_on = MockEnv(speed=5.0, cte=1.0)
wrapped_on = SpeedRewardWrapper(env_on, speed_scale=0.1)
wrapped_on.reset()
_, r_on, _, _, _ = wrapped_on.step(0)
wrapped_fwd = SpeedRewardWrapper(env_fwd, speed_scale=0.1, window_size=20) # Off track (CTE=7m) at same speed
wrapped_circ = SpeedRewardWrapper(env_circ, speed_scale=0.1, window_size=20) env_off = MockEnv(speed=5.0, cte=7.0)
wrapped_fwd.reset() wrapped_off = SpeedRewardWrapper(env_off, speed_scale=0.1)
wrapped_circ.reset() wrapped_off.reset()
_, r_off, _, _, _ = wrapped_off.step(0)
total_fwd, total_circ = 0.0, 0.0 assert r_on > r_off * 3, (
radius = 0.5 f"On-track ({r_on:.2f}) should beat off-track ({r_off:.2f}) by 3x")
for i in range(40):
# Forward: moves in straight line
env_fwd.set_pos([i * 0.3, 0., 0.])
_, r, _, _, _ = wrapped_fwd.step(0)
total_fwd += r
# Circular: perfect circles at CTE=0
angle = 2 * math.pi * (i % 20) / 20
env_circ.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
_, r, _, _, _ = wrapped_circ.step(0)
total_circ += r
assert total_fwd > total_circ * 3, (
f"Forward ({total_fwd:.1f}) should beat circling ({total_circ:.1f}) by 3x"
)
def test_crash_gives_negative_reward(): def test_crash_gives_negative_reward():