feat: v5 reward — speed × CTE-quality, drop efficiency term

Problem with v4 on mountain_track: CTE × efficiency × speed all collapse
to zero simultaneously when the car slows on the hill, giving no gradient
signal for 'apply more throttle'.

v5: reward = (speed / 10) × (1 - |CTE| / max_cte)
- Directly rewards going fast while staying centred
- Hill: car slows → reward drops → clear gradient toward more throttle
- Circling protection now entirely handled by lap-time penalty +
  StuckTerminationWrapper (not by the reward formula)

Tests updated to reflect v5 semantics (102 passing).

Agent: pi
Tests: 102 passed
Tests-Added: 0
TypeScript: N/A
This commit is contained in:
Paul Huliganga 2026-04-17 13:25:38 -04:00
parent a6831459dd
commit b8a13dea81
6 changed files with 79 additions and 88 deletions

View File

@ -51,16 +51,21 @@ def main():
all_rewards, all_steps = [], []
for ep in range(args.episodes):
obs, _ = env.reset()
obs = env.reset()
total_reward, steps, done = 0.0, 0, False
pos_samples = []
while not done and steps < args.max_steps:
action, _ = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action)
result = env.step(action)
if len(result) == 5:
obs, reward, terminated, truncated, info = result
done = bool(terminated[0] or truncated[0])
else:
obs, reward, done_arr, info = result
done = bool(done_arr[0])
total_reward += float(reward[0])
steps += 1
done = bool(terminated[0] or truncated[0])
if steps % 100 == 0:
raw_info = info[0] if isinstance(info, (list,tuple)) else info
pos = raw_info.get('pos') if isinstance(raw_info, dict) else None

View File

@ -723,3 +723,16 @@
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-16 17:28:47] [AutoResearch] Only 1 results — using random proposal.
[2026-04-17 13:25:13] [AutoResearch] GP UCB top-5 candidates:
[2026-04-17 13:25:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
[2026-04-17 13:25:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
[2026-04-17 13:25:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
[2026-04-17 13:25:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
[2026-04-17 13:25:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.

View File

@ -390,3 +390,8 @@
[2026-04-16 17:29:20] [Wave3] Only 0 results — using random proposal.
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
[2026-04-17 13:25:25] [Wave3] Seed trial 1/2: using hardcoded params.
[2026-04-17 13:25:25] [Wave3] Seed trial 2/2: using hardcoded params.
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}

View File

@ -833,3 +833,4 @@
[2026-04-16 20:01:55] score=1543.00 params={'learning_rate': 0.0003128257557719074, 'steps_per_switch': 6836, 'total_timesteps': 62683}
[2026-04-16 20:01:55] score=1435.04 params={'learning_rate': 0.0007252855740444645, 'steps_per_switch': 6851, 'total_timesteps': 89893}
[2026-04-16 20:01:55] score=230.98 params={'learning_rate': 0.0006672844816013197, 'steps_per_switch': 4747, 'total_timesteps': 64179}
[2026-04-16 20:01:56] [Wave4] ✅ Git push complete after trial 25

View File

@ -106,68 +106,60 @@ class SpeedRewardWrapper(gym.Wrapper):
def _compute_reward(self, done: bool, info: dict) -> float:
"""
Compute reward from scratch using CTE × efficiency × speed.
Bypasses sim's exploitable forward_vel-based reward.
v5: speed × CTE-quality reward.
Exploit patches
---------------
Short-lap circle: model circles at start/finish line triggering
lap completions every 1-2 sim-seconds. Detected via lap_count
increment + last_lap_time < min_lap_time large penalty.
reward = speed × (1 - |cte| / max_cte)
Simpler than v4. Directly incentivises going FAST while staying
centred. On a hill: car slows reward drops clear gradient
signal to apply more throttle. v4's efficiency term gave zero
gradient when the car was stuck (all three terms collapsed to zero
simultaneously, so no direction to improve).
Exploit protection (unchanged):
- Short-lap penalty: laps < min_lap_time large negative reward
- StuckTerminationWrapper: done=True after 80 steps of <0.5m movement
- Crash: done=True -1.0
"""
# Crash / episode over
if done:
return -1.0
# --- Short-lap exploit detection ---
# Fires exactly once per lap completion, only when the lap was too fast.
# --- Short-lap exploit detection (unchanged) ---
try:
current_lap_count = int(info.get('lap_count', 0) or 0)
except (TypeError, ValueError):
current_lap_count = self._last_lap_count
if current_lap_count > self._last_lap_count:
# A new lap just completed
self._last_lap_count = current_lap_count
try:
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
except (TypeError, ValueError):
lap_time = 999.0
if lap_time < self.min_lap_time:
# Tiny-circle exploit — heavy penalty proportional to how short the lap was
return -10.0 * (self.min_lap_time / max(lap_time, 0.1))
# Legitimate lap — no penalty, fall through to normal reward
# Legitimate lap — fall through to normal reward
# Update position history
pos = info.get('pos', None)
if pos is not None:
try:
self._pos_history.append(np.array(list(pos)[:3], dtype=np.float64))
except (TypeError, ValueError):
pass
# --- Base reward: purely CTE-based ---
# --- CTE quality: how centred is the car? ---
try:
cte = float(info.get('cte', 0.0) or 0.0)
except (TypeError, ValueError):
cte = 0.0
base = 1.0 - min(abs(cte) / self.max_cte, 1.0)
cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # 0=off track, 1=centred
# --- Path efficiency: detects circular motion ---
efficiency = self._compute_efficiency()
# Clamp: below min_efficiency → zero bonus
eff = max(0.0, (efficiency - self.min_efficiency) / (1.0 - self.min_efficiency))
# --- Speed: from info dict ---
# --- Speed ---
try:
speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
except (TypeError, ValueError):
speed = 0.0
# --- Combined reward: ALL three terms must be high ---
# Circling: eff≈0 → reward≈0 regardless of CTE or speed
shaped = base * eff * (1.0 + self.speed_scale * speed)
return shaped
# --- v5 reward: speed × CTE quality ---
# Fast + centred = high reward. Slow (hill) = low reward → gradient
# pushes policy toward higher throttle. Off-track = near-zero.
# Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s).
speed_norm = min(speed / 10.0, 1.0)
return cte_quality * speed_norm
def _compute_efficiency(self) -> float:
"""Path efficiency = net_displacement / total_path_length."""

View File

@ -69,76 +69,51 @@ def test_sim_reward_is_completely_ignored():
def test_circling_at_zero_cte_gives_near_zero_reward():
"""
CORE v4 GUARANTEE: A spinning car at CTE=0 must earn near-zero reward.
v3 failed this: spinning at CTE=0 gave 1.0/step regardless of efficiency.
v4 multiplies base reward by efficiency circling yields 0.
v5: circling protection is handled by lap-time penalty + StuckTermination,
NOT by the reward formula. A circling car at CTE=0 with speed CAN earn
reward per step. This test verifies the formula works as designed:
reward = speed_norm * cte_quality. Circling is stopped by other mechanisms.
"""
env = MockEnv(speed=3.0, cte=0.0)
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20)
wrapped.reset()
# Simulate full circles (returns to start position)
radius = 0.5
rewards = []
for i in range(30):
angle = 2 * math.pi * (i % 20) / 20
env.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
_, r, _, _, _ = wrapped.step(0)
rewards.append(r)
# After window fills, rewards should be near zero (circling detected)
late_rewards = rewards[20:]
avg = sum(late_rewards) / len(late_rewards)
assert avg < 0.15, f"Circling at CTE=0 should earn near-zero reward, got avg={avg:.4f}"
# At CTE=0 and speed=3, expected reward = (3/10) * 1.0 = 0.3
_, r, _, _, _ = wrapped.step(0)
expected = (3.0 / 10.0) * 1.0
assert abs(r - expected) < 0.05, (
f"v5: reward at CTE=0, speed=3 should be ~{expected:.2f}, got {r:.4f}")
def test_forward_driving_earns_positive_reward():
"""Straight-line driving at low CTE earns a clear positive reward."""
env = MockEnv(speed=2.0, cte=0.5)
"""Straight-line driving at low CTE and reasonable speed earns positive reward."""
env = MockEnv(speed=5.0, cte=0.5)
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10)
wrapped.reset()
rewards = []
for i in range(20):
env.set_pos([i * 0.3, 0., 0.])
_, r, _, _, _ = wrapped.step(0)
rewards.append(r)
late = rewards[10:]
avg = sum(late) / len(late)
assert avg > 0.5, f"Forward driving should earn >0.5 reward, got {avg:.4f}"
_, r, _, _, _ = wrapped.step(0)
# reward = (5/10) * (1 - 0.5/8) = 0.5 * 0.9375 = 0.469
assert r > 0.3, f"Forward driving should earn >0.3 reward, got {r:.4f}"
def test_forward_beats_circling_by_large_margin():
"""
Total reward over same number of steps:
forward driving >> circling, even at CTE=0 for the circular car.
v5: forward driving at moderate CTE should beat driving with high CTE.
The reward directly penalises being off-centre.
"""
env_fwd = MockEnv(speed=2.0, cte=0.5)
env_circ = MockEnv(speed=2.0, cte=0.0) # CTE=0 is best case for circling
# On track (CTE=1m) at speed=5
env_on = MockEnv(speed=5.0, cte=1.0)
wrapped_on = SpeedRewardWrapper(env_on, speed_scale=0.1)
wrapped_on.reset()
_, r_on, _, _, _ = wrapped_on.step(0)
wrapped_fwd = SpeedRewardWrapper(env_fwd, speed_scale=0.1, window_size=20)
wrapped_circ = SpeedRewardWrapper(env_circ, speed_scale=0.1, window_size=20)
wrapped_fwd.reset()
wrapped_circ.reset()
# Off track (CTE=7m) at same speed
env_off = MockEnv(speed=5.0, cte=7.0)
wrapped_off = SpeedRewardWrapper(env_off, speed_scale=0.1)
wrapped_off.reset()
_, r_off, _, _, _ = wrapped_off.step(0)
total_fwd, total_circ = 0.0, 0.0
radius = 0.5
for i in range(40):
# Forward: moves in straight line
env_fwd.set_pos([i * 0.3, 0., 0.])
_, r, _, _, _ = wrapped_fwd.step(0)
total_fwd += r
# Circular: perfect circles at CTE=0
angle = 2 * math.pi * (i % 20) / 20
env_circ.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
_, r, _, _, _ = wrapped_circ.step(0)
total_circ += r
assert total_fwd > total_circ * 3, (
f"Forward ({total_fwd:.1f}) should beat circling ({total_circ:.1f}) by 3x"
)
assert r_on > r_off * 3, (
f"On-track ({r_on:.2f}) should beat off-track ({r_off:.2f}) by 3x")
def test_crash_gives_negative_reward():