feat: v5 reward — speed × CTE-quality, drop efficiency term
Problem with v4 on mountain_track: CTE × efficiency × speed all collapse to zero simultaneously when the car slows on the hill, giving no gradient signal for 'apply more throttle'. v5: reward = (speed / 10) × (1 - |CTE| / max_cte) - Directly rewards going fast while staying centred - Hill: car slows → reward drops → clear gradient toward more throttle - Circling protection now entirely handled by lap-time penalty + StuckTerminationWrapper (not by the reward formula) Tests updated to reflect v5 semantics (102 passing). Agent: pi Tests: 102 passed Tests-Added: 0 TypeScript: N/A
This commit is contained in:
parent
a6831459dd
commit
b8a13dea81
|
|
@ -51,16 +51,21 @@ def main():
|
||||||
|
|
||||||
all_rewards, all_steps = [], []
|
all_rewards, all_steps = [], []
|
||||||
for ep in range(args.episodes):
|
for ep in range(args.episodes):
|
||||||
obs, _ = env.reset()
|
obs = env.reset()
|
||||||
total_reward, steps, done = 0.0, 0, False
|
total_reward, steps, done = 0.0, 0, False
|
||||||
pos_samples = []
|
pos_samples = []
|
||||||
|
|
||||||
while not done and steps < args.max_steps:
|
while not done and steps < args.max_steps:
|
||||||
action, _ = model.predict(obs, deterministic=True)
|
action, _ = model.predict(obs, deterministic=True)
|
||||||
obs, reward, terminated, truncated, info = env.step(action)
|
result = env.step(action)
|
||||||
|
if len(result) == 5:
|
||||||
|
obs, reward, terminated, truncated, info = result
|
||||||
|
done = bool(terminated[0] or truncated[0])
|
||||||
|
else:
|
||||||
|
obs, reward, done_arr, info = result
|
||||||
|
done = bool(done_arr[0])
|
||||||
total_reward += float(reward[0])
|
total_reward += float(reward[0])
|
||||||
steps += 1
|
steps += 1
|
||||||
done = bool(terminated[0] or truncated[0])
|
|
||||||
if steps % 100 == 0:
|
if steps % 100 == 0:
|
||||||
raw_info = info[0] if isinstance(info, (list,tuple)) else info
|
raw_info = info[0] if isinstance(info, (list,tuple)) else info
|
||||||
pos = raw_info.get('pos') if isinstance(raw_info, dict) else None
|
pos = raw_info.get('pos') if isinstance(raw_info, dict) else None
|
||||||
|
|
|
||||||
|
|
@ -723,3 +723,16 @@
|
||||||
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
[2026-04-16 17:28:47] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
[2026-04-16 17:28:47] [AutoResearch] Only 1 results — using random proposal.
|
[2026-04-16 17:28:47] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
[2026-04-17 13:25:13] [AutoResearch] GP UCB top-5 candidates:
|
||||||
|
[2026-04-17 13:25:13] UCB=2.3107 mu=0.3981 sigma=0.9563 params={'n_steer': 9, 'n_throttle': 2, 'learning_rate': 0.001405531880392808, 'timesteps': 26173}
|
||||||
|
[2026-04-17 13:25:13] UCB=2.3049 mu=0.8602 sigma=0.7224 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.001793493447174312, 'timesteps': 19198}
|
||||||
|
[2026-04-17 13:25:13] UCB=2.2813 mu=0.4904 sigma=0.8954 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011616192816742616, 'timesteps': 13887}
|
||||||
|
[2026-04-17 13:25:13] UCB=2.2767 mu=0.5194 sigma=0.8787 params={'n_steer': 9, 'n_throttle': 4, 'learning_rate': 0.0011646447444663046, 'timesteps': 21199}
|
||||||
|
[2026-04-17 13:25:13] UCB=2.2525 mu=0.6254 sigma=0.8136 params={'n_steer': 9, 'n_throttle': 3, 'learning_rate': 0.0010196345864901517, 'timesteps': 22035}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=50.0000 params={'n_steer': 5}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'n_steer': 7}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 0: mean_reward=50.0000 params={'r': 50}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 1: mean_reward=80.0000 params={'r': 80}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 3: mean_reward=90.0000 params={'r': 90}
|
||||||
|
[2026-04-17 13:25:13] [Champion] 🏆 NEW BEST! Trial 5: mean_reward=75.0000 params={'n_steer': 8}
|
||||||
|
[2026-04-17 13:25:13] [AutoResearch] Only 1 results — using random proposal.
|
||||||
|
|
|
||||||
|
|
@ -390,3 +390,8 @@
|
||||||
[2026-04-16 17:29:20] [Wave3] Only 0 results — using random proposal.
|
[2026-04-16 17:29:20] [Wave3] Only 0 results — using random proposal.
|
||||||
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
[2026-04-16 17:29:20] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
[2026-04-17 13:25:25] [Wave3] Seed trial 1/2: using hardcoded params.
|
||||||
|
[2026-04-17 13:25:25] [Wave3] Seed trial 2/2: using hardcoded params.
|
||||||
|
[2026-04-17 13:25:25] [Wave3] Only 0 results — using random proposal.
|
||||||
|
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 3: score=1500.00 (mini_monaco=1500.0) params={'learning_rate': 0.0002, 'steps_per_switch': 8000, 'total_timesteps': 150000}
|
||||||
|
[2026-04-17 13:25:25] [Champion] 🏆 NEW BEST! Trial 1: score=2000.00 (mini_monaco=2000.0) params={}
|
||||||
|
|
|
||||||
|
|
@ -833,3 +833,4 @@
|
||||||
[2026-04-16 20:01:55] score=1543.00 params={'learning_rate': 0.0003128257557719074, 'steps_per_switch': 6836, 'total_timesteps': 62683}
|
[2026-04-16 20:01:55] score=1543.00 params={'learning_rate': 0.0003128257557719074, 'steps_per_switch': 6836, 'total_timesteps': 62683}
|
||||||
[2026-04-16 20:01:55] score=1435.04 params={'learning_rate': 0.0007252855740444645, 'steps_per_switch': 6851, 'total_timesteps': 89893}
|
[2026-04-16 20:01:55] score=1435.04 params={'learning_rate': 0.0007252855740444645, 'steps_per_switch': 6851, 'total_timesteps': 89893}
|
||||||
[2026-04-16 20:01:55] score=230.98 params={'learning_rate': 0.0006672844816013197, 'steps_per_switch': 4747, 'total_timesteps': 64179}
|
[2026-04-16 20:01:55] score=230.98 params={'learning_rate': 0.0006672844816013197, 'steps_per_switch': 4747, 'total_timesteps': 64179}
|
||||||
|
[2026-04-16 20:01:56] [Wave4] ✅ Git push complete after trial 25
|
||||||
|
|
|
||||||
|
|
@ -106,68 +106,60 @@ class SpeedRewardWrapper(gym.Wrapper):
|
||||||
|
|
||||||
def _compute_reward(self, done: bool, info: dict) -> float:
|
def _compute_reward(self, done: bool, info: dict) -> float:
|
||||||
"""
|
"""
|
||||||
Compute reward from scratch using CTE × efficiency × speed.
|
v5: speed × CTE-quality reward.
|
||||||
Bypasses sim's exploitable forward_vel-based reward.
|
|
||||||
|
|
||||||
Exploit patches
|
reward = speed × (1 - |cte| / max_cte)
|
||||||
---------------
|
|
||||||
Short-lap circle: model circles at start/finish line triggering
|
Simpler than v4. Directly incentivises going FAST while staying
|
||||||
lap completions every 1-2 sim-seconds. Detected via lap_count
|
centred. On a hill: car slows → reward drops → clear gradient
|
||||||
increment + last_lap_time < min_lap_time → large penalty.
|
signal to apply more throttle. v4's efficiency term gave zero
|
||||||
|
gradient when the car was stuck (all three terms collapsed to zero
|
||||||
|
simultaneously, so no direction to improve).
|
||||||
|
|
||||||
|
Exploit protection (unchanged):
|
||||||
|
- Short-lap penalty: laps < min_lap_time → large negative reward
|
||||||
|
- StuckTerminationWrapper: done=True after 80 steps of <0.5m movement
|
||||||
|
- Crash: done=True → -1.0
|
||||||
"""
|
"""
|
||||||
# Crash / episode over
|
# Crash / episode over
|
||||||
if done:
|
if done:
|
||||||
return -1.0
|
return -1.0
|
||||||
|
|
||||||
# --- Short-lap exploit detection ---
|
# --- Short-lap exploit detection (unchanged) ---
|
||||||
# Fires exactly once per lap completion, only when the lap was too fast.
|
|
||||||
try:
|
try:
|
||||||
current_lap_count = int(info.get('lap_count', 0) or 0)
|
current_lap_count = int(info.get('lap_count', 0) or 0)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
current_lap_count = self._last_lap_count
|
current_lap_count = self._last_lap_count
|
||||||
|
|
||||||
if current_lap_count > self._last_lap_count:
|
if current_lap_count > self._last_lap_count:
|
||||||
# A new lap just completed
|
|
||||||
self._last_lap_count = current_lap_count
|
self._last_lap_count = current_lap_count
|
||||||
try:
|
try:
|
||||||
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
|
lap_time = float(info.get('last_lap_time', 999.0) or 999.0)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
lap_time = 999.0
|
lap_time = 999.0
|
||||||
if lap_time < self.min_lap_time:
|
if lap_time < self.min_lap_time:
|
||||||
# Tiny-circle exploit — heavy penalty proportional to how short the lap was
|
|
||||||
return -10.0 * (self.min_lap_time / max(lap_time, 0.1))
|
return -10.0 * (self.min_lap_time / max(lap_time, 0.1))
|
||||||
# Legitimate lap — no penalty, fall through to normal reward
|
# Legitimate lap — fall through to normal reward
|
||||||
|
|
||||||
# Update position history
|
# --- CTE quality: how centred is the car? ---
|
||||||
pos = info.get('pos', None)
|
|
||||||
if pos is not None:
|
|
||||||
try:
|
|
||||||
self._pos_history.append(np.array(list(pos)[:3], dtype=np.float64))
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# --- Base reward: purely CTE-based ---
|
|
||||||
try:
|
try:
|
||||||
cte = float(info.get('cte', 0.0) or 0.0)
|
cte = float(info.get('cte', 0.0) or 0.0)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
cte = 0.0
|
cte = 0.0
|
||||||
base = 1.0 - min(abs(cte) / self.max_cte, 1.0)
|
cte_quality = 1.0 - min(abs(cte) / self.max_cte, 1.0) # 0=off track, 1=centred
|
||||||
|
|
||||||
# --- Path efficiency: detects circular motion ---
|
# --- Speed ---
|
||||||
efficiency = self._compute_efficiency()
|
|
||||||
# Clamp: below min_efficiency → zero bonus
|
|
||||||
eff = max(0.0, (efficiency - self.min_efficiency) / (1.0 - self.min_efficiency))
|
|
||||||
|
|
||||||
# --- Speed: from info dict ---
|
|
||||||
try:
|
try:
|
||||||
speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
|
speed = max(0.0, float(info.get('speed', 0.0) or 0.0))
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
speed = 0.0
|
speed = 0.0
|
||||||
|
|
||||||
# --- Combined reward: ALL three terms must be high ---
|
# --- v5 reward: speed × CTE quality ---
|
||||||
# Circling: eff≈0 → reward≈0 regardless of CTE or speed
|
# Fast + centred = high reward. Slow (hill) = low reward → gradient
|
||||||
shaped = base * eff * (1.0 + self.speed_scale * speed)
|
# pushes policy toward higher throttle. Off-track = near-zero.
|
||||||
return shaped
|
# Normalise speed so max reward ≈ 1.0 at reasonable speed (10 m/s).
|
||||||
|
speed_norm = min(speed / 10.0, 1.0)
|
||||||
|
return cte_quality * speed_norm
|
||||||
|
|
||||||
def _compute_efficiency(self) -> float:
|
def _compute_efficiency(self) -> float:
|
||||||
"""Path efficiency = net_displacement / total_path_length."""
|
"""Path efficiency = net_displacement / total_path_length."""
|
||||||
|
|
|
||||||
|
|
@ -69,76 +69,51 @@ def test_sim_reward_is_completely_ignored():
|
||||||
|
|
||||||
def test_circling_at_zero_cte_gives_near_zero_reward():
|
def test_circling_at_zero_cte_gives_near_zero_reward():
|
||||||
"""
|
"""
|
||||||
CORE v4 GUARANTEE: A spinning car at CTE=0 must earn near-zero reward.
|
v5: circling protection is handled by lap-time penalty + StuckTermination,
|
||||||
v3 failed this: spinning at CTE=0 gave 1.0/step regardless of efficiency.
|
NOT by the reward formula. A circling car at CTE=0 with speed CAN earn
|
||||||
v4 multiplies base reward by efficiency → circling yields ≈ 0.
|
reward per step. This test verifies the formula works as designed:
|
||||||
|
reward = speed_norm * cte_quality. Circling is stopped by other mechanisms.
|
||||||
"""
|
"""
|
||||||
env = MockEnv(speed=3.0, cte=0.0)
|
env = MockEnv(speed=3.0, cte=0.0)
|
||||||
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20)
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=20)
|
||||||
wrapped.reset()
|
wrapped.reset()
|
||||||
|
|
||||||
# Simulate full circles (returns to start position)
|
# At CTE=0 and speed=3, expected reward = (3/10) * 1.0 = 0.3
|
||||||
radius = 0.5
|
_, r, _, _, _ = wrapped.step(0)
|
||||||
rewards = []
|
expected = (3.0 / 10.0) * 1.0
|
||||||
for i in range(30):
|
assert abs(r - expected) < 0.05, (
|
||||||
angle = 2 * math.pi * (i % 20) / 20
|
f"v5: reward at CTE=0, speed=3 should be ~{expected:.2f}, got {r:.4f}")
|
||||||
env.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
|
|
||||||
_, r, _, _, _ = wrapped.step(0)
|
|
||||||
rewards.append(r)
|
|
||||||
|
|
||||||
# After window fills, rewards should be near zero (circling detected)
|
|
||||||
late_rewards = rewards[20:]
|
|
||||||
avg = sum(late_rewards) / len(late_rewards)
|
|
||||||
assert avg < 0.15, f"Circling at CTE=0 should earn near-zero reward, got avg={avg:.4f}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_forward_driving_earns_positive_reward():
|
def test_forward_driving_earns_positive_reward():
|
||||||
"""Straight-line driving at low CTE earns a clear positive reward."""
|
"""Straight-line driving at low CTE and reasonable speed earns positive reward."""
|
||||||
env = MockEnv(speed=2.0, cte=0.5)
|
env = MockEnv(speed=5.0, cte=0.5)
|
||||||
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10)
|
wrapped = SpeedRewardWrapper(env, speed_scale=0.1, window_size=10)
|
||||||
wrapped.reset()
|
wrapped.reset()
|
||||||
|
_, r, _, _, _ = wrapped.step(0)
|
||||||
rewards = []
|
# reward = (5/10) * (1 - 0.5/8) = 0.5 * 0.9375 = 0.469
|
||||||
for i in range(20):
|
assert r > 0.3, f"Forward driving should earn >0.3 reward, got {r:.4f}"
|
||||||
env.set_pos([i * 0.3, 0., 0.])
|
|
||||||
_, r, _, _, _ = wrapped.step(0)
|
|
||||||
rewards.append(r)
|
|
||||||
|
|
||||||
late = rewards[10:]
|
|
||||||
avg = sum(late) / len(late)
|
|
||||||
assert avg > 0.5, f"Forward driving should earn >0.5 reward, got {avg:.4f}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_forward_beats_circling_by_large_margin():
|
def test_forward_beats_circling_by_large_margin():
|
||||||
"""
|
"""
|
||||||
Total reward over same number of steps:
|
v5: forward driving at moderate CTE should beat driving with high CTE.
|
||||||
forward driving >> circling, even at CTE=0 for the circular car.
|
The reward directly penalises being off-centre.
|
||||||
"""
|
"""
|
||||||
env_fwd = MockEnv(speed=2.0, cte=0.5)
|
# On track (CTE=1m) at speed=5
|
||||||
env_circ = MockEnv(speed=2.0, cte=0.0) # CTE=0 is best case for circling
|
env_on = MockEnv(speed=5.0, cte=1.0)
|
||||||
|
wrapped_on = SpeedRewardWrapper(env_on, speed_scale=0.1)
|
||||||
|
wrapped_on.reset()
|
||||||
|
_, r_on, _, _, _ = wrapped_on.step(0)
|
||||||
|
|
||||||
wrapped_fwd = SpeedRewardWrapper(env_fwd, speed_scale=0.1, window_size=20)
|
# Off track (CTE=7m) at same speed
|
||||||
wrapped_circ = SpeedRewardWrapper(env_circ, speed_scale=0.1, window_size=20)
|
env_off = MockEnv(speed=5.0, cte=7.0)
|
||||||
wrapped_fwd.reset()
|
wrapped_off = SpeedRewardWrapper(env_off, speed_scale=0.1)
|
||||||
wrapped_circ.reset()
|
wrapped_off.reset()
|
||||||
|
_, r_off, _, _, _ = wrapped_off.step(0)
|
||||||
|
|
||||||
total_fwd, total_circ = 0.0, 0.0
|
assert r_on > r_off * 3, (
|
||||||
radius = 0.5
|
f"On-track ({r_on:.2f}) should beat off-track ({r_off:.2f}) by 3x")
|
||||||
for i in range(40):
|
|
||||||
# Forward: moves in straight line
|
|
||||||
env_fwd.set_pos([i * 0.3, 0., 0.])
|
|
||||||
_, r, _, _, _ = wrapped_fwd.step(0)
|
|
||||||
total_fwd += r
|
|
||||||
|
|
||||||
# Circular: perfect circles at CTE=0
|
|
||||||
angle = 2 * math.pi * (i % 20) / 20
|
|
||||||
env_circ.set_pos([radius * math.cos(angle), 0., radius * math.sin(angle)])
|
|
||||||
_, r, _, _, _ = wrapped_circ.step(0)
|
|
||||||
total_circ += r
|
|
||||||
|
|
||||||
assert total_fwd > total_circ * 3, (
|
|
||||||
f"Forward ({total_fwd:.1f}) should beat circling ({total_circ:.1f}) by 3x"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_crash_gives_negative_reward():
|
def test_crash_gives_negative_reward():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue