donkeycar-rl-autoresearch/monitor_training.sh

62 lines
2.2 KiB
Bash

#!/bin/bash
# Standalone training monitor — runs independently of Claude.
# Usage: bash monitor_training.sh <log_file> <pid>
# Output: appended to /tmp/training_monitor.log
#
# Checks every 5 minutes:
# - Is the training process still alive?
# - What are the most recent checkpoint eval scores?
# - Are there any errors or exploit laps?
# - What is the current step count?
LOG_FILE="${1:-/tmp/exp19.log}"
TRAIN_PID="${2:-}"
MONITOR_OUT="/tmp/training_monitor.log"
INTERVAL=300 # 5 minutes
echo "======================================" >> "$MONITOR_OUT"
echo "Monitor started: $(date)" >> "$MONITOR_OUT"
echo "Watching: $LOG_FILE PID: $TRAIN_PID" >> "$MONITOR_OUT"
echo "======================================" >> "$MONITOR_OUT"
while true; do
sleep "$INTERVAL"
echo "" >> "$MONITOR_OUT"
echo "--- $(date) ---" >> "$MONITOR_OUT"
# Check process alive
if [ -n "$TRAIN_PID" ]; then
if ps -p "$TRAIN_PID" > /dev/null 2>&1; then
echo "Process $TRAIN_PID: RUNNING" >> "$MONITOR_OUT"
else
echo "Process $TRAIN_PID: STOPPED" >> "$MONITOR_OUT"
echo "Training ended at $(date)" >> "$MONITOR_OUT"
break
fi
fi
# Latest checkpoint/eval/best lines
echo "Recent checkpoints:" >> "$MONITOR_OUT"
grep "Checkpoint\|Eval:\|NEW BEST" "$LOG_FILE" 2>/dev/null | tail -6 >> "$MONITOR_OUT"
# Step progress
echo "Step progress:" >> "$MONITOR_OUT"
grep "total_timesteps" "$LOG_FILE" 2>/dev/null | tail -1 >> "$MONITOR_OUT"
# Exploit warning: more than 5 lap times in the last 100 lines
LAP_COUNT=$(tail -100 "$LOG_FILE" 2>/dev/null | grep -c "New lap time")
echo "Laps in last 100 log lines: $LAP_COUNT" >> "$MONITOR_OUT"
if [ "$LAP_COUNT" -gt 10 ]; then
echo "WARNING: high lap count may indicate circular exploit" >> "$MONITOR_OUT"
fi
# Any errors
ERRORS=$(grep -c "ERROR\|Traceback\|Exception" "$LOG_FILE" 2>/dev/null)
if [ "$ERRORS" -gt 0 ]; then
echo "ERRORS DETECTED: $ERRORS error lines in log" >> "$MONITOR_OUT"
grep "ERROR\|Traceback" "$LOG_FILE" 2>/dev/null | tail -3 >> "$MONITOR_OUT"
fi
done
echo "Monitor exiting: $(date)" >> "$MONITOR_OUT"