62 lines
2.2 KiB
Bash
62 lines
2.2 KiB
Bash
#!/bin/bash
|
|
# Standalone training monitor — runs independently of Claude.
|
|
# Usage: bash monitor_training.sh <log_file> <pid>
|
|
# Output: appended to /tmp/training_monitor.log
|
|
#
|
|
# Checks every 5 minutes:
|
|
# - Is the training process still alive?
|
|
# - What are the most recent checkpoint eval scores?
|
|
# - Are there any errors or exploit laps?
|
|
# - What is the current step count?
|
|
|
|
LOG_FILE="${1:-/tmp/exp19.log}"
|
|
TRAIN_PID="${2:-}"
|
|
MONITOR_OUT="/tmp/training_monitor.log"
|
|
INTERVAL=300 # 5 minutes
|
|
|
|
echo "======================================" >> "$MONITOR_OUT"
|
|
echo "Monitor started: $(date)" >> "$MONITOR_OUT"
|
|
echo "Watching: $LOG_FILE PID: $TRAIN_PID" >> "$MONITOR_OUT"
|
|
echo "======================================" >> "$MONITOR_OUT"
|
|
|
|
while true; do
|
|
sleep "$INTERVAL"
|
|
echo "" >> "$MONITOR_OUT"
|
|
echo "--- $(date) ---" >> "$MONITOR_OUT"
|
|
|
|
# Check process alive
|
|
if [ -n "$TRAIN_PID" ]; then
|
|
if ps -p "$TRAIN_PID" > /dev/null 2>&1; then
|
|
echo "Process $TRAIN_PID: RUNNING" >> "$MONITOR_OUT"
|
|
else
|
|
echo "Process $TRAIN_PID: STOPPED" >> "$MONITOR_OUT"
|
|
echo "Training ended at $(date)" >> "$MONITOR_OUT"
|
|
break
|
|
fi
|
|
fi
|
|
|
|
# Latest checkpoint/eval/best lines
|
|
echo "Recent checkpoints:" >> "$MONITOR_OUT"
|
|
grep "Checkpoint\|Eval:\|NEW BEST" "$LOG_FILE" 2>/dev/null | tail -6 >> "$MONITOR_OUT"
|
|
|
|
# Step progress
|
|
echo "Step progress:" >> "$MONITOR_OUT"
|
|
grep "total_timesteps" "$LOG_FILE" 2>/dev/null | tail -1 >> "$MONITOR_OUT"
|
|
|
|
# Exploit warning: more than 5 lap times in the last 100 lines
|
|
LAP_COUNT=$(tail -100 "$LOG_FILE" 2>/dev/null | grep -c "New lap time")
|
|
echo "Laps in last 100 log lines: $LAP_COUNT" >> "$MONITOR_OUT"
|
|
if [ "$LAP_COUNT" -gt 10 ]; then
|
|
echo "WARNING: high lap count may indicate circular exploit" >> "$MONITOR_OUT"
|
|
fi
|
|
|
|
# Any errors
|
|
ERRORS=$(grep -c "ERROR\|Traceback\|Exception" "$LOG_FILE" 2>/dev/null)
|
|
if [ "$ERRORS" -gt 0 ]; then
|
|
echo "ERRORS DETECTED: $ERRORS error lines in log" >> "$MONITOR_OUT"
|
|
grep "ERROR\|Traceback" "$LOG_FILE" 2>/dev/null | tail -3 >> "$MONITOR_OUT"
|
|
fi
|
|
done
|
|
|
|
echo "Monitor exiting: $(date)" >> "$MONITOR_OUT"
|