#!/bin/bash # Standalone training monitor — runs independently of Claude. # Usage: bash monitor_training.sh # Output: appended to /tmp/training_monitor.log # # Checks every 5 minutes: # - Is the training process still alive? # - What are the most recent checkpoint eval scores? # - Are there any errors or exploit laps? # - What is the current step count? LOG_FILE="${1:-/tmp/exp19.log}" TRAIN_PID="${2:-}" MONITOR_OUT="/tmp/training_monitor.log" INTERVAL=300 # 5 minutes echo "======================================" >> "$MONITOR_OUT" echo "Monitor started: $(date)" >> "$MONITOR_OUT" echo "Watching: $LOG_FILE PID: $TRAIN_PID" >> "$MONITOR_OUT" echo "======================================" >> "$MONITOR_OUT" while true; do sleep "$INTERVAL" echo "" >> "$MONITOR_OUT" echo "--- $(date) ---" >> "$MONITOR_OUT" # Check process alive if [ -n "$TRAIN_PID" ]; then if ps -p "$TRAIN_PID" > /dev/null 2>&1; then echo "Process $TRAIN_PID: RUNNING" >> "$MONITOR_OUT" else echo "Process $TRAIN_PID: STOPPED" >> "$MONITOR_OUT" echo "Training ended at $(date)" >> "$MONITOR_OUT" break fi fi # Latest checkpoint/eval/best lines echo "Recent checkpoints:" >> "$MONITOR_OUT" grep "Checkpoint\|Eval:\|NEW BEST" "$LOG_FILE" 2>/dev/null | tail -6 >> "$MONITOR_OUT" # Step progress echo "Step progress:" >> "$MONITOR_OUT" grep "total_timesteps" "$LOG_FILE" 2>/dev/null | tail -1 >> "$MONITOR_OUT" # Exploit warning: more than 5 lap times in the last 100 lines LAP_COUNT=$(tail -100 "$LOG_FILE" 2>/dev/null | grep -c "New lap time") echo "Laps in last 100 log lines: $LAP_COUNT" >> "$MONITOR_OUT" if [ "$LAP_COUNT" -gt 10 ]; then echo "WARNING: high lap count may indicate circular exploit" >> "$MONITOR_OUT" fi # Any errors ERRORS=$(grep -c "ERROR\|Traceback\|Exception" "$LOG_FILE" 2>/dev/null) if [ "$ERRORS" -gt 0 ]; then echo "ERRORS DETECTED: $ERRORS error lines in log" >> "$MONITOR_OUT" grep "ERROR\|Traceback" "$LOG_FILE" 2>/dev/null | tail -3 >> "$MONITOR_OUT" fi done echo "Monitor exiting: $(date)" >> "$MONITOR_OUT"