mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-22 16:48:57 +00:00
457 lines
18 KiB
Bash
Executable file
457 lines
18 KiB
Bash
Executable file
#!/bin/bash
|
|
# =============================================================================
|
|
# All Training Modes Comparison on Math Zero (32k context)
|
|
# =============================================================================
|
|
#
|
|
# Compares all 3 training modes on math_server_zero environment:
|
|
# - GPU 0: shared_vllm (CUDA IPC, zero-copy weight updates)
|
|
# - GPU 1: lora_only (--enforce-eager, ~13 TPS, slow)
|
|
# - GPU 2: lora_restart (no --enforce-eager, ~108 TPS, fast)
|
|
#
|
|
# All at 32k context length for proper math reasoning.
|
|
#
|
|
# Usage:
|
|
# ./scripts/compare_all_modes_math_zero.sh [MODEL] [STEPS]
|
|
#
|
|
# Example:
|
|
# ./scripts/compare_all_modes_math_zero.sh Qwen/Qwen3-4B-Instruct-2507 30
|
|
#
|
|
# =============================================================================
|
|
|
|
set -e
|
|
|
|
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
|
|
TRAINING_STEPS="${2:-30}"
|
|
BATCH_SIZE="${BATCH_SIZE:-2}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
|
USE_WANDB="${USE_WANDB:-true}"
|
|
WANDB_PROJECT="${WANDB_PROJECT:-math-zero-mode-comparison}"
|
|
|
|
# Port allocation (separate ports for each mode)
|
|
# shared_vllm: API 8001, vLLM 9001
|
|
# lora_only: API 8002, vLLM 9002
|
|
# lora_restart: API 8003, vLLM 9003
|
|
|
|
SHARED_API_PORT=8001
|
|
SHARED_VLLM_PORT=9001
|
|
SHARED_GPU=0
|
|
|
|
LORA_ONLY_API_PORT=8002
|
|
LORA_ONLY_VLLM_PORT=9002
|
|
LORA_ONLY_GPU=1
|
|
|
|
LORA_RESTART_API_PORT=8003
|
|
LORA_RESTART_VLLM_PORT=9003
|
|
LORA_RESTART_GPU=2
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
|
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
|
|
|
LOG_DIR="${REPO_DIR}/math_zero_comparison_$(date +%Y%m%d_%H%M%S)"
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
echo "============================================================"
|
|
echo "Math Zero Mode Comparison (32k Context)"
|
|
echo "============================================================"
|
|
echo "Model: $MODEL"
|
|
echo "Steps: $TRAINING_STEPS"
|
|
echo "Batch: $BATCH_SIZE"
|
|
echo "Max Model Length: $MAX_MODEL_LEN"
|
|
echo "Wandb: $USE_WANDB (project: $WANDB_PROJECT)"
|
|
echo ""
|
|
echo "GPU Allocation:"
|
|
echo " GPU $SHARED_GPU: shared_vllm (ports $SHARED_API_PORT, $SHARED_VLLM_PORT)"
|
|
echo " GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
|
|
echo " GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
|
|
echo ""
|
|
echo "Log Dir: $LOG_DIR"
|
|
echo "============================================================"
|
|
echo ""
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
echo ""
|
|
echo "Cleaning up all processes..."
|
|
pkill -9 -f "vllm_api_server" 2>/dev/null || true
|
|
pkill -9 -f "math_server_zero" 2>/dev/null || true
|
|
pkill -9 -f "run-api" 2>/dev/null || true
|
|
pkill -9 -f "grpo" 2>/dev/null || true
|
|
pkill -9 -f "vllm.*EngineCore" 2>/dev/null || true
|
|
for port in $SHARED_API_PORT $SHARED_VLLM_PORT $LORA_ONLY_API_PORT $LORA_ONLY_VLLM_PORT $LORA_RESTART_API_PORT $LORA_RESTART_VLLM_PORT; do
|
|
fuser -k ${port}/tcp 2>/dev/null || true
|
|
done
|
|
sleep 2
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Initial cleanup
|
|
cleanup
|
|
|
|
# Clear triton cache for clean start
|
|
rm -rf ~/.triton/cache 2>/dev/null || true
|
|
|
|
cd "$REPO_DIR"
|
|
|
|
# =============================================================================
|
|
# Helper functions
|
|
# =============================================================================
|
|
|
|
wait_for_health() {
|
|
local port=$1
|
|
local name=$2
|
|
local max_attempts=${3:-120}
|
|
local attempt=1
|
|
|
|
while [ $attempt -le $max_attempts ]; do
|
|
if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
|
|
echo " ✓ $name ready (port $port)"
|
|
return 0
|
|
fi
|
|
sleep 5
|
|
attempt=$((attempt + 1))
|
|
done
|
|
echo " ✗ $name failed to start (port $port)"
|
|
return 1
|
|
}
|
|
|
|
wait_for_api() {
|
|
local port=$1
|
|
local name=$2
|
|
local max_attempts=${3:-30}
|
|
local attempt=1
|
|
|
|
while [ $attempt -le $max_attempts ]; do
|
|
if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
|
|
echo " ✓ $name ready (port $port)"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
attempt=$((attempt + 1))
|
|
done
|
|
echo " ✗ $name failed to start (port $port)"
|
|
return 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# START ALL THREE MODES IN PARALLEL
|
|
# =============================================================================
|
|
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "Starting all three modes in parallel..."
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
|
|
# Pre-create checkpoint directories
|
|
mkdir -p "$LOG_DIR/checkpoints_shared"
|
|
mkdir -p "$LOG_DIR/checkpoints_lora_only"
|
|
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# MODE 1: SHARED_VLLM (GPU 0)
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[SHARED_VLLM] Starting on GPU $SHARED_GPU..."
|
|
|
|
# Start run-api for shared_vllm
|
|
run-api --port $SHARED_API_PORT > "$LOG_DIR/api_shared.log" 2>&1 &
|
|
|
|
# Start vLLM with shared weights
|
|
# NOTE: shared_vllm needs more headroom for optimizer states (~8GB) and gradients
|
|
# Using 0.5 leaves ~90GB for training operations on a 180GB GPU
|
|
echo "[SHARED_VLLM] Starting vLLM with shared weights..."
|
|
VLLM_ENABLE_SHARED_WEIGHTS=1 VLLM_BRIDGE_CONFIG_PATH=$LOG_DIR/vllm_bridge_config_shared.json \
|
|
CUDA_VISIBLE_DEVICES=$SHARED_GPU python -u example_trainer/vllm_api_server.py \
|
|
--model "$MODEL" \
|
|
--port $SHARED_VLLM_PORT \
|
|
--gpu-memory-utilization 0.50 \
|
|
--max-model-len $MAX_MODEL_LEN \
|
|
> "$LOG_DIR/vllm_shared.log" 2>&1 &
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# MODE 2: LORA_ONLY (GPU 1)
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
|
|
|
|
# Start run-api for lora_only
|
|
run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
|
|
|
|
# Start vLLM with --enforce-eager for lora_only
|
|
# LoRA modes need less training memory, but still need headroom at 32k
|
|
echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
|
|
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
|
|
--model "$MODEL" \
|
|
--port $LORA_ONLY_VLLM_PORT \
|
|
--gpu-memory-utilization 0.70 \
|
|
--max-model-len $MAX_MODEL_LEN \
|
|
--enable-lora \
|
|
--max-lora-rank 64 \
|
|
--enforce-eager \
|
|
> "$LOG_DIR/vllm_lora_only.log" 2>&1 &
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# MODE 3: LORA_RESTART (GPU 2) - Trainer manages vLLM internally
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
|
|
|
|
# Start run-api for lora_restart
|
|
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
|
|
|
|
# =============================================================================
|
|
# WAIT FOR INFRASTRUCTURE
|
|
# =============================================================================
|
|
echo ""
|
|
echo "Waiting for infrastructure to be ready..."
|
|
echo " (vLLM at 32k context takes ~2-5 minutes to start)"
|
|
|
|
wait_for_api $SHARED_API_PORT "shared_vllm API" || exit 1
|
|
wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
|
|
wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
|
|
|
|
wait_for_health $SHARED_VLLM_PORT "shared_vllm vLLM" 180 || exit 1
|
|
wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 180 || exit 1
|
|
|
|
# =============================================================================
|
|
# START ENVIRONMENTS AND TRAINERS
|
|
# =============================================================================
|
|
echo ""
|
|
echo "Starting environments and trainers..."
|
|
|
|
# Record start time
|
|
START_TIME=$(date +%s)
|
|
|
|
# Build wandb args
|
|
WANDB_ARGS=""
|
|
if [ "$USE_WANDB" = "true" ]; then
|
|
WANDB_ARGS="--use-wandb --wandb-project $WANDB_PROJECT"
|
|
fi
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# SHARED_VLLM: Start environment and trainer
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[SHARED_VLLM] Starting math_server_zero environment..."
|
|
MATH_ENV_MODEL="$MODEL" \
|
|
MATH_ENV_ROLLOUT_URL="http://localhost:${SHARED_API_PORT}" \
|
|
MATH_ENV_VLLM_URL="http://localhost:${SHARED_VLLM_PORT}/v1" \
|
|
MATH_ENV_WANDB_NAME="shared-vllm-env" \
|
|
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
|
MATH_ENV_WORKER_TIMEOUT=1800 \
|
|
python -u environments/math_server_zero.py serve \
|
|
--slurm false \
|
|
2>&1 | tee "$LOG_DIR/env_shared.log" &
|
|
SHARED_ENV_PID=$!
|
|
|
|
echo "[SHARED_VLLM] Starting trainer..."
|
|
CUDA_VISIBLE_DEVICES=$SHARED_GPU python -m example_trainer.grpo \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode shared_vllm \
|
|
--vllm-port $SHARED_VLLM_PORT \
|
|
--vllm-config-path "$LOG_DIR/vllm_bridge_config_shared.json" \
|
|
--atropos-url "http://localhost:${SHARED_API_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--max-model-len $MAX_MODEL_LEN \
|
|
--seq-len $MAX_MODEL_LEN \
|
|
--save-path "$LOG_DIR/checkpoints_shared" \
|
|
$WANDB_ARGS --wandb-group "shared-vllm" \
|
|
--benchmark \
|
|
2>&1 | tee "$LOG_DIR/trainer_shared.log" &
|
|
SHARED_TRAINER_PID=$!
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_ONLY: Start environment and trainer
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_ONLY] Starting math_server_zero environment..."
|
|
MATH_ENV_MODEL="$MODEL" \
|
|
MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_ONLY_API_PORT}" \
|
|
MATH_ENV_VLLM_URL="http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
|
|
MATH_ENV_WANDB_NAME="lora-only-env" \
|
|
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
|
MATH_ENV_WORKER_TIMEOUT=1800 \
|
|
python -u environments/math_server_zero.py serve \
|
|
--slurm false \
|
|
2>&1 | tee "$LOG_DIR/env_lora_only.log" &
|
|
LORA_ONLY_ENV_PID=$!
|
|
|
|
echo "[LORA_ONLY] Starting trainer..."
|
|
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode lora_only \
|
|
--vllm-port $LORA_ONLY_VLLM_PORT \
|
|
--atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--max-model-len $MAX_MODEL_LEN \
|
|
--seq-len $MAX_MODEL_LEN \
|
|
--lora-r 16 \
|
|
--lora-alpha 32 \
|
|
--vllm-restart-interval 5 \
|
|
--save-path "$LOG_DIR/checkpoints_lora_only" \
|
|
$WANDB_ARGS --wandb-group "lora-only" \
|
|
--benchmark \
|
|
2>&1 | tee "$LOG_DIR/trainer_lora_only.log" &
|
|
LORA_ONLY_TRAINER_PID=$!
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_RESTART: Start trainer (it manages vLLM internally)
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
|
|
# NOTE: lora_restart shares GPU with trainer's model (~8GB), so use lower vLLM memory
|
|
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode lora_restart \
|
|
--vllm-port $LORA_RESTART_VLLM_PORT \
|
|
--vllm-gpu-memory-utilization 0.20 \
|
|
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--max-model-len $MAX_MODEL_LEN \
|
|
--seq-len $MAX_MODEL_LEN \
|
|
--lora-r 16 \
|
|
--lora-alpha 32 \
|
|
--vllm-restart-interval 5 \
|
|
--save-path "$LOG_DIR/checkpoints_lora_restart" \
|
|
$WANDB_ARGS --wandb-group "lora-restart" \
|
|
--benchmark \
|
|
2>&1 | tee "$LOG_DIR/trainer_lora_restart.log" &
|
|
LORA_RESTART_TRAINER_PID=$!
|
|
|
|
# Wait for lora_restart's internal vLLM to start
|
|
echo "[LORA_RESTART] Waiting for internal vLLM to start..."
|
|
echo " NOTE: vLLM at 32k context with CUDA graphs takes 2-5 min"
|
|
sleep 60
|
|
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 300 || {
|
|
echo " Failed - check logs:"
|
|
tail -50 "$LOG_DIR/trainer_lora_restart.log"
|
|
exit 1
|
|
}
|
|
|
|
# Start environment for lora_restart
|
|
echo "[LORA_RESTART] Starting math_server_zero environment..."
|
|
MATH_ENV_MODEL="$MODEL" \
|
|
MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_RESTART_API_PORT}" \
|
|
MATH_ENV_VLLM_URL="http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
|
|
MATH_ENV_WANDB_NAME="lora-restart-env" \
|
|
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
|
MATH_ENV_WORKER_TIMEOUT=1800 \
|
|
python -u environments/math_server_zero.py serve \
|
|
--slurm false \
|
|
2>&1 | tee "$LOG_DIR/env_lora_restart.log" &
|
|
LORA_RESTART_ENV_PID=$!
|
|
|
|
# =============================================================================
|
|
# WAIT FOR ALL TRAINERS TO COMPLETE
|
|
# =============================================================================
|
|
echo ""
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "All three trainers running in parallel. Waiting for completion..."
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo ""
|
|
echo "📊 WANDB: https://wandb.ai (project: $WANDB_PROJECT)"
|
|
echo ""
|
|
echo "📋 MONITOR LOGS (in another terminal):"
|
|
echo ""
|
|
echo " # Trainer logs:"
|
|
echo " tail -f $LOG_DIR/trainer_shared.log"
|
|
echo " tail -f $LOG_DIR/trainer_lora_only.log"
|
|
echo " tail -f $LOG_DIR/trainer_lora_restart.log"
|
|
echo ""
|
|
echo " # Environment logs:"
|
|
echo " tail -f $LOG_DIR/env_shared.log"
|
|
echo " tail -f $LOG_DIR/env_lora_only.log"
|
|
echo " tail -f $LOG_DIR/env_lora_restart.log"
|
|
echo ""
|
|
echo " # vLLM logs:"
|
|
echo " tail -f $LOG_DIR/vllm_shared.log"
|
|
echo " tail -f $LOG_DIR/vllm_lora_only.log"
|
|
echo " tail -f $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
|
|
echo ""
|
|
|
|
# Wait for trainers
|
|
SHARED_EXIT=0
|
|
LORA_ONLY_EXIT=0
|
|
LORA_RESTART_EXIT=0
|
|
|
|
wait $SHARED_TRAINER_PID || SHARED_EXIT=$?
|
|
SHARED_END=$(date +%s)
|
|
SHARED_TIME=$((SHARED_END - START_TIME))
|
|
echo " ✓ shared_vllm finished in ${SHARED_TIME}s (exit: $SHARED_EXIT)"
|
|
|
|
wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
|
|
LORA_ONLY_END=$(date +%s)
|
|
LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
|
|
echo " ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
|
|
|
|
wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
|
|
LORA_RESTART_END=$(date +%s)
|
|
LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
|
|
echo " ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
|
|
|
|
# =============================================================================
|
|
# RESULTS
|
|
# =============================================================================
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "COMPARISON RESULTS (Math Zero @ 32k Context)"
|
|
echo "============================================================"
|
|
echo ""
|
|
echo "Training Steps: $TRAINING_STEPS"
|
|
echo "Batch Size: $BATCH_SIZE"
|
|
echo "Max Context: $MAX_MODEL_LEN"
|
|
echo ""
|
|
echo "┌─────────────────┬──────┬──────────────┬────────────────────────────────┐"
|
|
echo "│ Mode │ GPU │ Total Time │ Notes │"
|
|
echo "├─────────────────┼──────┼──────────────┼────────────────────────────────┤"
|
|
printf "│ shared_vllm │ %d │ %10ss │ CUDA IPC zero-copy (~172 TPS) │\n" "$SHARED_GPU" "$SHARED_TIME"
|
|
printf "│ lora_only │ %d │ %10ss │ --enforce-eager (~13 TPS) │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
|
|
printf "│ lora_restart │ %d │ %10ss │ no --enforce-eager (~108 TPS) │\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
|
|
echo "└─────────────────┴──────┴──────────────┴────────────────────────────────┘"
|
|
echo ""
|
|
|
|
# Calculate speedups
|
|
if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
|
|
RESTART_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
|
|
echo "lora_restart vs lora_only speedup: ${RESTART_SPEEDUP}x"
|
|
fi
|
|
if [ $LORA_ONLY_TIME -gt 0 ] && [ $SHARED_TIME -gt 0 ]; then
|
|
SHARED_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $SHARED_TIME" | bc)
|
|
echo "shared_vllm vs lora_only speedup: ${SHARED_SPEEDUP}x"
|
|
fi
|
|
|
|
echo ""
|
|
echo "📊 BENCHMARK DETAILS:"
|
|
echo ""
|
|
echo "━━━ shared_vllm (GPU $SHARED_GPU) ━━━"
|
|
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_shared.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_shared.log)"
|
|
echo ""
|
|
echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
|
|
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_only.log)"
|
|
echo ""
|
|
echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
|
|
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_restart.log)"
|
|
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "📁 All logs saved to: $LOG_DIR"
|
|
echo "============================================================"
|
|
echo ""
|
|
echo "Log files:"
|
|
echo " Trainers:"
|
|
echo " $LOG_DIR/trainer_shared.log"
|
|
echo " $LOG_DIR/trainer_lora_only.log"
|
|
echo " $LOG_DIR/trainer_lora_restart.log"
|
|
echo ""
|
|
echo " Environments:"
|
|
echo " $LOG_DIR/env_shared.log"
|
|
echo " $LOG_DIR/env_lora_only.log"
|
|
echo " $LOG_DIR/env_lora_restart.log"
|
|
echo ""
|
|
echo " vLLM:"
|
|
echo " $LOG_DIR/vllm_shared.log"
|
|
echo " $LOG_DIR/vllm_lora_only.log"
|
|
echo " $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
|
|
echo ""
|