atropos/example_trainer/scripts/compare_lora_modes.sh
Jai Suphavadeeprasit 9dcb362aba vllm restart 1
2026-03-02 11:18:52 -05:00

332 lines
13 KiB
Bash
Executable file

#!/bin/bash
# =============================================================================
# LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)
# =============================================================================
#
# Runs both modes IN PARALLEL on separate GPUs for fair comparison:
# - GPU 0: lora_only (--enforce-eager, ~13 TPS)
# - GPU 1: lora_restart (no --enforce-eager, ~108 TPS)
#
# Usage:
# ./scripts/compare_lora_modes.sh [MODEL] [STEPS]
#
# Example:
# ./scripts/compare_lora_modes.sh Qwen/Qwen3-4B-Instruct-2507 20
#
# =============================================================================
set -e
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
TRAINING_STEPS="${2:-20}"
BATCH_SIZE="${BATCH_SIZE:-2}"
# Port allocation (separate ports for each mode)
LORA_ONLY_VLLM_PORT=9001
LORA_ONLY_API_PORT=8001
LORA_RESTART_VLLM_PORT=9002
LORA_RESTART_API_PORT=8002
# GPU allocation
LORA_ONLY_GPU=0
LORA_RESTART_GPU=1
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
REPO_DIR="$(dirname "$TRAINER_DIR")"
LOG_DIR="${REPO_DIR}/lora_comparison_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$LOG_DIR"
echo "============================================================"
echo "LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)"
echo "============================================================"
echo "Model: $MODEL"
echo "Steps: $TRAINING_STEPS"
echo "Batch: $BATCH_SIZE"
echo ""
echo "GPU Allocation:"
echo " GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
echo " GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
echo ""
echo "Log Dir: $LOG_DIR"
echo "============================================================"
echo ""
# Cleanup function
cleanup() {
echo ""
echo "Cleaning up all processes..."
pkill -u $USER -f "vllm_api_server" 2>/dev/null || true
pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
pkill -u $USER -f "run-api" 2>/dev/null || true
pkill -u $USER -f "grpo" 2>/dev/null || true
for port in $LORA_ONLY_VLLM_PORT $LORA_ONLY_API_PORT $LORA_RESTART_VLLM_PORT $LORA_RESTART_API_PORT; do
fuser -k ${port}/tcp 2>/dev/null || true
done
sleep 2
}
trap cleanup EXIT
# Initial cleanup
cleanup
cd "$REPO_DIR"
# =============================================================================
# Helper functions
# =============================================================================
wait_for_health() {
local port=$1
local name=$2
local max_attempts=${3:-60}
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
echo "$name ready (port $port)"
return 0
fi
sleep 5
attempt=$((attempt + 1))
done
echo "$name failed to start (port $port)"
return 1
}
wait_for_api() {
local port=$1
local name=$2
local max_attempts=${3:-30}
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
echo "$name ready (port $port)"
return 0
fi
sleep 2
attempt=$((attempt + 1))
done
echo "$name failed to start (port $port)"
return 1
}
# =============================================================================
# START BOTH MODES IN PARALLEL
# =============================================================================
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Starting both modes in parallel..."
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# -----------------------------------------------------------------------------
# LORA_ONLY (GPU 0)
# -----------------------------------------------------------------------------
echo ""
echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
# Start run-api for lora_only
run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
LORA_ONLY_API_PID=$!
# Start vLLM with --enforce-eager for lora_only
echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
--model "$MODEL" \
--port $LORA_ONLY_VLLM_PORT \
--gpu-memory-utilization 0.4 \
--enable-lora \
--max-lora-rank 64 \
--enforce-eager \
> "$LOG_DIR/vllm_lora_only.log" 2>&1 &
LORA_ONLY_VLLM_PID=$!
# -----------------------------------------------------------------------------
# LORA_RESTART (GPU 1) - Trainer manages vLLM internally
# -----------------------------------------------------------------------------
echo ""
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
# Pre-create checkpoint directory so vLLM can write its log there
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
# Start run-api for lora_restart
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
LORA_RESTART_API_PID=$!
# =============================================================================
# WAIT FOR INFRASTRUCTURE
# =============================================================================
echo ""
echo "Waiting for infrastructure to be ready..."
wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 90 || exit 1
# =============================================================================
# START ENVIRONMENTS AND TRAINERS
# =============================================================================
echo ""
echo "Starting environments and trainers..."
# Record start time
START_TIME=$(date +%s)
# -----------------------------------------------------------------------------
# LORA_ONLY: Start environment and trainer
# -----------------------------------------------------------------------------
echo ""
echo "[LORA_ONLY] Starting GSM8k environment..."
python -u environments/gsm8k_server.py serve \
--env.tokenizer_name "$MODEL" \
--env.use_wandb=False \
--env.rollout_server_url "http://localhost:${LORA_ONLY_API_PORT}" \
--openai.model_name "$MODEL" \
--openai.base_url "http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
--openai.server_type vllm \
--slurm false \
> "$LOG_DIR/env_lora_only.log" 2>&1 &
LORA_ONLY_ENV_PID=$!
echo "[LORA_ONLY] Starting trainer..."
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
--model-name "$MODEL" \
--weight-bridge-mode lora_only \
--vllm-port $LORA_ONLY_VLLM_PORT \
--atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
--batch-size $BATCH_SIZE \
--training-steps $TRAINING_STEPS \
--lora-r 16 \
--lora-alpha 32 \
--vllm-restart-interval 5 \
--save-path "$LOG_DIR/checkpoints_lora_only" \
--benchmark \
> "$LOG_DIR/trainer_lora_only.log" 2>&1 &
LORA_ONLY_TRAINER_PID=$!
# -----------------------------------------------------------------------------
# LORA_RESTART: Start trainer (it manages vLLM internally)
# -----------------------------------------------------------------------------
echo ""
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
--model-name "$MODEL" \
--weight-bridge-mode lora_restart \
--vllm-port $LORA_RESTART_VLLM_PORT \
--vllm-gpu-memory-utilization 0.4 \
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
--batch-size $BATCH_SIZE \
--training-steps $TRAINING_STEPS \
--lora-r 16 \
--lora-alpha 32 \
--vllm-restart-interval 5 \
--save-path "$LOG_DIR/checkpoints_lora_restart" \
--benchmark \
> "$LOG_DIR/trainer_lora_restart.log" 2>&1 &
LORA_RESTART_TRAINER_PID=$!
# Wait for lora_restart's internal vLLM to start
# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
echo "[LORA_RESTART] Waiting for internal vLLM to start..."
echo " NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
echo " Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
sleep 30 # Give more time for model loading before checking health
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
echo " Failed - check logs:"
echo " Trainer log:"
tail -30 "$LOG_DIR/trainer_lora_restart.log"
echo ""
echo " vLLM internal log (if exists):"
tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo " (not found)"
exit 1
}
# Start GSM8k environment for lora_restart
echo "[LORA_RESTART] Starting GSM8k environment..."
python -u environments/gsm8k_server.py serve \
--env.tokenizer_name "$MODEL" \
--env.use_wandb=False \
--env.rollout_server_url "http://localhost:${LORA_RESTART_API_PORT}" \
--openai.model_name "$MODEL" \
--openai.base_url "http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
--openai.server_type vllm \
--slurm false \
> "$LOG_DIR/env_lora_restart.log" 2>&1 &
LORA_RESTART_ENV_PID=$!
# =============================================================================
# WAIT FOR BOTH TRAINERS TO COMPLETE
# =============================================================================
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Both trainers running in parallel. Waiting for completion..."
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Monitor progress:"
echo " tail -f $LOG_DIR/trainer_lora_only.log"
echo " tail -f $LOG_DIR/trainer_lora_restart.log"
echo ""
# Wait for trainers
LORA_ONLY_EXIT=0
LORA_RESTART_EXIT=0
wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
LORA_ONLY_END=$(date +%s)
LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
echo " ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
LORA_RESTART_END=$(date +%s)
LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
echo " ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
# =============================================================================
# RESULTS
# =============================================================================
echo ""
echo "============================================================"
echo "COMPARISON RESULTS (Parallel Execution)"
echo "============================================================"
echo ""
echo "Training Steps: $TRAINING_STEPS"
echo "Batch Size: $BATCH_SIZE"
echo ""
echo "┌─────────────────┬──────┬──────────────┬────────────────────────────┐"
echo "│ Mode │ GPU │ Total Time │ Notes │"
echo "├─────────────────┼──────┼──────────────┼────────────────────────────┤"
printf "│ lora_only │ %d │ %10ss │ --enforce-eager (~13 TPS) │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
printf "│ lora_restart │ %d │ %10ss │ no --enforce-eager (~108 TPS)│\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
echo "└─────────────────┴──────┴──────────────┴────────────────────────────┘"
echo ""
if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
echo "Speedup: ${SPEEDUP}x (lora_restart vs lora_only)"
fi
echo ""
echo "📊 BENCHMARK DETAILS:"
echo ""
echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_only.log)"
echo ""
echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_restart.log)"
echo ""
echo "============================================================"
echo "📁 All logs saved to: $LOG_DIR"
echo "============================================================"
echo ""
echo "Log files:"
echo " $LOG_DIR/trainer_lora_only.log"
echo " $LOG_DIR/trainer_lora_restart.log"
echo " $LOG_DIR/vllm_lora_only.log"
echo " $LOG_DIR/env_lora_only.log"
echo " $LOG_DIR/env_lora_restart.log"
echo ""