mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
332 lines
13 KiB
Bash
Executable file
332 lines
13 KiB
Bash
Executable file
#!/bin/bash
|
|
# =============================================================================
|
|
# LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)
|
|
# =============================================================================
|
|
#
|
|
# Runs both modes IN PARALLEL on separate GPUs for fair comparison:
|
|
# - GPU 0: lora_only (--enforce-eager, ~13 TPS)
|
|
# - GPU 1: lora_restart (no --enforce-eager, ~108 TPS)
|
|
#
|
|
# Usage:
|
|
# ./scripts/compare_lora_modes.sh [MODEL] [STEPS]
|
|
#
|
|
# Example:
|
|
# ./scripts/compare_lora_modes.sh Qwen/Qwen3-4B-Instruct-2507 20
|
|
#
|
|
# =============================================================================
|
|
|
|
set -e
|
|
|
|
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
|
|
TRAINING_STEPS="${2:-20}"
|
|
BATCH_SIZE="${BATCH_SIZE:-2}"
|
|
|
|
# Port allocation (separate ports for each mode)
|
|
LORA_ONLY_VLLM_PORT=9001
|
|
LORA_ONLY_API_PORT=8001
|
|
|
|
LORA_RESTART_VLLM_PORT=9002
|
|
LORA_RESTART_API_PORT=8002
|
|
|
|
# GPU allocation
|
|
LORA_ONLY_GPU=0
|
|
LORA_RESTART_GPU=1
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
|
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
|
|
|
LOG_DIR="${REPO_DIR}/lora_comparison_$(date +%Y%m%d_%H%M%S)"
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
echo "============================================================"
|
|
echo "LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)"
|
|
echo "============================================================"
|
|
echo "Model: $MODEL"
|
|
echo "Steps: $TRAINING_STEPS"
|
|
echo "Batch: $BATCH_SIZE"
|
|
echo ""
|
|
echo "GPU Allocation:"
|
|
echo " GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
|
|
echo " GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
|
|
echo ""
|
|
echo "Log Dir: $LOG_DIR"
|
|
echo "============================================================"
|
|
echo ""
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
echo ""
|
|
echo "Cleaning up all processes..."
|
|
pkill -u $USER -f "vllm_api_server" 2>/dev/null || true
|
|
pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
|
|
pkill -u $USER -f "run-api" 2>/dev/null || true
|
|
pkill -u $USER -f "grpo" 2>/dev/null || true
|
|
for port in $LORA_ONLY_VLLM_PORT $LORA_ONLY_API_PORT $LORA_RESTART_VLLM_PORT $LORA_RESTART_API_PORT; do
|
|
fuser -k ${port}/tcp 2>/dev/null || true
|
|
done
|
|
sleep 2
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Initial cleanup
|
|
cleanup
|
|
|
|
cd "$REPO_DIR"
|
|
|
|
# =============================================================================
|
|
# Helper functions
|
|
# =============================================================================
|
|
|
|
wait_for_health() {
|
|
local port=$1
|
|
local name=$2
|
|
local max_attempts=${3:-60}
|
|
local attempt=1
|
|
|
|
while [ $attempt -le $max_attempts ]; do
|
|
if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
|
|
echo " ✓ $name ready (port $port)"
|
|
return 0
|
|
fi
|
|
sleep 5
|
|
attempt=$((attempt + 1))
|
|
done
|
|
echo " ✗ $name failed to start (port $port)"
|
|
return 1
|
|
}
|
|
|
|
wait_for_api() {
|
|
local port=$1
|
|
local name=$2
|
|
local max_attempts=${3:-30}
|
|
local attempt=1
|
|
|
|
while [ $attempt -le $max_attempts ]; do
|
|
if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
|
|
echo " ✓ $name ready (port $port)"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
attempt=$((attempt + 1))
|
|
done
|
|
echo " ✗ $name failed to start (port $port)"
|
|
return 1
|
|
}
|
|
|
|
# =============================================================================
|
|
# START BOTH MODES IN PARALLEL
|
|
# =============================================================================
|
|
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "Starting both modes in parallel..."
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_ONLY (GPU 0)
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
|
|
|
|
# Start run-api for lora_only
|
|
run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
|
|
LORA_ONLY_API_PID=$!
|
|
|
|
# Start vLLM with --enforce-eager for lora_only
|
|
echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
|
|
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
|
|
--model "$MODEL" \
|
|
--port $LORA_ONLY_VLLM_PORT \
|
|
--gpu-memory-utilization 0.4 \
|
|
--enable-lora \
|
|
--max-lora-rank 64 \
|
|
--enforce-eager \
|
|
> "$LOG_DIR/vllm_lora_only.log" 2>&1 &
|
|
LORA_ONLY_VLLM_PID=$!
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_RESTART (GPU 1) - Trainer manages vLLM internally
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
|
|
|
|
# Pre-create checkpoint directory so vLLM can write its log there
|
|
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
|
|
|
|
# Start run-api for lora_restart
|
|
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
|
|
LORA_RESTART_API_PID=$!
|
|
|
|
# =============================================================================
|
|
# WAIT FOR INFRASTRUCTURE
|
|
# =============================================================================
|
|
echo ""
|
|
echo "Waiting for infrastructure to be ready..."
|
|
|
|
wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
|
|
wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
|
|
wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 90 || exit 1
|
|
|
|
# =============================================================================
|
|
# START ENVIRONMENTS AND TRAINERS
|
|
# =============================================================================
|
|
echo ""
|
|
echo "Starting environments and trainers..."
|
|
|
|
# Record start time
|
|
START_TIME=$(date +%s)
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_ONLY: Start environment and trainer
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_ONLY] Starting GSM8k environment..."
|
|
python -u environments/gsm8k_server.py serve \
|
|
--env.tokenizer_name "$MODEL" \
|
|
--env.use_wandb=False \
|
|
--env.rollout_server_url "http://localhost:${LORA_ONLY_API_PORT}" \
|
|
--openai.model_name "$MODEL" \
|
|
--openai.base_url "http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
|
|
--openai.server_type vllm \
|
|
--slurm false \
|
|
> "$LOG_DIR/env_lora_only.log" 2>&1 &
|
|
LORA_ONLY_ENV_PID=$!
|
|
|
|
echo "[LORA_ONLY] Starting trainer..."
|
|
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode lora_only \
|
|
--vllm-port $LORA_ONLY_VLLM_PORT \
|
|
--atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--lora-r 16 \
|
|
--lora-alpha 32 \
|
|
--vllm-restart-interval 5 \
|
|
--save-path "$LOG_DIR/checkpoints_lora_only" \
|
|
--benchmark \
|
|
> "$LOG_DIR/trainer_lora_only.log" 2>&1 &
|
|
LORA_ONLY_TRAINER_PID=$!
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# LORA_RESTART: Start trainer (it manages vLLM internally)
|
|
# -----------------------------------------------------------------------------
|
|
echo ""
|
|
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
|
|
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode lora_restart \
|
|
--vllm-port $LORA_RESTART_VLLM_PORT \
|
|
--vllm-gpu-memory-utilization 0.4 \
|
|
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--lora-r 16 \
|
|
--lora-alpha 32 \
|
|
--vllm-restart-interval 5 \
|
|
--save-path "$LOG_DIR/checkpoints_lora_restart" \
|
|
--benchmark \
|
|
> "$LOG_DIR/trainer_lora_restart.log" 2>&1 &
|
|
LORA_RESTART_TRAINER_PID=$!
|
|
|
|
# Wait for lora_restart's internal vLLM to start
|
|
# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
|
|
echo "[LORA_RESTART] Waiting for internal vLLM to start..."
|
|
echo " NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
|
|
echo " Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
|
|
sleep 30 # Give more time for model loading before checking health
|
|
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
|
|
echo " Failed - check logs:"
|
|
echo " Trainer log:"
|
|
tail -30 "$LOG_DIR/trainer_lora_restart.log"
|
|
echo ""
|
|
echo " vLLM internal log (if exists):"
|
|
tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo " (not found)"
|
|
exit 1
|
|
}
|
|
|
|
# Start GSM8k environment for lora_restart
|
|
echo "[LORA_RESTART] Starting GSM8k environment..."
|
|
python -u environments/gsm8k_server.py serve \
|
|
--env.tokenizer_name "$MODEL" \
|
|
--env.use_wandb=False \
|
|
--env.rollout_server_url "http://localhost:${LORA_RESTART_API_PORT}" \
|
|
--openai.model_name "$MODEL" \
|
|
--openai.base_url "http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
|
|
--openai.server_type vllm \
|
|
--slurm false \
|
|
> "$LOG_DIR/env_lora_restart.log" 2>&1 &
|
|
LORA_RESTART_ENV_PID=$!
|
|
|
|
# =============================================================================
|
|
# WAIT FOR BOTH TRAINERS TO COMPLETE
|
|
# =============================================================================
|
|
echo ""
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "Both trainers running in parallel. Waiting for completion..."
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo ""
|
|
echo "Monitor progress:"
|
|
echo " tail -f $LOG_DIR/trainer_lora_only.log"
|
|
echo " tail -f $LOG_DIR/trainer_lora_restart.log"
|
|
echo ""
|
|
|
|
# Wait for trainers
|
|
LORA_ONLY_EXIT=0
|
|
LORA_RESTART_EXIT=0
|
|
|
|
wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
|
|
LORA_ONLY_END=$(date +%s)
|
|
LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
|
|
echo " ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
|
|
|
|
wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
|
|
LORA_RESTART_END=$(date +%s)
|
|
LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
|
|
echo " ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
|
|
|
|
# =============================================================================
|
|
# RESULTS
|
|
# =============================================================================
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "COMPARISON RESULTS (Parallel Execution)"
|
|
echo "============================================================"
|
|
echo ""
|
|
echo "Training Steps: $TRAINING_STEPS"
|
|
echo "Batch Size: $BATCH_SIZE"
|
|
echo ""
|
|
echo "┌─────────────────┬──────┬──────────────┬────────────────────────────┐"
|
|
echo "│ Mode │ GPU │ Total Time │ Notes │"
|
|
echo "├─────────────────┼──────┼──────────────┼────────────────────────────┤"
|
|
printf "│ lora_only │ %d │ %10ss │ --enforce-eager (~13 TPS) │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
|
|
printf "│ lora_restart │ %d │ %10ss │ no --enforce-eager (~108 TPS)│\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
|
|
echo "└─────────────────┴──────┴──────────────┴────────────────────────────┘"
|
|
echo ""
|
|
|
|
if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
|
|
SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
|
|
echo "Speedup: ${SPEEDUP}x (lora_restart vs lora_only)"
|
|
fi
|
|
|
|
echo ""
|
|
echo "📊 BENCHMARK DETAILS:"
|
|
echo ""
|
|
echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
|
|
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_only.log)"
|
|
echo ""
|
|
echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
|
|
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_restart.log)"
|
|
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "📁 All logs saved to: $LOG_DIR"
|
|
echo "============================================================"
|
|
echo ""
|
|
echo "Log files:"
|
|
echo " $LOG_DIR/trainer_lora_only.log"
|
|
echo " $LOG_DIR/trainer_lora_restart.log"
|
|
echo " $LOG_DIR/vllm_lora_only.log"
|
|
echo " $LOG_DIR/env_lora_only.log"
|
|
echo " $LOG_DIR/env_lora_restart.log"
|
|
echo ""
|