mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
enforce eager check
This commit is contained in:
parent
84cee536a3
commit
211f91b528
4 changed files with 190 additions and 616 deletions
|
|
@ -1,326 +0,0 @@
|
|||
#!/bin/bash
|
||||
# ============================================================================
|
||||
# Compare lora_restart vs lora_only performance
|
||||
# ============================================================================
|
||||
# Runs both modes in parallel with separate APIs/environments/ports
|
||||
# All commands run in background (single terminal)
|
||||
# Results uploaded to W&B
|
||||
#
|
||||
# Usage:
|
||||
# ./compare_lora_modes.sh [steps]
|
||||
# ./compare_lora_modes.sh 30 # 30 steps (default)
|
||||
# ./compare_lora_modes.sh 10 # Quick 10-step test
|
||||
# ============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
MODEL="Qwen/Qwen3-4B-Instruct-2507"
|
||||
STEPS="${1:-30}"
|
||||
RESTART_INTERVAL=3
|
||||
WANDB_PROJECT="lora-mode-comparison"
|
||||
|
||||
# Port allocation
|
||||
# lora_restart: API 8001, vLLM 9001
|
||||
# lora_only: API 8002, vLLM 9002
|
||||
|
||||
echo "============================================================================"
|
||||
echo "LoRA Mode Comparison: lora_restart vs lora_only"
|
||||
echo "============================================================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $STEPS"
|
||||
echo "Restart interval: $RESTART_INTERVAL"
|
||||
echo "W&B project: $WANDB_PROJECT"
|
||||
echo ""
|
||||
echo "Port allocation:"
|
||||
echo " lora_restart: API=8001, vLLM=9001, GPU=0"
|
||||
echo " lora_only: API=8002, vLLM=9002, GPU=1"
|
||||
echo "============================================================================"
|
||||
|
||||
# Get script directory and repo root
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
cd "$REPO_ROOT"
|
||||
echo "Working directory: $(pwd)"
|
||||
|
||||
# Create log directory
|
||||
LOGDIR="./lora_comparison_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOGDIR"
|
||||
echo "Log directory: $LOGDIR"
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "Cleaning up all processes..."
|
||||
|
||||
# Kill by name
|
||||
pkill -f "gsm8k_server.py" 2>/dev/null || true
|
||||
pkill -f "run-api" 2>/dev/null || true
|
||||
pkill -f "vllm_api_server.py" 2>/dev/null || true
|
||||
pkill -f "example_trainer.grpo" 2>/dev/null || true
|
||||
|
||||
# Kill by port
|
||||
for port in 8001 8002 9001 9002; do
|
||||
fuser -k ${port}/tcp 2>/dev/null || true
|
||||
done
|
||||
|
||||
echo "Cleanup complete."
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Initial cleanup
|
||||
echo ""
|
||||
echo "Killing any existing processes on ports 8001, 8002, 9001, 9002..."
|
||||
cleanup
|
||||
sleep 3
|
||||
|
||||
# ============================================================================
|
||||
# MODE 1: lora_restart (GPU 0, ports 8001/9001)
|
||||
# ============================================================================
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "[1/2] LORA_RESTART MODE (GPU 0, API:8001, vLLM:9001)"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# Start API for lora_restart
|
||||
echo " Starting API server (port 8001)..."
|
||||
run-api --port 8001 > "$LOGDIR/api_restart.log" 2>&1 &
|
||||
RESTART_API_PID=$!
|
||||
sleep 3
|
||||
|
||||
# Check API is up
|
||||
if curl -s "http://localhost:8001/info" > /dev/null 2>&1; then
|
||||
echo " ✓ API running (PID: $RESTART_API_PID)"
|
||||
else
|
||||
echo " ✗ API failed to start"
|
||||
cat "$LOGDIR/api_restart.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start trainer (lora_restart manages vLLM internally)
|
||||
echo " Starting lora_restart trainer (will launch vLLM on port 9001)..."
|
||||
CUDA_VISIBLE_DEVICES=0 python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_restart \
|
||||
--vllm-port 9001 \
|
||||
--atropos-url http://localhost:8001 \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--training-steps $STEPS \
|
||||
--vllm-restart-interval $RESTART_INTERVAL \
|
||||
--save-path "$LOGDIR/checkpoints_restart" \
|
||||
--use-wandb \
|
||||
--wandb-project "$WANDB_PROJECT" \
|
||||
--wandb-group "comparison-$(date +%Y%m%d)" \
|
||||
--benchmark \
|
||||
> "$LOGDIR/trainer_restart.log" 2>&1 &
|
||||
RESTART_TRAINER_PID=$!
|
||||
echo " ✓ Trainer started (PID: $RESTART_TRAINER_PID)"
|
||||
|
||||
# Wait for vLLM to be ready (trainer launches it)
|
||||
echo " Waiting for vLLM to start (port 9001)..."
|
||||
for i in {1..60}; do
|
||||
if curl -s "http://localhost:9001/health" > /dev/null 2>&1; then
|
||||
echo " ✓ vLLM ready after ~${i}s"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Start environment for lora_restart
|
||||
echo " Starting environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.rollout_server_url "http://localhost:8001" \
|
||||
--env.max_token_length 2048 \
|
||||
--env.use_wandb=True \
|
||||
--env.wandb_name "lora-restart-env" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:9001/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
> "$LOGDIR/env_restart.log" 2>&1 &
|
||||
RESTART_ENV_PID=$!
|
||||
echo " ✓ Environment started (PID: $RESTART_ENV_PID)"
|
||||
|
||||
# ============================================================================
|
||||
# MODE 2: lora_only (GPU 1, ports 8002/9002)
|
||||
# ============================================================================
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "[2/2] LORA_ONLY MODE (GPU 1, API:8002, vLLM:9002)"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# Start API for lora_only
|
||||
echo " Starting API server (port 8002)..."
|
||||
run-api --port 8002 > "$LOGDIR/api_only.log" 2>&1 &
|
||||
ONLY_API_PID=$!
|
||||
sleep 3
|
||||
|
||||
# Check API is up
|
||||
if curl -s "http://localhost:8002/info" > /dev/null 2>&1; then
|
||||
echo " ✓ API running (PID: $ONLY_API_PID)"
|
||||
else
|
||||
echo " ✗ API failed to start"
|
||||
cat "$LOGDIR/api_only.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start vLLM for lora_only (external, with --enforce-eager)
|
||||
echo " Starting vLLM with --enable-lora --enforce-eager (port 9002)..."
|
||||
CUDA_VISIBLE_DEVICES=1 python example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--port 9002 \
|
||||
--gpu-memory-utilization 0.45 \
|
||||
--enable-lora \
|
||||
--max-lora-rank 32 \
|
||||
--enforce-eager \
|
||||
> "$LOGDIR/vllm_only.log" 2>&1 &
|
||||
ONLY_VLLM_PID=$!
|
||||
echo " ✓ vLLM started (PID: $ONLY_VLLM_PID)"
|
||||
|
||||
# Wait for vLLM to be ready
|
||||
echo " Waiting for vLLM to start (port 9002)..."
|
||||
for i in {1..90}; do
|
||||
if curl -s "http://localhost:9002/health" > /dev/null 2>&1; then
|
||||
echo " ✓ vLLM ready after ~${i}s"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Start environment for lora_only
|
||||
echo " Starting environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.rollout_server_url "http://localhost:8002" \
|
||||
--env.max_token_length 2048 \
|
||||
--env.use_wandb=True \
|
||||
--env.wandb_name "lora-only-env" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:9002/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
> "$LOGDIR/env_only.log" 2>&1 &
|
||||
ONLY_ENV_PID=$!
|
||||
echo " ✓ Environment started (PID: $ONLY_ENV_PID)"
|
||||
|
||||
# Start trainer for lora_only
|
||||
echo " Starting lora_only trainer..."
|
||||
CUDA_VISIBLE_DEVICES=1 python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_only \
|
||||
--vllm-port 9002 \
|
||||
--atropos-url http://localhost:8002 \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--training-steps $STEPS \
|
||||
--save-path "$LOGDIR/checkpoints_only" \
|
||||
--use-wandb \
|
||||
--wandb-project "$WANDB_PROJECT" \
|
||||
--wandb-group "comparison-$(date +%Y%m%d)" \
|
||||
--benchmark \
|
||||
> "$LOGDIR/trainer_only.log" 2>&1 &
|
||||
ONLY_TRAINER_PID=$!
|
||||
echo " ✓ Trainer started (PID: $ONLY_TRAINER_PID)"
|
||||
|
||||
# ============================================================================
|
||||
# Save PIDs and monitor
|
||||
# ============================================================================
|
||||
cat > "$LOGDIR/pids.txt" << EOF
|
||||
RESTART_API_PID=$RESTART_API_PID
|
||||
RESTART_TRAINER_PID=$RESTART_TRAINER_PID
|
||||
RESTART_ENV_PID=$RESTART_ENV_PID
|
||||
ONLY_API_PID=$ONLY_API_PID
|
||||
ONLY_VLLM_PID=$ONLY_VLLM_PID
|
||||
ONLY_ENV_PID=$ONLY_ENV_PID
|
||||
ONLY_TRAINER_PID=$ONLY_TRAINER_PID
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
echo "============================================================================"
|
||||
echo "All components started!"
|
||||
echo "============================================================================"
|
||||
echo ""
|
||||
echo "📊 Monitor progress:"
|
||||
echo " tail -f $LOGDIR/trainer_restart.log # lora_restart"
|
||||
echo " tail -f $LOGDIR/trainer_only.log # lora_only"
|
||||
echo ""
|
||||
echo "🔍 Watch both:"
|
||||
echo " tail -f $LOGDIR/trainer_*.log"
|
||||
echo ""
|
||||
echo "📈 W&B Dashboard:"
|
||||
echo " https://wandb.ai/$WANDB_PROJECT"
|
||||
echo ""
|
||||
echo "Waiting for trainers to complete..."
|
||||
echo "(lora_restart should finish MUCH faster than lora_only)"
|
||||
echo ""
|
||||
|
||||
# Wait for trainers
|
||||
RESTART_STATUS="running"
|
||||
ONLY_STATUS="running"
|
||||
|
||||
while [ "$RESTART_STATUS" = "running" ] || [ "$ONLY_STATUS" = "running" ]; do
|
||||
sleep 30
|
||||
|
||||
# Check lora_restart
|
||||
if [ "$RESTART_STATUS" = "running" ]; then
|
||||
if ! kill -0 $RESTART_TRAINER_PID 2>/dev/null; then
|
||||
wait $RESTART_TRAINER_PID 2>/dev/null && RESTART_STATUS="completed" || RESTART_STATUS="failed"
|
||||
echo " lora_restart: $RESTART_STATUS"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check lora_only
|
||||
if [ "$ONLY_STATUS" = "running" ]; then
|
||||
if ! kill -0 $ONLY_TRAINER_PID 2>/dev/null; then
|
||||
wait $ONLY_TRAINER_PID 2>/dev/null && ONLY_STATUS="completed" || ONLY_STATUS="failed"
|
||||
echo " lora_only: $ONLY_STATUS"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Show status
|
||||
if [ "$RESTART_STATUS" = "running" ] || [ "$ONLY_STATUS" = "running" ]; then
|
||||
echo " [$(date +%H:%M:%S)] lora_restart: $RESTART_STATUS, lora_only: $ONLY_STATUS"
|
||||
fi
|
||||
done
|
||||
|
||||
# ============================================================================
|
||||
# Print results
|
||||
# ============================================================================
|
||||
echo ""
|
||||
echo "============================================================================"
|
||||
echo "COMPARISON RESULTS"
|
||||
echo "============================================================================"
|
||||
|
||||
echo ""
|
||||
echo "📊 LORA_RESTART (CUDA graphs, vLLM restarts):"
|
||||
echo "─────────────────────────────────────────────────"
|
||||
grep -A 20 "BENCHMARK SUMMARY" "$LOGDIR/trainer_restart.log" 2>/dev/null || echo " (check $LOGDIR/trainer_restart.log)"
|
||||
|
||||
echo ""
|
||||
echo "📊 LORA_ONLY (--enforce-eager, hot-swap):"
|
||||
echo "─────────────────────────────────────────────────"
|
||||
grep -A 20 "BENCHMARK SUMMARY" "$LOGDIR/trainer_only.log" 2>/dev/null || echo " (check $LOGDIR/trainer_only.log)"
|
||||
|
||||
echo ""
|
||||
echo "============================================================================"
|
||||
echo "📁 LOGS SAVED TO: $LOGDIR"
|
||||
echo "============================================================================"
|
||||
echo ""
|
||||
echo "Log files:"
|
||||
echo " $LOGDIR/trainer_restart.log # lora_restart trainer"
|
||||
echo " $LOGDIR/trainer_only.log # lora_only trainer"
|
||||
echo " $LOGDIR/vllm_only.log # lora_only vLLM"
|
||||
echo " $LOGDIR/env_restart.log # lora_restart environment"
|
||||
echo " $LOGDIR/env_only.log # lora_only environment"
|
||||
echo ""
|
||||
echo "Checkpoints:"
|
||||
echo " $LOGDIR/checkpoints_restart/"
|
||||
echo " $LOGDIR/checkpoints_only/"
|
||||
echo ""
|
||||
echo "W&B runs should be visible at:"
|
||||
echo " https://wandb.ai/$WANDB_PROJECT"
|
||||
echo ""
|
||||
echo "============================================================================"
|
||||
echo "Done!"
|
||||
Loading…
Add table
Add a link
Reference in a new issue