mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
249 lines
7.9 KiB
Bash
249 lines
7.9 KiB
Bash
#!/bin/bash
|
|
# =============================================================================
|
|
# Concurrent GSM8k Training Test Script
|
|
# =============================================================================
|
|
#
|
|
# This script runs BOTH LoRA and Single-Copy modes concurrently on an 8-GPU node:
|
|
# - GPUs 0-1: LoRA mode (vLLM on GPU 0, trainer on GPU 1)
|
|
# - GPUs 4-5: Single-Copy mode (vLLM+trainer share GPU 4)
|
|
#
|
|
# Usage:
|
|
# ./scripts/run_concurrent_tests.sh [MODEL] [STEPS]
|
|
#
|
|
# Example:
|
|
# ./scripts/run_concurrent_tests.sh Qwen/Qwen2.5-3B-Instruct 100
|
|
#
|
|
# =============================================================================
|
|
|
|
set -e
|
|
|
|
# Configuration
|
|
MODEL="${1:-Qwen/Qwen2.5-3B-Instruct}"
|
|
TRAINING_STEPS="${2:-100}"
|
|
BATCH_SIZE=4
|
|
LORA_SAVE_INTERVAL=20
|
|
|
|
# Ports (separate for each mode)
|
|
LORA_VLLM_PORT=9001
|
|
LORA_GSM8K_PORT=8001
|
|
|
|
SINGLE_COPY_VLLM_PORT=9002
|
|
SINGLE_COPY_GSM8K_PORT=8002
|
|
|
|
# Directories
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
|
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
|
|
|
LOG_DIR="${REPO_DIR}/test_logs_$(date +%Y%m%d_%H%M%S)"
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
LORA_CHECKPOINT_DIR="${LOG_DIR}/lora_checkpoints"
|
|
SINGLE_COPY_CHECKPOINT_DIR="${LOG_DIR}/single_copy_checkpoints"
|
|
mkdir -p "$LORA_CHECKPOINT_DIR" "$SINGLE_COPY_CHECKPOINT_DIR"
|
|
|
|
echo "============================================================"
|
|
echo "Concurrent GSM8k Training Test"
|
|
echo "============================================================"
|
|
echo "Model: $MODEL"
|
|
echo "Training Steps: $TRAINING_STEPS"
|
|
echo "Batch Size: $BATCH_SIZE"
|
|
echo "Log Directory: $LOG_DIR"
|
|
echo ""
|
|
echo "LoRA Mode: GPUs 0-1, ports ${LORA_VLLM_PORT}/${LORA_GSM8K_PORT}"
|
|
echo "Single-Copy Mode: GPU 4, ports ${SINGLE_COPY_VLLM_PORT}/${SINGLE_COPY_GSM8K_PORT}"
|
|
echo "============================================================"
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
echo ""
|
|
echo "Cleaning up processes..."
|
|
pkill -u $USER -f "vllm_api_server.*port.${LORA_VLLM_PORT}" 2>/dev/null || true
|
|
pkill -u $USER -f "vllm_api_server.*port.${SINGLE_COPY_VLLM_PORT}" 2>/dev/null || true
|
|
pkill -u $USER -f "gsm8k_server.*${LORA_GSM8K_PORT}" 2>/dev/null || true
|
|
pkill -u $USER -f "gsm8k_server.*${SINGLE_COPY_GSM8K_PORT}" 2>/dev/null || true
|
|
pkill -u $USER -f "grpo.py.*lora_only" 2>/dev/null || true
|
|
pkill -u $USER -f "grpo.py.*shared_vllm" 2>/dev/null || true
|
|
echo "Cleanup complete."
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
# Kill any existing processes
|
|
cleanup
|
|
|
|
# Clear Triton cache (for LoRA B200 compatibility)
|
|
rm -rf ~/.triton/cache
|
|
|
|
cd "$REPO_DIR"
|
|
|
|
echo ""
|
|
echo "[1/6] Starting LoRA vLLM server (GPUs 0)..."
|
|
CUDA_VISIBLE_DEVICES=0 \
|
|
VLLM_ENABLE_SHARED_WEIGHTS=1 \
|
|
python -u example_trainer/vllm_api_server.py \
|
|
--model "$MODEL" \
|
|
--tensor-parallel-size 1 \
|
|
--port $LORA_VLLM_PORT \
|
|
--dtype bfloat16 \
|
|
--gpu-memory-utilization 0.7 \
|
|
--enable-lora \
|
|
--max-loras 2 \
|
|
--max-lora-rank 64 \
|
|
--enforce-eager \
|
|
> "${LOG_DIR}/lora_vllm.log" 2>&1 &
|
|
LORA_VLLM_PID=$!
|
|
echo " PID: $LORA_VLLM_PID"
|
|
|
|
echo ""
|
|
echo "[2/6] Starting Single-Copy vLLM server (GPU 4)..."
|
|
# NOTE: --enforce-eager is REQUIRED for single-copy mode!
|
|
# Without it, CUDA graphs freeze weights and updates won't be visible to inference.
|
|
CUDA_VISIBLE_DEVICES=4 \
|
|
VLLM_ENABLE_SHARED_WEIGHTS=1 \
|
|
LOGDIR="$SINGLE_COPY_CHECKPOINT_DIR" \
|
|
python -u example_trainer/vllm_api_server.py \
|
|
--model "$MODEL" \
|
|
--tensor-parallel-size 1 \
|
|
--port $SINGLE_COPY_VLLM_PORT \
|
|
--dtype bfloat16 \
|
|
--gpu-memory-utilization 0.5 \
|
|
--enforce-eager \
|
|
> "${LOG_DIR}/single_copy_vllm.log" 2>&1 &
|
|
SINGLE_COPY_VLLM_PID=$!
|
|
echo " PID: $SINGLE_COPY_VLLM_PID"
|
|
|
|
echo ""
|
|
echo "Waiting for vLLM servers to initialize (60s)..."
|
|
sleep 60
|
|
|
|
# Verify servers are running
|
|
echo ""
|
|
echo "Verifying vLLM servers..."
|
|
|
|
if curl -s "http://localhost:${LORA_VLLM_PORT}/health" > /dev/null; then
|
|
echo " ✓ LoRA vLLM server healthy"
|
|
else
|
|
echo " ✗ LoRA vLLM server failed to start"
|
|
cat "${LOG_DIR}/lora_vllm.log" | tail -50
|
|
exit 1
|
|
fi
|
|
|
|
if curl -s "http://localhost:${SINGLE_COPY_VLLM_PORT}/health" > /dev/null; then
|
|
echo " ✓ Single-Copy vLLM server healthy"
|
|
else
|
|
echo " ✗ Single-Copy vLLM server failed to start"
|
|
cat "${LOG_DIR}/single_copy_vllm.log" | tail -50
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo "[3/6] Starting LoRA GSM8k environment..."
|
|
python -u environments/gsm8k_server.py serve \
|
|
--env.tokenizer_name "$MODEL" \
|
|
--env.use_wandb=False \
|
|
--env.rollout_server_url "http://localhost:${LORA_GSM8K_PORT}" \
|
|
--openai.model_name "$MODEL" \
|
|
--openai.base_url "http://localhost:${LORA_VLLM_PORT}/v1" \
|
|
--openai.server_type vllm \
|
|
--slurm false \
|
|
> "${LOG_DIR}/lora_gsm8k.log" 2>&1 &
|
|
LORA_GSM8K_PID=$!
|
|
echo " PID: $LORA_GSM8K_PID"
|
|
|
|
echo ""
|
|
echo "[4/6] Starting Single-Copy GSM8k environment..."
|
|
python -u environments/gsm8k_server.py serve \
|
|
--env.tokenizer_name "$MODEL" \
|
|
--env.use_wandb=False \
|
|
--env.rollout_server_url "http://localhost:${SINGLE_COPY_GSM8K_PORT}" \
|
|
--openai.model_name "$MODEL" \
|
|
--openai.base_url "http://localhost:${SINGLE_COPY_VLLM_PORT}/v1" \
|
|
--openai.server_type vllm \
|
|
--slurm false \
|
|
> "${LOG_DIR}/single_copy_gsm8k.log" 2>&1 &
|
|
SINGLE_COPY_GSM8K_PID=$!
|
|
echo " PID: $SINGLE_COPY_GSM8K_PID"
|
|
|
|
echo ""
|
|
echo "Waiting for GSM8k environments to initialize (15s)..."
|
|
sleep 15
|
|
|
|
echo ""
|
|
echo "[5/6] Starting LoRA trainer (GPU 1)..."
|
|
CUDA_VISIBLE_DEVICES=1 \
|
|
python -u example_trainer/grpo.py \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode lora_only \
|
|
--vllm-port $LORA_VLLM_PORT \
|
|
--atropos-url "http://localhost:${LORA_GSM8K_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--vllm-restart-interval $LORA_SAVE_INTERVAL \
|
|
--save-path "$LORA_CHECKPOINT_DIR" \
|
|
--benchmark \
|
|
> "${LOG_DIR}/lora_trainer.log" 2>&1 &
|
|
LORA_TRAINER_PID=$!
|
|
echo " PID: $LORA_TRAINER_PID"
|
|
|
|
echo ""
|
|
echo "[6/6] Starting Single-Copy trainer (GPU 4 - shared with vLLM)..."
|
|
CUDA_VISIBLE_DEVICES=4 \
|
|
python -u example_trainer/grpo.py \
|
|
--model-name "$MODEL" \
|
|
--weight-bridge-mode shared_vllm \
|
|
--vllm-port $SINGLE_COPY_VLLM_PORT \
|
|
--atropos-url "http://localhost:${SINGLE_COPY_GSM8K_PORT}" \
|
|
--batch-size $BATCH_SIZE \
|
|
--training-steps $TRAINING_STEPS \
|
|
--save-path "$SINGLE_COPY_CHECKPOINT_DIR" \
|
|
--vllm-config-path "${SINGLE_COPY_CHECKPOINT_DIR}/vllm_bridge_config.json" \
|
|
--benchmark \
|
|
> "${LOG_DIR}/single_copy_trainer.log" 2>&1 &
|
|
SINGLE_COPY_TRAINER_PID=$!
|
|
echo " PID: $SINGLE_COPY_TRAINER_PID"
|
|
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "Both trainers started!"
|
|
echo ""
|
|
echo "Monitor logs:"
|
|
echo " tail -f ${LOG_DIR}/lora_trainer.log"
|
|
echo " tail -f ${LOG_DIR}/single_copy_trainer.log"
|
|
echo ""
|
|
echo "Or watch both:"
|
|
echo " tail -f ${LOG_DIR}/*.log"
|
|
echo ""
|
|
echo "Waiting for training to complete..."
|
|
echo "============================================================"
|
|
|
|
# Wait for both trainers to complete
|
|
wait $LORA_TRAINER_PID
|
|
LORA_EXIT=$?
|
|
|
|
wait $SINGLE_COPY_TRAINER_PID
|
|
SINGLE_COPY_EXIT=$?
|
|
|
|
echo ""
|
|
echo "============================================================"
|
|
echo "TRAINING COMPLETE"
|
|
echo "============================================================"
|
|
echo "LoRA Trainer Exit Code: $LORA_EXIT"
|
|
echo "Single-Copy Trainer Exit Code: $SINGLE_COPY_EXIT"
|
|
echo ""
|
|
echo "Results saved to: $LOG_DIR"
|
|
echo ""
|
|
echo "Checkpoints:"
|
|
echo " LoRA: $LORA_CHECKPOINT_DIR"
|
|
echo " Single-Copy: $SINGLE_COPY_CHECKPOINT_DIR"
|
|
echo "============================================================"
|
|
|
|
# Generate summary
|
|
echo ""
|
|
echo "=== LoRA Training Summary ===" | tee "${LOG_DIR}/summary.txt"
|
|
grep -E "Step|Loss|Accuracy" "${LOG_DIR}/lora_trainer.log" | tail -20 | tee -a "${LOG_DIR}/summary.txt"
|
|
|
|
echo "" | tee -a "${LOG_DIR}/summary.txt"
|
|
echo "=== Single-Copy Training Summary ===" | tee -a "${LOG_DIR}/summary.txt"
|
|
grep -E "Step|Loss|Accuracy" "${LOG_DIR}/single_copy_trainer.log" | tail -20 | tee -a "${LOG_DIR}/summary.txt"
|
|
|