vllm restart 1

This commit is contained in:
Jai Suphavadeeprasit 2026-02-12 12:28:00 -05:00
parent 6bd0296bac
commit 9dcb362aba
2 changed files with 22 additions and 6 deletions

View file

@ -150,6 +150,9 @@ LORA_ONLY_VLLM_PID=$!
echo "" echo ""
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..." echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
# Pre-create checkpoint directory so vLLM can write its log there
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
# Start run-api for lora_restart # Start run-api for lora_restart
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 & run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
LORA_RESTART_API_PID=$! LORA_RESTART_API_PID=$!
@ -190,7 +193,7 @@ python -u environments/gsm8k_server.py serve \
LORA_ONLY_ENV_PID=$! LORA_ONLY_ENV_PID=$!
echo "[LORA_ONLY] Starting trainer..." echo "[LORA_ONLY] Starting trainer..."
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/grpo.py \ CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
--model-name "$MODEL" \ --model-name "$MODEL" \
--weight-bridge-mode lora_only \ --weight-bridge-mode lora_only \
--vllm-port $LORA_ONLY_VLLM_PORT \ --vllm-port $LORA_ONLY_VLLM_PORT \
@ -210,7 +213,7 @@ LORA_ONLY_TRAINER_PID=$!
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
echo "" echo ""
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..." echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \ CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
--model-name "$MODEL" \ --model-name "$MODEL" \
--weight-bridge-mode lora_restart \ --weight-bridge-mode lora_restart \
--vllm-port $LORA_RESTART_VLLM_PORT \ --vllm-port $LORA_RESTART_VLLM_PORT \
@ -227,11 +230,18 @@ CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \
LORA_RESTART_TRAINER_PID=$! LORA_RESTART_TRAINER_PID=$!
# Wait for lora_restart's internal vLLM to start # Wait for lora_restart's internal vLLM to start
# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
echo "[LORA_RESTART] Waiting for internal vLLM to start..." echo "[LORA_RESTART] Waiting for internal vLLM to start..."
sleep 15 echo " NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 120 || { echo " Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
echo " Failed - check log:" sleep 30 # Give more time for model loading before checking health
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
echo " Failed - check logs:"
echo " Trainer log:"
tail -30 "$LOG_DIR/trainer_lora_restart.log" tail -30 "$LOG_DIR/trainer_lora_restart.log"
echo ""
echo " vLLM internal log (if exists):"
tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo " (not found)"
exit 1 exit 1
} }

View file

@ -891,9 +891,15 @@ def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str) -> Optiona
print(f" Launching: {' '.join(cmd)}") print(f" Launching: {' '.join(cmd)}")
print(f" Adapter: {adapter_path}") print(f" Adapter: {adapter_path}")
# Log vLLM output to file for debugging
vllm_log_path = os.path.join(config.save_path, "vllm_internal.log")
print(f" vLLM log: {vllm_log_path}")
try: try:
proc = subprocess.Popen(cmd, env=env) vllm_log_file = open(vllm_log_path, "w")
proc = subprocess.Popen(cmd, env=env, stdout=vllm_log_file, stderr=subprocess.STDOUT)
print(f" vLLM PID: {proc.pid}") print(f" vLLM PID: {proc.pid}")
print(f" NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (takes 1-3 min)...")
# Wait for server to be ready # Wait for server to be ready
if not wait_for_vllm_ready(config.vllm_port, timeout=180): if not wait_for_vllm_ready(config.vllm_port, timeout=180):