vllm restart 1

2026-04-19 12:57:58 +00:00 · 2026-02-12 12:28:00 -05:00 · 2026-02-12 12:28:00 -05:00 · 9dcb362aba
commit 9dcb362aba
parent 6bd0296bac
2 changed files with 22 additions and 6 deletions
--- a/example_trainer/scripts/compare_lora_modes.sh
+++ b/example_trainer/scripts/compare_lora_modes.sh
@ -150,6 +150,9 @@ LORA_ONLY_VLLM_PID=$!
 echo ""
 echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."

+# Pre-create checkpoint directory so vLLM can write its log there
+mkdir -p "$LOG_DIR/checkpoints_lora_restart"
+
 # Start run-api for lora_restart
 run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
 LORA_RESTART_API_PID=$!
@ -190,7 +193,7 @@ python -u environments/gsm8k_server.py serve \
 LORA_ONLY_ENV_PID=$!

 echo "[LORA_ONLY] Starting trainer..."
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/grpo.py \
+CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
    --model-name "$MODEL" \
    --weight-bridge-mode lora_only \
    --vllm-port $LORA_ONLY_VLLM_PORT \
@ -210,7 +213,7 @@ LORA_ONLY_TRAINER_PID=$!
 # -----------------------------------------------------------------------------
 echo ""
 echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
-CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \
+CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
    --model-name "$MODEL" \
    --weight-bridge-mode lora_restart \
    --vllm-port $LORA_RESTART_VLLM_PORT \
@ -227,11 +230,18 @@ CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \
 LORA_RESTART_TRAINER_PID=$!

 # Wait for lora_restart's internal vLLM to start
+# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
 echo "[LORA_RESTART] Waiting for internal vLLM to start..."
-sleep 15
-wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 120 || {
-    echo "  Failed - check log:"
+echo "  NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
+echo "  Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
+sleep 30  # Give more time for model loading before checking health
+wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
+    echo "  Failed - check logs:"
+    echo "  Trainer log:"
    tail -30 "$LOG_DIR/trainer_lora_restart.log"
+    echo ""
+    echo "  vLLM internal log (if exists):"
+    tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo "  (not found)"
    exit 1
 }

--- a/example_trainer/trainers.py
+++ b/example_trainer/trainers.py
@ -891,9 +891,15 @@ def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str) -> Optiona
    print(f"  Launching: {' '.join(cmd)}")
    print(f"  Adapter: {adapter_path}")
    
+    # Log vLLM output to file for debugging
+    vllm_log_path = os.path.join(config.save_path, "vllm_internal.log")
+    print(f"  vLLM log: {vllm_log_path}")
+    
    try:
-        proc = subprocess.Popen(cmd, env=env)
+        vllm_log_file = open(vllm_log_path, "w")
+        proc = subprocess.Popen(cmd, env=env, stdout=vllm_log_file, stderr=subprocess.STDOUT)
        print(f"  vLLM PID: {proc.pid}")
+        print(f"  NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (takes 1-3 min)...")
        
        # Wait for server to be ready
        if not wait_for_vllm_ready(config.vllm_port, timeout=180):