diff --git a/example_trainer/scripts/compare_lora_modes.sh b/example_trainer/scripts/compare_lora_modes.sh
index ff62e1d2..6cb81a1f 100755
--- a/example_trainer/scripts/compare_lora_modes.sh
+++ b/example_trainer/scripts/compare_lora_modes.sh
@@ -150,6 +150,9 @@ LORA_ONLY_VLLM_PID=$!
 echo ""
 echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
 
+# Pre-create checkpoint directory so vLLM can write its log there
+mkdir -p "$LOG_DIR/checkpoints_lora_restart"
+
 # Start run-api for lora_restart
 run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
 LORA_RESTART_API_PID=$!
@@ -190,7 +193,7 @@ python -u environments/gsm8k_server.py serve \
 LORA_ONLY_ENV_PID=$!
 
 echo "[LORA_ONLY] Starting trainer..."
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/grpo.py \
+CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
     --model-name "$MODEL" \
     --weight-bridge-mode lora_only \
     --vllm-port $LORA_ONLY_VLLM_PORT \
@@ -210,7 +213,7 @@ LORA_ONLY_TRAINER_PID=$!
 # -----------------------------------------------------------------------------
 echo ""
 echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
-CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \
+CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
     --model-name "$MODEL" \
     --weight-bridge-mode lora_restart \
     --vllm-port $LORA_RESTART_VLLM_PORT \
@@ -227,11 +230,18 @@ CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -u example_trainer/grpo.py \
 LORA_RESTART_TRAINER_PID=$!
 
 # Wait for lora_restart's internal vLLM to start
+# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
 echo "[LORA_RESTART] Waiting for internal vLLM to start..."
-sleep 15
-wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 120 || {
-    echo "  Failed - check log:"
+echo "  NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
+echo "  Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
+sleep 30  # Give more time for model loading before checking health
+wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
+    echo "  Failed - check logs:"
+    echo "  Trainer log:"
     tail -30 "$LOG_DIR/trainer_lora_restart.log"
+    echo ""
+    echo "  vLLM internal log (if exists):"
+    tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo "  (not found)"
     exit 1
 }
 
diff --git a/example_trainer/trainers.py b/example_trainer/trainers.py
index 23e5f6c6..9d1357b0 100644
--- a/example_trainer/trainers.py
+++ b/example_trainer/trainers.py
@@ -891,9 +891,15 @@ def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str) -> Optiona
     print(f"  Launching: {' '.join(cmd)}")
     print(f"  Adapter: {adapter_path}")
     
+    # Log vLLM output to file for debugging
+    vllm_log_path = os.path.join(config.save_path, "vllm_internal.log")
+    print(f"  vLLM log: {vllm_log_path}")
+    
     try:
-        proc = subprocess.Popen(cmd, env=env)
+        vllm_log_file = open(vllm_log_path, "w")
+        proc = subprocess.Popen(cmd, env=env, stdout=vllm_log_file, stderr=subprocess.STDOUT)
         print(f"  vLLM PID: {proc.pid}")
+        print(f"  NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (takes 1-3 min)...")
         
         # Wait for server to be ready
         if not wait_for_vllm_ready(config.vllm_port, timeout=180):