diff --git a/example_trainer/README.md b/example_trainer/README.md index 7531538c..a1f69c2a 100644 --- a/example_trainer/README.md +++ b/example_trainer/README.md @@ -136,11 +136,14 @@ Zero model duplication - trainer and vLLM share the exact same GPU memory! run-api --port 8000 # Terminal 2: Start vLLM with shared weights enabled +# IMPORTANT: --enforce-eager is REQUIRED to disable CUDA graphs +# Without it, weight updates won't be visible to inference! VLLM_ENABLE_SHARED_WEIGHTS=1 LOGDIR=$LOGDIR \ CUDA_VISIBLE_DEVICES=0 python example_trainer/vllm_api_server.py \ --model $MODEL \ --port 9001 \ - --gpu-memory-utilization 0.45 + --gpu-memory-utilization 0.45 \ + --enforce-eager # Terminal 3: Start the environment server python -u environments/gsm8k_server.py serve \ @@ -429,6 +432,20 @@ python example_trainer/vllm_api_server.py \ VLLM_ENABLE_SHARED_WEIGHTS=1 LOGDIR=/tmp/atropos python example_trainer/vllm_api_server.py ... ``` +### "LogProb Alignment: MISMATCH!" in shared_vllm mode +If you see `[MISMATCH!]` in the logprob alignment output, inference and training are seeing different weights. This is usually caused by **CUDA graphs**. + +**Symptom:** `inference_mean` stays constant while `training_mean` changes. The `diff` increases over time. + +**Fix:** Add `--enforce-eager` when starting vLLM: +```bash +VLLM_ENABLE_SHARED_WEIGHTS=1 LOGDIR=$LOGDIR \ +python example_trainer/vllm_api_server.py \ + --model $MODEL --port 9001 --enforce-eager # <-- REQUIRED! +``` + +**Why:** CUDA graphs "bake" model weights into compiled graphs at startup. Updates to the underlying tensors are NOT reflected in inference. Using `--enforce-eager` disables CUDA graphs, so vLLM reads from the shared tensors on every forward pass. + ### "Triton compilation error" on B200/Blackwell GPUs The patched vLLM server (`vllm_api_server.py`) automatically applies B200 fixes. If using standard vLLM, add `--enforce-eager`. diff --git a/example_trainer/scripts/run_comparison.sh b/example_trainer/scripts/run_comparison.sh index 4f05c42e..fa56a5cf 100755 --- a/example_trainer/scripts/run_comparison.sh +++ b/example_trainer/scripts/run_comparison.sh @@ -241,12 +241,15 @@ echo " ✓ run-api started (PID: $SHARED_API_PID, port 8002)" wait_for_api 8002 "shared" || { echo "Failed to start shared API"; exit 1; } # Start vLLM with shared weights (use separate config path) +# NOTE: --enforce-eager is REQUIRED for single-copy mode! +# Without it, CUDA graphs freeze weights and updates won't be visible to inference. echo " Starting vLLM with shared weights..." VLLM_ENABLE_SHARED_WEIGHTS=1 VLLM_BRIDGE_CONFIG_PATH=$LOGDIR/vllm_bridge_config_shared.json \ CUDA_VISIBLE_DEVICES=2 python example_trainer/vllm_api_server.py \ --model $MODEL \ --port 9002 \ --gpu-memory-utilization 0.35 \ + --enforce-eager \ > $LOGDIR/vllm_shared.log 2>&1 & SHARED_VLLM_PID=$! echo " ✓ vLLM started (PID: $SHARED_VLLM_PID, port 9002)" diff --git a/example_trainer/scripts/test_single_copy_mode.sh b/example_trainer/scripts/test_single_copy_mode.sh index 0a76e1e7..28efc6de 100644 --- a/example_trainer/scripts/test_single_copy_mode.sh +++ b/example_trainer/scripts/test_single_copy_mode.sh @@ -53,6 +53,8 @@ cd "$REPO_DIR" echo "" echo "[1/4] Starting vLLM with shared memory enabled..." +# NOTE: --enforce-eager is REQUIRED for single-copy mode! +# Without it, CUDA graphs freeze weights and updates won't be visible to inference. VLLM_ENABLE_SHARED_WEIGHTS=1 \ LOGDIR="$LOG_DIR" \ python -u example_trainer/vllm_api_server.py \ @@ -61,6 +63,7 @@ python -u example_trainer/vllm_api_server.py \ --port $VLLM_PORT \ --dtype bfloat16 \ --gpu-memory-utilization 0.5 \ + --enforce-eager \ > "${LOG_DIR}/vllm.log" 2>&1 & echo "Waiting for vLLM (45s)..."