testing 3

2026-04-19 12:57:58 +00:00 · 2026-02-02 13:08:42 -05:00 · 2026-02-02 13:08:42 -05:00 · 8a9e6945ee
commit 8a9e6945ee
parent dd7047dd5a
4 changed files with 18 additions and 1 deletions
--- a/example_trainer/scripts/run_concurrent_tests.sh
+++ b/example_trainer/scripts/run_concurrent_tests.sh
@ -97,6 +97,8 @@ echo "  PID: $LORA_VLLM_PID"

 echo ""
 echo "[2/6] Starting Single-Copy vLLM server (GPU 4)..."
+# NOTE: --enforce-eager is REQUIRED for single-copy mode!
+# Without it, CUDA graphs freeze weights and updates won't be visible to inference.
 CUDA_VISIBLE_DEVICES=4 \
 VLLM_ENABLE_SHARED_WEIGHTS=1 \
 LOGDIR="$SINGLE_COPY_CHECKPOINT_DIR" \
@ -106,6 +108,7 @@ python -u example_trainer/vllm_api_server.py \
    --port $SINGLE_COPY_VLLM_PORT \
    --dtype bfloat16 \
    --gpu-memory-utilization 0.5 \
+    --enforce-eager \
    > "${LOG_DIR}/single_copy_vllm.log" 2>&1 &
 SINGLE_COPY_VLLM_PID=$!
 echo "  PID: $SINGLE_COPY_VLLM_PID"