enforce eager check

This commit is contained in:
Jai Suphavadeeprasit 2026-02-12 11:33:33 -05:00
parent 84cee536a3
commit 211f91b528
4 changed files with 190 additions and 616 deletions

View file

@ -719,7 +719,7 @@ def train_lora_restart(config: TrainingConfig):
# Launch vLLM with the initial adapter
print("[3/4] Launching vLLM with CUDA graphs (no --enforce-eager)...")
vllm_proc = _launch_vllm_with_lora(config, current_adapter_path, step=0)
vllm_proc = _launch_vllm_with_lora(config, current_adapter_path)
if vllm_proc is None:
raise RuntimeError("Failed to launch vLLM")
@ -799,7 +799,7 @@ def train_lora_restart(config: TrainingConfig):
# Restart vLLM with new adapter
print(f" [RESTART] Restarting vLLM with new adapter...")
_terminate_vllm(vllm_proc)
vllm_proc = _launch_vllm_with_lora(config, current_adapter_path, step=step + 1)
vllm_proc = _launch_vllm_with_lora(config, current_adapter_path)
if vllm_proc is None:
raise RuntimeError("Failed to restart vLLM")
@ -849,30 +849,23 @@ def train_lora_restart(config: TrainingConfig):
print(f"Final adapter saved to {final_adapter_path}")
def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str, step: int = 0) -> Optional[subprocess.Popen]:
def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str) -> Optional[subprocess.Popen]:
"""
Launch vLLM with a LoRA adapter PRE-LOADED at startup (CUDA graphs enabled).
Launch vLLM with a LoRA adapter pre-loaded (CUDA graphs enabled).
Key insight: To get CUDA graphs with LoRA, we must load the adapter AT STARTUP
via --lora-modules, not dynamically via HTTP. Dynamic loading requires
--enforce-eager which disables CUDA graphs.
This gives us full CUDA graph speed (~170 TPS instead of ~13 TPS).
Unlike lora_only mode, this does NOT use --enforce-eager, so we get
full CUDA graph speed (~170 TPS instead of ~13 TPS).
"""
from .vllm_manager import kill_process_on_port, wait_for_vllm_ready
# Kill any existing process on the port
kill_process_on_port(config.vllm_port)
# Resolve adapter path to absolute path
adapter_abs_path = os.path.abspath(adapter_path)
# Find the vllm_api_server.py script
script_dir = os.path.dirname(os.path.abspath(__file__))
server_script = os.path.join(script_dir, "vllm_api_server.py")
# Build command with adapter loaded at startup via --lora-modules
# Format: --lora-modules name=path
# Build command - NO --enforce-eager for full speed
cmd = [
"python", server_script,
"--model", config.model_name,
@ -880,9 +873,8 @@ def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str, step: int
"--gpu-memory-utilization", str(config.vllm_gpu_memory_utilization),
"--enable-lora",
"--max-lora-rank", str(max(config.lora_r * 2, 32)),
# PRE-LOAD adapter at startup so CUDA graphs include it!
"--lora-modules", f"training_adapter={adapter_abs_path}",
# Note: NOT adding --enforce-eager - CUDA graphs will be compiled WITH the adapter
# Note: NOT adding --enforce-eager - this is the key difference!
# LoRA adapter will be loaded at startup, CUDA graphs compiled with it
]
# Set environment for GPU selection
@ -893,34 +885,39 @@ def _launch_vllm_with_lora(config: TrainingConfig, adapter_path: str, step: int
else:
print(f" GPU: Same as trainer (inherited CUDA_VISIBLE_DEVICES)")
print(f" Launching vLLM with adapter pre-loaded:")
print(f" Command: {' '.join(cmd)}")
print(f" Adapter: {adapter_abs_path}")
# Create log file for vLLM output
vllm_log_path = os.path.join(config.save_path, f"vllm_step_{step}.log")
print(f" Log: {vllm_log_path}")
print(f" Launching: {' '.join(cmd)}")
print(f" Adapter: {adapter_path}")
try:
vllm_log_file = open(vllm_log_path, "w")
proc = subprocess.Popen(cmd, env=env, stdout=vllm_log_file, stderr=subprocess.STDOUT)
print(f" PID: {proc.pid}")
proc = subprocess.Popen(cmd, env=env)
print(f" vLLM PID: {proc.pid}")
# Wait for server to be ready (may take longer as it compiles CUDA graphs with LoRA)
print(f" Waiting for vLLM to be ready (compiling CUDA graphs with LoRA)...")
if not wait_for_vllm_ready(config.vllm_port, timeout=300):
print(" ERROR: vLLM failed to start. Check log:")
print(f" cat {vllm_log_path}")
# Wait for server to be ready
if not wait_for_vllm_ready(config.vllm_port, timeout=180):
print(" ERROR: vLLM failed to start")
proc.terminate()
return None
print(f" ✓ vLLM ready with adapter pre-loaded!")
# Load the LoRA adapter
print(f" Loading LoRA adapter...")
try:
resp = requests.post(
f"http://localhost:{config.vllm_port}/lora/load",
json={"adapter_path": adapter_path, "adapter_name": "training_adapter"},
timeout=60,
)
if resp.status_code == 200:
print(f" ✓ Adapter loaded successfully")
else:
print(f" WARNING: Adapter load returned {resp.status_code}: {resp.text}")
except Exception as e:
print(f" WARNING: Could not load adapter: {e}")
# Continue anyway - base model inference still works
return proc
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
return None