mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
37 lines
966 B
Python
37 lines
966 B
Python
"""
|
|
vLLM Patching Module - Enables CUDA IPC shared memory for single-copy training.
|
|
|
|
This module patches vLLM's GPUModelRunner to:
|
|
1. Call share_memory_() on model weights after loading
|
|
2. Export CUDA IPC handles to vllm_bridge_config.json
|
|
3. Enable the trainer to attach to vLLM's tensors directly
|
|
|
|
The result: ONE copy of model weights in GPU memory, shared between
|
|
vLLM (inference) and the trainer (gradient updates).
|
|
|
|
Usage:
|
|
# Set environment BEFORE importing
|
|
import os
|
|
os.environ["VLLM_ENABLE_SHARED_WEIGHTS"] = "1"
|
|
|
|
# Import and apply patches BEFORE importing vllm
|
|
from example_trainer.vllm_patching import apply_patches
|
|
apply_patches()
|
|
|
|
# Then import vllm normally
|
|
from vllm import AsyncLLM
|
|
"""
|
|
|
|
from .patched_gpu_runner import (
|
|
PatchedGPUModelRunner,
|
|
apply_patches,
|
|
get_patched_runner,
|
|
is_patched,
|
|
)
|
|
|
|
__all__ = [
|
|
"PatchedGPUModelRunner",
|
|
"apply_patches",
|
|
"get_patched_runner",
|
|
"is_patched",
|
|
]
|