diff --git a/example_trainer/README.md b/example_trainer/README.md index 7b578a25..060a7965 100644 --- a/example_trainer/README.md +++ b/example_trainer/README.md @@ -223,9 +223,248 @@ CUDA_VISIBLE_DEVICES=0 LOGDIR=. python -u example_trainer/grpo.py \ --- -## Alternative Modes +## How Each Mode Works (Data Flow Diagrams) -### Mode 1: Legacy (Checkpoint + Restart) +### Single-Copy Mode (`--weight-bridge-mode shared_vllm`) ⭐ RECOMMENDED + +**The Magic**: Trainer and vLLM share the EXACT SAME GPU memory via CUDA IPC. + +``` +┌─────────────────────────────────────────────────────────────────────────────────────┐ +│ SINGLE-COPY MODE - COMPLETE DATA FLOW │ +│ │ +│ STEP 1: GSM8k sends problem │ +│ ┌──────────────────┐ │ +│ │ GSM8k Server │──── "What is 15 × 7?" ────▶┌──────────────────┐ │ +│ │ (Environment) │ │ Atropos API │ │ +│ └──────────────────┘ │ (Batching) │ │ +│ └────────┬─────────┘ │ +│ │ │ +│ STEP 2: Atropos forwards to vLLM │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ GPU MEMORY │ │ +│ │ │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ MODEL WEIGHTS (ONE COPY - SHARED!) │ │ │ +│ │ │ │ │ │ +│ │ │ embed_tokens.weight, layers.*.qkv_proj, ..., lm_head.weight │ │ │ +│ │ │ (address: 0x7f8a12340000) │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ │ ▲ ▲ │ │ +│ │ │ STEP 3: READ │ STEP 6: WRITE │ │ +│ │ │ (generate tokens) │ (optimizer.step) │ │ +│ │ ┌────────┴────────┐ ┌─────────┴─────────┐ │ │ +│ │ │ vLLM Server │ │ Trainer │ │ │ +│ │ │ │ │ (grpo.py) │ │ │ +│ │ │ Generates: │ │ │ │ │ +│ │ │ "15 × 7 = 105" │ │ STEP 5: Compute │ │ │ +│ │ │ │ │ GRPO loss & │ │ │ +│ │ └────────┬────────┘ │ gradients │ │ │ +│ │ │ └─────────▲─────────┘ │ │ +│ └───────────┼──────────────────────────────────────────────┼────────────────────┘ │ +│ │ │ │ +│ │ STEP 4: Return completion │ │ +│ ▼ │ │ +│ ┌──────────────────┐ │ │ +│ │ GSM8k Server │───────────────────────────────────────┘ │ +│ │ (Scoring) │ │ +│ │ │ Scores: "15 × 7 = 105" ✓ reward=1.0 │ +│ │ │ "15 × 7 = 100" ✗ reward=0.0 │ +│ └──────────────────┘ │ +│ │ +│ STEP 7: IMMEDIATE UPDATE │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐ │ +│ │ After optimizer.step(), vLLM's NEXT inference uses the NEW weights! │ │ +│ │ NO SYNC NEEDED - it's the same memory! │ │ +│ └─────────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Key Points:** +- ✅ ONE copy of weights in GPU memory +- ✅ 0ms sync latency (same memory!) +- ✅ Memory efficient (~1x model size) +- ⚠️ Requires same GPU for trainer and vLLM + +--- + +### LoRA Mode (`--weight-bridge-mode lora_only`) + +**The Idea**: Freeze base model, only train small adapter layers. Hot-swap adapters into vLLM. + +``` +┌─────────────────────────────────────────────────────────────────────────────────────┐ +│ LORA MODE - COMPLETE DATA FLOW │ +│ │ +│ STEP 1: GSM8k sends problem │ +│ ┌──────────────────┐ │ +│ │ GSM8k Server │──── "What is 15 × 7?" ────▶┌──────────────────┐ │ +│ │ (Environment) │ │ Atropos API │ │ +│ └──────────────────┘ └────────┬─────────┘ │ +│ │ │ +│ STEP 2: Forward to vLLM ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ vLLM GPU MEMORY │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ BASE MODEL (frozen, ~6GB) │ │ │ +│ │ │ + LORA ADAPTER A (current, ~50MB) │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ │ STEP 3: Inference with base + adapter A │ │ +│ │ ▼ │ │ +│ │ ┌────────────────────┐ │ │ +│ │ │ vLLM Server │ ──── "15 × 7 = 105" ────▶ │ │ +│ │ └────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ TRAINER GPU MEMORY (separate!) │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ BASE MODEL (frozen, ~6GB) │ │ │ +│ │ │ + LORA ADAPTER B (training, ~50MB) ◀── gradients flow here only! │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ │ STEP 4-5: Receive rollout, compute loss, update adapter B │ │ +│ │ ▼ │ │ +│ │ ┌────────────────────┐ │ │ +│ │ │ Trainer │ │ │ +│ │ │ (grpo.py) │ │ │ +│ │ └────────┬───────────┘ │ │ +│ └───────────┼──────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ STEP 6: Every N steps, save adapter B to disk │ +│ ▼ │ +│ ┌──────────────────┐ STEP 7: POST /lora/load ┌──────────────────┐ │ +│ │ adapter_step_N/ │ ─────────────────────────────────▶│ vLLM Server │ │ +│ │ (50MB on disk) │ │ Swaps A → B │ │ +│ └──────────────────┘ └──────────────────┘ │ +│ │ +│ STEP 8: Next inference uses NEW adapter B │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐ │ +│ │ Sync latency: 1-5 seconds (save to disk + HTTP load) │ │ +│ │ Memory: 2x base model + adapters │ │ +│ └─────────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Key Points:** +- ✅ Small adapter files (~50MB vs ~28GB) +- ✅ Works on separate GPUs +- ✅ Easy to switch between adapters +- ⚠️ 1-5 second sync latency +- ⚠️ 2x base model memory (trainer + vLLM) + +--- + +### Legacy Mode (`--weight-bridge-mode none`) + +**The Simple Approach**: Save full checkpoints, restart vLLM to load new weights. + +``` +┌─────────────────────────────────────────────────────────────────────────────────────┐ +│ LEGACY MODE - COMPLETE DATA FLOW │ +│ │ +│ STEP 1: GSM8k sends problem │ +│ ┌──────────────────┐ │ +│ │ GSM8k Server │──── "What is 15 × 7?" ────▶┌──────────────────┐ │ +│ │ (Environment) │ │ Atropos API │ │ +│ └──────────────────┘ └────────┬─────────┘ │ +│ │ │ +│ STEP 2: Forward to vLLM ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ vLLM GPU MEMORY │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ FULL MODEL - Version 1 (~28GB) │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ │ STEP 3: Inference │ │ +│ │ ▼ │ │ +│ │ ┌────────────────────┐ │ │ +│ │ │ vLLM Server │ ──── "15 × 7 = 105" ────▶ │ │ +│ │ └────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ TRAINER GPU MEMORY (separate!) │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ FULL MODEL - Version 2 (~28GB + gradients + optimizer) │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ │ STEP 4-5: Receive rollout, compute loss, update weights │ │ +│ │ ▼ │ │ +│ │ ┌────────────────────┐ │ │ +│ │ │ Trainer │ │ │ +│ │ │ (grpo.py) │ │ │ +│ │ └────────┬───────────┘ │ │ +│ └───────────┼──────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ STEP 6: Every N steps, save FULL checkpoint to disk (~28GB) │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ checkpoint/ │ │ +│ │ step_N/ │ (28GB on disk!) │ +│ │ - model.safetensors │ +│ │ - config.json │ +│ └────────┬─────────┘ │ +│ │ │ +│ │ STEP 7: RESTART vLLM with new checkpoint │ +│ │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ │ 1. Kill vLLM process │ │ +│ │ │ 2. Start new vLLM with --model checkpoint/step_N/ │ │ +│ │ │ 3. Wait for model to load (~30-60 seconds) │ │ +│ │ │ 4. Resume training │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────────┐ │ +│ │ vLLM GPU MEMORY (restarted) │ │ +│ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ FULL MODEL - Version 2 (loaded from checkpoint) │ │ │ +│ │ └────────────────────────────────────────────────────────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ STEP 8: Next inference uses updated model │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐ │ +│ │ Sync latency: 30-60 seconds (save + restart + reload) │ │ +│ │ Memory: 2x full model │ │ +│ │ Disk: 28GB per checkpoint │ │ +│ └─────────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Key Points:** +- ✅ Simple to understand +- ✅ Works on any setup +- ✅ Good for debugging +- ⚠️ 30-60 second sync latency +- ⚠️ 2x GPU memory (trainer + vLLM) +- ⚠️ Large checkpoint files (~28GB each) + +--- + +## Mode Comparison Summary + +``` +┌──────────────────────────────────────────────────────────────────────────────────┐ +│ MODE COMPARISON AT A GLANCE │ +├────────────────┬───────────────┬────────────────┬────────────────────────────────┤ +│ │ SINGLE-COPY │ LORA │ LEGACY │ +├────────────────┼───────────────┼────────────────┼────────────────────────────────┤ +│ Sync Latency │ 0 ms ⚡ │ 1-5 sec │ 30-60 sec │ +│ GPU Memory │ 1x model │ 2x model │ 2x model │ +│ Disk Space │ 28GB/ckpt │ 50MB/adapter │ 28GB/ckpt │ +│ Complexity │ Medium │ Medium │ Simple │ +│ Same GPU? │ Required ⚠️ │ Optional │ Optional │ +│ Best For │ Production │ Experiments │ Debugging │ +└────────────────┴───────────────┴────────────────┴────────────────────────────────┘ +``` + +--- + +## Alternative Mode Commands + +### Legacy Mode (Checkpoint + Restart) For simple setups or debugging. Saves checkpoints and restarts vLLM to load new weights. @@ -239,7 +478,7 @@ python example_trainer/grpo.py \ --lr 1e-5 ``` -### Mode 2: LoRA Adapters +### LoRA Mode (Adapter Training) Trains only adapter weights. Small checkpoints, lower memory. @@ -274,6 +513,8 @@ python example_trainer/grpo.py \ |--------|---------|-------------| | `--model-name` | (required) | HuggingFace model ID | | `--weight-bridge-mode` | `none` | `none`, `shared_vllm`, or `lora_only` | +| `--single-copy` | `false` | Enable TRUE single-copy mode via CUDA IPC | +| `--vllm-config-path` | (auto-detect) | Explicit path to `vllm_bridge_config.json` | | `--vllm-port` | `9001` | vLLM server port | | `--training-steps` | `10` | Total optimization steps | | `--batch-size` | `2` | Micro-batch size | @@ -288,6 +529,150 @@ python example_trainer/grpo.py \ | `--tensor-parallel-size` | Number of GPUs (use 1 for single-copy) | | `--port` | Server port (default: 9001) | | `--dtype` | Model dtype (`bfloat16`, `float16`, `auto`) | +| `--gpu-memory-utilization` | Fraction of GPU memory for KV cache (default: 0.9) | + +--- + +## The vLLM Bridge Config (vllm_bridge_config.json) + +The `vllm_bridge_config.json` file is the critical communication mechanism between the vLLM inference server and the GRPO trainer in single-copy mode. Understanding this file is essential for debugging and advanced configurations. + +### What It Is + +When you start vLLM with `VLLM_ENABLE_SHARED_WEIGHTS=1`, the patched `GPUModelRunner` exports CUDA IPC (Inter-Process Communication) handles for all model tensors. These handles allow another process (the trainer) to access the exact same GPU memory—no copying required. + +### Why It's Important + +1. **True Single-Copy Architecture**: Instead of loading the model twice (once for training, once for inference), both processes share the same tensors in GPU memory. + +2. **Zero-Latency Weight Updates**: When `optimizer.step()` modifies the weights, vLLM immediately sees the changes—no serialization, no network transfer, no disk I/O. + +3. **Memory Efficiency**: For a 7B model (~14GB in bf16), you save ~14GB of GPU memory compared to having two separate copies. + +### File Location + +The trainer searches for `vllm_bridge_config.json` in this order: + +1. **Explicit path** (if `--vllm-config-path` is provided) +2. **`$LOGDIR/vllm_bridge_config.json`** (if `LOGDIR` env var is set) +3. **`./vllm_bridge_config.json`** (current directory) +4. **`/tmp/atropos_bridge/vllm_bridge_config.json`** (default fallback) + +**Tip**: To avoid "Config not found" errors, always set `LOGDIR`: + +```bash +export LOGDIR=. +``` + +### File Contents + +The JSON file contains everything needed to reconstruct tensor references in another process: + +```json +{ + "model": "Qwen/Qwen2.5-3B-Instruct", + "tp_degree": 1, + "dp_shard_degree": 1, + + "param_names": [ + "model.embed_tokens.weight", + "model.layers.0.self_attn.qkv_proj.weight", + ... + ], + + "param_mappings": { + "model.embed_tokens.weight": { + "vllm_name": "model.embed_tokens.weight", + "shape": [152064, 2048], + "dtype": "torch.bfloat16", + "device": "cuda:0" + }, + ... + }, + + "ipc_handles": { + "model.embed_tokens.weight": { + "device_index": 0, + "ipc_handle_b64": "AmPA0pN...", + "storage_size": 623902720, + "storage_offset": 0, + "ref_counter_handle_b64": "Y2JY...", + "ref_counter_offset": 0, + "event_handle_b64": "wRIs...", + "event_sync_required": true, + "shape": [152064, 2048], + "dtype": "torch.bfloat16" + }, + ... + }, + + "shared_weights_enabled": true, + "single_copy_enabled": true, + "num_params": 255 +} +``` + +#### Field Descriptions + +| Field | Description | +|-------|-------------| +| `model` | HuggingFace model identifier | +| `tp_degree` | Tensor parallel degree (must be 1 for single-copy) | +| `param_names` | List of all parameter names in the model | +| `param_mappings` | Shape, dtype, and device info for each parameter | +| `ipc_handles` | CUDA IPC handles for reconstructing shared tensors | +| `ipc_handle_b64` | The actual CUDA IPC handle (base64-encoded bytes) | +| `ref_counter_handle_b64` | Reference counter for CUDA memory (base64) | +| `event_handle_b64` | CUDA event handle for synchronization (base64) | +| `storage_size` | Size of the underlying storage in bytes | + +### How the Trainer Uses It + +1. **Load Config**: Trainer reads `vllm_bridge_config.json` +2. **Create Shell Model**: Uses `AutoModelForCausalLM.from_config()` with meta tensors (no memory allocation) +3. **Attach IPC Handles**: For each parameter, reconstructs the tensor using `torch.UntypedStorage._new_shared_cuda()` with the IPC handles +4. **Verify Shapes**: Ensures trainer's model architecture matches vLLM's sharding + +```python +# Simplified version of what happens internally: +for name, ipc_info in config["ipc_handles"].items(): + # Decode IPC handle from base64 + ipc_handle = base64.b64decode(ipc_info["ipc_handle_b64"]) + + # Reconstruct storage from IPC handle + storage = torch.UntypedStorage._new_shared_cuda( + device_index, ipc_handle, storage_size, ... + ) + + # Create tensor from shared storage + tensor = torch.tensor(storage).view(shape).to(dtype) + + # Replace model parameter with shared tensor + model.get_parameter(name).data = tensor +``` + +### Specifying the Config Path Explicitly + +If auto-detection isn't working (e.g., in complex cluster setups), you can specify the path explicitly: + +```bash +# If vLLM writes config to a non-standard location: +python -u example_trainer/grpo.py \ + --model-name Qwen/Qwen2.5-3B-Instruct \ + --weight-bridge-mode shared_vllm \ + --single-copy \ + --vllm-config-path /shared/nfs/vllm_bridge_config.json \ + --training-steps 50 +``` + +### Common Issues + +| Symptom | Cause | Fix | +|---------|-------|-----| +| "Could not find vllm_bridge_config.json" | vLLM didn't export config | Check `VLLM_ENABLE_SHARED_WEIGHTS=1` was set BEFORE starting vLLM | +| Config exists but has empty `ipc_handles` | Patch didn't run | Ensure vLLM is using our custom `vllm_api_server.py` | +| "tuple of 8 items expected" | IPC handle format mismatch | Update to latest code (handles all 8 CUDA IPC tuple components) | +| "size mismatch" errors | Tensor parallel mismatch | Use `tensor-parallel-size 1` for single-copy mode | --- @@ -298,11 +683,17 @@ python example_trainer/grpo.py \ **A:** vLLM didn't export the IPC handles. Check: 1. `VLLM_ENABLE_SHARED_WEIGHTS=1` was set **before** starting vLLM -2. Look for export messages in vllm.log: +2. `LOGDIR` is set to a valid, writable directory +3. Look for export messages in vllm.log: ```bash grep "Exported" vllm.log ``` +If the file exists but in a different location, specify it explicitly: +```bash +python grpo.py ... --vllm-config-path /path/to/vllm_bridge_config.json +``` + --- ### Q: I get "CUDA out of memory" when starting the trainer @@ -365,8 +756,7 @@ pkill -9 -u $USER -f "vllm|grpo|python|run-api" | File | Description | |------|-------------| | `__init__.py` | Module exports and patch application | -| `patched_gpu_runner.py` | Patches GPUModelRunner to export IPC handles | -| `distributed_utils.py` | Distributed training utilities | +| `patched_gpu_runner.py` | Patches GPUModelRunner to export CUDA IPC handles | --- diff --git a/example_trainer/grpo.py b/example_trainer/grpo.py index 37a65195..8c82ccb7 100644 --- a/example_trainer/grpo.py +++ b/example_trainer/grpo.py @@ -149,6 +149,16 @@ class TrainingConfig(BaseModel): "vLLM must be started with VLLM_ENABLE_SHARED_WEIGHTS=1." ), ) + vllm_config_path: Optional[str] = Field( + None, + description=( + "Explicit path to vllm_bridge_config.json. " + "If not provided, auto-detects from LOGDIR environment variable, " + "current directory, or /tmp/atropos_bridge. " + "This file is created by vLLM when VLLM_ENABLE_SHARED_WEIGHTS=1 " + "and contains CUDA IPC handles for single-copy mode." + ), + ) def check_atropos_api(timeout: float = 30.0) -> bool: @@ -810,32 +820,36 @@ def load_model_and_tokenizer( # Single-copy mode: attach to vLLM's shared tensors via CUDA IPC if single_copy or config.weight_bridge_mode == "shared_vllm": - # Try multiple possible locations for the config file - possible_paths = [ - os.environ.get("LOGDIR", "."), - ".", - "/tmp/atropos_bridge", - os.path.dirname(os.path.abspath(__file__)), - ] - - config_path = None - for log_dir in possible_paths: - candidate = os.path.join(log_dir, "vllm_bridge_config.json") - if os.path.exists(candidate): - config_path = candidate - print(f"[Setup] Found vLLM config at: {candidate}") - break - - if config_path is None: - checked = [ - os.path.join(p, "vllm_bridge_config.json") for p in possible_paths + # Check for explicit path first + if config.vllm_config_path and os.path.exists(config.vllm_config_path): + config_path = config.vllm_config_path + print(f"[Setup] Using explicit vLLM config path: {config_path}") + else: + # Auto-detect from common locations + possible_paths = [ + os.environ.get("LOGDIR", "."), + ".", + "/tmp/atropos_bridge", + os.path.dirname(os.path.abspath(__file__)), ] - raise RuntimeError( - f"[Setup] Could not find vllm_bridge_config.json\n" - f"Checked: {checked}\n" - f"Make sure vLLM is running with VLLM_ENABLE_SHARED_WEIGHTS=1" - ) - + + config_path = None + for log_dir in possible_paths: + candidate = os.path.join(log_dir, "vllm_bridge_config.json") + if os.path.exists(candidate): + config_path = candidate + print(f"[Setup] Found vLLM config at: {candidate}") + break + + if config_path is None: + checked = [os.path.join(p, "vllm_bridge_config.json") for p in possible_paths] + raise RuntimeError( + f"[Setup] Could not find vllm_bridge_config.json\n" + f"Checked: {checked}\n" + f"Tip: Use --vllm-config-path to specify the path explicitly\n" + f"Make sure vLLM is running with VLLM_ENABLE_SHARED_WEIGHTS=1 and LOGDIR set" + ) + model = _attach_to_vllm_shared_tensors(config, config_path) if model is not None: print("[Setup] ✓ Single-copy mode active - using vLLM's tensors directly!") @@ -2043,6 +2057,17 @@ def parse_args() -> argparse.Namespace: "vLLM must be started with VLLM_ENABLE_SHARED_WEIGHTS=1." ), ) + parser.add_argument( + "--vllm-config-path", + type=str, + default=None, + help=( + "Explicit path to vllm_bridge_config.json. " + "If not provided, auto-detects from LOGDIR, current directory, " + "or /tmp/atropos_bridge. " + "This file contains CUDA IPC handles created by vLLM." + ), + ) return parser.parse_args() @@ -2073,7 +2098,8 @@ def config_from_args(args: argparse.Namespace) -> TrainingConfig: lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout, lora_target_modules=args.lora_target_modules, - single_copy=getattr(args, "single_copy", False), + single_copy=getattr(args, 'single_copy', False), + vllm_config_path=getattr(args, 'vllm_config_path', None), )