mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
cleanup
This commit is contained in:
parent
9f6cc64b9e
commit
0ebf3552c9
8 changed files with 388 additions and 1977 deletions
|
|
@ -6,28 +6,30 @@ A modular training framework for fine-tuning language models with **Group Relati
|
|||
|
||||
```
|
||||
example_trainer/
|
||||
├── grpo.py # CLI entry point (dispatches to trainers)
|
||||
├── run.py # Unified launcher for shared_vllm mode
|
||||
├── config.py # TrainingConfig dataclass
|
||||
├── cli.py # CLI argument parsing (single source of truth)
|
||||
├── api.py # Atropos API communication
|
||||
├── data.py # Data fetching & preprocessing
|
||||
├── model.py # Model loading & CUDA IPC shared memory
|
||||
├── training.py # GRPO loss computation & training step
|
||||
├── checkpointing.py # Save models & LoRA adapters
|
||||
├── vllm_manager.py # vLLM process management
|
||||
├── trainers.py # Training mode implementations
|
||||
├── vllm_api_server.py # Custom vLLM server (streamlined for training)
|
||||
├── vllm_patching/ # CUDA IPC patches for weight sharing
|
||||
├── grpo.py # CLI entry point (dispatches to 4 training modes)
|
||||
├── run.py # Unified launcher for shared_vllm mode (starts vLLM+trainer)
|
||||
├── config.py # TrainingConfig Pydantic model (all hyperparameters)
|
||||
├── cli.py # CLI argument parsing (modular, single source of truth)
|
||||
├── api.py # Atropos API communication (registration, batch fetching)
|
||||
├── data.py # Data fetching, preprocessing, logprob alignment
|
||||
├── model.py # Model loading, CUDA IPC, tensor mapping (QKV/Gate fusion)
|
||||
├── training.py # GRPO loss (importance sampling, KL penalty, clipping)
|
||||
├── checkpointing.py # Save models & LoRA adapters (handles fused tensor unfusing)
|
||||
├── vllm_manager.py # vLLM process lifecycle (launch, health, termination)
|
||||
├── trainers.py # 4 training mode implementations + optimizer selection
|
||||
├── vllm_api_server.py # Custom vLLM server with /generate endpoint + LoRA
|
||||
├── vllm_patching/ # CUDA IPC patches for weight sharing
|
||||
│ └── patched_gpu_runner.py
|
||||
└── scripts/ # Helper scripts
|
||||
└── scripts/ # Helper scripts and benchmarks
|
||||
├── test_lora_mode.sh
|
||||
└── test_single_copy_mode.sh
|
||||
├── test_single_copy_mode.sh
|
||||
└── compare_all_modes_math_zero.sh
|
||||
```
|
||||
|
||||
|
||||
GRPO Training Loop
|
||||
## GRPO Training Loop
|
||||
|
||||
```
|
||||
1. Generate multiple responses to the same prompt
|
||||
2. Score each response (reward)
|
||||
3. Compute ADVANTAGE = reward - mean(rewards)
|
||||
|
|
@ -47,13 +49,17 @@ GRPO Training Loop
|
|||
|
||||
## System Architecture
|
||||
|
||||
```
|
||||
Data Flow:
|
||||
1. Environment generates prompts → calls vLLM → scores responses
|
||||
2. Environment sends trajectories to run-api
|
||||
3. Trainer fetches batches from run-api
|
||||
4. Trainer updates model weights
|
||||
5. (shared_vllm) vLLM sees updates immediately via CUDA IPC
|
||||
(lora_only) Trainer pushes adapter to vLLM periodically
|
||||
5. Weight synchronization:
|
||||
- shared_vllm: vLLM sees updates immediately via CUDA IPC (zero-copy)
|
||||
- lora_only: Trainer pushes adapter to vLLM via HTTP (slow)
|
||||
- lora_restart: Trainer restarts vLLM with new adapter (fast)
|
||||
- none (legacy): Trainer saves checkpoint and restarts vLLM
|
||||
```
|
||||
|
||||
---
|
||||
|
|
@ -65,7 +71,7 @@ Data Flow:
|
|||
| **shared_vllm** | Single-copy via CUDA IPC | 1x model | ~172 TPS | Same GPU, maximum efficiency |
|
||||
| **lora_restart** | LoRA + vLLM restarts | 1x + adapter | ~108 TPS | LoRA training with speed |
|
||||
| **lora_only** | LoRA + HTTP hot-swap | 1x + adapter | ~13 TPS ⚠️ | Debugging only |
|
||||
| **legacy** | Full model, restart vLLM | 2x model | ~172 TPS | Different GPUs, simple setup |
|
||||
| **none** (legacy) | Full model, restart vLLM | 2x model | ~172 TPS | simple setup |
|
||||
|
||||
### ⚠️ IMPORTANT: `lora_only` Performance Warning
|
||||
|
||||
|
|
@ -80,15 +86,16 @@ The `lora_only` mode requires `--enforce-eager` which **disables CUDA graphs**,
|
|||
**Use `shared_vllm`** for production training when:
|
||||
- You have enough GPU memory for the full model
|
||||
- You want fastest training (no overhead)
|
||||
- Trainer and vLLM are on the same GPU(s)
|
||||
|
||||
**Use `lora_restart`** when:
|
||||
- You want LoRA's memory efficiency
|
||||
- You want fast inference (~108 TPS vs ~13 TPS = 8x speedup)
|
||||
- You can tolerate ~45s restart overhead every N steps
|
||||
|
||||
**Avoid `lora_only`** unless you're debugging - the 8x inference penalty is severe.
|
||||
|
||||
**Use `shared_vllm`** for single-GPU training when you need maximum efficiency.
|
||||
**Use `none` (legacy)** mode when:
|
||||
- You want the simplest setup without CUDA IPC or LoRA
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -118,13 +125,14 @@ python -m example_trainer.vllm_api_server \
|
|||
|
||||
**Terminal 3: Environment**
|
||||
```bash
|
||||
# Important: Use server_type=vllm to get logprobs (required for GRPO)
|
||||
python environments/gsm8k_server.py serve \
|
||||
--env.group_size 4 \
|
||||
--env.max_num 200 \
|
||||
--slurm.num_requests_per_time_interval 16 \
|
||||
--slurm.time_interval 10 \
|
||||
--openai.api_key "dummy" \
|
||||
--openai.base_url "http://localhost:9001" \
|
||||
--openai.base_url "http://localhost:9001/v1" \
|
||||
--openai.model_name "NousResearch/Hermes-3-Llama-3.1-8B" \
|
||||
--openai.server_type vllm
|
||||
```
|
||||
|
|
@ -138,7 +146,7 @@ python -m example_trainer.grpo \
|
|||
--atropos-url "http://localhost:8002" \
|
||||
--batch-size 4 \
|
||||
--gradient-accumulation-steps 4 \
|
||||
--learning-rate 1e-5 \
|
||||
--lr 1e-5 \
|
||||
--training-steps 30 \
|
||||
--kl-coef 0.1 \
|
||||
--clip-eps 0.2 \
|
||||
|
|
@ -150,16 +158,27 @@ python -m example_trainer.grpo \
|
|||
### Startup Order
|
||||
|
||||
```bash
|
||||
# 1. Start API
|
||||
# 2. Wait 5s, start vLLM
|
||||
# 3. Wait for vLLM to load (check: curl http://localhost:9001/health)
|
||||
# 4. Start environment
|
||||
# 5. Start trainer
|
||||
# CRITICAL: Follow this exact order!
|
||||
# 1. Start API first
|
||||
run-api --port 8002
|
||||
|
||||
# 2. Wait 5s, then start vLLM
|
||||
# Check health: curl http://localhost:9001/health
|
||||
python -m example_trainer.vllm_api_server --model ... --enable-lora --enforce-eager
|
||||
|
||||
# 3. Wait for vLLM health endpoint to return 200
|
||||
while ! curl -s http://localhost:9001/health > /dev/null; do sleep 1; done
|
||||
|
||||
# 4. Start environment (MUST use --openai.server_type vllm for logprobs)
|
||||
python environments/gsm8k_server.py serve ...
|
||||
|
||||
# 5. Start trainer (will register with API and begin training)
|
||||
python -m example_trainer.grpo --weight-bridge-mode lora_only ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Shared vLLM Mode (Advanced)
|
||||
## Shared vLLM Mode
|
||||
|
||||
Single-copy mode shares GPU memory between vLLM and the trainer - zero model duplication!
|
||||
|
||||
|
|
@ -200,10 +219,14 @@ python -m example_trainer.vllm_api_server \
|
|||
|
||||
**Terminal 3: Environment**
|
||||
```bash
|
||||
# Important: Use server_type=vllm to get logprobs (required for GRPO)
|
||||
python environments/gsm8k_server.py serve \
|
||||
--openai.base_url "http://localhost:9001" \
|
||||
--openai.base_url "http://localhost:9001/v1" \
|
||||
--openai.model_name "NousResearch/Hermes-3-Llama-3.1-8B" \
|
||||
--openai.server_type vllm
|
||||
--openai.server_type vllm \
|
||||
--env.group_size 4 \
|
||||
--slurm.num_requests_per_time_interval 16 \
|
||||
--slurm.time_interval 10
|
||||
```
|
||||
|
||||
**Terminal 4: Trainer**
|
||||
|
|
@ -232,45 +255,64 @@ VLLM_ENABLE_SHARED_WEIGHTS=1 python -m example_trainer.run \
|
|||
|
||||
## Best Practices & Lessons Learned
|
||||
|
||||
### 1. Always Use `--enforce-eager` with Shared Weights
|
||||
|
||||
**Why:** CUDA graphs "bake" weights at compile time. Without eager mode, vLLM won't see weight updates!
|
||||
### 1. Use `--openai.server_type vllm` for Training
|
||||
|
||||
**CRITICAL:** The atropos environment MUST use `server_type=vllm` to get logprobs for proper GRPO training.
|
||||
|
||||
Only `server_type=vllm` calls the `/generate` endpoint which returns token-level logprobs. These logprobs serve as the reference policy (π_old) for importance sampling in GRPO.
|
||||
|
||||
```bash
|
||||
# WRONG - weight updates won't be visible to inference
|
||||
python vllm_api_server.py --model $MODEL
|
||||
|
||||
# CORRECT - disables CUDA graphs
|
||||
python vllm_api_server.py --model $MODEL --enforce-eager
|
||||
```
|
||||
|
||||
### 2. Use `--openai.server_type vllm` for Training
|
||||
|
||||
The gsm8k environment needs logprobs for GRPO. Only `server_type=vllm` uses the `/generate` endpoint which returns logprobs.
|
||||
|
||||
```bash
|
||||
# CORRECT - gets logprobs for training
|
||||
# CORRECT - gets logprobs for training (REQUIRED!)
|
||||
--openai.server_type vllm
|
||||
|
||||
# WRONG for training - no logprobs
|
||||
# WRONG for training - no logprobs, training will FAIL
|
||||
--openai.server_type openai
|
||||
```
|
||||
|
||||
### 3. KL Coefficient and Clipping Are Essential
|
||||
**What happens without logprobs:**
|
||||
- The trainer will raise an error: "GRPO requires inference_logprobs for importance sampling!"
|
||||
- Without the reference policy, GRPO degenerates to vanilla REINFORCE (leads to reward hacking)
|
||||
|
||||
Without these, training will collapse (reward hacking):
|
||||
**How logprobs flow through the system:**
|
||||
1. Environment calls vLLM `/generate` with `logprobs=true`
|
||||
2. vLLM returns token-level logprobs for each generated token
|
||||
3. Environment embeds these in trajectory data sent to API
|
||||
4. Trainer extracts and aligns logprobs with training labels
|
||||
5. GRPO loss uses logprobs as π_old for importance sampling ratio
|
||||
|
||||
### 2. KL Coefficient and Clipping Are Essential
|
||||
|
||||
**CRITICAL:** Without these hyperparameters, training WILL collapse (reward hacking):
|
||||
|
||||
```bash
|
||||
--kl-coef 0.1 # Prevents policy from drifting too far
|
||||
--clip-eps 0.2 # Limits update magnitude
|
||||
--kl-coef 0.1 # Prevents policy from drifting too far from reference
|
||||
--clip-eps 0.2 # Limits importance sampling ratio to [0.8, 1.2]
|
||||
```
|
||||
|
||||
**Symptoms of missing KL/clipping:**
|
||||
- Accuracy drops dramatically (e.g., 59% → 7%)
|
||||
- Loss goes to very negative values
|
||||
- Model outputs become repetitive/degenerate
|
||||
**Why these matter:**
|
||||
- **KL Penalty** (β): Penalizes the policy for deviating from the reference policy (inference-time policy)
|
||||
- Uses Schulman's unbiased estimator: `exp(-log_ratio) + log_ratio - 1`
|
||||
- Higher β = more conservative updates
|
||||
- Set to 0 to disable (NOT recommended - leads to instability)
|
||||
|
||||
### 4. Memory Budgeting for Large Models
|
||||
- **PPO Clipping** (ε): Clips the importance sampling ratio to `[1-ε, 1+ε]`
|
||||
- Prevents catastrophically large policy updates
|
||||
- Takes pessimistic bound (conservative update)
|
||||
|
||||
**Symptoms of missing/misconfigured KL/clipping:**
|
||||
- Accuracy drops dramatically (e.g., 59% → 7%)
|
||||
- Loss goes to very negative values (< -10)
|
||||
- Model outputs become repetitive/degenerate
|
||||
- `mean_ratio` diverges far from 1.0
|
||||
- `mean_kl` explodes (> 1.0)
|
||||
|
||||
**Healthy training metrics:**
|
||||
- `mean_ratio`: 0.8 - 1.2 (close to 1.0)
|
||||
- `mean_kl`: 0.01 - 0.1
|
||||
- `clipped_fraction`: < 0.3 (< 30% of tokens clipped)
|
||||
|
||||
### 3. Memory Budgeting for Large Models
|
||||
|
||||
| Model Size | GPU Memory | Recommended Settings |
|
||||
|------------|------------|----------------------|
|
||||
|
|
@ -278,32 +320,26 @@ Without these, training will collapse (reward hacking):
|
|||
| 14B | 80GB | `--gpu-memory-utilization 0.45`, `--batch-size 2` |
|
||||
| 24B | 192GB (B200) | `--gpu-memory-utilization 0.30`, `--optimizer adafactor` |
|
||||
|
||||
### 5. Start with Small Batch Sizes
|
||||
|
||||
```bash
|
||||
# Start conservative, increase if no OOM
|
||||
--batch-size 2 --gradient-accumulation-steps 8 # Effective batch = 16
|
||||
```
|
||||
|
||||
### 6. Optimizer Selection
|
||||
### 4. Optimizer Selection
|
||||
|
||||
The trainer supports multiple optimizer options to trade off between speed, memory, and precision:
|
||||
|
||||
| Optimizer | GPU Memory for States | Speed | Precision | Dependencies |
|
||||
|-----------|----------------------|-------|-----------|--------------|
|
||||
| `adamw` (default) | ~32GB (for 8B model) | Fastest | Full FP32 | None |
|
||||
| `adamw_8bit` | ~8GB | Fast | 8-bit quantized | `bitsandbytes` |
|
||||
| `adamw` | ~32GB (for 8B model) | Fastest | Full FP32 | None |
|
||||
| `adamw_8bit` (default) | ~8GB | Fast | 8-bit quantized | `bitsandbytes` |
|
||||
| `adafactor` | ~8GB | Fast | Full (no momentum) | `transformers` |
|
||||
| `adamw_cpu` | ~0GB (on CPU) | ~2x slower | Full FP32 | None |
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Standard AdamW (default)
|
||||
--optimizer adamw
|
||||
|
||||
# 8-bit AdamW - recommended for memory-constrained setups
|
||||
# 8-bit AdamW (default) - recommended for memory-constrained setups
|
||||
--optimizer adamw_8bit
|
||||
|
||||
# Standard AdamW - full precision
|
||||
--optimizer adamw
|
||||
|
||||
# Adafactor - no momentum states, good for large models
|
||||
--optimizer adafactor
|
||||
|
||||
|
|
@ -392,36 +428,39 @@ vLLM exports tensor mappings to `vllm_bridge_config.json`:
|
|||
|
||||
## ❓ FAQ
|
||||
|
||||
|
||||
### Q: Why isn't vLLM seeing my weight updates?
|
||||
|
||||
**A:** CUDA graphs are caching the old weights. Add `--enforce-eager`:
|
||||
|
||||
```bash
|
||||
python vllm_api_server.py --model $MODEL --enforce-eager
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Q: How do I debug logprob alignment issues?
|
||||
|
||||
**A:** Look for these log messages:
|
||||
**A:** Look for these log messages during training:
|
||||
```
|
||||
[WARNING] ref_logprobs at generated positions avg 0.85 (should be negative!)
|
||||
[WARNING] This suggests inference_logprobs alignment is wrong
|
||||
```
|
||||
|
||||
This means inference logprobs aren't being passed correctly. Check that:
|
||||
1. Environment uses `--openai.server_type vllm`
|
||||
2. vLLM returns logprobs (check `/generate` response)
|
||||
This means inference logprobs aren't being passed correctly. Debug steps:
|
||||
|
||||
### Q: Why does vLLM v1 engine fail with CUDA fork errors?
|
||||
1. **Check environment server type:**
|
||||
```bash
|
||||
# Must be 'vllm', NOT 'openai'
|
||||
--openai.server_type vllm
|
||||
```
|
||||
|
||||
**A:** vLLM v1 uses multiprocessing that conflicts with CUDA initialization. We default to v0 engine:
|
||||
2. **Verify vLLM returns logprobs:**
|
||||
```bash
|
||||
curl -X POST http://localhost:9001/generate \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"prompt": "Hello", "max_tokens": 5}'
|
||||
# Response should include "logprobs": [...]
|
||||
```
|
||||
|
||||
```python
|
||||
# vllm_api_server.py automatically sets:
|
||||
os.environ.setdefault("VLLM_USE_V1", "0")
|
||||
```
|
||||
3. **Check data.py logs:**
|
||||
```
|
||||
[Data] ✓ inference_logprobs found in batch (sample len: 128)
|
||||
```
|
||||
|
||||
4. **Monitor alignment metrics in training logs:**
|
||||
- `alignment/diff_mean` should be close to 0 at step start
|
||||
- `alignment/diff_abs_mean` < 0.1 = good alignment
|
||||
- Large values = weights not properly shared or logprobs misaligned
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
|
@ -479,25 +518,9 @@ vLLM version incompatibility. Our server handles this automatically, but make su
|
|||
python -m example_trainer.vllm_api_server # NOT direct vllm commands
|
||||
```
|
||||
|
||||
### Training is slow / no batches
|
||||
|
||||
1. Check vLLM is running: `curl http://localhost:9001/health`
|
||||
2. Check API is running: `curl http://localhost:8002/info`
|
||||
3. Check environment is connected and generating rollouts
|
||||
|
||||
---
|
||||
|
||||
## 📊 Monitoring Training
|
||||
|
||||
### Key Metrics to Watch
|
||||
|
||||
| Metric | Healthy Range | Problem If... |
|
||||
|--------|---------------|---------------|
|
||||
| `mean_ratio` | 0.8 - 1.2 | Far from 1.0 = policy changed too much |
|
||||
| `mean_kl` | 0.01 - 0.1 | > 0.5 = policy drifting |
|
||||
| `clipped_fraction` | < 0.3 | > 0.5 = learning rate too high |
|
||||
| `loss` | Gradually decreasing | Exploding or very negative |
|
||||
|
||||
### WandB Logging
|
||||
|
||||
```bash
|
||||
|
|
@ -514,11 +537,12 @@ python -m example_trainer.vllm_api_server # NOT direct vllm commands
|
|||
|
||||
| Argument | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `--model-name` | (required) | HuggingFace model ID |
|
||||
| `--weight-bridge-mode` | `none` | `shared_vllm`, `lora_only`, or `none` |
|
||||
| `--model-name` or `--model` | (required) | HuggingFace model ID |
|
||||
| `--weight-bridge-mode` | `none` | `shared_vllm`, `lora_only`, `lora_restart`, or `none` |
|
||||
| `--training-steps` | 10 | Number of training steps |
|
||||
| `--batch-size` | 2 | Micro-batch size |
|
||||
| `--gradient-accumulation-steps` | 1 | Effective batch = batch × accum |
|
||||
| `--gradient-accumulation-steps` | 32 | Effective batch = batch × accum |
|
||||
| `--seq-len` | 2048 | Maximum sequence length |
|
||||
|
||||
### GRPO Hyperparameters
|
||||
|
||||
|
|
@ -526,15 +550,16 @@ python -m example_trainer.vllm_api_server # NOT direct vllm commands
|
|||
|----------|---------|-------------|
|
||||
| `--kl-coef` | 0.1 | KL penalty strength (higher = more conservative) |
|
||||
| `--clip-eps` | 0.2 | PPO clipping range [1-ε, 1+ε] |
|
||||
| `--learning-rate` | 1e-6 | Learning rate |
|
||||
| `--lr` | 1e-5 | Learning rate (NOT --learning-rate) |
|
||||
|
||||
### LoRA Arguments
|
||||
|
||||
| Argument | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `--lora-r` | 16 | LoRA rank |
|
||||
| `--lora-alpha` | 32 | LoRA scaling factor |
|
||||
| `--lora-dropout` | 0.05 | LoRA dropout |
|
||||
| `--lora-r` | 16 | LoRA rank (dimension of low-rank matrices) |
|
||||
| `--lora-alpha` | 32 | LoRA alpha scaling factor |
|
||||
| `--lora-dropout` | 0.05 | LoRA dropout probability |
|
||||
| `--lora-target-modules` | None | Module names to apply LoRA (default: `q_proj v_proj`) |
|
||||
|
||||
### vLLM Arguments
|
||||
|
||||
|
|
@ -542,7 +567,11 @@ python -m example_trainer.vllm_api_server # NOT direct vllm commands
|
|||
|----------|---------|-------------|
|
||||
| `--vllm-port` | 9001 | vLLM server port |
|
||||
| `--vllm-config-path` | auto | Path to bridge config (shared mode) |
|
||||
| `--gpu-memory-utilization` | 0.9 | vLLM GPU memory fraction |
|
||||
| `--gpu-memory-utilization` | 0.45 | vLLM GPU memory fraction |
|
||||
| `--vllm-gpu` | None | GPU ID for vLLM (None = same as trainer) |
|
||||
| `--max-model-len` | 4096 | Maximum context length |
|
||||
| `--dtype` | `bfloat16` | Model dtype: `bfloat16`, `float16`, or `auto` |
|
||||
| `--vllm-restart-interval` | 3 | Restart vLLM every N steps (legacy/lora_restart) |
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -550,15 +579,248 @@ python -m example_trainer.vllm_api_server # NOT direct vllm commands
|
|||
|
||||
| Module | Purpose |
|
||||
|--------|---------|
|
||||
| `grpo.py` | CLI entry point, dispatches to training modes |
|
||||
| `run.py` | Unified launcher for shared_vllm mode |
|
||||
| `cli.py` | Single source of truth for all CLI arguments |
|
||||
| `config.py` | `TrainingConfig` Pydantic model |
|
||||
| `api.py` | Communication with Atropos API |
|
||||
| `data.py` | Batch preprocessing, logprob extraction |
|
||||
| `model.py` | Model loading, CUDA IPC attachment, tensor mapping |
|
||||
| `training.py` | GRPO loss computation |
|
||||
| `trainers.py` | Mode-specific training loops |
|
||||
| `vllm_api_server.py` | Streamlined vLLM server for training |
|
||||
| `vllm_manager.py` | vLLM process lifecycle management |
|
||||
| `checkpointing.py` | Save/load checkpoints and adapters |
|
||||
| `grpo.py` | CLI entry point, dispatches to training modes (4 modes) |
|
||||
| `run.py` | Unified launcher for shared_vllm mode (starts vLLM + trainer) |
|
||||
| `cli.py` | Single source of truth for all CLI arguments (modular builders) |
|
||||
| `config.py` | `TrainingConfig` Pydantic model with all hyperparameters |
|
||||
| `api.py` | Communication with Atropos API (registration, batch fetching) |
|
||||
| `data.py` | Batch preprocessing, padding, logprob extraction and alignment |
|
||||
| `model.py` | Model loading, CUDA IPC attachment, tensor mapping (QKV/Gate fusion) |
|
||||
| `training.py` | GRPO loss computation (importance sampling, KL penalty, clipping) |
|
||||
| `trainers.py` | Mode-specific training loops (4 implementations + optimizer selection) |
|
||||
| `vllm_api_server.py` | Custom vLLM server with `/generate` endpoint and LoRA support |
|
||||
| `vllm_manager.py` | vLLM process lifecycle management (launch, health checks, termination) |
|
||||
| `checkpointing.py` | Save/load checkpoints and adapters (handles fused tensor unfusing) |
|
||||
|
||||
---
|
||||
|
||||
## Code Execution Flow
|
||||
|
||||
### High-Level Flow (All Modes)
|
||||
|
||||
```
|
||||
1. CLI Parsing (cli.py)
|
||||
↓
|
||||
2. Config Creation (config.py)
|
||||
↓
|
||||
3. Mode Dispatcher (grpo.py or run.py)
|
||||
↓
|
||||
4. Trainer Function (trainers.py)
|
||||
├─ Setup Phase
|
||||
│ ├─ Initialize W&B (training.py)
|
||||
│ ├─ Load Model (model.py)
|
||||
│ ├─ Create Optimizer (trainers.py)
|
||||
│ ├─ Check Atropos API (api.py)
|
||||
│ ├─ Register Trainer (api.py)
|
||||
│ └─ Launch/Connect vLLM (vllm_manager.py or external)
|
||||
│
|
||||
└─ Training Loop
|
||||
├─ Fetch Batch (api.py → data.py)
|
||||
│ ├─ Poll /batch endpoint
|
||||
│ ├─ Pad sequences (data.py)
|
||||
│ ├─ Extract inference logprobs (data.py)
|
||||
│ └─ Normalize advantages (data.py)
|
||||
│
|
||||
├─ Training Step (training.py)
|
||||
│ ├─ For each micro-batch:
|
||||
│ │ ├─ Forward pass (model)
|
||||
│ │ ├─ Compute GRPO loss (training.py)
|
||||
│ │ │ ├─ Temperature scaling
|
||||
│ │ │ ├─ Compute log probabilities
|
||||
│ │ │ ├─ Importance sampling ratio (using inference logprobs)
|
||||
│ │ │ ├─ PPO clipping
|
||||
│ │ │ ├─ Schulman KL penalty
|
||||
│ │ │ └─ Return loss + metrics
|
||||
│ │ └─ Backward pass (accumulate gradients)
|
||||
│ ├─ Clip gradients (norm=1.0)
|
||||
│ ├─ Optimizer step
|
||||
│ └─ Zero gradients
|
||||
│
|
||||
├─ Weight Sync (mode-dependent)
|
||||
│ ├─ shared_vllm: No sync needed (weights shared via CUDA IPC)
|
||||
│ ├─ lora_only: HTTP POST to /lora/load
|
||||
│ ├─ lora_restart: Save adapter + terminate + relaunch vLLM
|
||||
│ └─ none: Save checkpoint + terminate + relaunch vLLM
|
||||
│
|
||||
├─ Log Metrics (training.py)
|
||||
│ ├─ Console output
|
||||
│ └─ W&B logging (if enabled)
|
||||
│
|
||||
└─ Periodic Checkpoint (checkpointing.py)
|
||||
├─ Ensure tensors are contiguous (unfuse views)
|
||||
├─ Save state dict
|
||||
└─ Free GPU memory
|
||||
```
|
||||
|
||||
### Mode-Specific Details
|
||||
|
||||
#### shared_vllm Mode
|
||||
|
||||
```python
|
||||
# Entry: grpo.py → trainers.train_shared_vllm()
|
||||
|
||||
1. Model Loading (model.py):
|
||||
- Find vllm_bridge_config.json
|
||||
- Load IPC handles (CUDA memory pointers)
|
||||
- Create empty model on meta device
|
||||
- Reconstruct tensors from IPC handles
|
||||
- Map vLLM fused tensors → HF unfused parameters
|
||||
* qkv_proj → q_proj, k_proj, v_proj (views)
|
||||
* gate_up_proj → gate_proj, up_proj (views)
|
||||
- Initialize remaining meta tensors (buffers, etc.)
|
||||
|
||||
2. Training Loop:
|
||||
- optimizer.step() directly modifies vLLM's tensors
|
||||
- No weight synchronization needed!
|
||||
- Checkpoints: Unfuse views before saving (checkpointing.py)
|
||||
|
||||
3. Tensor Mapping (model.py:_create_vllm_to_hf_mapping):
|
||||
- Reads actual HF tensor shapes from model.state_dict()
|
||||
- Creates slice mappings for fused layers
|
||||
- Example: q_proj = qkv_proj[0:4096, :]
|
||||
```
|
||||
|
||||
#### lora_restart Mode
|
||||
|
||||
```python
|
||||
# Entry: grpo.py → trainers.train_lora_restart()
|
||||
|
||||
1. Model Loading (model.py):
|
||||
- Load base model with PEFT
|
||||
- Apply LoRA config to target modules
|
||||
- Freeze base weights, only LoRA trainable
|
||||
|
||||
2. vLLM Management:
|
||||
- Launch: _launch_vllm_with_lora()
|
||||
* NO --enforce-eager flag (CUDA graphs enabled)
|
||||
* Pre-load initial adapter
|
||||
- Periodic Restart:
|
||||
* Save new adapter (checkpointing.py)
|
||||
* Terminate vLLM aggressively (_terminate_vllm)
|
||||
- Kill process group
|
||||
- Kill by port (fuser)
|
||||
- Kill by process name patterns
|
||||
- Wait for GPU memory release (critical!)
|
||||
* Relaunch with new adapter
|
||||
|
||||
3. Performance:
|
||||
- ~108 TPS (CUDA graphs enabled)
|
||||
- ~45s restart overhead
|
||||
- Much faster than lora_only (~8x speedup)
|
||||
```
|
||||
|
||||
#### lora_only Mode
|
||||
|
||||
```python
|
||||
# Entry: grpo.py → trainers.train_lora()
|
||||
|
||||
1. Model Loading: Same as lora_restart
|
||||
|
||||
2. vLLM: External server (must be pre-started)
|
||||
- MUST use --enforce-eager (disables CUDA graphs)
|
||||
- MUST use --enable-lora
|
||||
|
||||
3. Weight Sync: _hotswap_lora_adapter()
|
||||
- Tries /v1/load_lora_adapter (native vLLM)
|
||||
- Falls back to /lora/load (custom endpoint)
|
||||
|
||||
4. Performance:
|
||||
- ~13 TPS (CUDA graphs disabled)
|
||||
- No restart overhead
|
||||
- 8x slower than lora_restart!
|
||||
```
|
||||
|
||||
#### none (legacy) Mode
|
||||
|
||||
```python
|
||||
# Entry: grpo.py → trainers.train_legacy()
|
||||
|
||||
1. Model Loading: Full model (model.py)
|
||||
|
||||
2. vLLM Management:
|
||||
- Launch: vllm_manager.launch_vllm_server()
|
||||
- Periodic Restart:
|
||||
* Save full checkpoint (checkpointing.py)
|
||||
* Terminate vLLM (vllm_manager.terminate_vllm_process)
|
||||
* Relaunch with new checkpoint
|
||||
|
||||
3. Use Case:
|
||||
- Different GPUs for trainer and vLLM
|
||||
- Simple setup without CUDA IPC or LoRA
|
||||
```
|
||||
|
||||
### Data Flow Detail (data.py)
|
||||
|
||||
```python
|
||||
# api.get_batch() → data.get_data() → data.pad_data_to_good_offset()
|
||||
|
||||
1. Batch Structure from API:
|
||||
{
|
||||
"batch": [
|
||||
{
|
||||
"tokens": [[tok1, tok2, ...], ...], # group_size sequences
|
||||
"masks": [[mask1, mask2, ...], ...], # -100 for prompt, token_id for generated
|
||||
"scores": [score1, score2, ...], # rewards
|
||||
"inference_logprobs": [[lp1, lp2, ...], ...], # CRITICAL for GRPO!
|
||||
"generation_params": {"temperature": 1.0},
|
||||
...
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
2. Preprocessing (pad_data_to_good_offset):
|
||||
- Normalize advantages (mean=0, std=1 per group)
|
||||
- Pad sequences to multiple of 64
|
||||
- Align inference_logprobs with labels:
|
||||
* 1.0 for prompt tokens (masked)
|
||||
* Actual negative logprobs for generated tokens
|
||||
* Shift by 1 for causal alignment
|
||||
- Extract temperatures (priority: override > generation_params > 1.0)
|
||||
- Batch into micro-batches
|
||||
|
||||
3. Output:
|
||||
- token_batches: [B, seq_len]
|
||||
- label_batches: [B, seq_len] # -100 for masked
|
||||
- advantage_batches: [B, 1]
|
||||
- temperature_batches: [B, 1, 1]
|
||||
- inference_logprob_batches: [B, seq_len] # aligned with labels!
|
||||
```
|
||||
|
||||
### GRPO Loss Computation (training.py)
|
||||
|
||||
```python
|
||||
# training.compute_grpo_loss()
|
||||
|
||||
1. Forward Pass:
|
||||
- Get logits from model
|
||||
- Apply temperature scaling (from data)
|
||||
- Compute log probabilities per token
|
||||
|
||||
2. Reference Policy (π_old):
|
||||
- Extract from inference_logprobs (from vLLM at generation time)
|
||||
- Already aligned with labels by data.py
|
||||
|
||||
3. Importance Sampling:
|
||||
- log_ratio = log π_new(a|s) - log π_old(a|s)
|
||||
- ratio = exp(log_ratio)
|
||||
- Clipped ratio = clip(ratio, 1-ε, 1+ε)
|
||||
|
||||
4. Policy Loss:
|
||||
- surr1 = ratio * advantage
|
||||
- surr2 = clipped_ratio * advantage
|
||||
- policy_loss = -min(surr1, surr2) # pessimistic bound
|
||||
|
||||
5. KL Penalty (Schulman's estimator):
|
||||
- kl = exp(-log_ratio) + log_ratio - 1
|
||||
- Guaranteed non-negative, unbiased
|
||||
|
||||
6. Total Loss:
|
||||
- loss = policy_loss + β * kl_penalty
|
||||
- Scaled by 1/gradient_accumulation_steps
|
||||
|
||||
7. Metrics:
|
||||
- mean_ratio: Average importance sampling ratio
|
||||
- mean_kl: Average KL divergence
|
||||
- clipped_fraction: % of tokens clipped
|
||||
- alignment/* : Token-level logprob alignment (verifies weight sharing)
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,394 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark LoRA inference modes to find the fastest approach.
|
||||
|
||||
This script tests multiple vLLM configurations to determine:
|
||||
1. Does --enable-lora force eager mode even without --enforce-eager?
|
||||
2. What's the actual TPS difference between configurations?
|
||||
3. Is there ANY way to get fast LoRA inference?
|
||||
|
||||
Configurations tested:
|
||||
- BASE: No LoRA flags (CUDA graphs enabled) - baseline
|
||||
- LORA_EAGER: --enable-lora --enforce-eager (required for hot-swap)
|
||||
- LORA_NO_EAGER: --enable-lora only (does vLLM force eager anyway?)
|
||||
|
||||
Usage:
|
||||
python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507
|
||||
python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507 --lora-path ./checkpoints/final_adapter
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
# Force unbuffered output for log files
|
||||
sys.stdout.reconfigure(line_buffering=True) if hasattr(sys.stdout, 'reconfigure') else None
|
||||
sys.stderr.reconfigure(line_buffering=True) if hasattr(sys.stderr, 'reconfigure') else None
|
||||
|
||||
def log(msg: str):
|
||||
"""Print with immediate flush."""
|
||||
print(msg, flush=True)
|
||||
|
||||
# Complex math prompt that requires extended reasoning
|
||||
BENCHMARK_PROMPT = """You are a mathematics expert. Solve this problem step by step, showing all your work:
|
||||
|
||||
A rectangular garden has a perimeter of 56 meters. The length is 4 meters more than twice the width.
|
||||
|
||||
1) Set up the equations
|
||||
2) Solve for width and length
|
||||
3) Calculate the area
|
||||
4) If we want to put a circular fountain in the center with radius equal to 1/4 of the width, what area remains for planting?
|
||||
5) Express the planting area as a percentage of the total garden area
|
||||
|
||||
Show all calculations clearly and verify your answer."""
|
||||
|
||||
# Longer prompt for extended generation
|
||||
LONG_PROMPT = """Write a detailed technical explanation of how transformer neural networks work, covering:
|
||||
|
||||
1. The attention mechanism - explain self-attention, multi-head attention, and how queries, keys, and values work
|
||||
2. The encoder-decoder architecture vs decoder-only models
|
||||
3. Positional encoding - why it's needed and different approaches
|
||||
4. Layer normalization and residual connections
|
||||
5. The feed-forward network component
|
||||
6. How training works with cross-entropy loss and backpropagation through attention
|
||||
|
||||
Include mathematical formulas where appropriate and explain the intuition behind each component. This should be comprehensive enough for someone with basic ML knowledge to understand transformers deeply."""
|
||||
|
||||
|
||||
def wait_for_server(port: int, timeout: int = 300) -> bool:
|
||||
"""Wait for vLLM server to be ready."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
resp = requests.get(f"http://localhost:{port}/health", timeout=5)
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def start_vllm_server(
|
||||
model: str,
|
||||
port: int,
|
||||
gpu_id: int,
|
||||
mode: str = "base", # "base", "lora_eager", "lora_no_eager"
|
||||
max_lora_rank: int = 32,
|
||||
max_model_len: int = 8192,
|
||||
log_file: str = "vllm.log",
|
||||
) -> subprocess.Popen:
|
||||
"""
|
||||
Start a vLLM server with different configurations.
|
||||
|
||||
Modes:
|
||||
- base: No LoRA, CUDA graphs enabled (fastest)
|
||||
- lora_eager: --enable-lora --enforce-eager (slow, but supports hot-swap)
|
||||
- lora_no_eager: --enable-lora only (test if vLLM forces eager anyway)
|
||||
"""
|
||||
# Find the vllm_api_server.py script relative to this script
|
||||
script_dir = Path(__file__).parent.parent # example_trainer/
|
||||
vllm_server_path = script_dir / "vllm_api_server.py"
|
||||
|
||||
if not vllm_server_path.exists():
|
||||
log(f"ERROR: vllm_api_server.py not found at {vllm_server_path}")
|
||||
raise FileNotFoundError(f"vllm_api_server.py not found at {vllm_server_path}")
|
||||
|
||||
cmd = [
|
||||
sys.executable, str(vllm_server_path),
|
||||
"--model", model,
|
||||
"--port", str(port),
|
||||
"--gpu-memory-utilization", "0.70", # Higher for 32k context
|
||||
"--max-model-len", str(max_model_len),
|
||||
"--dtype", "bfloat16",
|
||||
]
|
||||
|
||||
if mode == "lora_eager":
|
||||
cmd.extend([
|
||||
"--enable-lora",
|
||||
"--max-lora-rank", str(max_lora_rank),
|
||||
"--enforce-eager",
|
||||
])
|
||||
log(f"Mode: LORA_EAGER (--enable-lora --enforce-eager)")
|
||||
elif mode == "lora_no_eager":
|
||||
cmd.extend([
|
||||
"--enable-lora",
|
||||
"--max-lora-rank", str(max_lora_rank),
|
||||
# NOTE: NOT adding --enforce-eager - testing if vLLM forces it anyway
|
||||
])
|
||||
log(f"Mode: LORA_NO_EAGER (--enable-lora only, NO --enforce-eager)")
|
||||
else:
|
||||
log(f"Mode: BASE (no LoRA flags, CUDA graphs enabled)")
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
||||
|
||||
log(f"GPU: {gpu_id}")
|
||||
log(f"Command: {' '.join(cmd)}")
|
||||
|
||||
log_f = open(log_file, "w")
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=log_f,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
log(f"Started vLLM PID={proc.pid}, log: {log_file}")
|
||||
return proc
|
||||
|
||||
|
||||
def load_lora_adapter(port: int, adapter_path: str) -> bool:
|
||||
"""Load a LoRA adapter into vLLM."""
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"http://localhost:{port}/lora/load",
|
||||
json={"adapter_path": adapter_path, "adapter_name": "benchmark_adapter"},
|
||||
timeout=30,
|
||||
)
|
||||
return resp.status_code == 200
|
||||
except Exception as e:
|
||||
log(f"Failed to load LoRA adapter: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def benchmark_inference(
|
||||
port: int,
|
||||
prompt: str,
|
||||
max_tokens: int = 2048,
|
||||
num_runs: int = 3,
|
||||
) -> dict:
|
||||
"""Benchmark inference on a vLLM server."""
|
||||
results = {
|
||||
"times": [],
|
||||
"tokens": [],
|
||||
"tps": [],
|
||||
}
|
||||
|
||||
for i in range(num_runs):
|
||||
start = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"http://localhost:{port}/generate",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.7,
|
||||
},
|
||||
timeout=300,
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
output_text = data.get("text", [""])[0]
|
||||
# Rough token count (words * 1.3)
|
||||
output_tokens = len(output_text.split()) * 1.3
|
||||
|
||||
results["times"].append(elapsed)
|
||||
results["tokens"].append(output_tokens)
|
||||
results["tps"].append(output_tokens / elapsed if elapsed > 0 else 0)
|
||||
|
||||
log(f" Run {i+1}: {elapsed:.2f}s, ~{output_tokens:.0f} tokens, {output_tokens/elapsed:.1f} TPS")
|
||||
else:
|
||||
log(f" Run {i+1}: FAILED ({resp.status_code})")
|
||||
except Exception as e:
|
||||
log(f" Run {i+1}: ERROR - {e}")
|
||||
|
||||
if results["times"]:
|
||||
results["avg_time"] = sum(results["times"]) / len(results["times"])
|
||||
results["avg_tokens"] = sum(results["tokens"]) / len(results["tokens"])
|
||||
results["avg_tps"] = sum(results["tps"]) / len(results["tps"])
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Benchmark LoRA inference configurations")
|
||||
parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Instruct-2507",
|
||||
help="Model to benchmark")
|
||||
parser.add_argument("--lora-path", type=str, default=None,
|
||||
help="Path to LoRA adapter (optional)")
|
||||
parser.add_argument("--max-tokens", type=int, default=2048,
|
||||
help="Max tokens to generate")
|
||||
parser.add_argument("--num-runs", type=int, default=3,
|
||||
help="Number of benchmark runs per server")
|
||||
parser.add_argument("--gpu", type=int, default=0,
|
||||
help="GPU to use (tests run sequentially)")
|
||||
parser.add_argument("--port", type=int, default=9001,
|
||||
help="Port for vLLM server")
|
||||
parser.add_argument("--prompt", type=str, choices=["math", "long"], default="long",
|
||||
help="Which prompt to use")
|
||||
parser.add_argument("--max-model-len", type=int, default=8192,
|
||||
help="Maximum model context length (e.g., 8192, 32768)")
|
||||
parser.add_argument("--modes", type=str, default="all",
|
||||
help="Comma-separated modes to test: base,lora_eager,lora_no_eager or 'all'")
|
||||
args = parser.parse_args()
|
||||
|
||||
prompt = LONG_PROMPT if args.prompt == "long" else BENCHMARK_PROMPT
|
||||
|
||||
# Parse modes to test
|
||||
if args.modes == "all":
|
||||
modes_to_test = ["base", "lora_no_eager", "lora_eager"]
|
||||
else:
|
||||
modes_to_test = [m.strip() for m in args.modes.split(",")]
|
||||
|
||||
results = {}
|
||||
current_proc = None
|
||||
|
||||
def cleanup():
|
||||
log("\nCleaning up...")
|
||||
if current_proc:
|
||||
try:
|
||||
current_proc.terminate()
|
||||
current_proc.wait(timeout=5)
|
||||
except Exception:
|
||||
try:
|
||||
current_proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
signal.signal(signal.SIGINT, lambda s, f: (cleanup(), sys.exit(0)))
|
||||
signal.signal(signal.SIGTERM, lambda s, f: (cleanup(), sys.exit(0)))
|
||||
|
||||
try:
|
||||
log("=" * 70)
|
||||
log("vLLM LoRA Inference Configuration Benchmark")
|
||||
log("=" * 70)
|
||||
log(f"Model: {args.model}")
|
||||
log(f"LoRA adapter: {args.lora_path or 'None'}")
|
||||
log(f"Max tokens: {args.max_tokens}")
|
||||
log(f"Max model len: {args.max_model_len}")
|
||||
log(f"Num runs: {args.num_runs}")
|
||||
log(f"Modes to test: {modes_to_test}")
|
||||
log("=" * 70)
|
||||
log("")
|
||||
log("QUESTION: Does --enable-lora force eager mode even without --enforce-eager?")
|
||||
log("=" * 70)
|
||||
|
||||
# Test each mode sequentially (same GPU, restart between tests)
|
||||
for i, mode in enumerate(modes_to_test):
|
||||
log(f"\n[{i+1}/{len(modes_to_test)}] Testing mode: {mode.upper()}")
|
||||
log("-" * 70)
|
||||
|
||||
# Start server
|
||||
current_proc = start_vllm_server(
|
||||
args.model, args.port, args.gpu,
|
||||
mode=mode, max_model_len=args.max_model_len,
|
||||
log_file=f"benchmark_{mode}.log"
|
||||
)
|
||||
|
||||
# Wait for ready
|
||||
log(f" Waiting for server (port {args.port})...")
|
||||
if not wait_for_server(args.port, timeout=300):
|
||||
log(f" ✗ Server failed to start! Check benchmark_{mode}.log")
|
||||
results[mode] = {"error": "Server failed to start"}
|
||||
current_proc.terminate()
|
||||
current_proc = None
|
||||
continue
|
||||
|
||||
log(f" ✓ Server ready")
|
||||
|
||||
# Load LoRA adapter if provided and mode supports it
|
||||
if args.lora_path and mode in ["lora_eager", "lora_no_eager"]:
|
||||
log(f" Loading LoRA adapter...")
|
||||
if load_lora_adapter(args.port, args.lora_path):
|
||||
log(f" ✓ Adapter loaded")
|
||||
else:
|
||||
log(f" ⚠ Failed to load adapter (continuing anyway)")
|
||||
|
||||
# Check the log file for CUDA graph status
|
||||
log(f" Checking CUDA graph status in log...")
|
||||
try:
|
||||
with open(f"benchmark_{mode}.log", "r") as f:
|
||||
log_content = f.read()
|
||||
if "Cudagraph is disabled" in log_content:
|
||||
log(f" ⚠ CUDA GRAPHS DISABLED (eager mode)")
|
||||
elif "cudagraph" in log_content.lower():
|
||||
# Look for other cudagraph messages
|
||||
for line in log_content.split("\n"):
|
||||
if "cudagraph" in line.lower():
|
||||
log(f" Log: {line.strip()[:80]}")
|
||||
else:
|
||||
log(f" (No cudagraph message found in log)")
|
||||
except Exception as e:
|
||||
log(f" (Could not read log: {e})")
|
||||
|
||||
# Run benchmark
|
||||
log(f"\n Running {args.num_runs} inference requests...")
|
||||
mode_results = benchmark_inference(
|
||||
args.port, prompt, args.max_tokens, args.num_runs
|
||||
)
|
||||
results[mode] = mode_results
|
||||
|
||||
# Terminate server
|
||||
log(f" Stopping server...")
|
||||
current_proc.terminate()
|
||||
try:
|
||||
current_proc.wait(timeout=10)
|
||||
except Exception:
|
||||
current_proc.kill()
|
||||
current_proc = None
|
||||
|
||||
# Wait for port to be free
|
||||
time.sleep(3)
|
||||
|
||||
# Print comparison
|
||||
log("\n" + "=" * 70)
|
||||
log("RESULTS SUMMARY")
|
||||
log("=" * 70)
|
||||
|
||||
valid_results = {k: v for k, v in results.items() if "avg_tps" in v}
|
||||
|
||||
for mode, res in valid_results.items():
|
||||
log(f"\n{mode.upper()}:")
|
||||
log(f" Avg time: {res['avg_time']:.2f}s")
|
||||
log(f" Avg tokens: {res['avg_tokens']:.0f}")
|
||||
log(f" Avg TPS: {res['avg_tps']:.1f}")
|
||||
|
||||
# Compare
|
||||
if "base" in valid_results:
|
||||
base_tps = valid_results["base"]["avg_tps"]
|
||||
log(f"\n" + "-" * 70)
|
||||
log("COMPARISON TO BASE (CUDA graphs enabled):")
|
||||
for mode, res in valid_results.items():
|
||||
if mode != "base":
|
||||
ratio = res["avg_tps"] / base_tps if base_tps > 0 else 0
|
||||
slowdown = (1 - ratio) * 100
|
||||
log(f" {mode}: {res['avg_tps']:.1f} TPS ({ratio:.2f}x base, {slowdown:.1f}% slower)")
|
||||
|
||||
# Key finding
|
||||
log("\n" + "=" * 70)
|
||||
log("KEY FINDING:")
|
||||
if "lora_no_eager" in valid_results and "lora_eager" in valid_results:
|
||||
no_eager_tps = valid_results["lora_no_eager"]["avg_tps"]
|
||||
eager_tps = valid_results["lora_eager"]["avg_tps"]
|
||||
if abs(no_eager_tps - eager_tps) < eager_tps * 0.1: # Within 10%
|
||||
log(" ⚠ --enable-lora FORCES eager mode regardless of --enforce-eager flag!")
|
||||
log(" ⚠ There is NO WAY to get CUDA graphs with LoRA enabled in vLLM.")
|
||||
else:
|
||||
log(" ✓ --enable-lora without --enforce-eager is FASTER!")
|
||||
log(f" ✓ lora_no_eager: {no_eager_tps:.1f} TPS vs lora_eager: {eager_tps:.1f} TPS")
|
||||
|
||||
if "base" in valid_results and "lora_eager" in valid_results:
|
||||
base_tps = valid_results["base"]["avg_tps"]
|
||||
lora_tps = valid_results["lora_eager"]["avg_tps"]
|
||||
log(f"\n Base model (no LoRA): {base_tps:.1f} TPS")
|
||||
log(f" LoRA enabled: {lora_tps:.1f} TPS")
|
||||
log(f" Slowdown factor: {base_tps/lora_tps:.1f}x")
|
||||
|
||||
log("=" * 70)
|
||||
|
||||
finally:
|
||||
cleanup()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,458 +0,0 @@
|
|||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# All Training Modes Comparison on Math Zero (32k context)
|
||||
# =============================================================================
|
||||
#
|
||||
# Compares all 3 training modes on math_server_zero environment:
|
||||
# - GPU 0: shared_vllm (CUDA IPC, zero-copy weight updates)
|
||||
# - GPU 1: lora_only (--enforce-eager, ~13 TPS, slow)
|
||||
# - GPU 2: lora_restart (no --enforce-eager, ~108 TPS, fast)
|
||||
#
|
||||
# All at 32k context length for proper math reasoning.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/compare_all_modes_math_zero.sh [MODEL] [STEPS]
|
||||
#
|
||||
# Example:
|
||||
# ./scripts/compare_all_modes_math_zero.sh Qwen/Qwen3-4B-Instruct-2507 30
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
|
||||
TRAINING_STEPS="${2:-30}"
|
||||
BATCH_SIZE="${BATCH_SIZE:-2}"
|
||||
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
||||
USE_WANDB="${USE_WANDB:-true}"
|
||||
WANDB_PROJECT="${WANDB_PROJECT:-math-zero-mode-comparison}"
|
||||
|
||||
# Port allocation (separate ports for each mode)
|
||||
# shared_vllm: API 8001, vLLM 9001
|
||||
# lora_only: API 8002, vLLM 9002
|
||||
# lora_restart: API 8003, vLLM 9003
|
||||
|
||||
SHARED_API_PORT=8001
|
||||
SHARED_VLLM_PORT=9001
|
||||
SHARED_GPU=0
|
||||
|
||||
LORA_ONLY_API_PORT=8002
|
||||
LORA_ONLY_VLLM_PORT=9002
|
||||
LORA_ONLY_GPU=1
|
||||
|
||||
LORA_RESTART_API_PORT=8003
|
||||
LORA_RESTART_VLLM_PORT=9003
|
||||
LORA_RESTART_GPU=2
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
||||
|
||||
LOG_DIR="${REPO_DIR}/math_zero_comparison_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
echo "============================================================"
|
||||
echo "Math Zero Mode Comparison (32k Context)"
|
||||
echo "============================================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $TRAINING_STEPS"
|
||||
echo "Batch: $BATCH_SIZE"
|
||||
echo "Max Model Length: $MAX_MODEL_LEN"
|
||||
echo "Wandb: $USE_WANDB (project: $WANDB_PROJECT)"
|
||||
echo ""
|
||||
echo "GPU Allocation:"
|
||||
echo " GPU $SHARED_GPU: shared_vllm (ports $SHARED_API_PORT, $SHARED_VLLM_PORT)"
|
||||
echo " GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
|
||||
echo " GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
|
||||
echo ""
|
||||
echo "Log Dir: $LOG_DIR"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "Cleaning up all processes..."
|
||||
pkill -9 -f "vllm_api_server" 2>/dev/null || true
|
||||
pkill -9 -f "math_server_zero" 2>/dev/null || true
|
||||
pkill -9 -f "run-api" 2>/dev/null || true
|
||||
pkill -9 -f "grpo" 2>/dev/null || true
|
||||
pkill -9 -f "vllm.*EngineCore" 2>/dev/null || true
|
||||
for port in $SHARED_API_PORT $SHARED_VLLM_PORT $LORA_ONLY_API_PORT $LORA_ONLY_VLLM_PORT $LORA_RESTART_API_PORT $LORA_RESTART_VLLM_PORT; do
|
||||
fuser -k ${port}/tcp 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Initial cleanup
|
||||
cleanup
|
||||
|
||||
# Clear triton cache for clean start
|
||||
rm -rf ~/.triton/cache 2>/dev/null || true
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# =============================================================================
|
||||
# Helper functions
|
||||
# =============================================================================
|
||||
|
||||
wait_for_health() {
|
||||
local port=$1
|
||||
local name=$2
|
||||
local max_attempts=${3:-120}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
|
||||
echo " ✓ $name ready (port $port)"
|
||||
return 0
|
||||
fi
|
||||
sleep 5
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
echo " ✗ $name failed to start (port $port)"
|
||||
return 1
|
||||
}
|
||||
|
||||
wait_for_api() {
|
||||
local port=$1
|
||||
local name=$2
|
||||
local max_attempts=${3:-30}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
|
||||
echo " ✓ $name ready (port $port)"
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
echo " ✗ $name failed to start (port $port)"
|
||||
return 1
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# START ALL THREE MODES IN PARALLEL
|
||||
# =============================================================================
|
||||
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "Starting all three modes in parallel..."
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# Pre-create checkpoint directories
|
||||
mkdir -p "$LOG_DIR/checkpoints_shared"
|
||||
mkdir -p "$LOG_DIR/checkpoints_lora_only"
|
||||
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MODE 1: SHARED_VLLM (GPU 0)
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[SHARED_VLLM] Starting on GPU $SHARED_GPU..."
|
||||
|
||||
# Start run-api for shared_vllm
|
||||
run-api --port $SHARED_API_PORT > "$LOG_DIR/api_shared.log" 2>&1 &
|
||||
|
||||
# Start vLLM with shared weights
|
||||
# NOTE: shared_vllm needs more headroom for optimizer states (~8GB) and gradients
|
||||
# Using 0.5 leaves ~90GB for training operations on a 180GB GPU
|
||||
echo "[SHARED_VLLM] Starting vLLM with shared weights..."
|
||||
VLLM_ENABLE_SHARED_WEIGHTS=1 VLLM_BRIDGE_CONFIG_PATH=$LOG_DIR/vllm_bridge_config_shared.json \
|
||||
CUDA_VISIBLE_DEVICES=$SHARED_GPU python -u example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--port $SHARED_VLLM_PORT \
|
||||
--gpu-memory-utilization 0.50 \
|
||||
--max-model-len $MAX_MODEL_LEN \
|
||||
> "$LOG_DIR/vllm_shared.log" 2>&1 &
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MODE 2: LORA_ONLY (GPU 1)
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
|
||||
|
||||
# Start run-api for lora_only
|
||||
run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
|
||||
|
||||
# Start vLLM with --enforce-eager for lora_only
|
||||
# LoRA modes need less training memory, but still need headroom at 32k
|
||||
echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
|
||||
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--port $LORA_ONLY_VLLM_PORT \
|
||||
--gpu-memory-utilization 0.70 \
|
||||
--max-model-len $MAX_MODEL_LEN \
|
||||
--enable-lora \
|
||||
--max-lora-rank 64 \
|
||||
--enforce-eager \
|
||||
> "$LOG_DIR/vllm_lora_only.log" 2>&1 &
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MODE 3: LORA_RESTART (GPU 2) - Trainer manages vLLM internally
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
|
||||
|
||||
# Start run-api for lora_restart
|
||||
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
|
||||
|
||||
# =============================================================================
|
||||
# WAIT FOR INFRASTRUCTURE
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Waiting for infrastructure to be ready..."
|
||||
echo " (vLLM at 32k context takes ~2-5 minutes to start)"
|
||||
|
||||
wait_for_api $SHARED_API_PORT "shared_vllm API" || exit 1
|
||||
wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
|
||||
wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
|
||||
|
||||
wait_for_health $SHARED_VLLM_PORT "shared_vllm vLLM" 180 || exit 1
|
||||
wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 180 || exit 1
|
||||
|
||||
# =============================================================================
|
||||
# START ENVIRONMENTS AND TRAINERS
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting environments and trainers..."
|
||||
|
||||
# Record start time
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
# Build wandb args
|
||||
WANDB_ARGS=""
|
||||
if [ "$USE_WANDB" = "true" ]; then
|
||||
WANDB_ARGS="--use-wandb --wandb-project $WANDB_PROJECT"
|
||||
fi
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# SHARED_VLLM: Start environment and trainer
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[SHARED_VLLM] Starting math_server_zero environment..."
|
||||
MATH_ENV_MODEL="$MODEL" \
|
||||
MATH_ENV_ROLLOUT_URL="http://localhost:${SHARED_API_PORT}" \
|
||||
MATH_ENV_VLLM_URL="http://localhost:${SHARED_VLLM_PORT}/v1" \
|
||||
MATH_ENV_WANDB_NAME="shared-vllm-env" \
|
||||
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
||||
MATH_ENV_WORKER_TIMEOUT=1800 \
|
||||
python -u environments/math_server_zero.py serve \
|
||||
--slurm false \
|
||||
2>&1 | tee "$LOG_DIR/env_shared.log" &
|
||||
SHARED_ENV_PID=$!
|
||||
|
||||
echo "[SHARED_VLLM] Starting trainer..."
|
||||
CUDA_VISIBLE_DEVICES=$SHARED_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode shared_vllm \
|
||||
--vllm-port $SHARED_VLLM_PORT \
|
||||
--vllm-config-path "$LOG_DIR/vllm_bridge_config_shared.json" \
|
||||
--atropos-url "http://localhost:${SHARED_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--max-model-len $MAX_MODEL_LEN \
|
||||
--seq-len $MAX_MODEL_LEN \
|
||||
--save-path "$LOG_DIR/checkpoints_shared" \
|
||||
$WANDB_ARGS --wandb-group "shared-vllm" \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer_shared.log" &
|
||||
SHARED_TRAINER_PID=$!
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_ONLY: Start environment and trainer
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_ONLY] Starting math_server_zero environment..."
|
||||
MATH_ENV_MODEL="$MODEL" \
|
||||
MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_ONLY_API_PORT}" \
|
||||
MATH_ENV_VLLM_URL="http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
|
||||
MATH_ENV_WANDB_NAME="lora-only-env" \
|
||||
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
||||
MATH_ENV_WORKER_TIMEOUT=1800 \
|
||||
python -u environments/math_server_zero.py serve \
|
||||
--slurm false \
|
||||
2>&1 | tee "$LOG_DIR/env_lora_only.log" &
|
||||
LORA_ONLY_ENV_PID=$!
|
||||
|
||||
echo "[LORA_ONLY] Starting trainer..."
|
||||
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_only \
|
||||
--vllm-port $LORA_ONLY_VLLM_PORT \
|
||||
--atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--max-model-len $MAX_MODEL_LEN \
|
||||
--seq-len $MAX_MODEL_LEN \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--vllm-restart-interval 5 \
|
||||
--save-path "$LOG_DIR/checkpoints_lora_only" \
|
||||
$WANDB_ARGS --wandb-group "lora-only" \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer_lora_only.log" &
|
||||
LORA_ONLY_TRAINER_PID=$!
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_RESTART: Start trainer (it manages vLLM internally)
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
|
||||
# NOTE: lora_restart shares GPU with trainer's model (~8GB), so use lower vLLM memory
|
||||
# Use unbuffered output (-u) and stdbuf to capture crashes
|
||||
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_restart \
|
||||
--vllm-port $LORA_RESTART_VLLM_PORT \
|
||||
--vllm-gpu-memory-utilization 0.20 \
|
||||
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--max-model-len $MAX_MODEL_LEN \
|
||||
--seq-len $MAX_MODEL_LEN \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--vllm-restart-interval 5 \
|
||||
--save-path "$LOG_DIR/checkpoints_lora_restart" \
|
||||
$WANDB_ARGS --wandb-group "lora-restart" \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer_lora_restart.log" &
|
||||
LORA_RESTART_TRAINER_PID=$!
|
||||
|
||||
# Wait for lora_restart's internal vLLM to start
|
||||
echo "[LORA_RESTART] Waiting for internal vLLM to start..."
|
||||
echo " NOTE: vLLM at 32k context with CUDA graphs takes 2-5 min"
|
||||
sleep 60
|
||||
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 300 || {
|
||||
echo " Failed - check logs:"
|
||||
tail -50 "$LOG_DIR/trainer_lora_restart.log"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Start environment for lora_restart
|
||||
echo "[LORA_RESTART] Starting math_server_zero environment..."
|
||||
MATH_ENV_MODEL="$MODEL" \
|
||||
MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_RESTART_API_PORT}" \
|
||||
MATH_ENV_VLLM_URL="http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
|
||||
MATH_ENV_WANDB_NAME="lora-restart-env" \
|
||||
MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
|
||||
MATH_ENV_WORKER_TIMEOUT=1800 \
|
||||
python -u environments/math_server_zero.py serve \
|
||||
--slurm false \
|
||||
2>&1 | tee "$LOG_DIR/env_lora_restart.log" &
|
||||
LORA_RESTART_ENV_PID=$!
|
||||
|
||||
# =============================================================================
|
||||
# WAIT FOR ALL TRAINERS TO COMPLETE
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "All three trainers running in parallel. Waiting for completion..."
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo ""
|
||||
echo "📊 WANDB: https://wandb.ai (project: $WANDB_PROJECT)"
|
||||
echo ""
|
||||
echo "📋 MONITOR LOGS (in another terminal):"
|
||||
echo ""
|
||||
echo " # Trainer logs:"
|
||||
echo " tail -f $LOG_DIR/trainer_shared.log"
|
||||
echo " tail -f $LOG_DIR/trainer_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/trainer_lora_restart.log"
|
||||
echo ""
|
||||
echo " # Environment logs:"
|
||||
echo " tail -f $LOG_DIR/env_shared.log"
|
||||
echo " tail -f $LOG_DIR/env_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/env_lora_restart.log"
|
||||
echo ""
|
||||
echo " # vLLM logs:"
|
||||
echo " tail -f $LOG_DIR/vllm_shared.log"
|
||||
echo " tail -f $LOG_DIR/vllm_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
|
||||
echo ""
|
||||
|
||||
# Wait for trainers
|
||||
SHARED_EXIT=0
|
||||
LORA_ONLY_EXIT=0
|
||||
LORA_RESTART_EXIT=0
|
||||
|
||||
wait $SHARED_TRAINER_PID || SHARED_EXIT=$?
|
||||
SHARED_END=$(date +%s)
|
||||
SHARED_TIME=$((SHARED_END - START_TIME))
|
||||
echo " ✓ shared_vllm finished in ${SHARED_TIME}s (exit: $SHARED_EXIT)"
|
||||
|
||||
wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
|
||||
LORA_ONLY_END=$(date +%s)
|
||||
LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
|
||||
echo " ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
|
||||
|
||||
wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
|
||||
LORA_RESTART_END=$(date +%s)
|
||||
LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
|
||||
echo " ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
|
||||
|
||||
# =============================================================================
|
||||
# RESULTS
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "COMPARISON RESULTS (Math Zero @ 32k Context)"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
echo "Training Steps: $TRAINING_STEPS"
|
||||
echo "Batch Size: $BATCH_SIZE"
|
||||
echo "Max Context: $MAX_MODEL_LEN"
|
||||
echo ""
|
||||
echo "┌─────────────────┬──────┬──────────────┬────────────────────────────────┐"
|
||||
echo "│ Mode │ GPU │ Total Time │ Notes │"
|
||||
echo "├─────────────────┼──────┼──────────────┼────────────────────────────────┤"
|
||||
printf "│ shared_vllm │ %d │ %10ss │ CUDA IPC zero-copy (~172 TPS) │\n" "$SHARED_GPU" "$SHARED_TIME"
|
||||
printf "│ lora_only │ %d │ %10ss │ --enforce-eager (~13 TPS) │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
|
||||
printf "│ lora_restart │ %d │ %10ss │ no --enforce-eager (~108 TPS) │\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
|
||||
echo "└─────────────────┴──────┴──────────────┴────────────────────────────────┘"
|
||||
echo ""
|
||||
|
||||
# Calculate speedups
|
||||
if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
|
||||
RESTART_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
|
||||
echo "lora_restart vs lora_only speedup: ${RESTART_SPEEDUP}x"
|
||||
fi
|
||||
if [ $LORA_ONLY_TIME -gt 0 ] && [ $SHARED_TIME -gt 0 ]; then
|
||||
SHARED_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $SHARED_TIME" | bc)
|
||||
echo "shared_vllm vs lora_only speedup: ${SHARED_SPEEDUP}x"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "📊 BENCHMARK DETAILS:"
|
||||
echo ""
|
||||
echo "━━━ shared_vllm (GPU $SHARED_GPU) ━━━"
|
||||
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_shared.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_shared.log)"
|
||||
echo ""
|
||||
echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
|
||||
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_only.log)"
|
||||
echo ""
|
||||
echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
|
||||
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_restart.log)"
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "📁 All logs saved to: $LOG_DIR"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
echo "Log files:"
|
||||
echo " Trainers:"
|
||||
echo " $LOG_DIR/trainer_shared.log"
|
||||
echo " $LOG_DIR/trainer_lora_only.log"
|
||||
echo " $LOG_DIR/trainer_lora_restart.log"
|
||||
echo ""
|
||||
echo " Environments:"
|
||||
echo " $LOG_DIR/env_shared.log"
|
||||
echo " $LOG_DIR/env_lora_only.log"
|
||||
echo " $LOG_DIR/env_lora_restart.log"
|
||||
echo ""
|
||||
echo " vLLM:"
|
||||
echo " $LOG_DIR/vllm_shared.log"
|
||||
echo " $LOG_DIR/vllm_lora_only.log"
|
||||
echo " $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
|
||||
echo ""
|
||||
|
|
@ -1,368 +0,0 @@
|
|||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)
|
||||
# =============================================================================
|
||||
#
|
||||
# Runs both modes IN PARALLEL on separate GPUs for fair comparison:
|
||||
# - GPU 0: lora_only (--enforce-eager, ~13 TPS)
|
||||
# - GPU 1: lora_restart (no --enforce-eager, ~108 TPS)
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/compare_lora_modes.sh [MODEL] [STEPS]
|
||||
#
|
||||
# Example:
|
||||
# ./scripts/compare_lora_modes.sh Qwen/Qwen3-4B-Instruct-2507 20
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
|
||||
TRAINING_STEPS="${2:-20}"
|
||||
BATCH_SIZE="${BATCH_SIZE:-2}"
|
||||
USE_WANDB="${USE_WANDB:-true}" # Set USE_WANDB=false to disable
|
||||
WANDB_PROJECT="${WANDB_PROJECT:-lora-mode-comparison}"
|
||||
|
||||
# Port allocation (separate ports for each mode)
|
||||
LORA_ONLY_VLLM_PORT=9001
|
||||
LORA_ONLY_API_PORT=8001
|
||||
|
||||
LORA_RESTART_VLLM_PORT=9002
|
||||
LORA_RESTART_API_PORT=8002
|
||||
|
||||
# GPU allocation
|
||||
LORA_ONLY_GPU=0
|
||||
LORA_RESTART_GPU=1
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
||||
|
||||
LOG_DIR="${REPO_DIR}/lora_comparison_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
echo "============================================================"
|
||||
echo "LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)"
|
||||
echo "============================================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $TRAINING_STEPS"
|
||||
echo "Batch: $BATCH_SIZE"
|
||||
echo "Wandb: $USE_WANDB (project: $WANDB_PROJECT)"
|
||||
echo ""
|
||||
echo "GPU Allocation:"
|
||||
echo " GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
|
||||
echo " GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
|
||||
echo ""
|
||||
echo "Log Dir: $LOG_DIR"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "Cleaning up all processes..."
|
||||
pkill -u $USER -f "vllm_api_server" 2>/dev/null || true
|
||||
pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
|
||||
pkill -u $USER -f "run-api" 2>/dev/null || true
|
||||
pkill -u $USER -f "grpo" 2>/dev/null || true
|
||||
for port in $LORA_ONLY_VLLM_PORT $LORA_ONLY_API_PORT $LORA_RESTART_VLLM_PORT $LORA_RESTART_API_PORT; do
|
||||
fuser -k ${port}/tcp 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Initial cleanup
|
||||
cleanup
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# =============================================================================
|
||||
# Helper functions
|
||||
# =============================================================================
|
||||
|
||||
wait_for_health() {
|
||||
local port=$1
|
||||
local name=$2
|
||||
local max_attempts=${3:-60}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
|
||||
echo " ✓ $name ready (port $port)"
|
||||
return 0
|
||||
fi
|
||||
sleep 5
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
echo " ✗ $name failed to start (port $port)"
|
||||
return 1
|
||||
}
|
||||
|
||||
wait_for_api() {
|
||||
local port=$1
|
||||
local name=$2
|
||||
local max_attempts=${3:-30}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
|
||||
echo " ✓ $name ready (port $port)"
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
echo " ✗ $name failed to start (port $port)"
|
||||
return 1
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# START BOTH MODES IN PARALLEL
|
||||
# =============================================================================
|
||||
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "Starting both modes in parallel..."
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_ONLY (GPU 0)
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
|
||||
|
||||
# Start run-api for lora_only
|
||||
run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
|
||||
LORA_ONLY_API_PID=$!
|
||||
|
||||
# Start vLLM with --enforce-eager for lora_only
|
||||
echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
|
||||
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--port $LORA_ONLY_VLLM_PORT \
|
||||
--gpu-memory-utilization 0.3 \
|
||||
--enable-lora \
|
||||
--max-lora-rank 64 \
|
||||
--enforce-eager \
|
||||
> "$LOG_DIR/vllm_lora_only.log" 2>&1 &
|
||||
LORA_ONLY_VLLM_PID=$!
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_RESTART (GPU 1) - Trainer manages vLLM internally
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
|
||||
|
||||
# Pre-create checkpoint directory so vLLM can write its log there
|
||||
mkdir -p "$LOG_DIR/checkpoints_lora_restart"
|
||||
|
||||
# Start run-api for lora_restart
|
||||
run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
|
||||
LORA_RESTART_API_PID=$!
|
||||
|
||||
# =============================================================================
|
||||
# WAIT FOR INFRASTRUCTURE
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Waiting for infrastructure to be ready..."
|
||||
|
||||
wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
|
||||
wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
|
||||
wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 90 || exit 1
|
||||
|
||||
# =============================================================================
|
||||
# START ENVIRONMENTS AND TRAINERS
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "Starting environments and trainers..."
|
||||
|
||||
# Record start time
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_ONLY: Start environment and trainer
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_ONLY] Starting GSM8k environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.use_wandb=$USE_WANDB \
|
||||
--env.wandb_name "lora-only-env" \
|
||||
--env.rollout_server_url "http://localhost:${LORA_ONLY_API_PORT}" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
2>&1 | tee "$LOG_DIR/env_lora_only.log" &
|
||||
LORA_ONLY_ENV_PID=$!
|
||||
|
||||
echo "[LORA_ONLY] Starting trainer..."
|
||||
|
||||
# Build wandb args
|
||||
WANDB_ARGS=""
|
||||
if [ "$USE_WANDB" = "true" ]; then
|
||||
WANDB_ARGS="--use-wandb --wandb-project $WANDB_PROJECT --wandb-group lora-only"
|
||||
fi
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_only \
|
||||
--vllm-port $LORA_ONLY_VLLM_PORT \
|
||||
--atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--vllm-restart-interval 5 \
|
||||
--save-path "$LOG_DIR/checkpoints_lora_only" \
|
||||
$WANDB_ARGS \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer_lora_only.log" &
|
||||
LORA_ONLY_TRAINER_PID=$!
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# LORA_RESTART: Start trainer (it manages vLLM internally)
|
||||
# -----------------------------------------------------------------------------
|
||||
echo ""
|
||||
echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
|
||||
|
||||
# Build wandb args for lora_restart
|
||||
WANDB_ARGS_RESTART=""
|
||||
if [ "$USE_WANDB" = "true" ]; then
|
||||
WANDB_ARGS_RESTART="--use-wandb --wandb-project $WANDB_PROJECT --wandb-group lora-restart"
|
||||
fi
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_restart \
|
||||
--vllm-port $LORA_RESTART_VLLM_PORT \
|
||||
--vllm-gpu-memory-utilization 0.3 \
|
||||
--atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--vllm-restart-interval 5 \
|
||||
--save-path "$LOG_DIR/checkpoints_lora_restart" \
|
||||
$WANDB_ARGS_RESTART \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer_lora_restart.log" &
|
||||
LORA_RESTART_TRAINER_PID=$!
|
||||
|
||||
# Wait for lora_restart's internal vLLM to start
|
||||
# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
|
||||
echo "[LORA_RESTART] Waiting for internal vLLM to start..."
|
||||
echo " NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
|
||||
echo " Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
|
||||
sleep 30 # Give more time for model loading before checking health
|
||||
wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
|
||||
echo " Failed - check logs:"
|
||||
echo " Trainer log:"
|
||||
tail -30 "$LOG_DIR/trainer_lora_restart.log"
|
||||
echo ""
|
||||
echo " vLLM internal log (if exists):"
|
||||
tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo " (not found)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Start GSM8k environment for lora_restart
|
||||
echo "[LORA_RESTART] Starting GSM8k environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.use_wandb=$USE_WANDB \
|
||||
--env.wandb_name "lora-restart-env" \
|
||||
--env.rollout_server_url "http://localhost:${LORA_RESTART_API_PORT}" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
2>&1 | tee "$LOG_DIR/env_lora_restart.log" &
|
||||
LORA_RESTART_ENV_PID=$!
|
||||
|
||||
# =============================================================================
|
||||
# WAIT FOR BOTH TRAINERS TO COMPLETE
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo "Both trainers running in parallel. Waiting for completion..."
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo ""
|
||||
echo "📊 WANDB: https://wandb.ai (project: $WANDB_PROJECT)"
|
||||
echo ""
|
||||
echo "📋 MONITOR LOGS (in another terminal):"
|
||||
echo ""
|
||||
echo " # Trainer logs (main output):"
|
||||
echo " tail -f $LOG_DIR/trainer_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/trainer_lora_restart.log"
|
||||
echo ""
|
||||
echo " # Environment logs (rollouts, scores):"
|
||||
echo " tail -f $LOG_DIR/env_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/env_lora_restart.log"
|
||||
echo ""
|
||||
echo " # vLLM logs:"
|
||||
echo " tail -f $LOG_DIR/vllm_lora_only.log"
|
||||
echo " tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
|
||||
echo ""
|
||||
echo " # All logs at once:"
|
||||
echo " tail -f $LOG_DIR/*.log"
|
||||
echo ""
|
||||
|
||||
# Wait for trainers
|
||||
LORA_ONLY_EXIT=0
|
||||
LORA_RESTART_EXIT=0
|
||||
|
||||
wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
|
||||
LORA_ONLY_END=$(date +%s)
|
||||
LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
|
||||
echo " ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
|
||||
|
||||
wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
|
||||
LORA_RESTART_END=$(date +%s)
|
||||
LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
|
||||
echo " ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
|
||||
|
||||
# =============================================================================
|
||||
# RESULTS
|
||||
# =============================================================================
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "COMPARISON RESULTS (Parallel Execution)"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
echo "Training Steps: $TRAINING_STEPS"
|
||||
echo "Batch Size: $BATCH_SIZE"
|
||||
echo ""
|
||||
echo "┌─────────────────┬──────┬──────────────┬────────────────────────────┐"
|
||||
echo "│ Mode │ GPU │ Total Time │ Notes │"
|
||||
echo "├─────────────────┼──────┼──────────────┼────────────────────────────┤"
|
||||
printf "│ lora_only │ %d │ %10ss │ --enforce-eager (~13 TPS) │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
|
||||
printf "│ lora_restart │ %d │ %10ss │ no --enforce-eager (~108 TPS)│\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
|
||||
echo "└─────────────────┴──────┴──────────────┴────────────────────────────┘"
|
||||
echo ""
|
||||
|
||||
if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
|
||||
SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
|
||||
echo "Speedup: ${SPEEDUP}x (lora_restart vs lora_only)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "📊 BENCHMARK DETAILS:"
|
||||
echo ""
|
||||
echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
|
||||
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_only.log)"
|
||||
echo ""
|
||||
echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
|
||||
grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo " (check $LOG_DIR/trainer_lora_restart.log)"
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "📁 All logs saved to: $LOG_DIR"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
echo "Log files:"
|
||||
echo " $LOG_DIR/trainer_lora_only.log"
|
||||
echo " $LOG_DIR/trainer_lora_restart.log"
|
||||
echo " $LOG_DIR/vllm_lora_only.log"
|
||||
echo " $LOG_DIR/env_lora_only.log"
|
||||
echo " $LOG_DIR/env_lora_restart.log"
|
||||
echo ""
|
||||
|
|
@ -1,140 +0,0 @@
|
|||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# LoRA Mode GSM8k Training Test
|
||||
# =============================================================================
|
||||
#
|
||||
# Tests the LoRA training pipeline with GSM8k environment.
|
||||
# Uses separate GPUs for vLLM and trainer.
|
||||
#
|
||||
# Usage:
|
||||
# CUDA_VISIBLE_DEVICES=0,1 ./scripts/test_lora_mode.sh [MODEL] [STEPS]
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-Qwen/Qwen2.5-3B-Instruct}"
|
||||
TRAINING_STEPS="${2:-50}"
|
||||
BATCH_SIZE=4
|
||||
SAVE_INTERVAL=10
|
||||
|
||||
VLLM_PORT=9001
|
||||
GSM8K_PORT=8001
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
||||
|
||||
LOG_DIR="${REPO_DIR}/lora_test_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
echo "============================================================"
|
||||
echo "LoRA Mode GSM8k Training Test"
|
||||
echo "============================================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $TRAINING_STEPS"
|
||||
echo "Log Dir: $LOG_DIR"
|
||||
echo "============================================================"
|
||||
|
||||
cleanup() {
|
||||
echo "Cleaning up..."
|
||||
pkill -u $USER -f "vllm_api_server.*port.*${VLLM_PORT}" 2>/dev/null || true
|
||||
pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
|
||||
pkill -u $USER -f "grpo.py" 2>/dev/null || true
|
||||
}
|
||||
trap cleanup EXIT
|
||||
cleanup
|
||||
|
||||
# Clear Triton cache for B200 compatibility
|
||||
rm -rf ~/.triton/cache
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
echo ""
|
||||
echo "[1/4] Starting vLLM with LoRA support..."
|
||||
VLLM_ENABLE_SHARED_WEIGHTS=1 \
|
||||
python -u example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--tensor-parallel-size 1 \
|
||||
--port $VLLM_PORT \
|
||||
--dtype bfloat16 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--enable-lora \
|
||||
--max-loras 2 \
|
||||
--max-lora-rank 64 \
|
||||
--enforce-eager \
|
||||
> "${LOG_DIR}/vllm.log" 2>&1 &
|
||||
|
||||
echo "Waiting for vLLM (45s)..."
|
||||
sleep 45
|
||||
|
||||
curl -s "http://localhost:${VLLM_PORT}/health" && echo " ✓ vLLM ready" || { echo " ✗ vLLM failed"; exit 1; }
|
||||
|
||||
echo ""
|
||||
echo "[2/4] Starting GSM8k environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.use_wandb=False \
|
||||
--env.rollout_server_url "http://localhost:${GSM8K_PORT}" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:${VLLM_PORT}/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
> "${LOG_DIR}/gsm8k.log" 2>&1 &
|
||||
|
||||
echo "Waiting for GSM8k (10s)..."
|
||||
sleep 10
|
||||
|
||||
echo ""
|
||||
echo "[3/4] Baseline test (before training)..."
|
||||
curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.1
|
||||
}' | jq '.text[0]' | tee "${LOG_DIR}/baseline_response.txt"
|
||||
|
||||
echo ""
|
||||
echo "[4/4] Starting LoRA trainer..."
|
||||
python -u example_trainer/grpo.py \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_only \
|
||||
--vllm-port $VLLM_PORT \
|
||||
--atropos-url "http://localhost:${GSM8K_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--vllm-restart-interval $SAVE_INTERVAL \
|
||||
--save-path "$LOG_DIR/checkpoints" \
|
||||
--benchmark \
|
||||
2>&1 | tee "${LOG_DIR}/trainer.log"
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "Training Complete!"
|
||||
echo "Logs: $LOG_DIR"
|
||||
echo "Checkpoints: $LOG_DIR/checkpoints"
|
||||
echo "============================================================"
|
||||
|
||||
# Post-training test
|
||||
if [ -d "$LOG_DIR/checkpoints" ]; then
|
||||
LATEST_ADAPTER=$(ls -td "$LOG_DIR/checkpoints/adapter_"* 2>/dev/null | head -1)
|
||||
if [ -n "$LATEST_ADAPTER" ]; then
|
||||
echo ""
|
||||
echo "Post-training test with adapter: $LATEST_ADAPTER"
|
||||
|
||||
curl -s -X POST "http://localhost:${VLLM_PORT}/lora/load" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"adapter_path": "'"$LATEST_ADAPTER"'"}' | jq
|
||||
|
||||
echo ""
|
||||
echo "Response after training:"
|
||||
curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.1
|
||||
}' | jq '.text[0]' | tee "${LOG_DIR}/trained_response.txt"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Quick test for lora_restart mode - just 10 steps with 2 restarts
|
||||
set -e
|
||||
|
||||
MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
|
||||
STEPS="${2:-10}"
|
||||
GPU="${3:-0}"
|
||||
PORT_API=8099
|
||||
PORT_VLLM=9099
|
||||
MAX_LEN="${MAX_LEN:-8192}" # Use 8k for quick test, set MAX_LEN=32768 for full test
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
LOG_DIR="./lora_restart_test_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
echo "=============================================="
|
||||
echo "LORA_RESTART Quick Test"
|
||||
echo "=============================================="
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $STEPS"
|
||||
echo "GPU: $GPU"
|
||||
echo "Max Length: $MAX_LEN"
|
||||
echo "Log dir: $LOG_DIR"
|
||||
echo "=============================================="
|
||||
|
||||
# Cleanup
|
||||
cleanup() {
|
||||
echo "Cleaning up..."
|
||||
pkill -9 -f "port $PORT_VLLM" 2>/dev/null || true
|
||||
pkill -9 -f "port $PORT_API" 2>/dev/null || true
|
||||
fuser -k ${PORT_API}/tcp 2>/dev/null || true
|
||||
fuser -k ${PORT_VLLM}/tcp 2>/dev/null || true
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Kill any existing processes
|
||||
cleanup
|
||||
sleep 2
|
||||
|
||||
# Start API server
|
||||
echo ""
|
||||
echo "[1/3] Starting API server on port $PORT_API..."
|
||||
run-api --port $PORT_API > "$LOG_DIR/api.log" 2>&1 &
|
||||
API_PID=$!
|
||||
sleep 3
|
||||
|
||||
# Check API is up
|
||||
if ! curl -s "http://localhost:$PORT_API/info" > /dev/null; then
|
||||
echo "ERROR: API server failed to start"
|
||||
cat "$LOG_DIR/api.log"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ API server ready"
|
||||
|
||||
# Start environment (GSM8K for simplicity)
|
||||
echo ""
|
||||
echo "[2/3] Starting GSM8K environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.use_wandb=False \
|
||||
--env.rollout_server_url "http://localhost:$PORT_API" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:$PORT_VLLM/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
> "$LOG_DIR/env.log" 2>&1 &
|
||||
ENV_PID=$!
|
||||
echo " ✓ Environment started (PID: $ENV_PID)"
|
||||
sleep 5
|
||||
|
||||
# Start trainer
|
||||
echo ""
|
||||
echo "[3/3] Starting LORA_RESTART trainer..."
|
||||
echo " (This will launch vLLM internally and restart every 5 steps)"
|
||||
echo ""
|
||||
|
||||
CUDA_VISIBLE_DEVICES=$GPU python -m example_trainer.grpo \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode lora_restart \
|
||||
--vllm-port $PORT_VLLM \
|
||||
--vllm-gpu-memory-utilization 0.20 \
|
||||
--atropos-url "http://localhost:$PORT_API" \
|
||||
--batch-size 2 \
|
||||
--training-steps $STEPS \
|
||||
--max-model-len $MAX_LEN \
|
||||
--seq-len $MAX_LEN \
|
||||
--lora-r 16 \
|
||||
--lora-alpha 32 \
|
||||
--vllm-restart-interval 5 \
|
||||
--save-path "$LOG_DIR/checkpoints" \
|
||||
--benchmark \
|
||||
2>&1 | tee "$LOG_DIR/trainer.log"
|
||||
|
||||
echo ""
|
||||
echo "=============================================="
|
||||
echo "Test complete! Logs in: $LOG_DIR"
|
||||
echo "=============================================="
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# Single-Copy Mode GSM8k Training Test
|
||||
# =============================================================================
|
||||
#
|
||||
# Tests the single-copy (shared_vllm) training pipeline with GSM8k environment.
|
||||
# vLLM and trainer share the SAME GPU memory - true single-copy architecture.
|
||||
#
|
||||
# Usage:
|
||||
# CUDA_VISIBLE_DEVICES=0 ./scripts/test_single_copy_mode.sh [MODEL] [STEPS]
|
||||
#
|
||||
# Note: Single-copy mode requires tensor-parallel-size=1
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-Qwen/Qwen2.5-3B-Instruct}"
|
||||
TRAINING_STEPS="${2:-50}"
|
||||
BATCH_SIZE=4
|
||||
|
||||
VLLM_PORT=9002
|
||||
GSM8K_PORT=8002
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
REPO_DIR="$(dirname "$TRAINER_DIR")"
|
||||
|
||||
LOG_DIR="${REPO_DIR}/single_copy_test_$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
echo "============================================================"
|
||||
echo "Single-Copy Mode GSM8k Training Test"
|
||||
echo "============================================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Steps: $TRAINING_STEPS"
|
||||
echo "Log Dir: $LOG_DIR"
|
||||
echo ""
|
||||
echo "NOTE: vLLM and trainer share the SAME GPU memory!"
|
||||
echo " Weight updates are INSTANT (no copying)."
|
||||
echo "============================================================"
|
||||
|
||||
cleanup() {
|
||||
echo "Cleaning up..."
|
||||
pkill -u $USER -f "vllm_api_server.*port.*${VLLM_PORT}" 2>/dev/null || true
|
||||
pkill -u $USER -f "gsm8k_server.*${GSM8K_PORT}" 2>/dev/null || true
|
||||
pkill -u $USER -f "grpo.py.*shared_vllm" 2>/dev/null || true
|
||||
}
|
||||
trap cleanup EXIT
|
||||
cleanup
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
echo ""
|
||||
echo "[1/4] Starting vLLM with shared memory enabled..."
|
||||
# NOTE: --enforce-eager is REQUIRED for single-copy mode!
|
||||
# Without it, CUDA graphs freeze weights and updates won't be visible to inference.
|
||||
VLLM_ENABLE_SHARED_WEIGHTS=1 \
|
||||
LOGDIR="$LOG_DIR" \
|
||||
python -u example_trainer/vllm_api_server.py \
|
||||
--model "$MODEL" \
|
||||
--tensor-parallel-size 1 \
|
||||
--port $VLLM_PORT \
|
||||
--dtype bfloat16 \
|
||||
--gpu-memory-utilization 0.5 \
|
||||
--enforce-eager \
|
||||
> "${LOG_DIR}/vllm.log" 2>&1 &
|
||||
|
||||
echo "Waiting for vLLM (45s)..."
|
||||
sleep 45
|
||||
|
||||
curl -s "http://localhost:${VLLM_PORT}/health" && echo " ✓ vLLM ready" || { echo " ✗ vLLM failed"; exit 1; }
|
||||
|
||||
# Verify IPC handles are exported
|
||||
if [ -f "${LOG_DIR}/vllm_bridge_config.json" ]; then
|
||||
echo " ✓ vllm_bridge_config.json created"
|
||||
PARAM_COUNT=$(jq '.ipc_handles | keys | length' "${LOG_DIR}/vllm_bridge_config.json" 2>/dev/null || echo "0")
|
||||
echo " Exported parameters: $PARAM_COUNT"
|
||||
else
|
||||
echo " ✗ vllm_bridge_config.json not found - shared memory may not work"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[2/4] Starting GSM8k environment..."
|
||||
python -u environments/gsm8k_server.py serve \
|
||||
--env.tokenizer_name "$MODEL" \
|
||||
--env.use_wandb=False \
|
||||
--env.rollout_server_url "http://localhost:${GSM8K_PORT}" \
|
||||
--openai.model_name "$MODEL" \
|
||||
--openai.base_url "http://localhost:${VLLM_PORT}/v1" \
|
||||
--openai.server_type vllm \
|
||||
--slurm false \
|
||||
> "${LOG_DIR}/gsm8k.log" 2>&1 &
|
||||
|
||||
echo "Waiting for GSM8k (10s)..."
|
||||
sleep 10
|
||||
|
||||
echo ""
|
||||
echo "[3/4] Baseline test (before training)..."
|
||||
curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.1
|
||||
}' | jq '.text[0]' | tee "${LOG_DIR}/baseline_response.txt"
|
||||
|
||||
echo ""
|
||||
echo "[4/4] Starting Single-Copy trainer..."
|
||||
echo "The trainer will attach to vLLM's GPU memory via CUDA IPC."
|
||||
echo ""
|
||||
|
||||
python -u example_trainer/grpo.py \
|
||||
--model-name "$MODEL" \
|
||||
--weight-bridge-mode shared_vllm \
|
||||
--vllm-port $VLLM_PORT \
|
||||
--atropos-url "http://localhost:${GSM8K_PORT}" \
|
||||
--batch-size $BATCH_SIZE \
|
||||
--training-steps $TRAINING_STEPS \
|
||||
--save-path "$LOG_DIR/checkpoints" \
|
||||
--vllm-config-path "${LOG_DIR}/vllm_bridge_config.json" \
|
||||
--benchmark \
|
||||
--debug-loading \
|
||||
2>&1 | tee "${LOG_DIR}/trainer.log"
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo "Training Complete!"
|
||||
echo "============================================================"
|
||||
echo "Logs: $LOG_DIR"
|
||||
echo ""
|
||||
echo "Key Metrics:"
|
||||
grep -E "Attached|fused|Step.*Loss" "${LOG_DIR}/trainer.log" | tail -20
|
||||
echo "============================================================"
|
||||
|
||||
# Post-training test
|
||||
echo ""
|
||||
echo "Post-training test (weights are already updated in vLLM):"
|
||||
curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.1
|
||||
}' | jq '.text[0]' | tee "${LOG_DIR}/trained_response.txt"
|
||||
|
|
@ -1,246 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch.
|
||||
Tests whether GPU memory is properly released between restarts.
|
||||
|
||||
Run from atropos directory:
|
||||
python example_trainer/scripts/test_vllm_restart_only.py --restarts 3 --gpu 0
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import subprocess
|
||||
import signal
|
||||
|
||||
|
||||
def kill_process_on_port(port: int) -> None:
|
||||
"""Kill any process using the specified port."""
|
||||
try:
|
||||
subprocess.run(f"fuser -k {port}/tcp", shell=True, capture_output=True, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def wait_for_vllm_ready(port: int, timeout: int = 300) -> bool:
|
||||
"""Wait for vLLM to be ready on the specified port."""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
req = urllib.request.urlopen(f"http://localhost:{port}/health", timeout=5)
|
||||
if req.status == 200:
|
||||
return True
|
||||
except (urllib.error.URLError, Exception):
|
||||
pass
|
||||
time.sleep(5)
|
||||
elapsed = int(time.time() - start)
|
||||
print(f" Waiting... ({elapsed}s / {timeout}s)")
|
||||
return False
|
||||
|
||||
|
||||
def terminate_vllm(proc, port: int) -> None:
|
||||
"""Terminate vLLM process and release GPU memory."""
|
||||
print(f" Terminating vLLM on port {port}...")
|
||||
|
||||
# Get current GPU device
|
||||
gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
|
||||
|
||||
# Phase 1: Kill the process group (kills all children too)
|
||||
if proc is not None:
|
||||
print(f" Killing process group (PID: {proc.pid})...")
|
||||
try:
|
||||
# Kill entire process group - this gets all child processes
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
pass
|
||||
try:
|
||||
proc.kill()
|
||||
proc.wait(timeout=5)
|
||||
except Exception as e:
|
||||
print(f" Warning: {e}")
|
||||
|
||||
# Phase 2: Kill by port (catches anything still running)
|
||||
kill_process_on_port(port)
|
||||
time.sleep(2)
|
||||
|
||||
# Phase 3: Kill ALL vLLM-related processes
|
||||
print(" Killing all vLLM-related processes...")
|
||||
kill_commands = [
|
||||
f"fuser -k {port}/tcp",
|
||||
"pkill -9 -f 'vllm.*EngineCore'",
|
||||
"pkill -9 -f 'vllm_api_server'",
|
||||
"pkill -9 -f 'from vllm'",
|
||||
"pkill -9 -f 'multiprocessing.spawn'",
|
||||
]
|
||||
for cmd in kill_commands:
|
||||
try:
|
||||
subprocess.run(cmd, shell=True, capture_output=True, timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Phase 4: Check for zombie GPU processes
|
||||
print(f" Checking for zombie GPU processes on GPU {gpu_id}...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
f"nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits -i {gpu_id}",
|
||||
shell=True, capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.stdout.strip():
|
||||
print(f" Found GPU processes:\n{result.stdout}")
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
parts = line.split(',')
|
||||
if len(parts) >= 1:
|
||||
pid = parts[0].strip()
|
||||
if pid and pid != str(os.getpid()):
|
||||
print(f" Killing zombie GPU process: {pid}")
|
||||
try:
|
||||
subprocess.run(f"kill -9 {pid}", shell=True, timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f" Warning: nvidia-smi check failed: {e}")
|
||||
|
||||
# Phase 5: Wait for GPU memory release
|
||||
print(" Waiting for GPU memory release...")
|
||||
import torch
|
||||
for i in range(12): # 60 seconds total
|
||||
time.sleep(5)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f" [{(i+1)*5}s] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
|
||||
if free_mem > total_mem * 0.5:
|
||||
print(f" ✓ Sufficient memory available ({free_mem:.1f} GB)")
|
||||
break
|
||||
|
||||
# Final cleanup
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f" ✓ Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
|
||||
|
||||
print(" ✓ vLLM terminated")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test vLLM restart cycle")
|
||||
parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507")
|
||||
parser.add_argument("--port", type=int, default=9099)
|
||||
parser.add_argument("--gpu", type=int, default=0)
|
||||
parser.add_argument("--memory-util", type=float, default=0.3)
|
||||
parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
|
||||
|
||||
import torch
|
||||
|
||||
print("=" * 60)
|
||||
print("vLLM RESTART CYCLE TEST")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Port: {args.port}")
|
||||
print(f"GPU: {args.gpu}")
|
||||
print(f"Memory utilization: {args.memory_util}")
|
||||
print(f"Restart cycles: {args.restarts}")
|
||||
print("=" * 60)
|
||||
|
||||
# Check initial GPU memory
|
||||
if torch.cuda.is_available():
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free")
|
||||
|
||||
# Find server script (relative to this script's location)
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py")
|
||||
|
||||
if not os.path.exists(server_script):
|
||||
print(f"ERROR: Cannot find vllm_api_server.py at {server_script}")
|
||||
return 1
|
||||
|
||||
log_dir = "/tmp/vllm_restart_test"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
for cycle in range(args.restarts):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"CYCLE {cycle + 1}/{args.restarts}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Check memory before launch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
|
||||
|
||||
# Launch vLLM
|
||||
print(f"\n[{cycle+1}] Launching vLLM...")
|
||||
cmd = [
|
||||
"python", server_script,
|
||||
"--model", args.model,
|
||||
"--port", str(args.port),
|
||||
"--gpu-memory-utilization", str(args.memory_util),
|
||||
"--max-model-len", "4096",
|
||||
]
|
||||
print(f" Command: {' '.join(cmd)}")
|
||||
|
||||
log_file = f"{log_dir}/vllm_cycle_{cycle}.log"
|
||||
with open(log_file, "w") as f:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=f,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=os.environ.copy(),
|
||||
start_new_session=True, # Creates new process group for easy cleanup
|
||||
)
|
||||
print(f" PID: {proc.pid} (process group: {os.getpgid(proc.pid)})")
|
||||
print(f" Log: {log_file}")
|
||||
|
||||
# Wait for vLLM to be ready
|
||||
print(f" Waiting for vLLM to be ready...")
|
||||
start_time = time.time()
|
||||
if wait_for_vllm_ready(args.port, timeout=300):
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ✓ vLLM ready in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f" ✗ vLLM failed to start!")
|
||||
print(f" Check log: tail -50 {log_file}")
|
||||
proc.kill()
|
||||
return 1
|
||||
|
||||
# Check memory after launch
|
||||
if torch.cuda.is_available():
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
|
||||
|
||||
# Keep vLLM running for a bit
|
||||
print(f"\n Letting vLLM run for 5s...")
|
||||
time.sleep(5)
|
||||
|
||||
# Terminate vLLM
|
||||
print(f"\n[{cycle+1}] Terminating vLLM...")
|
||||
terminate_vllm(proc, args.port)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST COMPLETE!")
|
||||
print("=" * 60)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
free_mem = torch.cuda.mem_get_info()[0] / 1e9
|
||||
total_mem = torch.cuda.mem_get_info()[1] / 1e9
|
||||
print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue