cleanup

2026-04-19 12:57:58 +00:00 · 2026-02-13 10:51:21 -05:00 · 2026-02-13 10:51:21 -05:00 · 0ebf3552c9
commit 0ebf3552c9
parent 9f6cc64b9e
8 changed files with 388 additions and 1977 deletions
--- a/example_trainer/README.md
+++ b/example_trainer/README.md
@ -6,28 +6,30 @@ A modular training framework for fine-tuning language models with **Group Relati

 ```
 example_trainer/
-├── grpo.py              # CLI entry point (dispatches to trainers)
-├── run.py               # Unified launcher for shared_vllm mode
-├── config.py            # TrainingConfig dataclass
-├── cli.py               # CLI argument parsing (single source of truth)
-├── api.py               # Atropos API communication
-├── data.py              # Data fetching & preprocessing
-├── model.py             # Model loading & CUDA IPC shared memory
-├── training.py          # GRPO loss computation & training step
-├── checkpointing.py     # Save models & LoRA adapters
-├── vllm_manager.py      # vLLM process management
-├── trainers.py          # Training mode implementations
-├── vllm_api_server.py   # Custom vLLM server (streamlined for training)
-├── vllm_patching/       # CUDA IPC patches for weight sharing
+├── grpo.py              # CLI entry point (dispatches to 4 training modes)
+├── run.py               # Unified launcher for shared_vllm mode (starts vLLM+trainer)
+├── config.py            # TrainingConfig Pydantic model (all hyperparameters)
+├── cli.py               # CLI argument parsing (modular, single source of truth)
+├── api.py               # Atropos API communication (registration, batch fetching)
+├── data.py              # Data fetching, preprocessing, logprob alignment
+├── model.py             # Model loading, CUDA IPC, tensor mapping (QKV/Gate fusion)
+├── training.py          # GRPO loss (importance sampling, KL penalty, clipping)
+├── checkpointing.py     # Save models & LoRA adapters (handles fused tensor unfusing)
+├── vllm_manager.py      # vLLM process lifecycle (launch, health, termination)
+├── trainers.py          # 4 training mode implementations + optimizer selection
+├── vllm_api_server.py   # Custom vLLM server with /generate endpoint + LoRA
+├── vllm_patching/       # CUDA IPC patches for weight sharing 
 │   └── patched_gpu_runner.py
-└── scripts/             # Helper scripts
+└── scripts/             # Helper scripts and benchmarks
    ├── test_lora_mode.sh
-    └── test_single_copy_mode.sh
+    ├── test_single_copy_mode.sh
+    └── compare_all_modes_math_zero.sh
 ```


-GRPO Training Loop
+## GRPO Training Loop

+```
 1. Generate multiple responses to the same prompt
 2. Score each response (reward)
 3. Compute ADVANTAGE = reward - mean(rewards)
@ -47,13 +49,17 @@ GRPO Training Loop

 ## System Architecture

+```
 Data Flow:
 1. Environment generates prompts → calls vLLM → scores responses
 2. Environment sends trajectories to run-api
 3. Trainer fetches batches from run-api
 4. Trainer updates model weights
-5. (shared_vllm) vLLM sees updates immediately via CUDA IPC
-   (lora_only) Trainer pushes adapter to vLLM periodically
+5. Weight synchronization:
+   - shared_vllm: vLLM sees updates immediately via CUDA IPC (zero-copy)
+   - lora_only: Trainer pushes adapter to vLLM via HTTP (slow)
+   - lora_restart: Trainer restarts vLLM with new adapter (fast)
+   - none (legacy): Trainer saves checkpoint and restarts vLLM
 ```

 ---
@ -65,7 +71,7 @@ Data Flow:
 | **shared_vllm** | Single-copy via CUDA IPC | 1x model | ~172 TPS | Same GPU, maximum efficiency |
 | **lora_restart** | LoRA + vLLM restarts | 1x + adapter | ~108 TPS | LoRA training with speed |
 | **lora_only** | LoRA + HTTP hot-swap | 1x + adapter | ~13 TPS ⚠️ | Debugging only |
-| **legacy** | Full model, restart vLLM | 2x model | ~172 TPS | Different GPUs, simple setup |
+| **none** (legacy) | Full model, restart vLLM | 2x model | ~172 TPS | simple setup |

 ### ⚠️ IMPORTANT: `lora_only` Performance Warning

@ -80,15 +86,16 @@ The `lora_only` mode requires `--enforce-eager` which **disables CUDA graphs**,
 **Use `shared_vllm`** for production training when:
 - You have enough GPU memory for the full model
 - You want fastest training (no overhead)
+- Trainer and vLLM are on the same GPU(s)

 **Use `lora_restart`** when:
 - You want LoRA's memory efficiency
- You want fast inference (~108 TPS vs ~13 TPS = 8x speedup)
 - You can tolerate ~45s restart overhead every N steps

 **Avoid `lora_only`** unless you're debugging - the 8x inference penalty is severe.

-**Use `shared_vllm`** for single-GPU training when you need maximum efficiency.
+**Use `none` (legacy)** mode when:
+- You want the simplest setup without CUDA IPC or LoRA

 ---

@ -118,13 +125,14 @@ python -m example_trainer.vllm_api_server \

 **Terminal 3: Environment**
 ```bash
+# Important: Use server_type=vllm to get logprobs (required for GRPO)
 python environments/gsm8k_server.py serve \
    --env.group_size 4 \
    --env.max_num 200 \
    --slurm.num_requests_per_time_interval 16 \
    --slurm.time_interval 10 \
    --openai.api_key "dummy" \
-    --openai.base_url "http://localhost:9001" \
+    --openai.base_url "http://localhost:9001/v1" \
    --openai.model_name "NousResearch/Hermes-3-Llama-3.1-8B" \
    --openai.server_type vllm
 ```
@ -138,7 +146,7 @@ python -m example_trainer.grpo \
    --atropos-url "http://localhost:8002" \
    --batch-size 4 \
    --gradient-accumulation-steps 4 \
-    --learning-rate 1e-5 \
+    --lr 1e-5 \
    --training-steps 30 \
    --kl-coef 0.1 \
    --clip-eps 0.2 \
@ -150,16 +158,27 @@ python -m example_trainer.grpo \
 ### Startup Order

 ```bash
-# 1. Start API
-# 2. Wait 5s, start vLLM
-# 3. Wait for vLLM to load (check: curl http://localhost:9001/health)
-# 4. Start environment
-# 5. Start trainer
+# CRITICAL: Follow this exact order!
+# 1. Start API first
+run-api --port 8002
+
+# 2. Wait 5s, then start vLLM
+# Check health: curl http://localhost:9001/health
+python -m example_trainer.vllm_api_server --model ... --enable-lora --enforce-eager
+
+# 3. Wait for vLLM health endpoint to return 200
+while ! curl -s http://localhost:9001/health > /dev/null; do sleep 1; done
+
+# 4. Start environment (MUST use --openai.server_type vllm for logprobs)
+python environments/gsm8k_server.py serve ...
+
+# 5. Start trainer (will register with API and begin training)
+python -m example_trainer.grpo --weight-bridge-mode lora_only ...
 ```

 ---

-##  Shared vLLM Mode (Advanced)
+##  Shared vLLM Mode 

 Single-copy mode shares GPU memory between vLLM and the trainer - zero model duplication!

@ -200,10 +219,14 @@ python -m example_trainer.vllm_api_server \

 **Terminal 3: Environment**
 ```bash
+# Important: Use server_type=vllm to get logprobs (required for GRPO)
 python environments/gsm8k_server.py serve \
-    --openai.base_url "http://localhost:9001" \
+    --openai.base_url "http://localhost:9001/v1" \
    --openai.model_name "NousResearch/Hermes-3-Llama-3.1-8B" \
-    --openai.server_type vllm
+    --openai.server_type vllm \
+    --env.group_size 4 \
+    --slurm.num_requests_per_time_interval 16 \
+    --slurm.time_interval 10
 ```

 **Terminal 4: Trainer**
@ -232,45 +255,64 @@ VLLM_ENABLE_SHARED_WEIGHTS=1 python -m example_trainer.run \

 ## Best Practices & Lessons Learned

-### 1. Always Use `--enforce-eager` with Shared Weights

-**Why:** CUDA graphs "bake" weights at compile time. Without eager mode, vLLM won't see weight updates!
+### 1. Use `--openai.server_type vllm` for Training
+
+**CRITICAL:** The atropos environment MUST use `server_type=vllm` to get logprobs for proper GRPO training.
+
+Only `server_type=vllm` calls the `/generate` endpoint which returns token-level logprobs. These logprobs serve as the reference policy (π_old) for importance sampling in GRPO.

 ```bash
-# WRONG - weight updates won't be visible to inference
-python vllm_api_server.py --model $MODEL
-
-# CORRECT - disables CUDA graphs
-python vllm_api_server.py --model $MODEL --enforce-eager
-```
-
-### 2. Use `--openai.server_type vllm` for Training
-
-The gsm8k environment needs logprobs for GRPO. Only `server_type=vllm` uses the `/generate` endpoint which returns logprobs.
-
-```bash
-# CORRECT - gets logprobs for training
+# CORRECT - gets logprobs for training (REQUIRED!)
 --openai.server_type vllm

-# WRONG for training - no logprobs
+# WRONG for training - no logprobs, training will FAIL
 --openai.server_type openai
 ```

-### 3. KL Coefficient and Clipping Are Essential
+**What happens without logprobs:**
+- The trainer will raise an error: "GRPO requires inference_logprobs for importance sampling!"
+- Without the reference policy, GRPO degenerates to vanilla REINFORCE (leads to reward hacking)

-Without these, training will collapse (reward hacking):
+**How logprobs flow through the system:**
+1. Environment calls vLLM `/generate` with `logprobs=true`
+2. vLLM returns token-level logprobs for each generated token
+3. Environment embeds these in trajectory data sent to API
+4. Trainer extracts and aligns logprobs with training labels
+5. GRPO loss uses logprobs as π_old for importance sampling ratio
+
+### 2. KL Coefficient and Clipping Are Essential
+
+**CRITICAL:** Without these hyperparameters, training WILL collapse (reward hacking):

 ```bash
--kl-coef 0.1      # Prevents policy from drifting too far
--clip-eps 0.2     # Limits update magnitude
+--kl-coef 0.1      # Prevents policy from drifting too far from reference
+--clip-eps 0.2     # Limits importance sampling ratio to [0.8, 1.2]
 ```

-**Symptoms of missing KL/clipping:**
- Accuracy drops dramatically (e.g., 59% → 7%)
- Loss goes to very negative values
- Model outputs become repetitive/degenerate
+**Why these matter:**
+- **KL Penalty** (β): Penalizes the policy for deviating from the reference policy (inference-time policy)
+  - Uses Schulman's unbiased estimator: `exp(-log_ratio) + log_ratio - 1`
+  - Higher β = more conservative updates
+  - Set to 0 to disable (NOT recommended - leads to instability)

-### 4. Memory Budgeting for Large Models
+- **PPO Clipping** (ε): Clips the importance sampling ratio to `[1-ε, 1+ε]`
+  - Prevents catastrophically large policy updates
+  - Takes pessimistic bound (conservative update)
+
+**Symptoms of missing/misconfigured KL/clipping:**
+- Accuracy drops dramatically (e.g., 59% → 7%)
+- Loss goes to very negative values (< -10)
+- Model outputs become repetitive/degenerate
+- `mean_ratio` diverges far from 1.0
+- `mean_kl` explodes (> 1.0)
+
+**Healthy training metrics:**
+- `mean_ratio`: 0.8 - 1.2 (close to 1.0)
+- `mean_kl`: 0.01 - 0.1
+- `clipped_fraction`: < 0.3 (< 30% of tokens clipped)
+
+### 3. Memory Budgeting for Large Models

 | Model Size | GPU Memory | Recommended Settings |
 |------------|------------|----------------------|
@ -278,32 +320,26 @@ Without these, training will collapse (reward hacking):
 | 14B | 80GB | `--gpu-memory-utilization 0.45`, `--batch-size 2` |
 | 24B | 192GB (B200) | `--gpu-memory-utilization 0.30`, `--optimizer adafactor` |

-### 5. Start with Small Batch Sizes

-```bash
-# Start conservative, increase if no OOM
--batch-size 2 --gradient-accumulation-steps 8  # Effective batch = 16
-```
-
-### 6. Optimizer Selection
+### 4. Optimizer Selection

 The trainer supports multiple optimizer options to trade off between speed, memory, and precision:

 | Optimizer | GPU Memory for States | Speed | Precision | Dependencies |
 |-----------|----------------------|-------|-----------|--------------|
-| `adamw` (default) | ~32GB (for 8B model) | Fastest | Full FP32 | None |
-| `adamw_8bit` | ~8GB | Fast | 8-bit quantized | `bitsandbytes` |
+| `adamw` | ~32GB (for 8B model) | Fastest | Full FP32 | None |
+| `adamw_8bit` (default) | ~8GB | Fast | 8-bit quantized | `bitsandbytes` |
 | `adafactor` | ~8GB | Fast | Full (no momentum) | `transformers` |
 | `adamw_cpu` | ~0GB (on CPU) | ~2x slower | Full FP32 | None |

 **Usage:**
 ```bash
-# Standard AdamW (default)
--optimizer adamw
-
-# 8-bit AdamW - recommended for memory-constrained setups
+# 8-bit AdamW (default) - recommended for memory-constrained setups
 --optimizer adamw_8bit

+# Standard AdamW - full precision
+--optimizer adamw
+
 # Adafactor - no momentum states, good for large models
 --optimizer adafactor

@ -392,36 +428,39 @@ vLLM exports tensor mappings to `vllm_bridge_config.json`:

 ## ❓ FAQ

-
-### Q: Why isn't vLLM seeing my weight updates?
-
-**A:** CUDA graphs are caching the old weights. Add `--enforce-eager`:
-
-```bash
-python vllm_api_server.py --model $MODEL --enforce-eager
-```
-
-
-
 ### Q: How do I debug logprob alignment issues?

-**A:** Look for these log messages:
+**A:** Look for these log messages during training:
 ```
 [WARNING] ref_logprobs at generated positions avg 0.85 (should be negative!)
+[WARNING] This suggests inference_logprobs alignment is wrong
 ```

-This means inference logprobs aren't being passed correctly. Check that:
-1. Environment uses `--openai.server_type vllm`
-2. vLLM returns logprobs (check `/generate` response)
+This means inference logprobs aren't being passed correctly. Debug steps:

-### Q: Why does vLLM v1 engine fail with CUDA fork errors?
+1. **Check environment server type:**
+   ```bash
+   # Must be 'vllm', NOT 'openai'
+   --openai.server_type vllm
+   ```

-**A:** vLLM v1 uses multiprocessing that conflicts with CUDA initialization. We default to v0 engine:
+2. **Verify vLLM returns logprobs:**
+   ```bash
+   curl -X POST http://localhost:9001/generate \
+     -H "Content-Type: application/json" \
+     -d '{"prompt": "Hello", "max_tokens": 5}'
+   # Response should include "logprobs": [...]
+   ```

-```python
-# vllm_api_server.py automatically sets:
-os.environ.setdefault("VLLM_USE_V1", "0")
-```
+3. **Check data.py logs:**
+   ```
+   [Data] ✓ inference_logprobs found in batch (sample len: 128)
+   ```
+
+4. **Monitor alignment metrics in training logs:**
+   - `alignment/diff_mean` should be close to 0 at step start
+   - `alignment/diff_abs_mean` < 0.1 = good alignment
+   - Large values = weights not properly shared or logprobs misaligned


 ##  Troubleshooting
@ -479,25 +518,9 @@ vLLM version incompatibility. Our server handles this automatically, but make su
 python -m example_trainer.vllm_api_server  # NOT direct vllm commands
 ```

-### Training is slow / no batches
-
-1. Check vLLM is running: `curl http://localhost:9001/health`
-2. Check API is running: `curl http://localhost:8002/info`
-3. Check environment is connected and generating rollouts
-
---

 ## 📊 Monitoring Training

-### Key Metrics to Watch
-
-| Metric | Healthy Range | Problem If... |
-|--------|---------------|---------------|
-| `mean_ratio` | 0.8 - 1.2 | Far from 1.0 = policy changed too much |
-| `mean_kl` | 0.01 - 0.1 | > 0.5 = policy drifting |
-| `clipped_fraction` | < 0.3 | > 0.5 = learning rate too high |
-| `loss` | Gradually decreasing | Exploding or very negative |
-
 ### WandB Logging

 ```bash
@ -514,11 +537,12 @@ python -m example_trainer.vllm_api_server  # NOT direct vllm commands

 | Argument | Default | Description |
 |----------|---------|-------------|
-| `--model-name` | (required) | HuggingFace model ID |
-| `--weight-bridge-mode` | `none` | `shared_vllm`, `lora_only`, or `none` |
+| `--model-name` or `--model` | (required) | HuggingFace model ID |
+| `--weight-bridge-mode` | `none` | `shared_vllm`, `lora_only`, `lora_restart`, or `none` |
 | `--training-steps` | 10 | Number of training steps |
 | `--batch-size` | 2 | Micro-batch size |
-| `--gradient-accumulation-steps` | 1 | Effective batch = batch × accum |
+| `--gradient-accumulation-steps` | 32 | Effective batch = batch × accum |
+| `--seq-len` | 2048 | Maximum sequence length |

 ### GRPO Hyperparameters

@ -526,15 +550,16 @@ python -m example_trainer.vllm_api_server  # NOT direct vllm commands
 |----------|---------|-------------|
 | `--kl-coef` | 0.1 | KL penalty strength (higher = more conservative) |
 | `--clip-eps` | 0.2 | PPO clipping range [1-ε, 1+ε] |
-| `--learning-rate` | 1e-6 | Learning rate |
+| `--lr` | 1e-5 | Learning rate (NOT --learning-rate) |

 ### LoRA Arguments

 | Argument | Default | Description |
 |----------|---------|-------------|
-| `--lora-r` | 16 | LoRA rank |
-| `--lora-alpha` | 32 | LoRA scaling factor |
-| `--lora-dropout` | 0.05 | LoRA dropout |
+| `--lora-r` | 16 | LoRA rank (dimension of low-rank matrices) |
+| `--lora-alpha` | 32 | LoRA alpha scaling factor |
+| `--lora-dropout` | 0.05 | LoRA dropout probability |
+| `--lora-target-modules` | None | Module names to apply LoRA (default: `q_proj v_proj`) |

 ### vLLM Arguments

@ -542,7 +567,11 @@ python -m example_trainer.vllm_api_server  # NOT direct vllm commands
 |----------|---------|-------------|
 | `--vllm-port` | 9001 | vLLM server port |
 | `--vllm-config-path` | auto | Path to bridge config (shared mode) |
-| `--gpu-memory-utilization` | 0.9 | vLLM GPU memory fraction |
+| `--gpu-memory-utilization` | 0.45 | vLLM GPU memory fraction |
+| `--vllm-gpu` | None | GPU ID for vLLM (None = same as trainer) |
+| `--max-model-len` | 4096 | Maximum context length |
+| `--dtype` | `bfloat16` | Model dtype: `bfloat16`, `float16`, or `auto` |
+| `--vllm-restart-interval` | 3 | Restart vLLM every N steps (legacy/lora_restart) |

 ---

@ -550,15 +579,248 @@ python -m example_trainer.vllm_api_server  # NOT direct vllm commands

 | Module | Purpose |
 |--------|---------|
-| `grpo.py` | CLI entry point, dispatches to training modes |
-| `run.py` | Unified launcher for shared_vllm mode |
-| `cli.py` | Single source of truth for all CLI arguments |
-| `config.py` | `TrainingConfig` Pydantic model |
-| `api.py` | Communication with Atropos API |
-| `data.py` | Batch preprocessing, logprob extraction |
-| `model.py` | Model loading, CUDA IPC attachment, tensor mapping |
-| `training.py` | GRPO loss computation |
-| `trainers.py` | Mode-specific training loops |
-| `vllm_api_server.py` | Streamlined vLLM server for training |
-| `vllm_manager.py` | vLLM process lifecycle management |
-| `checkpointing.py` | Save/load checkpoints and adapters |
+| `grpo.py` | CLI entry point, dispatches to training modes (4 modes) |
+| `run.py` | Unified launcher for shared_vllm mode (starts vLLM + trainer) |
+| `cli.py` | Single source of truth for all CLI arguments (modular builders) |
+| `config.py` | `TrainingConfig` Pydantic model with all hyperparameters |
+| `api.py` | Communication with Atropos API (registration, batch fetching) |
+| `data.py` | Batch preprocessing, padding, logprob extraction and alignment |
+| `model.py` | Model loading, CUDA IPC attachment, tensor mapping (QKV/Gate fusion) |
+| `training.py` | GRPO loss computation (importance sampling, KL penalty, clipping) |
+| `trainers.py` | Mode-specific training loops (4 implementations + optimizer selection) |
+| `vllm_api_server.py` | Custom vLLM server with `/generate` endpoint and LoRA support |
+| `vllm_manager.py` | vLLM process lifecycle management (launch, health checks, termination) |
+| `checkpointing.py` | Save/load checkpoints and adapters (handles fused tensor unfusing) |
+
+---
+
+## Code Execution Flow
+
+### High-Level Flow (All Modes)
+
+```
+1. CLI Parsing (cli.py)
+   ↓
+2. Config Creation (config.py)
+   ↓
+3. Mode Dispatcher (grpo.py or run.py)
+   ↓
+4. Trainer Function (trainers.py)
+   ├─ Setup Phase
+   │  ├─ Initialize W&B (training.py)
+   │  ├─ Load Model (model.py)
+   │  ├─ Create Optimizer (trainers.py)
+   │  ├─ Check Atropos API (api.py)
+   │  ├─ Register Trainer (api.py)
+   │  └─ Launch/Connect vLLM (vllm_manager.py or external)
+   │
+   └─ Training Loop
+      ├─ Fetch Batch (api.py → data.py)
+      │  ├─ Poll /batch endpoint
+      │  ├─ Pad sequences (data.py)
+      │  ├─ Extract inference logprobs (data.py)
+      │  └─ Normalize advantages (data.py)
+      │
+      ├─ Training Step (training.py)
+      │  ├─ For each micro-batch:
+      │  │  ├─ Forward pass (model)
+      │  │  ├─ Compute GRPO loss (training.py)
+      │  │  │  ├─ Temperature scaling
+      │  │  │  ├─ Compute log probabilities
+      │  │  │  ├─ Importance sampling ratio (using inference logprobs)
+      │  │  │  ├─ PPO clipping
+      │  │  │  ├─ Schulman KL penalty
+      │  │  │  └─ Return loss + metrics
+      │  │  └─ Backward pass (accumulate gradients)
+      │  ├─ Clip gradients (norm=1.0)
+      │  ├─ Optimizer step
+      │  └─ Zero gradients
+      │
+      ├─ Weight Sync (mode-dependent)
+      │  ├─ shared_vllm: No sync needed (weights shared via CUDA IPC)
+      │  ├─ lora_only: HTTP POST to /lora/load
+      │  ├─ lora_restart: Save adapter + terminate + relaunch vLLM
+      │  └─ none: Save checkpoint + terminate + relaunch vLLM
+      │
+      ├─ Log Metrics (training.py)
+      │  ├─ Console output
+      │  └─ W&B logging (if enabled)
+      │
+      └─ Periodic Checkpoint (checkpointing.py)
+         ├─ Ensure tensors are contiguous (unfuse views)
+         ├─ Save state dict
+         └─ Free GPU memory
+```
+
+### Mode-Specific Details
+
+#### shared_vllm Mode
+
+```python
+# Entry: grpo.py → trainers.train_shared_vllm()
+
+1. Model Loading (model.py):
+   - Find vllm_bridge_config.json
+   - Load IPC handles (CUDA memory pointers)
+   - Create empty model on meta device
+   - Reconstruct tensors from IPC handles
+   - Map vLLM fused tensors → HF unfused parameters
+     * qkv_proj → q_proj, k_proj, v_proj (views)
+     * gate_up_proj → gate_proj, up_proj (views)
+   - Initialize remaining meta tensors (buffers, etc.)
+
+2. Training Loop:
+   - optimizer.step() directly modifies vLLM's tensors
+   - No weight synchronization needed!
+   - Checkpoints: Unfuse views before saving (checkpointing.py)
+
+3. Tensor Mapping (model.py:_create_vllm_to_hf_mapping):
+   - Reads actual HF tensor shapes from model.state_dict()
+   - Creates slice mappings for fused layers
+   - Example: q_proj = qkv_proj[0:4096, :]
+```
+
+#### lora_restart Mode
+
+```python
+# Entry: grpo.py → trainers.train_lora_restart()
+
+1. Model Loading (model.py):
+   - Load base model with PEFT
+   - Apply LoRA config to target modules
+   - Freeze base weights, only LoRA trainable
+
+2. vLLM Management:
+   - Launch: _launch_vllm_with_lora()
+     * NO --enforce-eager flag (CUDA graphs enabled)
+     * Pre-load initial adapter
+   - Periodic Restart:
+     * Save new adapter (checkpointing.py)
+     * Terminate vLLM aggressively (_terminate_vllm)
+       - Kill process group
+       - Kill by port (fuser)
+       - Kill by process name patterns
+       - Wait for GPU memory release (critical!)
+     * Relaunch with new adapter
+
+3. Performance:
+   - ~108 TPS (CUDA graphs enabled)
+   - ~45s restart overhead
+   - Much faster than lora_only (~8x speedup)
+```
+
+#### lora_only Mode
+
+```python
+# Entry: grpo.py → trainers.train_lora()
+
+1. Model Loading: Same as lora_restart
+
+2. vLLM: External server (must be pre-started)
+   - MUST use --enforce-eager (disables CUDA graphs)
+   - MUST use --enable-lora
+
+3. Weight Sync: _hotswap_lora_adapter()
+   - Tries /v1/load_lora_adapter (native vLLM)
+   - Falls back to /lora/load (custom endpoint)
+
+4. Performance:
+   - ~13 TPS (CUDA graphs disabled)
+   - No restart overhead
+   - 8x slower than lora_restart!
+```
+
+#### none (legacy) Mode
+
+```python
+# Entry: grpo.py → trainers.train_legacy()
+
+1. Model Loading: Full model (model.py)
+
+2. vLLM Management:
+   - Launch: vllm_manager.launch_vllm_server()
+   - Periodic Restart:
+     * Save full checkpoint (checkpointing.py)
+     * Terminate vLLM (vllm_manager.terminate_vllm_process)
+     * Relaunch with new checkpoint
+
+3. Use Case:
+   - Different GPUs for trainer and vLLM
+   - Simple setup without CUDA IPC or LoRA
+```
+
+### Data Flow Detail (data.py)
+
+```python
+# api.get_batch() → data.get_data() → data.pad_data_to_good_offset()
+
+1. Batch Structure from API:
+   {
+     "batch": [
+       {
+         "tokens": [[tok1, tok2, ...], ...],  # group_size sequences
+         "masks": [[mask1, mask2, ...], ...],  # -100 for prompt, token_id for generated
+         "scores": [score1, score2, ...],      # rewards
+         "inference_logprobs": [[lp1, lp2, ...], ...],  # CRITICAL for GRPO!
+         "generation_params": {"temperature": 1.0},
+         ...
+       }
+     ]
+   }
+
+2. Preprocessing (pad_data_to_good_offset):
+   - Normalize advantages (mean=0, std=1 per group)
+   - Pad sequences to multiple of 64
+   - Align inference_logprobs with labels:
+     * 1.0 for prompt tokens (masked)
+     * Actual negative logprobs for generated tokens
+     * Shift by 1 for causal alignment
+   - Extract temperatures (priority: override > generation_params > 1.0)
+   - Batch into micro-batches
+
+3. Output:
+   - token_batches: [B, seq_len]
+   - label_batches: [B, seq_len]  # -100 for masked
+   - advantage_batches: [B, 1]
+   - temperature_batches: [B, 1, 1]
+   - inference_logprob_batches: [B, seq_len]  # aligned with labels!
+```
+
+### GRPO Loss Computation (training.py)
+
+```python
+# training.compute_grpo_loss()
+
+1. Forward Pass:
+   - Get logits from model
+   - Apply temperature scaling (from data)
+   - Compute log probabilities per token
+
+2. Reference Policy (π_old):
+   - Extract from inference_logprobs (from vLLM at generation time)
+   - Already aligned with labels by data.py
+
+3. Importance Sampling:
+   - log_ratio = log π_new(a|s) - log π_old(a|s)
+   - ratio = exp(log_ratio)
+   - Clipped ratio = clip(ratio, 1-ε, 1+ε)
+
+4. Policy Loss:
+   - surr1 = ratio * advantage
+   - surr2 = clipped_ratio * advantage
+   - policy_loss = -min(surr1, surr2)  # pessimistic bound
+
+5. KL Penalty (Schulman's estimator):
+   - kl = exp(-log_ratio) + log_ratio - 1
+   - Guaranteed non-negative, unbiased
+
+6. Total Loss:
+   - loss = policy_loss + β * kl_penalty
+   - Scaled by 1/gradient_accumulation_steps
+
+7. Metrics:
+   - mean_ratio: Average importance sampling ratio
+   - mean_kl: Average KL divergence
+   - clipped_fraction: % of tokens clipped
+   - alignment/* : Token-level logprob alignment (verifies weight sharing)
+```
--- a/example_trainer/scripts/benchmark_lora_vs_shared.py
+++ b/example_trainer/scripts/benchmark_lora_vs_shared.py
@ -1,394 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark LoRA inference modes to find the fastest approach.
-
-This script tests multiple vLLM configurations to determine:
-1. Does --enable-lora force eager mode even without --enforce-eager?
-2. What's the actual TPS difference between configurations?
-3. Is there ANY way to get fast LoRA inference?
-
-Configurations tested:
- BASE: No LoRA flags (CUDA graphs enabled) - baseline
- LORA_EAGER: --enable-lora --enforce-eager (required for hot-swap)
- LORA_NO_EAGER: --enable-lora only (does vLLM force eager anyway?)
-
-Usage:
-    python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507
-    python benchmark_lora_vs_shared.py --model Qwen/Qwen3-4B-Instruct-2507 --lora-path ./checkpoints/final_adapter
-"""
-
-import argparse
-import json
-import os
-import signal
-import subprocess
-import sys
-import time
-from typing import Optional
-from pathlib import Path
-
-import requests
-
-# Force unbuffered output for log files
-sys.stdout.reconfigure(line_buffering=True) if hasattr(sys.stdout, 'reconfigure') else None
-sys.stderr.reconfigure(line_buffering=True) if hasattr(sys.stderr, 'reconfigure') else None
-
-def log(msg: str):
-    """Print with immediate flush."""
-    print(msg, flush=True)
-
-# Complex math prompt that requires extended reasoning
-BENCHMARK_PROMPT = """You are a mathematics expert. Solve this problem step by step, showing all your work:
-
-A rectangular garden has a perimeter of 56 meters. The length is 4 meters more than twice the width. 
-
-1) Set up the equations
-2) Solve for width and length
-3) Calculate the area
-4) If we want to put a circular fountain in the center with radius equal to 1/4 of the width, what area remains for planting?
-5) Express the planting area as a percentage of the total garden area
-
-Show all calculations clearly and verify your answer."""
-
-# Longer prompt for extended generation
-LONG_PROMPT = """Write a detailed technical explanation of how transformer neural networks work, covering:
-
-1. The attention mechanism - explain self-attention, multi-head attention, and how queries, keys, and values work
-2. The encoder-decoder architecture vs decoder-only models
-3. Positional encoding - why it's needed and different approaches
-4. Layer normalization and residual connections
-5. The feed-forward network component
-6. How training works with cross-entropy loss and backpropagation through attention
-
-Include mathematical formulas where appropriate and explain the intuition behind each component. This should be comprehensive enough for someone with basic ML knowledge to understand transformers deeply."""
-
-
-def wait_for_server(port: int, timeout: int = 300) -> bool:
-    """Wait for vLLM server to be ready."""
-    start = time.time()
-    while time.time() - start < timeout:
-        try:
-            resp = requests.get(f"http://localhost:{port}/health", timeout=5)
-            if resp.status_code == 200:
-                return True
-        except Exception:
-            pass
-        time.sleep(2)
-    return False
-
-
-def start_vllm_server(
-    model: str,
-    port: int,
-    gpu_id: int,
-    mode: str = "base",  # "base", "lora_eager", "lora_no_eager"
-    max_lora_rank: int = 32,
-    max_model_len: int = 8192,
-    log_file: str = "vllm.log",
-) -> subprocess.Popen:
-    """
-    Start a vLLM server with different configurations.
-    
-    Modes:
-    - base: No LoRA, CUDA graphs enabled (fastest)
-    - lora_eager: --enable-lora --enforce-eager (slow, but supports hot-swap)
-    - lora_no_eager: --enable-lora only (test if vLLM forces eager anyway)
-    """
-    # Find the vllm_api_server.py script relative to this script
-    script_dir = Path(__file__).parent.parent  # example_trainer/
-    vllm_server_path = script_dir / "vllm_api_server.py"
-    
-    if not vllm_server_path.exists():
-        log(f"ERROR: vllm_api_server.py not found at {vllm_server_path}")
-        raise FileNotFoundError(f"vllm_api_server.py not found at {vllm_server_path}")
-    
-    cmd = [
-        sys.executable, str(vllm_server_path),
-        "--model", model,
-        "--port", str(port),
-        "--gpu-memory-utilization", "0.70",  # Higher for 32k context
-        "--max-model-len", str(max_model_len),
-        "--dtype", "bfloat16",
-    ]
-    
-    if mode == "lora_eager":
-        cmd.extend([
-            "--enable-lora",
-            "--max-lora-rank", str(max_lora_rank),
-            "--enforce-eager",
-        ])
-        log(f"Mode: LORA_EAGER (--enable-lora --enforce-eager)")
-    elif mode == "lora_no_eager":
-        cmd.extend([
-            "--enable-lora",
-            "--max-lora-rank", str(max_lora_rank),
-            # NOTE: NOT adding --enforce-eager - testing if vLLM forces it anyway
-        ])
-        log(f"Mode: LORA_NO_EAGER (--enable-lora only, NO --enforce-eager)")
-    else:
-        log(f"Mode: BASE (no LoRA flags, CUDA graphs enabled)")
-    
-    env = os.environ.copy()
-    env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-    
-    log(f"GPU: {gpu_id}")
-    log(f"Command: {' '.join(cmd)}")
-    
-    log_f = open(log_file, "w")
-    proc = subprocess.Popen(
-        cmd,
-        env=env,
-        stdout=log_f,
-        stderr=subprocess.STDOUT,
-    )
-    log(f"Started vLLM PID={proc.pid}, log: {log_file}")
-    return proc
-
-
-def load_lora_adapter(port: int, adapter_path: str) -> bool:
-    """Load a LoRA adapter into vLLM."""
-    try:
-        resp = requests.post(
-            f"http://localhost:{port}/lora/load",
-            json={"adapter_path": adapter_path, "adapter_name": "benchmark_adapter"},
-            timeout=30,
-        )
-        return resp.status_code == 200
-    except Exception as e:
-        log(f"Failed to load LoRA adapter: {e}")
-        return False
-
-
-def benchmark_inference(
-    port: int,
-    prompt: str,
-    max_tokens: int = 2048,
-    num_runs: int = 3,
-) -> dict:
-    """Benchmark inference on a vLLM server."""
-    results = {
-        "times": [],
-        "tokens": [],
-        "tps": [],
-    }
-    
-    for i in range(num_runs):
-        start = time.time()
-        try:
-            resp = requests.post(
-                f"http://localhost:{port}/generate",
-                json={
-                    "prompt": prompt,
-                    "max_tokens": max_tokens,
-                    "temperature": 0.7,
-                },
-                timeout=300,
-            )
-            elapsed = time.time() - start
-            
-            if resp.status_code == 200:
-                data = resp.json()
-                output_text = data.get("text", [""])[0]
-                # Rough token count (words * 1.3)
-                output_tokens = len(output_text.split()) * 1.3
-                
-                results["times"].append(elapsed)
-                results["tokens"].append(output_tokens)
-                results["tps"].append(output_tokens / elapsed if elapsed > 0 else 0)
-                
-                log(f"  Run {i+1}: {elapsed:.2f}s, ~{output_tokens:.0f} tokens, {output_tokens/elapsed:.1f} TPS")
-            else:
-                log(f"  Run {i+1}: FAILED ({resp.status_code})")
-        except Exception as e:
-            log(f"  Run {i+1}: ERROR - {e}")
-    
-    if results["times"]:
-        results["avg_time"] = sum(results["times"]) / len(results["times"])
-        results["avg_tokens"] = sum(results["tokens"]) / len(results["tokens"])
-        results["avg_tps"] = sum(results["tps"]) / len(results["tps"])
-    
-    return results
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark LoRA inference configurations")
-    parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Instruct-2507",
-                        help="Model to benchmark")
-    parser.add_argument("--lora-path", type=str, default=None,
-                        help="Path to LoRA adapter (optional)")
-    parser.add_argument("--max-tokens", type=int, default=2048,
-                        help="Max tokens to generate")
-    parser.add_argument("--num-runs", type=int, default=3,
-                        help="Number of benchmark runs per server")
-    parser.add_argument("--gpu", type=int, default=0,
-                        help="GPU to use (tests run sequentially)")
-    parser.add_argument("--port", type=int, default=9001,
-                        help="Port for vLLM server")
-    parser.add_argument("--prompt", type=str, choices=["math", "long"], default="long",
-                        help="Which prompt to use")
-    parser.add_argument("--max-model-len", type=int, default=8192,
-                        help="Maximum model context length (e.g., 8192, 32768)")
-    parser.add_argument("--modes", type=str, default="all",
-                        help="Comma-separated modes to test: base,lora_eager,lora_no_eager or 'all'")
-    args = parser.parse_args()
-    
-    prompt = LONG_PROMPT if args.prompt == "long" else BENCHMARK_PROMPT
-    
-    # Parse modes to test
-    if args.modes == "all":
-        modes_to_test = ["base", "lora_no_eager", "lora_eager"]
-    else:
-        modes_to_test = [m.strip() for m in args.modes.split(",")]
-    
-    results = {}
-    current_proc = None
-    
-    def cleanup():
-        log("\nCleaning up...")
-        if current_proc:
-            try:
-                current_proc.terminate()
-                current_proc.wait(timeout=5)
-            except Exception:
-                try:
-                    current_proc.kill()
-                except Exception:
-                    pass
-    
-    signal.signal(signal.SIGINT, lambda s, f: (cleanup(), sys.exit(0)))
-    signal.signal(signal.SIGTERM, lambda s, f: (cleanup(), sys.exit(0)))
-    
-    try:
-        log("=" * 70)
-        log("vLLM LoRA Inference Configuration Benchmark")
-        log("=" * 70)
-        log(f"Model: {args.model}")
-        log(f"LoRA adapter: {args.lora_path or 'None'}")
-        log(f"Max tokens: {args.max_tokens}")
-        log(f"Max model len: {args.max_model_len}")
-        log(f"Num runs: {args.num_runs}")
-        log(f"Modes to test: {modes_to_test}")
-        log("=" * 70)
-        log("")
-        log("QUESTION: Does --enable-lora force eager mode even without --enforce-eager?")
-        log("=" * 70)
-        
-        # Test each mode sequentially (same GPU, restart between tests)
-        for i, mode in enumerate(modes_to_test):
-            log(f"\n[{i+1}/{len(modes_to_test)}] Testing mode: {mode.upper()}")
-            log("-" * 70)
-            
-            # Start server
-            current_proc = start_vllm_server(
-                args.model, args.port, args.gpu,
-                mode=mode, max_model_len=args.max_model_len,
-                log_file=f"benchmark_{mode}.log"
-            )
-            
-            # Wait for ready
-            log(f"  Waiting for server (port {args.port})...")
-            if not wait_for_server(args.port, timeout=300):
-                log(f"  ✗ Server failed to start! Check benchmark_{mode}.log")
-                results[mode] = {"error": "Server failed to start"}
-                current_proc.terminate()
-                current_proc = None
-                continue
-            
-            log(f"  ✓ Server ready")
-            
-            # Load LoRA adapter if provided and mode supports it
-            if args.lora_path and mode in ["lora_eager", "lora_no_eager"]:
-                log(f"  Loading LoRA adapter...")
-                if load_lora_adapter(args.port, args.lora_path):
-                    log(f"  ✓ Adapter loaded")
-                else:
-                    log(f"  ⚠ Failed to load adapter (continuing anyway)")
-            
-            # Check the log file for CUDA graph status
-            log(f"  Checking CUDA graph status in log...")
-            try:
-                with open(f"benchmark_{mode}.log", "r") as f:
-                    log_content = f.read()
-                    if "Cudagraph is disabled" in log_content:
-                        log(f"  ⚠ CUDA GRAPHS DISABLED (eager mode)")
-                    elif "cudagraph" in log_content.lower():
-                        # Look for other cudagraph messages
-                        for line in log_content.split("\n"):
-                            if "cudagraph" in line.lower():
-                                log(f"  Log: {line.strip()[:80]}")
-                    else:
-                        log(f"  (No cudagraph message found in log)")
-            except Exception as e:
-                log(f"  (Could not read log: {e})")
-            
-            # Run benchmark
-            log(f"\n  Running {args.num_runs} inference requests...")
-            mode_results = benchmark_inference(
-                args.port, prompt, args.max_tokens, args.num_runs
-            )
-            results[mode] = mode_results
-            
-            # Terminate server
-            log(f"  Stopping server...")
-            current_proc.terminate()
-            try:
-                current_proc.wait(timeout=10)
-            except Exception:
-                current_proc.kill()
-            current_proc = None
-            
-            # Wait for port to be free
-            time.sleep(3)
-        
-        # Print comparison
-        log("\n" + "=" * 70)
-        log("RESULTS SUMMARY")
-        log("=" * 70)
-        
-        valid_results = {k: v for k, v in results.items() if "avg_tps" in v}
-        
-        for mode, res in valid_results.items():
-            log(f"\n{mode.upper()}:")
-            log(f"  Avg time:   {res['avg_time']:.2f}s")
-            log(f"  Avg tokens: {res['avg_tokens']:.0f}")
-            log(f"  Avg TPS:    {res['avg_tps']:.1f}")
-        
-        # Compare
-        if "base" in valid_results:
-            base_tps = valid_results["base"]["avg_tps"]
-            log(f"\n" + "-" * 70)
-            log("COMPARISON TO BASE (CUDA graphs enabled):")
-            for mode, res in valid_results.items():
-                if mode != "base":
-                    ratio = res["avg_tps"] / base_tps if base_tps > 0 else 0
-                    slowdown = (1 - ratio) * 100
-                    log(f"  {mode}: {res['avg_tps']:.1f} TPS ({ratio:.2f}x base, {slowdown:.1f}% slower)")
-        
-        # Key finding
-        log("\n" + "=" * 70)
-        log("KEY FINDING:")
-        if "lora_no_eager" in valid_results and "lora_eager" in valid_results:
-            no_eager_tps = valid_results["lora_no_eager"]["avg_tps"]
-            eager_tps = valid_results["lora_eager"]["avg_tps"]
-            if abs(no_eager_tps - eager_tps) < eager_tps * 0.1:  # Within 10%
-                log("  ⚠ --enable-lora FORCES eager mode regardless of --enforce-eager flag!")
-                log("  ⚠ There is NO WAY to get CUDA graphs with LoRA enabled in vLLM.")
-            else:
-                log("  ✓ --enable-lora without --enforce-eager is FASTER!")
-                log(f"  ✓ lora_no_eager: {no_eager_tps:.1f} TPS vs lora_eager: {eager_tps:.1f} TPS")
-        
-        if "base" in valid_results and "lora_eager" in valid_results:
-            base_tps = valid_results["base"]["avg_tps"]
-            lora_tps = valid_results["lora_eager"]["avg_tps"]
-            log(f"\n  Base model (no LoRA): {base_tps:.1f} TPS")
-            log(f"  LoRA enabled:         {lora_tps:.1f} TPS")
-            log(f"  Slowdown factor:      {base_tps/lora_tps:.1f}x")
-        
-        log("=" * 70)
-        
-    finally:
-        cleanup()
-
-
-if __name__ == "__main__":
-    main()
--- a/example_trainer/scripts/compare_all_modes_math_zero.sh
+++ b/example_trainer/scripts/compare_all_modes_math_zero.sh
@ -1,458 +0,0 @@
-#!/bin/bash
-# =============================================================================
-# All Training Modes Comparison on Math Zero (32k context)
-# =============================================================================
-#
-# Compares all 3 training modes on math_server_zero environment:
-#   - GPU 0: shared_vllm (CUDA IPC, zero-copy weight updates)
-#   - GPU 1: lora_only   (--enforce-eager, ~13 TPS, slow)
-#   - GPU 2: lora_restart (no --enforce-eager, ~108 TPS, fast)
-#
-# All at 32k context length for proper math reasoning.
-#
-# Usage:
-#   ./scripts/compare_all_modes_math_zero.sh [MODEL] [STEPS]
-#
-# Example:
-#   ./scripts/compare_all_modes_math_zero.sh Qwen/Qwen3-4B-Instruct-2507 30
-#
-# =============================================================================
-
-set -e
-
-MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
-TRAINING_STEPS="${2:-30}"
-BATCH_SIZE="${BATCH_SIZE:-2}"
-MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
-USE_WANDB="${USE_WANDB:-true}"
-WANDB_PROJECT="${WANDB_PROJECT:-math-zero-mode-comparison}"
-
-# Port allocation (separate ports for each mode)
-# shared_vllm: API 8001, vLLM 9001
-# lora_only:   API 8002, vLLM 9002
-# lora_restart: API 8003, vLLM 9003
-
-SHARED_API_PORT=8001
-SHARED_VLLM_PORT=9001
-SHARED_GPU=0
-
-LORA_ONLY_API_PORT=8002
-LORA_ONLY_VLLM_PORT=9002
-LORA_ONLY_GPU=1
-
-LORA_RESTART_API_PORT=8003
-LORA_RESTART_VLLM_PORT=9003
-LORA_RESTART_GPU=2
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
-REPO_DIR="$(dirname "$TRAINER_DIR")"
-
-LOG_DIR="${REPO_DIR}/math_zero_comparison_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$LOG_DIR"
-
-echo "============================================================"
-echo "Math Zero Mode Comparison (32k Context)"
-echo "============================================================"
-echo "Model: $MODEL"
-echo "Steps: $TRAINING_STEPS"
-echo "Batch: $BATCH_SIZE"
-echo "Max Model Length: $MAX_MODEL_LEN"
-echo "Wandb: $USE_WANDB (project: $WANDB_PROJECT)"
-echo ""
-echo "GPU Allocation:"
-echo "  GPU $SHARED_GPU: shared_vllm (ports $SHARED_API_PORT, $SHARED_VLLM_PORT)"
-echo "  GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
-echo "  GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
-echo ""
-echo "Log Dir: $LOG_DIR"
-echo "============================================================"
-echo ""
-
-# Cleanup function
-cleanup() {
-    echo ""
-    echo "Cleaning up all processes..."
-    pkill -9 -f "vllm_api_server" 2>/dev/null || true
-    pkill -9 -f "math_server_zero" 2>/dev/null || true
-    pkill -9 -f "run-api" 2>/dev/null || true
-    pkill -9 -f "grpo" 2>/dev/null || true
-    pkill -9 -f "vllm.*EngineCore" 2>/dev/null || true
-    for port in $SHARED_API_PORT $SHARED_VLLM_PORT $LORA_ONLY_API_PORT $LORA_ONLY_VLLM_PORT $LORA_RESTART_API_PORT $LORA_RESTART_VLLM_PORT; do
-        fuser -k ${port}/tcp 2>/dev/null || true
-    done
-    sleep 2
-}
-trap cleanup EXIT
-
-# Initial cleanup
-cleanup
-
-# Clear triton cache for clean start
-rm -rf ~/.triton/cache 2>/dev/null || true
-
-cd "$REPO_DIR"
-
-# =============================================================================
-# Helper functions
-# =============================================================================
-
-wait_for_health() {
-    local port=$1
-    local name=$2
-    local max_attempts=${3:-120}
-    local attempt=1
-    
-    while [ $attempt -le $max_attempts ]; do
-        if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
-            echo "  ✓ $name ready (port $port)"
-            return 0
-        fi
-        sleep 5
-        attempt=$((attempt + 1))
-    done
-    echo "  ✗ $name failed to start (port $port)"
-    return 1
-}
-
-wait_for_api() {
-    local port=$1
-    local name=$2
-    local max_attempts=${3:-30}
-    local attempt=1
-    
-    while [ $attempt -le $max_attempts ]; do
-        if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
-            echo "  ✓ $name ready (port $port)"
-            return 0
-        fi
-        sleep 2
-        attempt=$((attempt + 1))
-    done
-    echo "  ✗ $name failed to start (port $port)"
-    return 1
-}
-
-# =============================================================================
-# START ALL THREE MODES IN PARALLEL
-# =============================================================================
-
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo "Starting all three modes in parallel..."
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-
-# Pre-create checkpoint directories
-mkdir -p "$LOG_DIR/checkpoints_shared"
-mkdir -p "$LOG_DIR/checkpoints_lora_only"
-mkdir -p "$LOG_DIR/checkpoints_lora_restart"
-
-# -----------------------------------------------------------------------------
-# MODE 1: SHARED_VLLM (GPU 0)
-# -----------------------------------------------------------------------------
-echo ""
-echo "[SHARED_VLLM] Starting on GPU $SHARED_GPU..."
-
-# Start run-api for shared_vllm
-run-api --port $SHARED_API_PORT > "$LOG_DIR/api_shared.log" 2>&1 &
-
-# Start vLLM with shared weights
-# NOTE: shared_vllm needs more headroom for optimizer states (~8GB) and gradients
-# Using 0.5 leaves ~90GB for training operations on a 180GB GPU
-echo "[SHARED_VLLM] Starting vLLM with shared weights..."
-VLLM_ENABLE_SHARED_WEIGHTS=1 VLLM_BRIDGE_CONFIG_PATH=$LOG_DIR/vllm_bridge_config_shared.json \
-CUDA_VISIBLE_DEVICES=$SHARED_GPU python -u example_trainer/vllm_api_server.py \
-    --model "$MODEL" \
-    --port $SHARED_VLLM_PORT \
-    --gpu-memory-utilization 0.50 \
-    --max-model-len $MAX_MODEL_LEN \
-    > "$LOG_DIR/vllm_shared.log" 2>&1 &
-
-# -----------------------------------------------------------------------------
-# MODE 2: LORA_ONLY (GPU 1)
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
-
-# Start run-api for lora_only
-run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
-
-# Start vLLM with --enforce-eager for lora_only
-# LoRA modes need less training memory, but still need headroom at 32k
-echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
-    --model "$MODEL" \
-    --port $LORA_ONLY_VLLM_PORT \
-    --gpu-memory-utilization 0.70 \
-    --max-model-len $MAX_MODEL_LEN \
-    --enable-lora \
-    --max-lora-rank 64 \
-    --enforce-eager \
-    > "$LOG_DIR/vllm_lora_only.log" 2>&1 &
-
-# -----------------------------------------------------------------------------
-# MODE 3: LORA_RESTART (GPU 2) - Trainer manages vLLM internally
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
-
-# Start run-api for lora_restart
-run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
-
-# =============================================================================
-# WAIT FOR INFRASTRUCTURE
-# =============================================================================
-echo ""
-echo "Waiting for infrastructure to be ready..."
-echo "  (vLLM at 32k context takes ~2-5 minutes to start)"
-
-wait_for_api $SHARED_API_PORT "shared_vllm API" || exit 1
-wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
-wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
-
-wait_for_health $SHARED_VLLM_PORT "shared_vllm vLLM" 180 || exit 1
-wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 180 || exit 1
-
-# =============================================================================
-# START ENVIRONMENTS AND TRAINERS
-# =============================================================================
-echo ""
-echo "Starting environments and trainers..."
-
-# Record start time
-START_TIME=$(date +%s)
-
-# Build wandb args
-WANDB_ARGS=""
-if [ "$USE_WANDB" = "true" ]; then
-    WANDB_ARGS="--use-wandb --wandb-project $WANDB_PROJECT"
-fi
-
-# -----------------------------------------------------------------------------
-# SHARED_VLLM: Start environment and trainer
-# -----------------------------------------------------------------------------
-echo ""
-echo "[SHARED_VLLM] Starting math_server_zero environment..."
-MATH_ENV_MODEL="$MODEL" \
-MATH_ENV_ROLLOUT_URL="http://localhost:${SHARED_API_PORT}" \
-MATH_ENV_VLLM_URL="http://localhost:${SHARED_VLLM_PORT}/v1" \
-MATH_ENV_WANDB_NAME="shared-vllm-env" \
-MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
-MATH_ENV_WORKER_TIMEOUT=1800 \
-python -u environments/math_server_zero.py serve \
-    --slurm false \
-    2>&1 | tee "$LOG_DIR/env_shared.log" &
-SHARED_ENV_PID=$!
-
-echo "[SHARED_VLLM] Starting trainer..."
-CUDA_VISIBLE_DEVICES=$SHARED_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode shared_vllm \
-    --vllm-port $SHARED_VLLM_PORT \
-    --vllm-config-path "$LOG_DIR/vllm_bridge_config_shared.json" \
-    --atropos-url "http://localhost:${SHARED_API_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --max-model-len $MAX_MODEL_LEN \
-    --seq-len $MAX_MODEL_LEN \
-    --save-path "$LOG_DIR/checkpoints_shared" \
-    $WANDB_ARGS --wandb-group "shared-vllm" \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer_shared.log" &
-SHARED_TRAINER_PID=$!
-
-# -----------------------------------------------------------------------------
-# LORA_ONLY: Start environment and trainer
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_ONLY] Starting math_server_zero environment..."
-MATH_ENV_MODEL="$MODEL" \
-MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_ONLY_API_PORT}" \
-MATH_ENV_VLLM_URL="http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
-MATH_ENV_WANDB_NAME="lora-only-env" \
-MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
-MATH_ENV_WORKER_TIMEOUT=1800 \
-python -u environments/math_server_zero.py serve \
-    --slurm false \
-    2>&1 | tee "$LOG_DIR/env_lora_only.log" &
-LORA_ONLY_ENV_PID=$!
-
-echo "[LORA_ONLY] Starting trainer..."
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_only \
-    --vllm-port $LORA_ONLY_VLLM_PORT \
-    --atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --max-model-len $MAX_MODEL_LEN \
-    --seq-len $MAX_MODEL_LEN \
-    --lora-r 16 \
-    --lora-alpha 32 \
-    --vllm-restart-interval 5 \
-    --save-path "$LOG_DIR/checkpoints_lora_only" \
-    $WANDB_ARGS --wandb-group "lora-only" \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer_lora_only.log" &
-LORA_ONLY_TRAINER_PID=$!
-
-# -----------------------------------------------------------------------------
-# LORA_RESTART: Start trainer (it manages vLLM internally)
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
-# NOTE: lora_restart shares GPU with trainer's model (~8GB), so use lower vLLM memory
-# Use unbuffered output (-u) and stdbuf to capture crashes
-CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU PYTHONUNBUFFERED=1 stdbuf -oL -eL python -u -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_restart \
-    --vllm-port $LORA_RESTART_VLLM_PORT \
-    --vllm-gpu-memory-utilization 0.20 \
-    --atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --max-model-len $MAX_MODEL_LEN \
-    --seq-len $MAX_MODEL_LEN \
-    --lora-r 16 \
-    --lora-alpha 32 \
-    --vllm-restart-interval 5 \
-    --save-path "$LOG_DIR/checkpoints_lora_restart" \
-    $WANDB_ARGS --wandb-group "lora-restart" \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer_lora_restart.log" &
-LORA_RESTART_TRAINER_PID=$!
-
-# Wait for lora_restart's internal vLLM to start
-echo "[LORA_RESTART] Waiting for internal vLLM to start..."
-echo "  NOTE: vLLM at 32k context with CUDA graphs takes 2-5 min"
-sleep 60
-wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 300 || {
-    echo "  Failed - check logs:"
-    tail -50 "$LOG_DIR/trainer_lora_restart.log"
-    exit 1
-}
-
-# Start environment for lora_restart
-echo "[LORA_RESTART] Starting math_server_zero environment..."
-MATH_ENV_MODEL="$MODEL" \
-MATH_ENV_ROLLOUT_URL="http://localhost:${LORA_RESTART_API_PORT}" \
-MATH_ENV_VLLM_URL="http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
-MATH_ENV_WANDB_NAME="lora-restart-env" \
-MATH_ENV_MAX_TOKENS=$MAX_MODEL_LEN \
-MATH_ENV_WORKER_TIMEOUT=1800 \
-python -u environments/math_server_zero.py serve \
-    --slurm false \
-    2>&1 | tee "$LOG_DIR/env_lora_restart.log" &
-LORA_RESTART_ENV_PID=$!
-
-# =============================================================================
-# WAIT FOR ALL TRAINERS TO COMPLETE
-# =============================================================================
-echo ""
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo "All three trainers running in parallel. Waiting for completion..."
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo ""
-echo "📊 WANDB: https://wandb.ai (project: $WANDB_PROJECT)"
-echo ""
-echo "📋 MONITOR LOGS (in another terminal):"
-echo ""
-echo "  # Trainer logs:"
-echo "  tail -f $LOG_DIR/trainer_shared.log"
-echo "  tail -f $LOG_DIR/trainer_lora_only.log"
-echo "  tail -f $LOG_DIR/trainer_lora_restart.log"
-echo ""
-echo "  # Environment logs:"
-echo "  tail -f $LOG_DIR/env_shared.log"
-echo "  tail -f $LOG_DIR/env_lora_only.log"
-echo "  tail -f $LOG_DIR/env_lora_restart.log"
-echo ""
-echo "  # vLLM logs:"
-echo "  tail -f $LOG_DIR/vllm_shared.log"
-echo "  tail -f $LOG_DIR/vllm_lora_only.log"
-echo "  tail -f $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
-echo ""
-
-# Wait for trainers
-SHARED_EXIT=0
-LORA_ONLY_EXIT=0
-LORA_RESTART_EXIT=0
-
-wait $SHARED_TRAINER_PID || SHARED_EXIT=$?
-SHARED_END=$(date +%s)
-SHARED_TIME=$((SHARED_END - START_TIME))
-echo "  ✓ shared_vllm finished in ${SHARED_TIME}s (exit: $SHARED_EXIT)"
-
-wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
-LORA_ONLY_END=$(date +%s)
-LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
-echo "  ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
-
-wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
-LORA_RESTART_END=$(date +%s)
-LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
-echo "  ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
-
-# =============================================================================
-# RESULTS
-# =============================================================================
-echo ""
-echo "============================================================"
-echo "COMPARISON RESULTS (Math Zero @ 32k Context)"
-echo "============================================================"
-echo ""
-echo "Training Steps: $TRAINING_STEPS"
-echo "Batch Size: $BATCH_SIZE"
-echo "Max Context: $MAX_MODEL_LEN"
-echo ""
-echo "┌─────────────────┬──────┬──────────────┬────────────────────────────────┐"
-echo "│ Mode            │ GPU  │ Total Time   │ Notes                          │"
-echo "├─────────────────┼──────┼──────────────┼────────────────────────────────┤"
-printf "│ shared_vllm     │  %d   │ %10ss │ CUDA IPC zero-copy (~172 TPS)  │\n" "$SHARED_GPU" "$SHARED_TIME"
-printf "│ lora_only       │  %d   │ %10ss │ --enforce-eager (~13 TPS)      │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
-printf "│ lora_restart    │  %d   │ %10ss │ no --enforce-eager (~108 TPS)  │\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
-echo "└─────────────────┴──────┴──────────────┴────────────────────────────────┘"
-echo ""
-
-# Calculate speedups
-if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
-    RESTART_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
-    echo "lora_restart vs lora_only speedup: ${RESTART_SPEEDUP}x"
-fi
-if [ $LORA_ONLY_TIME -gt 0 ] && [ $SHARED_TIME -gt 0 ]; then
-    SHARED_SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $SHARED_TIME" | bc)
-    echo "shared_vllm vs lora_only speedup: ${SHARED_SPEEDUP}x"
-fi
-
-echo ""
-echo "📊 BENCHMARK DETAILS:"
-echo ""
-echo "━━━ shared_vllm (GPU $SHARED_GPU) ━━━"
-grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_shared.log" 2>/dev/null || echo "  (check $LOG_DIR/trainer_shared.log)"
-echo ""
-echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
-grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo "  (check $LOG_DIR/trainer_lora_only.log)"
-echo ""
-echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
-grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo "  (check $LOG_DIR/trainer_lora_restart.log)"
-
-echo ""
-echo "============================================================"
-echo "📁 All logs saved to: $LOG_DIR"
-echo "============================================================"
-echo ""
-echo "Log files:"
-echo "  Trainers:"
-echo "    $LOG_DIR/trainer_shared.log"
-echo "    $LOG_DIR/trainer_lora_only.log"
-echo "    $LOG_DIR/trainer_lora_restart.log"
-echo ""
-echo "  Environments:"
-echo "    $LOG_DIR/env_shared.log"
-echo "    $LOG_DIR/env_lora_only.log"
-echo "    $LOG_DIR/env_lora_restart.log"
-echo ""
-echo "  vLLM:"
-echo "    $LOG_DIR/vllm_shared.log"
-echo "    $LOG_DIR/vllm_lora_only.log"
-echo "    $LOG_DIR/checkpoints_lora_restart/vllm_restart_*.log"
-echo ""
--- a/example_trainer/scripts/compare_lora_modes.sh
+++ b/example_trainer/scripts/compare_lora_modes.sh
@ -1,368 +0,0 @@
-#!/bin/bash
-# =============================================================================
-# LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)
-# =============================================================================
-#
-# Runs both modes IN PARALLEL on separate GPUs for fair comparison:
-#   - GPU 0: lora_only    (--enforce-eager, ~13 TPS)
-#   - GPU 1: lora_restart (no --enforce-eager, ~108 TPS)
-#
-# Usage:
-#   ./scripts/compare_lora_modes.sh [MODEL] [STEPS]
-#
-# Example:
-#   ./scripts/compare_lora_modes.sh Qwen/Qwen3-4B-Instruct-2507 20
-#
-# =============================================================================
-
-set -e
-
-MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
-TRAINING_STEPS="${2:-20}"
-BATCH_SIZE="${BATCH_SIZE:-2}"
-USE_WANDB="${USE_WANDB:-true}"  # Set USE_WANDB=false to disable
-WANDB_PROJECT="${WANDB_PROJECT:-lora-mode-comparison}"
-
-# Port allocation (separate ports for each mode)
-LORA_ONLY_VLLM_PORT=9001
-LORA_ONLY_API_PORT=8001
-
-LORA_RESTART_VLLM_PORT=9002
-LORA_RESTART_API_PORT=8002
-
-# GPU allocation
-LORA_ONLY_GPU=0
-LORA_RESTART_GPU=1
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
-REPO_DIR="$(dirname "$TRAINER_DIR")"
-
-LOG_DIR="${REPO_DIR}/lora_comparison_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$LOG_DIR"
-
-echo "============================================================"
-echo "LoRA Mode Comparison: lora_only vs lora_restart (PARALLEL)"
-echo "============================================================"
-echo "Model: $MODEL"
-echo "Steps: $TRAINING_STEPS"
-echo "Batch: $BATCH_SIZE"
-echo "Wandb: $USE_WANDB (project: $WANDB_PROJECT)"
-echo ""
-echo "GPU Allocation:"
-echo "  GPU $LORA_ONLY_GPU: lora_only (ports $LORA_ONLY_API_PORT, $LORA_ONLY_VLLM_PORT)"
-echo "  GPU $LORA_RESTART_GPU: lora_restart (ports $LORA_RESTART_API_PORT, $LORA_RESTART_VLLM_PORT)"
-echo ""
-echo "Log Dir: $LOG_DIR"
-echo "============================================================"
-echo ""
-
-# Cleanup function
-cleanup() {
-    echo ""
-    echo "Cleaning up all processes..."
-    pkill -u $USER -f "vllm_api_server" 2>/dev/null || true
-    pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
-    pkill -u $USER -f "run-api" 2>/dev/null || true
-    pkill -u $USER -f "grpo" 2>/dev/null || true
-    for port in $LORA_ONLY_VLLM_PORT $LORA_ONLY_API_PORT $LORA_RESTART_VLLM_PORT $LORA_RESTART_API_PORT; do
-        fuser -k ${port}/tcp 2>/dev/null || true
-    done
-    sleep 2
-}
-trap cleanup EXIT
-
-# Initial cleanup
-cleanup
-
-cd "$REPO_DIR"
-
-# =============================================================================
-# Helper functions
-# =============================================================================
-
-wait_for_health() {
-    local port=$1
-    local name=$2
-    local max_attempts=${3:-60}
-    local attempt=1
-    
-    while [ $attempt -le $max_attempts ]; do
-        if curl -s "http://localhost:$port/health" > /dev/null 2>&1; then
-            echo "  ✓ $name ready (port $port)"
-            return 0
-        fi
-        sleep 5
-        attempt=$((attempt + 1))
-    done
-    echo "  ✗ $name failed to start (port $port)"
-    return 1
-}
-
-wait_for_api() {
-    local port=$1
-    local name=$2
-    local max_attempts=${3:-30}
-    local attempt=1
-    
-    while [ $attempt -le $max_attempts ]; do
-        if curl -s "http://localhost:$port/info" > /dev/null 2>&1; then
-            echo "  ✓ $name ready (port $port)"
-            return 0
-        fi
-        sleep 2
-        attempt=$((attempt + 1))
-    done
-    echo "  ✗ $name failed to start (port $port)"
-    return 1
-}
-
-# =============================================================================
-# START BOTH MODES IN PARALLEL
-# =============================================================================
-
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo "Starting both modes in parallel..."
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-
-# -----------------------------------------------------------------------------
-# LORA_ONLY (GPU 0)
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_ONLY] Starting on GPU $LORA_ONLY_GPU..."
-
-# Start run-api for lora_only
-run-api --port $LORA_ONLY_API_PORT > "$LOG_DIR/api_lora_only.log" 2>&1 &
-LORA_ONLY_API_PID=$!
-
-# Start vLLM with --enforce-eager for lora_only
-echo "[LORA_ONLY] Starting vLLM with --enable-lora --enforce-eager..."
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -u example_trainer/vllm_api_server.py \
-    --model "$MODEL" \
-    --port $LORA_ONLY_VLLM_PORT \
-    --gpu-memory-utilization 0.3 \
-    --enable-lora \
-    --max-lora-rank 64 \
-    --enforce-eager \
-    > "$LOG_DIR/vllm_lora_only.log" 2>&1 &
-LORA_ONLY_VLLM_PID=$!
-
-# -----------------------------------------------------------------------------
-# LORA_RESTART (GPU 1) - Trainer manages vLLM internally
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_RESTART] Starting on GPU $LORA_RESTART_GPU..."
-
-# Pre-create checkpoint directory so vLLM can write its log there
-mkdir -p "$LOG_DIR/checkpoints_lora_restart"
-
-# Start run-api for lora_restart
-run-api --port $LORA_RESTART_API_PORT > "$LOG_DIR/api_lora_restart.log" 2>&1 &
-LORA_RESTART_API_PID=$!
-
-# =============================================================================
-# WAIT FOR INFRASTRUCTURE
-# =============================================================================
-echo ""
-echo "Waiting for infrastructure to be ready..."
-
-wait_for_api $LORA_ONLY_API_PORT "lora_only API" || exit 1
-wait_for_api $LORA_RESTART_API_PORT "lora_restart API" || exit 1
-wait_for_health $LORA_ONLY_VLLM_PORT "lora_only vLLM" 90 || exit 1
-
-# =============================================================================
-# START ENVIRONMENTS AND TRAINERS
-# =============================================================================
-echo ""
-echo "Starting environments and trainers..."
-
-# Record start time
-START_TIME=$(date +%s)
-
-# -----------------------------------------------------------------------------
-# LORA_ONLY: Start environment and trainer
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_ONLY] Starting GSM8k environment..."
-python -u environments/gsm8k_server.py serve \
-    --env.tokenizer_name "$MODEL" \
-    --env.use_wandb=$USE_WANDB \
-    --env.wandb_name "lora-only-env" \
-    --env.rollout_server_url "http://localhost:${LORA_ONLY_API_PORT}" \
-    --openai.model_name "$MODEL" \
-    --openai.base_url "http://localhost:${LORA_ONLY_VLLM_PORT}/v1" \
-    --openai.server_type vllm \
-    --slurm false \
-    2>&1 | tee "$LOG_DIR/env_lora_only.log" &
-LORA_ONLY_ENV_PID=$!
-
-echo "[LORA_ONLY] Starting trainer..."
-
-# Build wandb args
-WANDB_ARGS=""
-if [ "$USE_WANDB" = "true" ]; then
-    WANDB_ARGS="--use-wandb --wandb-project $WANDB_PROJECT --wandb-group lora-only"
-fi
-
-CUDA_VISIBLE_DEVICES=$LORA_ONLY_GPU python -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_only \
-    --vllm-port $LORA_ONLY_VLLM_PORT \
-    --atropos-url "http://localhost:${LORA_ONLY_API_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --lora-r 16 \
-    --lora-alpha 32 \
-    --vllm-restart-interval 5 \
-    --save-path "$LOG_DIR/checkpoints_lora_only" \
-    $WANDB_ARGS \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer_lora_only.log" &
-LORA_ONLY_TRAINER_PID=$!
-
-# -----------------------------------------------------------------------------
-# LORA_RESTART: Start trainer (it manages vLLM internally)
-# -----------------------------------------------------------------------------
-echo ""
-echo "[LORA_RESTART] Starting trainer (manages vLLM internally)..."
-
-# Build wandb args for lora_restart
-WANDB_ARGS_RESTART=""
-if [ "$USE_WANDB" = "true" ]; then
-    WANDB_ARGS_RESTART="--use-wandb --wandb-project $WANDB_PROJECT --wandb-group lora-restart"
-fi
-
-CUDA_VISIBLE_DEVICES=$LORA_RESTART_GPU python -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_restart \
-    --vllm-port $LORA_RESTART_VLLM_PORT \
-    --vllm-gpu-memory-utilization 0.3 \
-    --atropos-url "http://localhost:${LORA_RESTART_API_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --lora-r 16 \
-    --lora-alpha 32 \
-    --vllm-restart-interval 5 \
-    --save-path "$LOG_DIR/checkpoints_lora_restart" \
-    $WANDB_ARGS_RESTART \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer_lora_restart.log" &
-LORA_RESTART_TRAINER_PID=$!
-
-# Wait for lora_restart's internal vLLM to start
-# NOTE: Without --enforce-eager, vLLM compiles CUDA graphs which takes 1-3 minutes!
-echo "[LORA_RESTART] Waiting for internal vLLM to start..."
-echo "  NOTE: vLLM without --enforce-eager compiles CUDA graphs on startup (1-3 min)"
-echo "  Check progress: tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
-sleep 30  # Give more time for model loading before checking health
-wait_for_health $LORA_RESTART_VLLM_PORT "lora_restart internal vLLM" 180 || {
-    echo "  Failed - check logs:"
-    echo "  Trainer log:"
-    tail -30 "$LOG_DIR/trainer_lora_restart.log"
-    echo ""
-    echo "  vLLM internal log (if exists):"
-    tail -50 "$LOG_DIR/checkpoints_lora_restart/vllm_internal.log" 2>/dev/null || echo "  (not found)"
-    exit 1
-}
-
-# Start GSM8k environment for lora_restart
-echo "[LORA_RESTART] Starting GSM8k environment..."
-python -u environments/gsm8k_server.py serve \
-    --env.tokenizer_name "$MODEL" \
-    --env.use_wandb=$USE_WANDB \
-    --env.wandb_name "lora-restart-env" \
-    --env.rollout_server_url "http://localhost:${LORA_RESTART_API_PORT}" \
-    --openai.model_name "$MODEL" \
-    --openai.base_url "http://localhost:${LORA_RESTART_VLLM_PORT}/v1" \
-    --openai.server_type vllm \
-    --slurm false \
-    2>&1 | tee "$LOG_DIR/env_lora_restart.log" &
-LORA_RESTART_ENV_PID=$!
-
-# =============================================================================
-# WAIT FOR BOTH TRAINERS TO COMPLETE
-# =============================================================================
-echo ""
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo "Both trainers running in parallel. Waiting for completion..."
-echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
-echo ""
-echo "📊 WANDB: https://wandb.ai (project: $WANDB_PROJECT)"
-echo ""
-echo "📋 MONITOR LOGS (in another terminal):"
-echo ""
-echo "  # Trainer logs (main output):"
-echo "  tail -f $LOG_DIR/trainer_lora_only.log"
-echo "  tail -f $LOG_DIR/trainer_lora_restart.log"
-echo ""
-echo "  # Environment logs (rollouts, scores):"
-echo "  tail -f $LOG_DIR/env_lora_only.log"
-echo "  tail -f $LOG_DIR/env_lora_restart.log"
-echo ""
-echo "  # vLLM logs:"
-echo "  tail -f $LOG_DIR/vllm_lora_only.log"
-echo "  tail -f $LOG_DIR/checkpoints_lora_restart/vllm_internal.log"
-echo ""
-echo "  # All logs at once:"
-echo "  tail -f $LOG_DIR/*.log"
-echo ""
-
-# Wait for trainers
-LORA_ONLY_EXIT=0
-LORA_RESTART_EXIT=0
-
-wait $LORA_ONLY_TRAINER_PID || LORA_ONLY_EXIT=$?
-LORA_ONLY_END=$(date +%s)
-LORA_ONLY_TIME=$((LORA_ONLY_END - START_TIME))
-echo "  ✓ lora_only finished in ${LORA_ONLY_TIME}s (exit: $LORA_ONLY_EXIT)"
-
-wait $LORA_RESTART_TRAINER_PID || LORA_RESTART_EXIT=$?
-LORA_RESTART_END=$(date +%s)
-LORA_RESTART_TIME=$((LORA_RESTART_END - START_TIME))
-echo "  ✓ lora_restart finished in ${LORA_RESTART_TIME}s (exit: $LORA_RESTART_EXIT)"
-
-# =============================================================================
-# RESULTS
-# =============================================================================
-echo ""
-echo "============================================================"
-echo "COMPARISON RESULTS (Parallel Execution)"
-echo "============================================================"
-echo ""
-echo "Training Steps: $TRAINING_STEPS"
-echo "Batch Size: $BATCH_SIZE"
-echo ""
-echo "┌─────────────────┬──────┬──────────────┬────────────────────────────┐"
-echo "│ Mode            │ GPU  │ Total Time   │ Notes                      │"
-echo "├─────────────────┼──────┼──────────────┼────────────────────────────┤"
-printf "│ lora_only       │  %d   │ %10ss │ --enforce-eager (~13 TPS)  │\n" "$LORA_ONLY_GPU" "$LORA_ONLY_TIME"
-printf "│ lora_restart    │  %d   │ %10ss │ no --enforce-eager (~108 TPS)│\n" "$LORA_RESTART_GPU" "$LORA_RESTART_TIME"
-echo "└─────────────────┴──────┴──────────────┴────────────────────────────┘"
-echo ""
-
-if [ $LORA_ONLY_TIME -gt 0 ] && [ $LORA_RESTART_TIME -gt 0 ]; then
-    SPEEDUP=$(echo "scale=2; $LORA_ONLY_TIME / $LORA_RESTART_TIME" | bc)
-    echo "Speedup: ${SPEEDUP}x (lora_restart vs lora_only)"
-fi
-
-echo ""
-echo "📊 BENCHMARK DETAILS:"
-echo ""
-echo "━━━ lora_only (GPU $LORA_ONLY_GPU) ━━━"
-grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_only.log" 2>/dev/null || echo "  (check $LOG_DIR/trainer_lora_only.log)"
-echo ""
-echo "━━━ lora_restart (GPU $LORA_RESTART_GPU) ━━━"
-grep -A 15 "BENCHMARK SUMMARY" "$LOG_DIR/trainer_lora_restart.log" 2>/dev/null || echo "  (check $LOG_DIR/trainer_lora_restart.log)"
-
-echo ""
-echo "============================================================"
-echo "📁 All logs saved to: $LOG_DIR"
-echo "============================================================"
-echo ""
-echo "Log files:"
-echo "  $LOG_DIR/trainer_lora_only.log"
-echo "  $LOG_DIR/trainer_lora_restart.log"
-echo "  $LOG_DIR/vllm_lora_only.log"
-echo "  $LOG_DIR/env_lora_only.log"
-echo "  $LOG_DIR/env_lora_restart.log"
-echo ""
--- a/example_trainer/scripts/test_lora_mode.sh
+++ b/example_trainer/scripts/test_lora_mode.sh
@ -1,140 +0,0 @@
-#!/bin/bash
-# =============================================================================
-# LoRA Mode GSM8k Training Test
-# =============================================================================
-#
-# Tests the LoRA training pipeline with GSM8k environment.
-# Uses separate GPUs for vLLM and trainer.
-#
-# Usage:
-#   CUDA_VISIBLE_DEVICES=0,1 ./scripts/test_lora_mode.sh [MODEL] [STEPS]
-#
-# =============================================================================
-
-set -e
-
-MODEL="${1:-Qwen/Qwen2.5-3B-Instruct}"
-TRAINING_STEPS="${2:-50}"
-BATCH_SIZE=4
-SAVE_INTERVAL=10
-
-VLLM_PORT=9001
-GSM8K_PORT=8001
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
-REPO_DIR="$(dirname "$TRAINER_DIR")"
-
-LOG_DIR="${REPO_DIR}/lora_test_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$LOG_DIR"
-
-echo "============================================================"
-echo "LoRA Mode GSM8k Training Test"
-echo "============================================================"
-echo "Model: $MODEL"
-echo "Steps: $TRAINING_STEPS"
-echo "Log Dir: $LOG_DIR"
-echo "============================================================"
-
-cleanup() {
-    echo "Cleaning up..."
-    pkill -u $USER -f "vllm_api_server.*port.*${VLLM_PORT}" 2>/dev/null || true
-    pkill -u $USER -f "gsm8k_server" 2>/dev/null || true
-    pkill -u $USER -f "grpo.py" 2>/dev/null || true
-}
-trap cleanup EXIT
-cleanup
-
-# Clear Triton cache for B200 compatibility
-rm -rf ~/.triton/cache
-
-cd "$REPO_DIR"
-
-echo ""
-echo "[1/4] Starting vLLM with LoRA support..."
-VLLM_ENABLE_SHARED_WEIGHTS=1 \
-python -u example_trainer/vllm_api_server.py \
-    --model "$MODEL" \
-    --tensor-parallel-size 1 \
-    --port $VLLM_PORT \
-    --dtype bfloat16 \
-    --gpu-memory-utilization 0.6 \
-    --enable-lora \
-    --max-loras 2 \
-    --max-lora-rank 64 \
-    --enforce-eager \
-    > "${LOG_DIR}/vllm.log" 2>&1 &
-
-echo "Waiting for vLLM (45s)..."
-sleep 45
-
-curl -s "http://localhost:${VLLM_PORT}/health" && echo " ✓ vLLM ready" || { echo " ✗ vLLM failed"; exit 1; }
-
-echo ""
-echo "[2/4] Starting GSM8k environment..."
-python -u environments/gsm8k_server.py serve \
-    --env.tokenizer_name "$MODEL" \
-    --env.use_wandb=False \
-    --env.rollout_server_url "http://localhost:${GSM8K_PORT}" \
-    --openai.model_name "$MODEL" \
-    --openai.base_url "http://localhost:${VLLM_PORT}/v1" \
-    --openai.server_type vllm \
-    --slurm false \
-    > "${LOG_DIR}/gsm8k.log" 2>&1 &
-
-echo "Waiting for GSM8k (10s)..."
-sleep 10
-
-echo ""
-echo "[3/4] Baseline test (before training)..."
-curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
-    -H "Content-Type: application/json" \
-    -d '{
-        "prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
-        "max_tokens": 100,
-        "temperature": 0.1
-    }' | jq '.text[0]' | tee "${LOG_DIR}/baseline_response.txt"
-
-echo ""
-echo "[4/4] Starting LoRA trainer..."
-python -u example_trainer/grpo.py \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_only \
-    --vllm-port $VLLM_PORT \
-    --atropos-url "http://localhost:${GSM8K_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --vllm-restart-interval $SAVE_INTERVAL \
-    --save-path "$LOG_DIR/checkpoints" \
-    --benchmark \
-    2>&1 | tee "${LOG_DIR}/trainer.log"
-
-echo ""
-echo "============================================================"
-echo "Training Complete!"
-echo "Logs: $LOG_DIR"
-echo "Checkpoints: $LOG_DIR/checkpoints"
-echo "============================================================"
-
-# Post-training test
-if [ -d "$LOG_DIR/checkpoints" ]; then
-    LATEST_ADAPTER=$(ls -td "$LOG_DIR/checkpoints/adapter_"* 2>/dev/null | head -1)
-    if [ -n "$LATEST_ADAPTER" ]; then
-        echo ""
-        echo "Post-training test with adapter: $LATEST_ADAPTER"
-
-        curl -s -X POST "http://localhost:${VLLM_PORT}/lora/load" \
-            -H "Content-Type: application/json" \
-            -d '{"adapter_path": "'"$LATEST_ADAPTER"'"}' | jq
-
-        echo ""
-        echo "Response after training:"
-        curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
-            -H "Content-Type: application/json" \
-            -d '{
-                "prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
-                "max_tokens": 100,
-                "temperature": 0.1
-            }' | jq '.text[0]' | tee "${LOG_DIR}/trained_response.txt"
-    fi
-fi
--- a/example_trainer/scripts/test_lora_restart_quick.sh
+++ b/example_trainer/scripts/test_lora_restart_quick.sh
@ -1,100 +0,0 @@
-#!/bin/bash
-# Quick test for lora_restart mode - just 10 steps with 2 restarts
-set -e
-
-MODEL="${1:-Qwen/Qwen3-4B-Instruct-2507}"
-STEPS="${2:-10}"
-GPU="${3:-0}"
-PORT_API=8099
-PORT_VLLM=9099
-MAX_LEN="${MAX_LEN:-8192}"  # Use 8k for quick test, set MAX_LEN=32768 for full test
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-cd "$REPO_ROOT"
-
-LOG_DIR="./lora_restart_test_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$LOG_DIR"
-
-echo "=============================================="
-echo "LORA_RESTART Quick Test"
-echo "=============================================="
-echo "Model: $MODEL"
-echo "Steps: $STEPS"
-echo "GPU: $GPU"
-echo "Max Length: $MAX_LEN"
-echo "Log dir: $LOG_DIR"
-echo "=============================================="
-
-# Cleanup
-cleanup() {
-    echo "Cleaning up..."
-    pkill -9 -f "port $PORT_VLLM" 2>/dev/null || true
-    pkill -9 -f "port $PORT_API" 2>/dev/null || true
-    fuser -k ${PORT_API}/tcp 2>/dev/null || true
-    fuser -k ${PORT_VLLM}/tcp 2>/dev/null || true
-}
-trap cleanup EXIT
-
-# Kill any existing processes
-cleanup
-sleep 2
-
-# Start API server
-echo ""
-echo "[1/3] Starting API server on port $PORT_API..."
-run-api --port $PORT_API > "$LOG_DIR/api.log" 2>&1 &
-API_PID=$!
-sleep 3
-
-# Check API is up
-if ! curl -s "http://localhost:$PORT_API/info" > /dev/null; then
-    echo "ERROR: API server failed to start"
-    cat "$LOG_DIR/api.log"
-    exit 1
-fi
-echo "  ✓ API server ready"
-
-# Start environment (GSM8K for simplicity)
-echo ""
-echo "[2/3] Starting GSM8K environment..."
-python -u environments/gsm8k_server.py serve \
-    --env.tokenizer_name "$MODEL" \
-    --env.use_wandb=False \
-    --env.rollout_server_url "http://localhost:$PORT_API" \
-    --openai.model_name "$MODEL" \
-    --openai.base_url "http://localhost:$PORT_VLLM/v1" \
-    --openai.server_type vllm \
-    --slurm false \
-    > "$LOG_DIR/env.log" 2>&1 &
-ENV_PID=$!
-echo "  ✓ Environment started (PID: $ENV_PID)"
-sleep 5
-
-# Start trainer
-echo ""
-echo "[3/3] Starting LORA_RESTART trainer..."
-echo "  (This will launch vLLM internally and restart every 5 steps)"
-echo ""
-
-CUDA_VISIBLE_DEVICES=$GPU python -m example_trainer.grpo \
-    --model-name "$MODEL" \
-    --weight-bridge-mode lora_restart \
-    --vllm-port $PORT_VLLM \
-    --vllm-gpu-memory-utilization 0.20 \
-    --atropos-url "http://localhost:$PORT_API" \
-    --batch-size 2 \
-    --training-steps $STEPS \
-    --max-model-len $MAX_LEN \
-    --seq-len $MAX_LEN \
-    --lora-r 16 \
-    --lora-alpha 32 \
-    --vllm-restart-interval 5 \
-    --save-path "$LOG_DIR/checkpoints" \
-    --benchmark \
-    2>&1 | tee "$LOG_DIR/trainer.log"
-
-echo ""
-echo "=============================================="
-echo "Test complete! Logs in: $LOG_DIR"
-echo "=============================================="
--- a/example_trainer/scripts/test_single_copy_mode.sh
+++ b/example_trainer/scripts/test_single_copy_mode.sh
@ -1,145 +0,0 @@
-#!/bin/bash
-# =============================================================================
-# Single-Copy Mode GSM8k Training Test
-# =============================================================================
-#
-# Tests the single-copy (shared_vllm) training pipeline with GSM8k environment.
-# vLLM and trainer share the SAME GPU memory - true single-copy architecture.
-#
-# Usage:
-#   CUDA_VISIBLE_DEVICES=0 ./scripts/test_single_copy_mode.sh [MODEL] [STEPS]
-#
-# Note: Single-copy mode requires tensor-parallel-size=1
-#
-# =============================================================================
-
-set -e
-
-MODEL="${1:-Qwen/Qwen2.5-3B-Instruct}"
-TRAINING_STEPS="${2:-50}"
-BATCH_SIZE=4
-
-VLLM_PORT=9002
-GSM8K_PORT=8002
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-TRAINER_DIR="$(dirname "$SCRIPT_DIR")"
-REPO_DIR="$(dirname "$TRAINER_DIR")"
-
-LOG_DIR="${REPO_DIR}/single_copy_test_$(date +%Y%m%d_%H%M%S)"
-mkdir -p "$LOG_DIR"
-
-echo "============================================================"
-echo "Single-Copy Mode GSM8k Training Test"
-echo "============================================================"
-echo "Model: $MODEL"
-echo "Steps: $TRAINING_STEPS"
-echo "Log Dir: $LOG_DIR"
-echo ""
-echo "NOTE: vLLM and trainer share the SAME GPU memory!"
-echo "      Weight updates are INSTANT (no copying)."
-echo "============================================================"
-
-cleanup() {
-    echo "Cleaning up..."
-    pkill -u $USER -f "vllm_api_server.*port.*${VLLM_PORT}" 2>/dev/null || true
-    pkill -u $USER -f "gsm8k_server.*${GSM8K_PORT}" 2>/dev/null || true
-    pkill -u $USER -f "grpo.py.*shared_vllm" 2>/dev/null || true
-}
-trap cleanup EXIT
-cleanup
-
-cd "$REPO_DIR"
-
-echo ""
-echo "[1/4] Starting vLLM with shared memory enabled..."
-# NOTE: --enforce-eager is REQUIRED for single-copy mode!
-# Without it, CUDA graphs freeze weights and updates won't be visible to inference.
-VLLM_ENABLE_SHARED_WEIGHTS=1 \
-LOGDIR="$LOG_DIR" \
-python -u example_trainer/vllm_api_server.py \
-    --model "$MODEL" \
-    --tensor-parallel-size 1 \
-    --port $VLLM_PORT \
-    --dtype bfloat16 \
-    --gpu-memory-utilization 0.5 \
-    --enforce-eager \
-    > "${LOG_DIR}/vllm.log" 2>&1 &
-
-echo "Waiting for vLLM (45s)..."
-sleep 45
-
-curl -s "http://localhost:${VLLM_PORT}/health" && echo " ✓ vLLM ready" || { echo " ✗ vLLM failed"; exit 1; }
-
-# Verify IPC handles are exported
-if [ -f "${LOG_DIR}/vllm_bridge_config.json" ]; then
-    echo " ✓ vllm_bridge_config.json created"
-    PARAM_COUNT=$(jq '.ipc_handles | keys | length' "${LOG_DIR}/vllm_bridge_config.json" 2>/dev/null || echo "0")
-    echo "   Exported parameters: $PARAM_COUNT"
-else
-    echo " ✗ vllm_bridge_config.json not found - shared memory may not work"
-fi
-
-echo ""
-echo "[2/4] Starting GSM8k environment..."
-python -u environments/gsm8k_server.py serve \
-    --env.tokenizer_name "$MODEL" \
-    --env.use_wandb=False \
-    --env.rollout_server_url "http://localhost:${GSM8K_PORT}" \
-    --openai.model_name "$MODEL" \
-    --openai.base_url "http://localhost:${VLLM_PORT}/v1" \
-    --openai.server_type vllm \
-    --slurm false \
-    > "${LOG_DIR}/gsm8k.log" 2>&1 &
-
-echo "Waiting for GSM8k (10s)..."
-sleep 10
-
-echo ""
-echo "[3/4] Baseline test (before training)..."
-curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
-    -H "Content-Type: application/json" \
-    -d '{
-        "prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
-        "max_tokens": 100,
-        "temperature": 0.1
-    }' | jq '.text[0]' | tee "${LOG_DIR}/baseline_response.txt"
-
-echo ""
-echo "[4/4] Starting Single-Copy trainer..."
-echo "The trainer will attach to vLLM's GPU memory via CUDA IPC."
-echo ""
-
-python -u example_trainer/grpo.py \
-    --model-name "$MODEL" \
-    --weight-bridge-mode shared_vllm \
-    --vllm-port $VLLM_PORT \
-    --atropos-url "http://localhost:${GSM8K_PORT}" \
-    --batch-size $BATCH_SIZE \
-    --training-steps $TRAINING_STEPS \
-    --save-path "$LOG_DIR/checkpoints" \
-    --vllm-config-path "${LOG_DIR}/vllm_bridge_config.json" \
-    --benchmark \
-    --debug-loading \
-    2>&1 | tee "${LOG_DIR}/trainer.log"
-
-echo ""
-echo "============================================================"
-echo "Training Complete!"
-echo "============================================================"
-echo "Logs: $LOG_DIR"
-echo ""
-echo "Key Metrics:"
-grep -E "Attached|fused|Step.*Loss" "${LOG_DIR}/trainer.log" | tail -20
-echo "============================================================"
-
-# Post-training test
-echo ""
-echo "Post-training test (weights are already updated in vLLM):"
-curl -s -X POST "http://localhost:${VLLM_PORT}/generate" \
-    -H "Content-Type: application/json" \
-    -d '{
-        "prompt": "<|im_start|>user\nWhat is 123 + 456?<|im_end|>\n<|im_start|>assistant\n",
-        "max_tokens": 100,
-        "temperature": 0.1
-    }' | jq '.text[0]' | tee "${LOG_DIR}/trained_response.txt"
--- a/example_trainer/scripts/test_vllm_restart_only.py
+++ b/example_trainer/scripts/test_vllm_restart_only.py
@ -1,246 +0,0 @@
-#!/usr/bin/env python3
-"""
-Minimal test for vLLM restart cycle - no training, just launch/terminate/relaunch.
-Tests whether GPU memory is properly released between restarts.
-
-Run from atropos directory:
-    python example_trainer/scripts/test_vllm_restart_only.py --restarts 3 --gpu 0
-"""
-import os
-import sys
-import time
-import argparse
-import subprocess
-import signal
-
-
-def kill_process_on_port(port: int) -> None:
-    """Kill any process using the specified port."""
-    try:
-        subprocess.run(f"fuser -k {port}/tcp", shell=True, capture_output=True, timeout=10)
-    except Exception:
-        pass
-
-
-def wait_for_vllm_ready(port: int, timeout: int = 300) -> bool:
-    """Wait for vLLM to be ready on the specified port."""
-    import urllib.request
-    import urllib.error
-    
-    start = time.time()
-    while time.time() - start < timeout:
-        try:
-            req = urllib.request.urlopen(f"http://localhost:{port}/health", timeout=5)
-            if req.status == 200:
-                return True
-        except (urllib.error.URLError, Exception):
-            pass
-        time.sleep(5)
-        elapsed = int(time.time() - start)
-        print(f"    Waiting... ({elapsed}s / {timeout}s)")
-    return False
-
-
-def terminate_vllm(proc, port: int) -> None:
-    """Terminate vLLM process and release GPU memory."""
-    print(f"  Terminating vLLM on port {port}...")
-    
-    # Get current GPU device
-    gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
-    
-    # Phase 1: Kill the process group (kills all children too)
-    if proc is not None:
-        print(f"  Killing process group (PID: {proc.pid})...")
-        try:
-            # Kill entire process group - this gets all child processes
-            os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
-        except (ProcessLookupError, PermissionError):
-            pass
-        try:
-            proc.kill()
-            proc.wait(timeout=5)
-        except Exception as e:
-            print(f"  Warning: {e}")
-    
-    # Phase 2: Kill by port (catches anything still running)
-    kill_process_on_port(port)
-    time.sleep(2)
-    
-    # Phase 3: Kill ALL vLLM-related processes
-    print("  Killing all vLLM-related processes...")
-    kill_commands = [
-        f"fuser -k {port}/tcp",
-        "pkill -9 -f 'vllm.*EngineCore'",
-        "pkill -9 -f 'vllm_api_server'",
-        "pkill -9 -f 'from vllm'",
-        "pkill -9 -f 'multiprocessing.spawn'",
-    ]
-    for cmd in kill_commands:
-        try:
-            subprocess.run(cmd, shell=True, capture_output=True, timeout=5)
-        except Exception:
-            pass
-    
-    # Phase 4: Check for zombie GPU processes
-    print(f"  Checking for zombie GPU processes on GPU {gpu_id}...")
-    try:
-        result = subprocess.run(
-            f"nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits -i {gpu_id}",
-            shell=True, capture_output=True, text=True, timeout=10
-        )
-        if result.stdout.strip():
-            print(f"  Found GPU processes:\n{result.stdout}")
-            for line in result.stdout.strip().split('\n'):
-                if line.strip():
-                    parts = line.split(',')
-                    if len(parts) >= 1:
-                        pid = parts[0].strip()
-                        if pid and pid != str(os.getpid()):
-                            print(f"    Killing zombie GPU process: {pid}")
-                            try:
-                                subprocess.run(f"kill -9 {pid}", shell=True, timeout=5)
-                            except Exception:
-                                pass
-    except Exception as e:
-        print(f"  Warning: nvidia-smi check failed: {e}")
-    
-    # Phase 5: Wait for GPU memory release
-    print("  Waiting for GPU memory release...")
-    import torch
-    for i in range(12):  # 60 seconds total
-        time.sleep(5)
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            free_mem = torch.cuda.mem_get_info()[0] / 1e9
-            total_mem = torch.cuda.mem_get_info()[1] / 1e9
-            print(f"    [{(i+1)*5}s] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
-            if free_mem > total_mem * 0.5:
-                print(f"  ✓ Sufficient memory available ({free_mem:.1f} GB)")
-                break
-    
-    # Final cleanup
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        free_mem = torch.cuda.mem_get_info()[0] / 1e9
-        total_mem = torch.cuda.mem_get_info()[1] / 1e9
-        print(f"  ✓ Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
-    
-    print("  ✓ vLLM terminated")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Test vLLM restart cycle")
-    parser.add_argument("--model", default="Qwen/Qwen3-4B-Instruct-2507")
-    parser.add_argument("--port", type=int, default=9099)
-    parser.add_argument("--gpu", type=int, default=0)
-    parser.add_argument("--memory-util", type=float, default=0.3)
-    parser.add_argument("--restarts", type=int, default=3, help="Number of restart cycles")
-    args = parser.parse_args()
-    
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
-    
-    import torch
-    
-    print("=" * 60)
-    print("vLLM RESTART CYCLE TEST")
-    print("=" * 60)
-    print(f"Model: {args.model}")
-    print(f"Port: {args.port}")
-    print(f"GPU: {args.gpu}")
-    print(f"Memory utilization: {args.memory_util}")
-    print(f"Restart cycles: {args.restarts}")
-    print("=" * 60)
-    
-    # Check initial GPU memory
-    if torch.cuda.is_available():
-        free_mem = torch.cuda.mem_get_info()[0] / 1e9
-        total_mem = torch.cuda.mem_get_info()[1] / 1e9
-        print(f"\nInitial GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free")
-    
-    # Find server script (relative to this script's location)
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    server_script = os.path.join(os.path.dirname(script_dir), "vllm_api_server.py")
-    
-    if not os.path.exists(server_script):
-        print(f"ERROR: Cannot find vllm_api_server.py at {server_script}")
-        return 1
-    
-    log_dir = "/tmp/vllm_restart_test"
-    os.makedirs(log_dir, exist_ok=True)
-    
-    for cycle in range(args.restarts):
-        print(f"\n{'='*60}")
-        print(f"CYCLE {cycle + 1}/{args.restarts}")
-        print(f"{'='*60}")
-        
-        # Check memory before launch
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            free_mem = torch.cuda.mem_get_info()[0] / 1e9
-            total_mem = torch.cuda.mem_get_info()[1] / 1e9
-            print(f"[Before launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
-        
-        # Launch vLLM
-        print(f"\n[{cycle+1}] Launching vLLM...")
-        cmd = [
-            "python", server_script,
-            "--model", args.model,
-            "--port", str(args.port),
-            "--gpu-memory-utilization", str(args.memory_util),
-            "--max-model-len", "4096",
-        ]
-        print(f"  Command: {' '.join(cmd)}")
-        
-        log_file = f"{log_dir}/vllm_cycle_{cycle}.log"
-        with open(log_file, "w") as f:
-            proc = subprocess.Popen(
-                cmd,
-                stdout=f,
-                stderr=subprocess.STDOUT,
-                env=os.environ.copy(),
-                start_new_session=True,  # Creates new process group for easy cleanup
-            )
-        print(f"  PID: {proc.pid} (process group: {os.getpgid(proc.pid)})")
-        print(f"  Log: {log_file}")
-        
-        # Wait for vLLM to be ready
-        print(f"  Waiting for vLLM to be ready...")
-        start_time = time.time()
-        if wait_for_vllm_ready(args.port, timeout=300):
-            elapsed = time.time() - start_time
-            print(f"  ✓ vLLM ready in {elapsed:.1f}s")
-        else:
-            print(f"  ✗ vLLM failed to start!")
-            print(f"  Check log: tail -50 {log_file}")
-            proc.kill()
-            return 1
-        
-        # Check memory after launch
-        if torch.cuda.is_available():
-            free_mem = torch.cuda.mem_get_info()[0] / 1e9
-            total_mem = torch.cuda.mem_get_info()[1] / 1e9
-            print(f"[After launch] GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
-        
-        # Keep vLLM running for a bit
-        print(f"\n  Letting vLLM run for 5s...")
-        time.sleep(5)
-        
-        # Terminate vLLM
-        print(f"\n[{cycle+1}] Terminating vLLM...")
-        terminate_vllm(proc, args.port)
-    
-    print("\n" + "=" * 60)
-    print("TEST COMPLETE!")
-    print("=" * 60)
-    
-    if torch.cuda.is_available():
-        free_mem = torch.cuda.mem_get_info()[0] / 1e9
-        total_mem = torch.cuda.mem_get_info()[1] / 1e9
-        print(f"Final GPU memory: {free_mem:.1f}/{total_mem:.1f} GB free ({100*free_mem/total_mem:.0f}%)")
-    
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())