#!/usr/bin/env python3 """ Multi-model test suite for shared_vllm trainer. Tests the trainer against diverse models to verify robustness. Supports both parallel (different GPUs) and sequential execution. With --auto-env, each model gets its own isolated stack: - run-api (port 8002 + offset) - gsm8k environment (with model-specific tokenizer) - vLLM server (port 9001 + offset) - trainer Usage: # RECOMMENDED: Fully automated parallel test (each model gets isolated stack) python -m example_trainer.test_multi_model \ --models qwen3-4b hermes-8b nemotron-14b devstral-24b \ --parallel \ --gpus 0 1 2 3 \ --auto-env # Sequential test on one GPU python -m example_trainer.test_multi_model \ --models qwen3-4b hermes-8b \ --sequential \ --gpu 0 \ --auto-env # Manual mode (you must start run-api and gsm8k_server yourself) # First start: run-api --port 8002 & # Then start gsm8k for your model python -m example_trainer.test_multi_model \ --models qwen3-4b \ --sequential \ --gpu 0 \ --atropos-url http://localhost:8002 Port allocation with --auto-env: Model 0: run-api:8002, vLLM:9001 Model 1: run-api:8003, vLLM:9002 Model 2: run-api:8004, vLLM:9003 ... """ import argparse import json import os import signal import subprocess import sys import time from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Dict, List, Optional import threading @dataclass class ModelConfig: """Configuration for a test model.""" name: str model_id: str gpu_memory_utilization: float = 0.5 max_model_len: int = 4096 dtype: str = "bfloat16" training_steps: int = 10 notes: str = "" # Define test models # Memory estimates for B200 (183GB): # - Model weights (bf16): 2 bytes/param # - Gradients: ~same as weights # - 8-bit optimizer: ~1 byte/param # - KV cache: depends on max_model_len TEST_MODELS: Dict[str, ModelConfig] = { "qwen3-4b": ModelConfig( name="qwen3-4b", model_id="Qwen/Qwen3-4B-Instruct-2507", gpu_memory_utilization=0.4, # ~73GB for vLLM max_model_len=8192, # Plenty of room on B200 notes="Small 4B model, good baseline test (~8GB weights)", ), "hermes-8b": ModelConfig( name="hermes-8b", model_id="NousResearch/Hermes-3-Llama-3.1-8B", gpu_memory_utilization=0.45, # ~82GB for vLLM max_model_len=8192, # 8K context fits well notes="Llama 8B architecture (~16GB weights)", ), "nemotron-14b": ModelConfig( name="nemotron-14b", model_id="nvidia/Nemotron-Cascade-14B-Thinking", gpu_memory_utilization=0.5, # ~91GB for vLLM max_model_len=32768, # 32K context for thinking notes="14B thinking model (~28GB weights), needs room for long CoT", ), "devstral-24b": ModelConfig( name="devstral-24b", model_id="mistralai/Devstral-Small-2-24B-Instruct-2512", gpu_memory_utilization=0.55, # ~100GB for vLLM max_model_len=16384, # 16K context (conservative for 24B) notes="Large 24B Mistral (~48GB weights), largest model", ), } def get_test_dir(base_dir: str, model_name: str, timestamp: str) -> Path: """Get unique test directory for a model run.""" return Path(base_dir) / f"{model_name}_{timestamp}" def start_run_api( port: int, log_path: Path, ) -> subprocess.Popen: """Start a run-api instance on a specific port.""" cmd = [sys.executable, "-m", "atroposlib.cli.run_api", "--port", str(port)] log_file = open(log_path, "w") process = subprocess.Popen( cmd, stdout=log_file, stderr=subprocess.STDOUT, # Don't buffer output bufsize=1, ) return process def wait_for_run_api(port: int, timeout: int = 60) -> bool: """Wait for run-api to be ready.""" import requests start = time.time() while time.time() - start < timeout: try: # run-api uses /status or / endpoint, not /health resp = requests.get(f"http://localhost:{port}/status", timeout=5) if resp.status_code == 200: return True except: pass try: # Fallback to root endpoint resp = requests.get(f"http://localhost:{port}/", timeout=5) if resp.status_code == 200: return True except: pass time.sleep(2) return False def start_gsm8k_env( model_id: str, vllm_port: int, run_api_port: int, log_path: Path, atropos_root: Path, ) -> subprocess.Popen: """Start a gsm8k environment process for a specific model.""" gsm8k_script = atropos_root / "environments" / "gsm8k_server.py" cmd = [ sys.executable, "-u", str(gsm8k_script), "serve", "--env.rollout_server_url", f"http://localhost:{run_api_port}", "--env.tokenizer_name", model_id, "--env.use_wandb", "false", "--env.total_steps", "10000", "--env.batch_size", "64", "--env.group_size", "8", "--openai.model_name", model_id, "--openai.base_url", f"http://localhost:{vllm_port}/v1", "--openai.api_key", "x", "--openai.server_type", "openai", ] log_file = open(log_path, "w") process = subprocess.Popen( cmd, stdout=log_file, stderr=subprocess.STDOUT, cwd=str(atropos_root), # Run from atropos root ) return process def run_model_test( model_config: ModelConfig, gpu_id: int, atropos_url: str, atropos_port: int, base_dir: str, timestamp: str, training_steps: int, vllm_port_offset: int = 0, auto_env: bool = False, ) -> Dict: """ Run a complete training test for a single model. Returns dict with test results. """ model_name = model_config.name test_dir = get_test_dir(base_dir, model_name, timestamp).resolve() # Make absolute test_dir.mkdir(parents=True, exist_ok=True) # Unique paths for this model (all absolute) vllm_port = 9001 + vllm_port_offset bridge_config_path = test_dir / "vllm_bridge_config.json" checkpoint_dir = test_dir / "checkpoints" log_dir = test_dir / "logs" log_dir.mkdir(exist_ok=True) vllm_log = log_dir / "vllm.log" trainer_log = log_dir / "trainer.log" # Each model gets unique ports run_api_port = 8002 + vllm_port_offset result = { "model": model_config.model_id, "model_name": model_name, "gpu": gpu_id, "vllm_port": vllm_port, "run_api_port": run_api_port, "test_dir": str(test_dir), "status": "pending", "error": None, "start_time": None, "end_time": None, "duration_seconds": None, "real_time_alignment": None, "final_gpu_memory": None, } print(f"\n{'='*60}") print(f"[{model_name}] Starting test on GPU {gpu_id}") print(f"[{model_name}] Model: {model_config.model_id}") print(f"[{model_name}] vLLM port: {vllm_port}") print(f"[{model_name}] Test dir: {test_dir}") print(f"{'='*60}\n") result["start_time"] = datetime.now().isoformat() start_time = time.time() env_process = None run_api_process = None trainer_process = None # Get atropos root directory (used for vLLM and gsm8k scripts) script_dir = Path(__file__).parent atropos_root = script_dir.parent.resolve() try: # === Start run-api (if auto_env) === if auto_env: run_api_log = log_dir / "run_api.log" print(f"[{model_name}] Starting run-api on port {run_api_port}...") run_api_process = start_run_api(run_api_port, run_api_log) if not wait_for_run_api(run_api_port, timeout=60): # Check if process died if run_api_process.poll() is not None: print(f"[{model_name}] run-api process exited with code {run_api_process.returncode}") # Print log contents for debugging if run_api_log.exists(): print(f"[{model_name}] run-api log contents:") print(run_api_log.read_text()[-2000:]) # Last 2000 chars raise RuntimeError(f"run-api failed to start on port {run_api_port}") print(f"[{model_name}] ✓ run-api ready on port {run_api_port}") # Update atropos_url to use this model's run-api atropos_url = f"http://localhost:{run_api_port}" # === Start gsm8k Environment (if auto_env) === if auto_env: env_log = log_dir / "env.log" print(f"[{model_name}] Starting gsm8k environment (tokenizer: {model_config.model_id})...") env_process = start_gsm8k_env( model_config.model_id, vllm_port, run_api_port, env_log, atropos_root ) time.sleep(10) # Give it time to initialize and connect print(f"[{model_name}] ✓ gsm8k environment started") # === Start Unified vLLM + Trainer (run.py) === # Using run.py ensures vLLM is a CHILD of the trainer process, # which is required for CUDA IPC with ptrace_scope=1 run_script = script_dir / "run.py" run_env = os.environ.copy() run_env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) run_env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" run_cmd = [ sys.executable, "-u", str(run_script), "--model", model_config.model_id, "--vllm-port", str(vllm_port), "--gpu-memory-utilization", str(model_config.gpu_memory_utilization), "--max-model-len", str(model_config.max_model_len), "--dtype", model_config.dtype, "--atropos-url", atropos_url, "--training-steps", str(training_steps), "--optimizer", "adamw_8bit", "--save-path", str(checkpoint_dir), "--checkpoint-interval", "5", "--log-dir", str(log_dir), ] print(f"[{model_name}] Starting unified trainer (vLLM + GRPO) for {training_steps} steps...") with open(trainer_log, "w") as tlog: trainer_process = subprocess.Popen( run_cmd, env=run_env, stdout=tlog, stderr=subprocess.STDOUT, cwd=str(atropos_root), # Run from atropos root ) trainer_process.wait() if trainer_process.returncode != 0: raise RuntimeError(f"Unified trainer exited with code {trainer_process.returncode}") result["status"] = "success" print(f"[{model_name}] ✓ Training completed successfully!") # Parse trainer log for metrics try: with open(trainer_log, "r") as f: log_content = f.read() # Extract real-time alignment if "Mean diff:" in log_content: import re match = re.search(r"Mean diff: ([\d.]+)", log_content) if match: result["real_time_alignment"] = float(match.group(1)) # Extract final GPU memory if "GPU mem:" in log_content: matches = re.findall(r"GPU mem: ([\d.]+)GB", log_content) if matches: result["final_gpu_memory"] = float(matches[-1]) except Exception as e: print(f"[{model_name}] Warning: Could not parse log: {e}") except Exception as e: result["status"] = "failed" result["error"] = str(e) print(f"[{model_name}] ✗ Test failed: {e}") import traceback traceback.print_exc() finally: # Note: vLLM is managed by run.py and cleaned up automatically # Cleanup gsm8k environment if env_process and env_process.poll() is None: print(f"[{model_name}] Terminating gsm8k environment...") env_process.terminate() try: env_process.wait(timeout=10) except subprocess.TimeoutExpired: env_process.kill() # Cleanup run-api if run_api_process and run_api_process.poll() is None: print(f"[{model_name}] Terminating run-api...") run_api_process.terminate() try: run_api_process.wait(timeout=10) except subprocess.TimeoutExpired: run_api_process.kill() result["end_time"] = datetime.now().isoformat() result["duration_seconds"] = time.time() - start_time return result def run_parallel_tests( models: List[ModelConfig], gpu_ids: List[int], atropos_url: str, atropos_port: int, base_dir: str, training_steps: int, auto_env: bool = False, ) -> List[Dict]: """Run tests for multiple models in parallel.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results = [] threads = [] result_lock = threading.Lock() def run_and_store(model, gpu, port_offset): result = run_model_test( model, gpu, atropos_url, atropos_port, base_dir, timestamp, training_steps, port_offset, auto_env ) with result_lock: results.append(result) # Start threads for i, (model, gpu) in enumerate(zip(models, gpu_ids)): t = threading.Thread(target=run_and_store, args=(model, gpu, i)) t.start() threads.append(t) time.sleep(5) # Stagger starts slightly # Wait for all to complete for t in threads: t.join() return results def run_sequential_tests( models: List[ModelConfig], gpu_id: int, atropos_url: str, atropos_port: int, base_dir: str, training_steps: int, auto_env: bool = False, ) -> List[Dict]: """Run tests for multiple models sequentially on one GPU.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results = [] for i, model in enumerate(models): result = run_model_test( model, gpu_id, atropos_url, atropos_port, base_dir, timestamp, training_steps, port_offset=0, auto_env=auto_env ) results.append(result) # Give GPU time to fully release memory time.sleep(10) return results def print_summary(results: List[Dict]): """Print summary of test results.""" print("\n" + "="*80) print("TEST SUMMARY") print("="*80) for r in results: status_icon = "✓" if r["status"] == "success" else "✗" duration = f"{r['duration_seconds']:.1f}s" if r['duration_seconds'] else "N/A" alignment = f"{r['real_time_alignment']:.4f}" if r['real_time_alignment'] else "N/A" memory = f"{r['final_gpu_memory']:.1f}GB" if r['final_gpu_memory'] else "N/A" print(f"\n{status_icon} {r['model_name']}") print(f" Model: {r['model']}") print(f" GPU: {r['gpu']}, vLLM port: {r['vllm_port']}, run-api port: {r.get('run_api_port', 'N/A')}") print(f" Status: {r['status']}") print(f" Duration: {duration}") print(f" Real-time alignment: {alignment}") print(f" GPU memory: {memory}") if r["error"]: print(f" Error: {r['error']}") print(f" Logs: {r['test_dir']}/logs/") # Summary stats successes = sum(1 for r in results if r["status"] == "success") failures = len(results) - successes print(f"\n{'='*80}") print(f"TOTAL: {successes} passed, {failures} failed") print("="*80) def main(): parser = argparse.ArgumentParser( description="Multi-model test suite for shared_vllm trainer", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run all models in parallel (one per GPU) python -m example_trainer.test_multi_model --parallel # Run specific models python -m example_trainer.test_multi_model --models hermes-8b qwen3-4b --parallel # Run sequentially on GPU 0 python -m example_trainer.test_multi_model --sequential --gpu 0 Available models: """ + ", ".join(TEST_MODELS.keys()) ) parser.add_argument( "--models", nargs="+", choices=list(TEST_MODELS.keys()), default=["qwen3-4b", "hermes-8b"], help="Models to test", ) parser.add_argument( "--parallel", action="store_true", help="Run models in parallel on different GPUs", ) parser.add_argument( "--sequential", action="store_true", help="Run models sequentially on one GPU", ) parser.add_argument( "--gpus", type=int, nargs="+", default=None, help="GPU IDs to use (for parallel mode)", ) parser.add_argument( "--gpu", type=int, default=0, help="GPU ID (for sequential mode)", ) parser.add_argument( "--atropos-url", type=str, default="http://localhost:8002", help="Atropos API URL", ) parser.add_argument( "--atropos-port", type=int, default=8002, help="Atropos API port (for spawning multiple if needed)", ) parser.add_argument( "--training-steps", type=int, default=10, help="Number of training steps per model", ) parser.add_argument( "--output-dir", type=str, default="./multi_model_tests", help="Base directory for test outputs", ) parser.add_argument( "--auto-env", action="store_true", help="Automatically start gsm8k environment for each model (requires run-api to be running)", ) args = parser.parse_args() if not args.parallel and not args.sequential: args.sequential = True # Default to sequential # Get model configs models = [TEST_MODELS[name] for name in args.models] print(f"\n{'#'*60}") print("# MULTI-MODEL SHARED_VLLM TRAINER TEST SUITE") print(f"{'#'*60}") print(f"\nModels to test: {[m.name for m in models]}") print(f"Mode: {'Parallel' if args.parallel else 'Sequential'}") print(f"Training steps per model: {args.training_steps}") print(f"Output directory: {args.output_dir}") print(f"Atropos URL: {args.atropos_url}") # Run tests if args.auto_env: print(f"Auto-env: Will start gsm8k environment per model") if args.parallel: gpus = args.gpus or list(range(len(models))) if len(gpus) < len(models): print(f"\nWarning: Not enough GPUs ({len(gpus)}) for models ({len(models)})") print("Some models will share GPUs") gpus = gpus * (len(models) // len(gpus) + 1) print(f"Using GPUs: {gpus[:len(models)]}") results = run_parallel_tests( models, gpus[:len(models)], args.atropos_url, args.atropos_port, args.output_dir, args.training_steps, auto_env=args.auto_env ) else: print(f"Using GPU: {args.gpu}") results = run_sequential_tests( models, args.gpu, args.atropos_url, args.atropos_port, args.output_dir, args.training_steps, auto_env=args.auto_env ) # Print summary print_summary(results) # Save results to JSON results_file = Path(args.output_dir) / f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" results_file.parent.mkdir(parents=True, exist_ok=True) with open(results_file, "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {results_file}") # Exit with error code if any failed if any(r["status"] != "success" for r in results): sys.exit(1) if __name__ == "__main__": main()