group tests

2026-04-24 17:04:55 +00:00 · 2026-02-02 14:15:13 -05:00 · 2026-02-02 14:15:13 -05:00 · bbda8bedb2
commit bbda8bedb2
parent 7bffa4bfe7
1 changed files with 47 additions and 14 deletions
--- a/example_trainer/test_multi_model.py
+++ b/example_trainer/test_multi_model.py
@ -12,23 +12,24 @@ With --auto-env, each model gets its own isolated stack:
    - trainer

 Usage:
-    # RECOMMENDED: Fully automated parallel test (each model gets isolated stack)
+    # RECOMMENDED: Fully automated parallel test with W&B logging
    python -m example_trainer.test_multi_model \
        --models qwen3-4b hermes-8b nemotron-14b devstral-24b \
        --parallel \
        --gpus 0 1 2 3 \
-        --auto-env
+        --auto-env \
+        --use-wandb \
+        --wandb-project multi-model-test
    
    # Sequential test on one GPU
    python -m example_trainer.test_multi_model \
        --models qwen3-4b hermes-8b \
        --sequential \
        --gpu 0 \
-        --auto-env
+        --auto-env \
+        --use-wandb
    
    # Manual mode (you must start run-api and gsm8k_server yourself)
-    # First start: run-api --port 8002 &
-    # Then start gsm8k for your model
    python -m example_trainer.test_multi_model \
        --models qwen3-4b \
        --sequential \
@ -36,9 +37,9 @@ Usage:
        --atropos-url http://localhost:8002

 Port allocation with --auto-env:
-    Model 0: run-api:8002, vLLM:9001
-    Model 1: run-api:8003, vLLM:9002
-    Model 2: run-api:8004, vLLM:9003
+    Model 0: run-api:8002, vLLM:9001, GPU from --gpus[0]
+    Model 1: run-api:8003, vLLM:9002, GPU from --gpus[1]
+    Model 2: run-api:8004, vLLM:9003, GPU from --gpus[2]
    ...
 """

@ -195,6 +196,8 @@ def run_model_test(
    training_steps: int,
    vllm_port_offset: int = 0,
    auto_env: bool = False,
+    use_wandb: bool = False,
+    wandb_project: str = "multi-model-test",
 ) -> Dict:
    """
    Run a complete training test for a single model.
@ -288,13 +291,15 @@ def run_model_test(
        # which is required for CUDA IPC with ptrace_scope=1
        run_script = script_dir / "run.py"
        
+        # Don't use CUDA_VISIBLE_DEVICES - use --device instead
+        # run.py sets CUDA_VISIBLE_DEVICES internally based on --device
        run_env = os.environ.copy()
-        run_env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        run_env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        
        run_cmd = [
            sys.executable, "-u", str(run_script),
            "--model", model_config.model_id,
+            "--device", f"cuda:{gpu_id}",  # This controls GPU selection
            "--vllm-port", str(vllm_port),
            "--gpu-memory-utilization", str(model_config.gpu_memory_utilization),
            "--max-model-len", str(model_config.max_model_len),
@ -307,6 +312,10 @@ def run_model_test(
            "--log-dir", str(log_dir),
        ]
        
+        # Add wandb flags if enabled
+        if use_wandb:
+            run_cmd.extend(["--use-wandb", "--wandb-project", wandb_project])
+        
        print(f"[{model_name}] Starting unified trainer (vLLM + GRPO) for {training_steps} steps...")
        with open(trainer_log, "w") as tlog:
            trainer_process = subprocess.Popen(
@ -386,6 +395,8 @@ def run_parallel_tests(
    base_dir: str,
    training_steps: int,
    auto_env: bool = False,
+    use_wandb: bool = False,
+    wandb_project: str = "multi-model-test",
 ) -> List[Dict]:
    """Run tests for multiple models in parallel."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@ -396,7 +407,7 @@ def run_parallel_tests(
    def run_and_store(model, gpu, port_offset):
        result = run_model_test(
            model, gpu, atropos_url, atropos_port, base_dir, timestamp,
-            training_steps, port_offset, auto_env
+            training_steps, port_offset, auto_env, use_wandb, wandb_project
        )
        with result_lock:
            results.append(result)
@ -423,6 +434,8 @@ def run_sequential_tests(
    base_dir: str,
    training_steps: int,
    auto_env: bool = False,
+    use_wandb: bool = False,
+    wandb_project: str = "multi-model-test",
 ) -> List[Dict]:
    """Run tests for multiple models sequentially on one GPU."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@ -431,7 +444,8 @@ def run_sequential_tests(
    for i, model in enumerate(models):
        result = run_model_test(
            model, gpu_id, atropos_url, atropos_port, base_dir, timestamp,
-            training_steps, port_offset=0, auto_env=auto_env
+            training_steps, port_offset=0, auto_env=auto_env,
+            use_wandb=use_wandb, wandb_project=wandb_project
        )
        results.append(result)
        
@ -548,7 +562,18 @@ Available models: """ + ", ".join(TEST_MODELS.keys())
    parser.add_argument(
        "--auto-env",
        action="store_true",
-        help="Automatically start gsm8k environment for each model (requires run-api to be running)",
+        help="Automatically start run-api and gsm8k environment for each model",
+    )
+    parser.add_argument(
+        "--use-wandb",
+        action="store_true",
+        help="Enable Weights & Biases logging for training runs",
+    )
+    parser.add_argument(
+        "--wandb-project",
+        type=str,
+        default="multi-model-test",
+        help="W&B project name for logging",
    )
    
    args = parser.parse_args()
@ -580,19 +605,27 @@ Available models: """ + ", ".join(TEST_MODELS.keys())
            gpus = gpus * (len(models) // len(gpus) + 1)
        
        print(f"Using GPUs: {gpus[:len(models)]}")
+        if args.use_wandb:
+            print(f"W&B logging enabled (project: {args.wandb_project})")
        results = run_parallel_tests(
            models, gpus[:len(models)],
            args.atropos_url, args.atropos_port,
            args.output_dir, args.training_steps,
-            auto_env=args.auto_env
+            auto_env=args.auto_env,
+            use_wandb=args.use_wandb,
+            wandb_project=args.wandb_project,
        )
    else:
        print(f"Using GPU: {args.gpu}")
+        if args.use_wandb:
+            print(f"W&B logging enabled (project: {args.wandb_project})")
        results = run_sequential_tests(
            models, args.gpu,
            args.atropos_url, args.atropos_port,
            args.output_dir, args.training_steps,
-            auto_env=args.auto_env
+            auto_env=args.auto_env,
+            use_wandb=args.use_wandb,
+            wandb_project=args.wandb_project,
        )
    
    # Print summary