From d07ab3e3cedf0d44d111ed80c94915a59b09ac84 Mon Sep 17 00:00:00 2001 From: Jai Suphavadeeprasit Date: Wed, 4 Feb 2026 18:01:59 -0500 Subject: [PATCH] math zero work arounds --- environments/math_server_zero.py | 47 ++++++++++++------- example_trainer/configs/math_zero_lora.yaml | 8 ---- example_trainer/configs/math_zero_shared.yaml | 8 ---- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py index eebbd5ec..bbb116fd 100644 --- a/environments/math_server_zero.py +++ b/environments/math_server_zero.py @@ -4,10 +4,11 @@ Original Repository: https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero """ import asyncio +import os import random import re from concurrent.futures import ProcessPoolExecutor -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import wandb from datasets import load_dataset @@ -24,6 +25,7 @@ from atroposlib.envs.base import ( ScoredDataGroup, ServerBaseline, ) +from atroposlib.envs.server_handling.server_baseline import APIServerConfig prompt_format = ( "A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant " @@ -119,7 +121,7 @@ class MathEnv(BaseEnv): def __init__( self, config: RSConfig, - server_configs: ServerBaseline, + server_configs: Union[ServerBaseline, List[APIServerConfig]], slurm=True, testing=False, ): @@ -137,26 +139,39 @@ class MathEnv(BaseEnv): self.iter = 0 @classmethod - def config_init(cls) -> Tuple[RSConfig, ServerBaseline]: + def config_init(cls) -> Tuple[RSConfig, List[APIServerConfig]]: + # Allow configuration via environment variables for running multiple instances + model_name = os.environ.get("MATH_ENV_MODEL", "Qwen/Qwen3-4B-Instruct-2507") + rollout_url = os.environ.get("MATH_ENV_ROLLOUT_URL", "http://localhost:8000") + vllm_url = os.environ.get("MATH_ENV_VLLM_URL", "http://localhost:9001/v1") + wandb_name = os.environ.get("MATH_ENV_WANDB_NAME", "math-zero-env") + max_token_length = int(os.environ.get("MATH_ENV_MAX_TOKENS", "8192")) + env_config = RSConfig( - tokenizer_name="Qwen/Qwen2.5-7B", - group_size=16, + tokenizer_name=model_name, + group_size=8, use_wandb=True, - rollout_server_url="http://localhost:8000", - total_steps=1000, - batch_size=1024, - steps_per_eval=25, - max_token_length=31000, # 22000 // (2 ** i), - wandb_name="math", + rollout_server_url=rollout_url, + total_steps=120, + batch_size=64, + steps_per_eval=20, + max_token_length=max_token_length, + start_tok_length=max_token_length, + wandb_name=wandb_name, eval_handling=EvalHandlingEnum.LIMIT_TRAIN, eval_limit_ratio=0.1, max_num_workers_per_node=24, ) - server_configs = ServerBaseline( - model_name="Qwen/Qwen2.5-7B", - num_requests_for_eval=256, # since evaling only on one... - server_type="vllm", - ) + server_configs = [ + APIServerConfig( + model_name=model_name, + base_url=vllm_url, + api_key="x", + num_requests_for_eval=256, + server_type="vllm", + weight=1.0, + ) + ] return env_config, server_configs diff --git a/example_trainer/configs/math_zero_lora.yaml b/example_trainer/configs/math_zero_lora.yaml index 650d9d82..5480c87e 100644 --- a/example_trainer/configs/math_zero_lora.yaml +++ b/example_trainer/configs/math_zero_lora.yaml @@ -11,11 +11,3 @@ env: wandb_name: "math-zero-lora-env" eval_limit_ratio: 0.1 max_num_workers_per_node: 24 - -openai: - base_url: "http://localhost:9002/v1" - model_name: "Qwen/Qwen3-4B-Instruct-2507" - server_type: "vllm" - api_key: "x" - num_requests_for_eval: 256 - weight: 1.0 diff --git a/example_trainer/configs/math_zero_shared.yaml b/example_trainer/configs/math_zero_shared.yaml index 35979498..e5ee82c7 100644 --- a/example_trainer/configs/math_zero_shared.yaml +++ b/example_trainer/configs/math_zero_shared.yaml @@ -11,11 +11,3 @@ env: wandb_name: "math-zero-shared-env" eval_limit_ratio: 0.1 max_num_workers_per_node: 24 - -openai: - base_url: "http://localhost:9001/v1" - model_name: "Qwen/Qwen3-4B-Instruct-2507" - server_type: "vllm" - api_key: "x" - num_requests_for_eval: 256 - weight: 1.0