diff --git a/environments/infinimath/infinimath_env.py b/environments/infinimath/infinimath_env.py
index e946e7e7..5e89a56c 100644
--- a/environments/infinimath/infinimath_env.py
+++ b/environments/infinimath/infinimath_env.py
@@ -5,22 +5,21 @@ import random
import re
from typing import Any, Dict, List, Optional, Tuple, Union
-from trajectoryhandler.envs.base import (
+from atroposlib.envs.base import (
BaseEnv,
BaseEnvConfig,
OpenaiConfig,
ScoredDataGroup,
)
-from trajectoryhandler.envs.reward_fns import registry
-from trajectoryhandler.envs.reward_fns.combined_reward import CombinedReward
-from trajectoryhandler.utils.tokenize_for_trainer import tokenize_for_trainer
+from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
from .curriculum import MathCurriculum
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
-system_prompt = """You are an expert mathematician. You need to solve the given math problem step-by-step, showing your reasoning clearly. You should enclose your thoughts and internal monologue inside tags, and then provide your final answer in a LaTeX format using \\boxed{your answer here}.
+system_prompt = """You are an expert mathematician that can use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering.
+You should enclose your thoughts and internal monologue inside tags, and then provide your final answer in a LaTeX format using \\boxed{your answer here}.
The problems will be given in a LaTeX format, so be sure to follow the LaTeX syntax when writing your answer (although no $ delimiters are necessary).
@@ -55,6 +54,8 @@ class InfiniteMathEnvConfig(BaseEnvConfig):
max_attempts_per_problem: int = 3
correct_reward: float = 1.0
incorrect_reward: float = -1.0
+ think_block_bonus: float = 0.2 # Bonus for a well-formed think block
+ boxed_answer_bonus: float = 0.2 # Bonus for a well-formed boxed answer
# Length penalty parameters
apply_length_penalty: bool = True
@@ -66,16 +67,6 @@ class InfiniteMathEnvConfig(BaseEnvConfig):
temperature: float = 0.7
top_p: float = 0.9
- # Reward functions
- reward_functions: List[Union[str, Dict[str, Any]]] = ["accuracy", "format", "boxed"]
- accuracy_reward_weight: float = 1.0 # Weight for the accuracy reward
- format_reward_weight: float = (
- 0.2 # Weight for the format reward relative to correctness
- )
- boxed_reward_weight: float = (
- 0.3 # Weight for the boxed answer reward relative to correctness
- )
-
class InfiniteMathEnv(BaseEnv):
"""Environment for procedurally generated math problems with curriculum advancement."""
@@ -103,70 +94,6 @@ class InfiniteMathEnv(BaseEnv):
# Set the system prompt
self.system_prompt = system_prompt
- # Initialize reward function
- self.reward_function = self._initialize_reward_function()
-
- def _initialize_reward_function(self):
- """Initialize the combined reward function for scoring."""
- if hasattr(self.config, "reward_functions") and self.config.reward_functions:
- # Configure parameters for specific reward functions
- reward_configs = []
-
- for reward_func in self.config.reward_functions:
- if isinstance(reward_func, str):
- # String name case - handle known rewards with custom params
- if reward_func == "accuracy":
- # Configure accuracy reward
- accuracy_config = {
- "type": "accuracy",
- "weight": self.config.accuracy_reward_weight,
- "params": {
- "split_on_think_tag": True, # Only look at what's after
- "tolerance": 1e-6, # Tolerance for number comparisons
- },
- }
- logger.info(f"Adding accuracy reward with config: {accuracy_config}")
- reward_configs.append(accuracy_config)
- elif reward_func == "format":
- # Configure format reward with think tags and explicit weight
- format_config = {
- "type": "format",
- "weight": self.config.format_reward_weight,
- "params": {
- "preferred_tags": ["think"],
- },
- }
- logger.info(f"Adding format reward with config: {format_config}")
- reward_configs.append(format_config)
- elif reward_func == "boxed":
- # Configure boxed reward with proper parameters and explicit weight
- boxed_config = {
- "type": "boxed",
- "weight": self.config.boxed_reward_weight,
- "params": {
- "require_outside_think": True,
- },
- }
- logger.info(f"Adding boxed reward with config: {boxed_config}")
- reward_configs.append(boxed_config)
- else:
- # Pass through other reward functions as is
- logger.info(f"Adding generic reward function: {reward_func}")
- reward_configs.append(reward_func)
- else:
- # Dict case - pass through as is
- logger.info(f"Adding reward config: {reward_func}")
- reward_configs.append(reward_func)
-
- # Create the reward function(s)
- if len(reward_configs) == 1:
- logger.info(f"Creating single reward function: {reward_configs[0]}")
- return registry.create(reward_configs[0])
- else:
- logger.info(f"Creating combined reward function with {len(reward_configs)} rewards")
- # Add explicit normalization to sum to 1.0
- return CombinedReward(rewards=reward_configs, normalization="none")
-
async def setup(self):
"""Initialize the environment and curriculum."""
logger.info("Setting up InfiniteMathEnv")
@@ -340,21 +267,22 @@ class InfiniteMathEnv(BaseEnv):
)
# Log reward function metrics
- if hasattr(self, "reward_function") and self.wandb:
- if hasattr(self.reward_function, "set_wandb_logger"):
- self.reward_function.set_wandb_logger(self.wandb)
+ # REMOVED: Specific reward function config logging as it's not used anymore
+ # if hasattr(self, "reward_function") and self.wandb:
+ # if hasattr(self.reward_function, "set_wandb_logger"):
+ # self.reward_function.set_wandb_logger(self.wandb)
- # Log the reward configurations
- if isinstance(self.config.reward_functions, list) and self.config.reward_functions:
- # Log the reward configuration
- wandb_metrics["reward/format_reward_enabled"] = "format" in self.config.reward_functions
- wandb_metrics["reward/boxed_reward_enabled"] = "boxed" in self.config.reward_functions
+ # # Log the reward configurations
+ # if isinstance(self.config.reward_functions, list) and self.config.reward_functions:
+ # # Log the reward configuration
+ # wandb_metrics["reward/format_reward_enabled"] = "format" in self.config.reward_functions
+ # wandb_metrics["reward/boxed_reward_enabled"] = "boxed" in self.config.reward_functions
- if hasattr(self.config, "format_reward_weight"):
- wandb_metrics["reward/format_reward_weight"] = self.config.format_reward_weight
+ # if hasattr(self.config, "format_reward_weight"):
+ # wandb_metrics["reward/format_reward_weight"] = self.config.format_reward_weight
- if hasattr(self.config, "boxed_reward_weight"):
- wandb_metrics["reward/boxed_reward_weight"] = self.config.boxed_reward_weight
+ # if hasattr(self.config, "boxed_reward_weight"):
+ # wandb_metrics["reward/boxed_reward_weight"] = self.config.boxed_reward_weight
# Add eval metrics
for item in self.eval_metrics:
@@ -502,7 +430,7 @@ class InfiniteMathEnv(BaseEnv):
)
# Extract the boxed answer if present
- boxed_answer = self.extract_boxed_answer(after_think_part)
+ boxed_answer = self._extract_boxed_answer(after_think_part)
if not boxed_answer:
# Try to find the answer in the last line
lines = after_think_part.strip().split("\n")
@@ -510,15 +438,15 @@ class InfiniteMathEnv(BaseEnv):
boxed_answer = lines[-1].strip()
# Clean up answers for comparison (remove spaces, convert to lowercase)
- model_clean = self.clean_for_comparison(
+ model_clean = self._clean_for_comparison(
boxed_answer if boxed_answer else after_think_part
)
- solution_clean = self.clean_for_comparison(solution)
+ solution_clean = self._clean_for_comparison(solution)
# Check if they match
return model_clean == solution_clean
- def extract_boxed_answer(self, text: str) -> Optional[str]:
+ def _extract_boxed_answer(self, text: str) -> Optional[str]:
"""Extract answer from a LaTeX boxed expression."""
# Try to find boxed content
boxed_match = re.search(r"\\boxed{([^}]*)}", text)
@@ -526,7 +454,7 @@ class InfiniteMathEnv(BaseEnv):
return boxed_match.group(1)
return None
- def clean_for_comparison(self, text: str) -> str:
+ def _clean_for_comparison(self, text: str) -> str:
"""Clean text for comparison."""
# Remove LaTeX commands, spaces, commas, and convert to lowercase
cleaned = re.sub(r"\\[a-zA-Z]+", "", text)
@@ -604,86 +532,54 @@ class InfiniteMathEnv(BaseEnv):
scored_data["scores"] = []
scored_data["messages"] = []
- # Format completions for reward function evaluation
- format_completions = []
-
# Process each item in the rollout data
- for messages, solution, generator_id, level in rollout_group_data:
- # Extract the model's answer
- model_answer = messages[-1]["content"]
-
- # Add to format completions list for reward function
- format_completions.append([{"role": "assistant", "content": model_answer}])
-
- # Record performance in curriculum based on the answer and solution
- # This will be updated after the reward functions are applied
-
- # Apply all reward functions
- reward_scores = []
- unweighted_scores = []
- if hasattr(self, "reward_function") and self.reward_function:
- try:
- # Apply the reward function (which may be a combined reward)
- reward_scores = self.reward_function(format_completions, solution=solution)
- logger.info(f"Reward scores: {reward_scores}")
-
- # Debug individual rewards if it's a combined reward
- if hasattr(self.reward_function, "rewards"):
- logger.info(f"Combined reward with {len(self.reward_function.rewards)} components")
- for i, reward in enumerate(self.reward_function.rewards):
- if hasattr(reward, "compute"):
- # Get raw unweighted scores
- raw_scores = reward.compute(format_completions, solution=solution)
- if hasattr(reward, "weight"):
- logger.info(f"Reward {i} ({type(reward).__name__}): raw={raw_scores}, weight={reward.weight}")
- else:
- logger.info(f"Reward {i} ({type(reward).__name__}): raw={raw_scores}")
- else:
- logger.info(f"Using single reward: {type(self.reward_function).__name__}")
-
- except Exception as e:
- logger.error(f"Error applying reward functions: {e}")
- logger.exception(e)
- reward_scores = [0.0] * len(format_completions)
-
- # Now update curriculum based on accuracy reward results
for i, (messages, solution, generator_id, level) in enumerate(rollout_group_data):
- # Extract accuracy from the combined reward if available
- is_correct = False
- if reward_scores and hasattr(self.reward_function, "rewards"):
- for reward in self.reward_function.rewards:
- if type(reward).__name__ == "AccuracyReward":
- # Get raw scores from accuracy reward
- accuracy_scores = reward.compute(format_completions, solution=solution)
- is_correct = accuracy_scores[i] > 0
- break
+ model_answer = messages[-1]["content"]
+ current_score = 0.0
+
+ # 1. Accuracy Check
+ is_correct = self.check_answer(model_answer, solution)
+ if is_correct:
+ current_score += self.config.correct_reward
+ else:
+ current_score += self.config.incorrect_reward
- # Record answer correctness for tracking
+ # Record answer correctness for tracking and curriculum
self.percent_correct_buffer.append(1 if is_correct else 0)
if level is not None:
self.level_correct_buffer[level].append(1 if is_correct else 0)
-
- # Record performance in curriculum
self.curriculum.record_performance(generator_id, is_correct)
-
- # Combine scores and add to scored data
- for i, (messages, _, _, _) in enumerate(rollout_group_data):
- # Use the reward score directly (all weights are applied)
- combined_score = reward_scores[i] if reward_scores else 0.0
-
- logger.info(f"Final score for item {i}: {combined_score}")
+
+ # 2. Thinking Block Check
+ think_match = re.search(r"(.*?)", model_answer, re.DOTALL)
+ if think_match:
+ think_content = think_match.group(1).strip()
+ if think_content: # Check if there's actual content
+ current_score += self.config.think_block_bonus
+ # else: penalty for empty think block, or neutral
+ # else: penalty for missing think block, or neutral
+
+ # 3. Boxed Answer Check
+ # Extract the part after the thinking block for boxed answer validation
+ after_think_part = model_answer.split("")[-1].strip() if "" in model_answer else model_answer
+ boxed_answer_content = self._extract_boxed_answer(after_think_part)
+ if boxed_answer_content is not None: # Check if \boxed{} is present and has content
+ current_score += self.config.boxed_answer_bonus
+ # else: penalty for missing/malformed boxed answer, or neutral
+ logger.info(f"Item {i}: Correct: {is_correct}, Think Bonus: {self.config.think_block_bonus if think_match and think_match.group(1).strip() else 0}, Boxed Bonus: {self.config.boxed_answer_bonus if boxed_answer_content is not None else 0}, Final Score: {current_score}")
+
# Tokenize for the trainer
tokens_dict = tokenize_for_trainer(
self.tokenizer,
- messages,
- None,
+ messages, # These are the full messages including system, user, assistant
+ None, # Not used by this tokenizer function apparently
)
# Add to scored data
scored_data["tokens"].append(tokens_dict["tokens"])
scored_data["masks"].append(tokens_dict["masks"])
- scored_data["scores"].append(combined_score)
+ scored_data["scores"].append(current_score)
scored_data["messages"].append(messages)
# Advance difficulty if criteria met
diff --git a/environments/infinimath/infinimath_server.py b/environments/infinimath/infinimath_server.py
index 08cd0efa..de9b9e3c 100644
--- a/environments/infinimath/infinimath_server.py
+++ b/environments/infinimath/infinimath_server.py
@@ -86,10 +86,6 @@ async def main():
length_threshold_ratio=raw_config.get("infinimath", {}).get("length_threshold_ratio", 0.6),
temperature=raw_config.get("infinimath", {}).get("temperature", 0.7),
top_p=raw_config.get("infinimath", {}).get("top_p", 0.9),
- reward_functions=raw_config.get("infinimath", {}).get("reward_functions", ["accuracy", "format", "boxed"]),
- accuracy_reward_weight=raw_config.get("infinimath", {}).get("accuracy_reward_weight", 1.0),
- format_reward_weight=raw_config.get("infinimath", {}).get("format_reward_weight", 0.2),
- boxed_reward_weight=raw_config.get("infinimath", {}).get("boxed_reward_weight", 0.3),
)
# Server configuration from config file or defaults