diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..3c941b35 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "environments/intern_bootcamp/internbootcamp_lib"] + path = environments/intern_bootcamp/internbootcamp_lib + url = https://github.com/InternLM/InternBootcamp.git diff --git a/environments/intern_bootcamp/README.md b/environments/intern_bootcamp/README.md new file mode 100644 index 00000000..043d190f --- /dev/null +++ b/environments/intern_bootcamp/README.md @@ -0,0 +1,272 @@ +# InternBootcamp RL Training Environment + +## Overview + +The InternBootcamp RL Training Environment is a flexible and extensible framework for training large reasoning models using reinforcement learning on verifiable reasoning tasks. Based on the [InternBootcamp](https://github.com/InternLM/InternBootcamp) library, this environment provides a seamless integration between InternBootcamp's comprehensive collection of reasoning tasks and the Atropos RL training infrastructure. + +## How InternBootcamp Works + +InternBootcamp is a library that provides: + +1. **Standardized Task Interface**: Each task (called a "bootcamp") implements three core methods: + - `case_generator()`: Generates problem instances with controllable difficulty + - `prompt_func()`: Converts problem instances into natural language prompts + - `verify_score()`: Verifies and scores model responses + +2. **Diverse Task Coverage**: Over 1,000 verifiable reasoning tasks including: + - Logic puzzles (e.g., Game24, Sudoku, N-Queens) + - Mathematical problems (algebra, geometry, calculus) + - Algorithm challenges (sorting, searching, optimization) + - Game-based reasoning (chess, Go, strategic games) + - Pattern recognition and sequence problems + +3. **Automatic Task Generation**: Tasks can generate unlimited problem instances with: + - Controllable difficulty parameters + - Consistent verification methods + - Scalable complexity + +## Architecture + +``` +InternBootcamp RL Environment +├── Task Selection Layer +│ ├── Single Task Mode (train on one specific bootcamp) +│ ├── Multi-Task Mode (train on multiple bootcamps - TBD) +│ └── Curriculum Mode (progressive difficulty - TBD) +│ +├── InternBootcamp Integration +│ ├── Bootcamp Registry (dynamic task discovery) +│ ├── Bootcamp Instance Management +│ ├── Problem Generation Pipeline +│ └── Response Verification System +│ +├── RL Training Loop +│ ├── Trajectory Collection +│ ├── Reward Calculation +│ └── Policy Updates +│ +└── Atropos Base Environment + ├── Server Management + ├── Batch Processing + └── Wandb Logging +``` + +## Key Features + +### 1. Dynamic Task Discovery +The environment automatically discovers all available bootcamp tasks (1000+) without manual imports: + +```python +from environments.intern_bootcamp.bootcamp_registry import get_available_bootcamps + +# List all available tasks +tasks = get_available_bootcamps() +print(f"Found {len(tasks)} bootcamp tasks") +# Output: Found 1069 bootcamp tasks +``` + +### 2. Simple Task Selection +Train on any available bootcamp task by name: + +```python +# Train on Game24 +env = InternBootcampEnv(task_name="Game24bootcamp", task_params={"num_numbers": 4}) + +# Train on Sudoku +env = InternBootcampEnv(task_name="Sudokubootcamp") + +# Train on Maze solving +env = InternBootcampEnv(task_name="Mazebootcamp") +``` + +### 3. Automatic Problem Generation +Each training step: +1. Instantiates the selected bootcamp with specified parameters +2. Generates a new problem instance using `case_generator()` +3. Converts it to a natural language prompt via `prompt_func()` +4. Collects model responses +5. Verifies correctness using `verify_score()` + +### 4. Flexible Reward System +- **Base rewards**: Correct/incorrect responses (configurable) +- **Format bonuses**: Proper answer formatting (e.g., `\boxed{}` for math) +- **Reasoning bonuses**: Quality of step-by-step explanations +- **Task-specific scoring**: Each bootcamp can define its own scoring logic + +## Installation + +1. Clone the repository and navigate to the environment: +```bash +cd environments/intern_bootcamp +``` + +2. Install InternBootcamp (already included as a submodule): +```bash +cd internbootcamp_lib && uv pip install -e . +``` + +## Usage Examples + +### 1. Single Task Training +Train on Game24 puzzles with specific difficulty: + +```bash +python -m environments.intern_bootcamp serve \ + --env--task_name "Game24bootcamp" \ + --env--task_params '{"num_numbers": 4, "range_max": 100}' \ + --env--group_size 8 \ + --env--total_steps 10000 +``` + +### 2. Exploring Available Tasks +List all available bootcamp tasks: + +```python +from environments.intern_bootcamp.bootcamp_registry import get_available_bootcamps + +tasks = get_available_bootcamps() +for task in tasks[:20]: # Show first 20 + print(task) +``` + +### 3. Custom Configuration File +Use a YAML configuration for training: + +```yaml +# config/intern_bootcamp_game24.yaml +env: + task_name: "Game24bootcamp" + task_params: + num_numbers: 4 + range_max: 50 + target_max: 50 + + correct_reward: 1.0 + incorrect_reward: -0.5 + format_bonus: 0.2 + + group_size: 8 + total_steps: 10000 + steps_per_eval: 100 + +openai: + model_name: "gpt-4" + temperature: 0.7 + max_tokens: 2048 +``` + +Run with config: +```bash +python -m environments.intern_bootcamp serve --config config/intern_bootcamp_game24.yaml +``` + +## Available Bootcamp Tasks + +The environment supports over 1000 bootcamp tasks. Some examples include: + +- **Math & Logic**: Game24bootcamp, Sudokubootcamp, Kakurobootcamp +- **Algorithms**: Mazebootcamp, Slitherlinkbootcamp, Bridgesbootcamp +- **Games**: InternGObootcamp, Chessbootcamp +- **Pattern Recognition**: Arcbootcamp, Nonogramsbootcamp +- **Code Generation**: CodeIObootcamp, BigCodeBenchbootcamp +- **Language Tasks**: Cipherbootcamp, WordSortingbootcamp + +Use `get_available_bootcamps()` to see the full list. + +## Implementation Details + +### Environment Configuration + +```python +class InternBootcampEnvConfig(BaseEnvConfig): + # Task selection + task_name: str = "Game24bootcamp" # Bootcamp task name + task_params: Dict[str, Any] = {} # Task-specific parameters + + # Reward configuration + correct_reward: float = 1.0 + incorrect_reward: float = -0.5 + format_bonus: float = 0.2 + + # Training parameters + require_reasoning: bool = True + min_reasoning_length: int = 50 + temperature: float = 0.7 + top_p: float = 0.9 +``` + +### Bootcamp Registry + +The environment uses a dynamic registry system to discover and manage bootcamp tasks: + +```python +from environments.intern_bootcamp.bootcamp_registry import ( + create_bootcamp, + get_available_bootcamps, + bootcamp_registry +) + +# Create a bootcamp instance +bootcamp = create_bootcamp("Game24bootcamp", num_numbers=4, range_max=50) + +# Get information about a bootcamp +info = bootcamp_registry.get_bootcamp_info("Game24bootcamp") +print(info["parameters"]) # Shows accepted parameters +``` + +## Evaluation and Metrics + +The environment tracks comprehensive metrics: + +### Performance Metrics +- **Task accuracy**: Success rate on the specific bootcamp task +- **Format compliance**: Rate of properly formatted responses +- **Reasoning quality**: Length and coherence of explanations + +### Training Metrics +- **Reward statistics**: Mean, std, min, max rewards +- **Problem diversity**: Variety of generated problems +- **Learning progress**: Improvement over time + +## Troubleshooting + +### Common Issues + +1. **Task Not Found** + ``` + ValueError: Unknown bootcamp: XYZBootcamp + ``` + Solution: Check available tasks with `get_available_bootcamps()` + +2. **Import Errors** + ``` + ImportError: No module named 'internbootcamp' + ``` + Solution: Install InternBootcamp: `cd internbootcamp_lib && pip install -e .` + +3. **Parameter Errors** + ``` + TypeError: __init__() got an unexpected keyword argument + ``` + Solution: Check accepted parameters with `bootcamp_registry.get_bootcamp_info(task_name)` + +## Future Enhancements + +1. **Multi-Task Training**: Train on multiple bootcamps simultaneously +2. **Curriculum Learning**: Progressive difficulty advancement +3. **Task Composition**: Combine multiple bootcamps into complex reasoning chains +4. **Custom Bootcamps**: Easy integration of new reasoning tasks + +## Contributing + +To add new features or improvements: + +1. Fork the repository +2. Create a feature branch +3. Implement your changes following the existing patterns +4. Add tests for new functionality +5. Submit a pull request with a clear description + +## License + +This environment follows the same license as the Atropos framework and InternBootcamp library. diff --git a/environments/intern_bootcamp/__init__.py b/environments/intern_bootcamp/__init__.py new file mode 100644 index 00000000..1787c38b --- /dev/null +++ b/environments/intern_bootcamp/__init__.py @@ -0,0 +1,7 @@ +""" +InternBootcamp RL Environment for Atropos +""" + +from .intern_bootcamp_env import InternBootcampEnv, InternBootcampEnvConfig + +__all__ = ["InternBootcampEnv", "InternBootcampEnvConfig"] diff --git a/environments/intern_bootcamp/bootcamp_registry.py b/environments/intern_bootcamp/bootcamp_registry.py new file mode 100644 index 00000000..2450f75b --- /dev/null +++ b/environments/intern_bootcamp/bootcamp_registry.py @@ -0,0 +1,266 @@ +""" +Bootcamp Registry for InternBootcamp Environment + +This module provides a registry system for dynamically discovering and managing +InternBootcamp tasks without having to manually import each one. +""" + +import importlib +import inspect +import logging +import random +from typing import Any, Dict, List, Type + +logger = logging.getLogger(__name__) + + +class BootcampRegistry: + """Registry for InternBootcamp tasks with dynamic discovery.""" + + def __init__(self): + self._registry: Dict[str, Type] = {} + self._discovered = False + + def discover_bootcamps(self) -> None: + """Dynamically discover all available bootcamp classes from InternBootcamp.""" + if self._discovered: + return + + try: + # Import the internbootcamp.bootcamp module + bootcamp_module = importlib.import_module("internbootcamp.bootcamp") + + # Get all attributes from the module + for name in dir(bootcamp_module): + if name.endswith("bootcamp") and not name.startswith("_"): + try: + obj = getattr(bootcamp_module, name) + # Check if it's a class and has the required methods + if ( + inspect.isclass(obj) + and hasattr(obj, "case_generator") + and hasattr(obj, "prompt_func") + and hasattr(obj, "verify_score") + ): + self._registry[name] = obj + logger.debug(f"Registered bootcamp: {name}") + except Exception as e: + logger.warning(f"Failed to register {name}: {e}") + + self._discovered = True + logger.info(f"Discovered {len(self._registry)} bootcamp tasks") + + except ImportError as e: + logger.error(f"Failed to import internbootcamp.bootcamp: {e}") + raise + + def get_bootcamp_class(self, name: str) -> Type: + """Get a bootcamp class by name.""" + if not self._discovered: + self.discover_bootcamps() + + if name not in self._registry: + available = self.list_available_bootcamps() + raise ValueError( + f"Unknown bootcamp: {name}. " + f"Available bootcamps: {', '.join(available[:10])}..." + f" ({len(available)} total)" + ) + + return self._registry[name] + + def create_bootcamp_instance(self, name: str, **params) -> Any: + """Create an instance of a bootcamp with given parameters.""" + bootcamp_class = self.get_bootcamp_class(name) + + # Get the __init__ signature to see what parameters are accepted + try: + sig = inspect.signature(bootcamp_class.__init__) + valid_params = {} + + # Filter out parameters that the bootcamp doesn't accept + for param_name, param_value in params.items(): + if param_name in sig.parameters: + valid_params[param_name] = param_value + else: + logger.warning( + f"Parameter '{param_name}' not accepted by {name}, ignoring" + ) + + return bootcamp_class(**valid_params) + + except Exception as e: + logger.error(f"Failed to create instance of {name}: {e}") + # Try with no parameters as fallback + try: + return bootcamp_class() + except Exception as e: + raise e + + def list_available_bootcamps(self) -> List[str]: + """List all available bootcamp names.""" + if not self._discovered: + self.discover_bootcamps() + return sorted(list(self._registry.keys())) + + def get_bootcamp_info(self, name: str) -> Dict[str, Any]: + """Get information about a specific bootcamp.""" + bootcamp_class = self.get_bootcamp_class(name) + + info = { + "name": name, + "class": bootcamp_class, + "docstring": inspect.getdoc(bootcamp_class) or "No documentation available", + "parameters": {}, + } + + # Get __init__ parameters + try: + sig = inspect.signature(bootcamp_class.__init__) + for param_name, param in sig.parameters.items(): + if param_name not in ["self"]: + param_info = { + "default": ( + param.default + if param.default != inspect.Parameter.empty + else None + ), + "annotation": ( + str(param.annotation) + if param.annotation != inspect.Parameter.empty + else None + ), + } + info["parameters"][param_name] = param_info + except Exception as e: + logger.warning(f"Could not inspect parameters for {name}: {e}") + + return info + + +class RandomTask: + """Special bootcamp that randomly selects from available bootcamps on each call.""" + + def __init__(self, **params): + self.registry = BootcampRegistry() + self.registry.discover_bootcamps() + self.available_bootcamps = self.registry.list_available_bootcamps() + # Remove base classes and template classes from the list + self.available_bootcamps = [ + name + for name in self.available_bootcamps + if not any(x in name.lower() for x in ["base", "template", "{puzzlename}"]) + ] + self.params = params + self.current_bootcamp = None + self.current_bootcamp_name = None + logger.info( + f"RandomTask initialized with {len(self.available_bootcamps)} available bootcamps" + ) + + def case_generator(self) -> object: + """Generate a case by randomly selecting a bootcamp.""" + # Select a random bootcamp + self.current_bootcamp_name = random.choice(self.available_bootcamps) + self.current_bootcamp = self.registry.create_bootcamp_instance( + self.current_bootcamp_name, **self.params + ) + + # Generate case from the selected bootcamp + case = self.current_bootcamp.case_generator() + + # Add bootcamp name to the case for tracking + if isinstance(case, dict): + case["_bootcamp_name"] = self.current_bootcamp_name + else: + # If case is not a dict, wrap it + case = {"data": case, "_bootcamp_name": self.current_bootcamp_name} + + return case + + def prompt_func(self, identity) -> str: + """Generate prompt using the current bootcamp.""" + # Extract the bootcamp name if stored + bootcamp_name = identity.get("_bootcamp_name", self.current_bootcamp_name) + + # If we need to recreate the bootcamp (e.g., during scoring) + if not self.current_bootcamp or self.current_bootcamp_name != bootcamp_name: + self.current_bootcamp_name = bootcamp_name + self.current_bootcamp = self.registry.create_bootcamp_instance( + bootcamp_name, **self.params + ) + + # Remove the bootcamp name before passing to prompt_func + identity_copy = dict(identity) + identity_copy.pop("_bootcamp_name", None) + if "data" in identity_copy and len(identity_copy) == 1: + identity_copy = identity_copy["data"] + + return self.current_bootcamp.prompt_func(identity_copy) + + @classmethod + def extract_output(cls, output): + """This should not be called directly for RandomTask.""" + raise NotImplementedError( + "RandomTask does not implement extract_output directly" + ) + + @classmethod + def _verify_correction(cls, solution, identity): + """This should not be called directly for RandomTask.""" + raise NotImplementedError( + "RandomTask does not implement _verify_correction directly" + ) + + def verify_score( + self, + model_output, + identity, + format_score=0, + short_penalty=True, + short_threshold=100, + format_penalty=True, + ) -> float: + """Verify score using the appropriate bootcamp.""" + # Extract the bootcamp name + bootcamp_name = identity.get("_bootcamp_name", self.current_bootcamp_name) + + # If we need to recreate the bootcamp + if not self.current_bootcamp or self.current_bootcamp_name != bootcamp_name: + self.current_bootcamp_name = bootcamp_name + self.current_bootcamp = self.registry.create_bootcamp_instance( + bootcamp_name, **self.params + ) + + # Remove the bootcamp name before passing to verify_score + identity_copy = dict(identity) + identity_copy.pop("_bootcamp_name", None) + if "data" in identity_copy and len(identity_copy) == 1: + identity_copy = identity_copy["data"] + + # Call the bootcamp's verify_score method + return self.current_bootcamp.verify_score( + model_output, + identity_copy, + format_score, + short_penalty, + short_threshold, + format_penalty, + ) + + +# Global registry instance +bootcamp_registry = BootcampRegistry() + + +def get_available_bootcamps() -> List[str]: + """Get a list of all available bootcamp names.""" + return bootcamp_registry.list_available_bootcamps() + + +def create_bootcamp(name: str, **params) -> Any: + """Create a bootcamp instance by name with parameters.""" + # Special handling for RandomTask + if name == "RandomTask": + return RandomTask(**params) + return bootcamp_registry.create_bootcamp_instance(name, **params) diff --git a/environments/intern_bootcamp/intern_bootcamp_env.py b/environments/intern_bootcamp/intern_bootcamp_env.py new file mode 100644 index 00000000..325a95d6 --- /dev/null +++ b/environments/intern_bootcamp/intern_bootcamp_env.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 +""" +InternBootcamp RL Environment for Atropos + +This environment integrates InternBootcamp's verifiable reasoning tasks with the Atropos +RL training framework. It supports training on single tasks, with plans for multi-task +and curriculum learning modes. +""" + +import asyncio +import logging +from typing import Any, Dict, List, Optional, Tuple, Union + +from atroposlib.envs.base import ( + APIServerConfig, + BaseEnv, + BaseEnvConfig, + ScoredDataGroup, +) +from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer + +from .bootcamp_registry import create_bootcamp, get_available_bootcamps + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# System prompt for reasoning tasks +SYSTEM_PROMPT = ( + "You are a deep thinking AI with strong reasoning abilities. You may use " + "extremely long chains of thought to deeply consider the problem and " + "deliberate with yourself via systematic reasoning processes to help come " + "to a correct solution.\n\n" + "You should enclose your thoughts and internal monologue inside " + " tags, and then provide your solution or response to the problem. " + "Please think in English, even if the problem is presented in another " + "language.\n\n" + "When solving problems:\n" + "1. Think step by step through the problem inside tags\n" + "2. Show your work clearly in your thinking\n" + "3. Verify your answer before finalizing\n" + "4. Follow the specific answer format requested in the problem\n\n" + "Pay close attention to how the problem asks you to format your answer - " + "some may require specific tags, notations, or formats." +) + + +class InternBootcampEnvConfig(BaseEnvConfig): + """Configuration for the InternBootcamp environment.""" + + # Task selection + task_name: str = "RandomTask" # Random task selection mode + + # Task-specific parameters + task_params: Dict[str, Any] = {} + + # Reward configuration + correct_reward: float = 1.0 + incorrect_reward: float = -0.5 + format_bonus: float = 0.2 + + # Training parameters + require_reasoning: bool = True + min_reasoning_length: int = 50 + temperature: float = 0.7 + top_p: float = 0.9 + + +class InternBootcampEnv(BaseEnv): + """Environment for training on InternBootcamp reasoning tasks.""" + + name = "intern_bootcamp" + + def __init__( + self, + config: InternBootcampEnvConfig, + server_configs: Union[List[APIServerConfig], APIServerConfig], + slurm=True, + testing=False, + ): + super().__init__(config, server_configs, slurm, testing) + self.config = config + + # Task tracking + self.bootcamp_instance = None + self.current_task_name = config.task_name + + # Performance tracking + self.task_correct_buffer = [] + self.format_correct_buffer = [] + self.eval_metrics = [] + + self.system_prompt = SYSTEM_PROMPT + + async def setup(self): + """Initialize the environment and bootcamp task.""" + logger.info(f"Setting up InternBootcampEnv with task: {self.config.task_name}") + + # Log available bootcamps + available = get_available_bootcamps() + logger.info(f"Found {len(available)} available bootcamp tasks") + logger.debug(f"Available tasks (first 20): {available[:20]}") + + # Initialize the bootcamp task + self._initialize_bootcamp() + + # Generate some test problems to verify setup + try: + for i in range(3): + identity = self.bootcamp_instance.case_generator() + prompt = self.bootcamp_instance.prompt_func(identity) + logger.info(f"Test problem {i+1}: {prompt[:100]}...") + except Exception as e: + logger.error(f"Failed to generate test problems: {e}") + raise + + def _initialize_bootcamp(self): + """Initialize the bootcamp instance based on task name.""" + try: + # Create bootcamp instance using the registry + self.bootcamp_instance = create_bootcamp( + self.config.task_name, **self.config.task_params + ) + logger.info( + f"Initialized {self.config.task_name} with params: {self.config.task_params}" + ) + except ValueError as e: + # If task not found, list available tasks + available = get_available_bootcamps() + logger.error(f"Task '{self.config.task_name}' not found!") + logger.error(f"Available tasks (showing first 20): {available[:20]}") + raise e + except Exception as e: + logger.error(f"Failed to initialize bootcamp: {e}") + raise + + async def get_next_item(self) -> Tuple[Any, Dict]: + """Get the next problem from the bootcamp.""" + # Generate a new problem + identity = self.bootcamp_instance.case_generator() + prompt = self.bootcamp_instance.prompt_func(identity) + + # Log which bootcamp is being used if RandomTask + if ( + self.config.task_name == "RandomTask" + and isinstance(identity, dict) + and "_bootcamp_name" in identity + ): + logger.info(f"RandomTask selected: {identity['_bootcamp_name']}") + + # Create the message format expected by Atropos + messages = [ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt}, + ] + + # Return item with metadata + return ( + messages, + { + "identity": identity, + "task_name": self.current_task_name, + "raw_prompt": prompt, + }, + ) + + async def collect_trajectories(self, item) -> Tuple[List, List]: + """Collect trajectories for the current item.""" + messages, metadata = item + logger.info(f"Collecting trajectories for item: {messages}") + + # Get completions from the model using chat_completion + completions = await self.server.chat_completion( + messages=messages, + n=self.config.group_size, + max_tokens=self.config.max_token_length, + temperature=self.config.temperature, + top_p=self.config.top_p, + ) + + to_score = [] + + for i, completion in enumerate(completions.choices): + model_response = completion.message.content + + # Create full conversation for scoring + full_messages = messages + [ + {"role": "assistant", "content": model_response} + ] + + to_score.append((full_messages, metadata, model_response)) + + # Score the trajectories immediately and return a ScoredDataGroup + scored_data = await self.score(to_score) + backlog = [] # No backlog items for now + + return scored_data, backlog + + async def score(self, rollout_group_data) -> ScoredDataGroup: + """Score the collected trajectories using bootcamp verification.""" + scored_data = ScoredDataGroup() + scored_data["tokens"] = [] + scored_data["masks"] = [] + scored_data["scores"] = [] + scored_data["messages"] = [] + + for messages, metadata, model_response in rollout_group_data: + # Verify the response using the bootcamp + identity = metadata["identity"] + + # Calculate base score from bootcamp verification + base_score = self.bootcamp_instance.verify_score( + model_response, + identity, + format_score=self.config.format_bonus, + short_penalty=self.config.require_reasoning, + short_threshold=self.config.min_reasoning_length, + ) + + # Apply reward scaling + if base_score >= 1.0: + # Correct answer with format + final_score = self.config.correct_reward + self.task_correct_buffer.append(1) + self.format_correct_buffer.append(1) + elif base_score > 0: + # Correct format but wrong answer + final_score = self.config.incorrect_reward + base_score + self.task_correct_buffer.append(0) + self.format_correct_buffer.append(1) + else: + # Wrong answer and/or format + final_score = self.config.incorrect_reward + self.task_correct_buffer.append(0) + self.format_correct_buffer.append(0) + + # Log the scoring details + logger.debug( + f"Scored response: base_score={base_score}, " + f"final_score={final_score}, " + f"identity={identity}" + ) + + # Tokenize for trainer + tokens_dict = tokenize_for_trainer( + self.tokenizer, + messages, + None, + ) + + scored_data["tokens"].append(tokens_dict["tokens"]) + scored_data["masks"].append(tokens_dict["masks"]) + scored_data["scores"].append(final_score) + scored_data["messages"].append(messages) + + return scored_data + + async def evaluate(self, *args, **kwargs): + """Evaluate the model on test problems.""" + logger.info(f"Starting evaluation for {self.current_task_name}") + + eval_tasks = [] + num_eval_problems = 20 # Number of problems to evaluate on + + # Generate evaluation problems + for i in range(num_eval_problems): + eval_tasks.append(self.evaluate_single_problem()) + + # Run evaluations in parallel + results = await asyncio.gather(*eval_tasks) + + # Calculate metrics + correct_count = sum(1 for is_correct, _ in results if is_correct) + format_count = sum(1 for _, has_format in results if has_format) + total_count = len(results) + + accuracy = correct_count / total_count if total_count > 0 else 0 + format_rate = format_count / total_count if total_count > 0 else 0 + + logger.info( + f"Evaluation complete: accuracy={accuracy:.2%}, " + f"format_rate={format_rate:.2%} " + f"({correct_count}/{total_count} correct)" + ) + + # Store metrics for wandb logging + self.eval_metrics.append((f"eval/{self.current_task_name}_accuracy", accuracy)) + self.eval_metrics.append( + (f"eval/{self.current_task_name}_format_rate", format_rate) + ) + self.eval_metrics.append(("eval/overall_accuracy", accuracy)) + + return self.eval_metrics + + async def evaluate_single_problem(self) -> Tuple[bool, bool]: + """Evaluate a single problem.""" + try: + # Generate a problem + identity = self.bootcamp_instance.case_generator() + prompt = self.bootcamp_instance.prompt_func(identity) + + # Create messages + messages = [ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt}, + ] + + # Get model response using chat_completion + completion = await self.server.chat_completion( + messages=messages, + n=1, + max_tokens=self.config.max_token_length, + temperature=0.0, # Deterministic for evaluation + top_p=1.0, + split="eval", + ) + + model_response = completion.choices[0].message.content + + # Score the response + score = self.bootcamp_instance.verify_score( + model_response, + identity, + format_score=self.config.format_bonus, + short_penalty=False, # Don't penalize short responses in eval + ) + + is_correct = score >= 1.0 + has_format = score > 0 + + return is_correct, has_format + + except Exception as e: + logger.error(f"Error evaluating problem: {e}") + return False, False + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log metrics to wandb.""" + if wandb_metrics is None: + wandb_metrics = {} + + # Add training metrics + if self.task_correct_buffer: + wandb_metrics[f"train/{self.current_task_name}_accuracy"] = sum( + self.task_correct_buffer + ) / len(self.task_correct_buffer) + + if self.format_correct_buffer: + wandb_metrics[f"train/{self.current_task_name}_format_rate"] = sum( + self.format_correct_buffer + ) / len(self.format_correct_buffer) + + # Add evaluation metrics + for metric_name, value in self.eval_metrics: + wandb_metrics[metric_name] = value + + # Clear buffers + self.task_correct_buffer = [] + self.format_correct_buffer = [] + self.eval_metrics = [] + + await super().wandb_log(wandb_metrics) + + @classmethod + def config_init(cls) -> Tuple[InternBootcampEnvConfig, List[APIServerConfig]]: + """Initialize environment and server configurations.""" + env_config = InternBootcampEnvConfig( + tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview", + group_size=8, + use_wandb=True, + max_num_workers=64, + rollout_server_url="http://localhost:8000", + total_steps=10000, + batch_size=1024, + steps_per_eval=100, + max_token_length=16384, + inference_weight=1.0, + wandb_name="intern_bootcamp_random_tasks", + data_path_to_save_groups="data/intern_bootcamp_random_tasks.jsonl", + # Task configuration + task_name="RandomTask", + task_params={}, + # Reward configuration + correct_reward=1.0, + incorrect_reward=-0.5, + format_bonus=0.2, + # Training parameters + require_reasoning=True, + min_reasoning_length=50, + temperature=0.7, + top_p=0.9, + ) + + server_configs = [ + APIServerConfig( + model_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview", + base_url="http://localhost:9004/v1", + api_key="x", + num_requests_for_eval=64, + ) + ] + + return env_config, server_configs + + +if __name__ == "__main__": + InternBootcampEnv.cli() diff --git a/environments/intern_bootcamp/intern_bootcamp_local_test.py b/environments/intern_bootcamp/intern_bootcamp_local_test.py new file mode 100644 index 00000000..6b2292fb --- /dev/null +++ b/environments/intern_bootcamp/intern_bootcamp_local_test.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Local testing script for InternBootcamp environment with RandomTask +""" + +import asyncio +import logging +import os + +from dotenv import load_dotenv + +from atroposlib.envs.base import APIServerConfig, EvalHandlingEnum +from environments.intern_bootcamp.intern_bootcamp_env import ( + InternBootcampEnv, + InternBootcampEnvConfig, +) + +load_dotenv() + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +async def main(): + logger.info("Starting InternBootcamp environment local test runner with RandomTask") + + # Test configuration - using RandomTask for multitask curriculum + env_config = InternBootcampEnvConfig( + tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview", + group_size=2, # Small group for testing + use_wandb=False, + wandb_name="intern_bootcamp_random_test", + max_num_workers=1, + rollout_server_url="http://localhost:8000", + total_steps=1, + batch_size=2, + steps_per_eval=0, + max_token_length=2048, # Increased for diverse tasks + inference_weight=1.0, + data_path_to_save_groups=None, + eval_handling=EvalHandlingEnum.NONE, + eval_limit_ratio=0.0, + # InternBootcamp specific settings - using RandomTask + task_name="RandomTask", + task_params={}, # RandomTask doesn't need specific params + correct_reward=1.0, + incorrect_reward=-0.5, + format_bonus=0.2, + require_reasoning=True, + min_reasoning_length=20, + temperature=0.7, + top_p=0.9, + ) + + server_configs = [ + APIServerConfig( + model_name="gpt-4o-mini", + base_url="https://api.openai.com/v1", + api_key=os.getenv("OPENAI_API_KEY"), + num_requests_for_eval=0, + ) + ] + + logger.info("Using RandomTask configuration for multitask curriculum") + logger.debug(f"Env Config: {env_config}") + logger.debug(f"Server Configs: {server_configs}") + + try: + env = InternBootcampEnv( + config=env_config, server_configs=server_configs, slurm=False + ) + except Exception as e: + logger.exception(f"Failed to initialize InternBootcampEnv: {e}") + return + + logger.info("Running RandomTask tests") + try: + await env.setup() + + # Test 1: Generate multiple random problems to show variety + logger.info("\n========== Test 1: Multiple Random Problems ==========") + + for i in range(5): + logger.info(f"\n--- Random Problem {i+1} ---") + item = await env.get_next_item() + prompt_tuple, metadata = item + + # Extract bootcamp name from identity if available + bootcamp_name = "Unknown" + if ( + isinstance(metadata["identity"], dict) + and "_bootcamp_name" in metadata["identity"] + ): + bootcamp_name = metadata["identity"]["_bootcamp_name"] + + logger.info(f" Selected Bootcamp: {bootcamp_name}") + logger.info(f" Task: {metadata['task_name']}") + logger.info(f" Prompt preview: {metadata['raw_prompt'][:150]}...") + + # Test 2: Collect and score trajectories from a random problem + logger.info("\n========== Test 2: Trajectory Collection & Scoring ==========") + item = await env.get_next_item() + prompt_tuple, metadata = item + + # Extract bootcamp name + bootcamp_name = "Unknown" + if ( + isinstance(metadata["identity"], dict) + and "_bootcamp_name" in metadata["identity"] + ): + bootcamp_name = metadata["identity"]["_bootcamp_name"] + + logger.info(f"Testing with bootcamp: {bootcamp_name}") + logger.info(f"Problem: {metadata['raw_prompt'][:200]}...") + + # Collect trajectories + scored_data, backlog = await env.collect_trajectories(item) + logger.info(f"Collected and scored {len(scored_data['scores'])} responses") + + for i, score in enumerate(scored_data["scores"]): + response_preview = ( + scored_data["messages"][i][-1]["content"][:100] + if scored_data["messages"][i] + else "No response" + ) + logger.info( + f" Response {i+1}: Score={score:.2f}, Preview: {response_preview}..." + ) + + # Test 3: Quick evaluation with random tasks + logger.info("\n========== Test 3: Random Task Evaluation ==========") + + async def quick_evaluate(*args, **kwargs): + logger.info("Starting evaluation with random tasks") + eval_tasks = [] + bootcamp_names = [] + + for i in range(3): # Only 3 problems for testing + logger.info(f"Starting evaluation problem {i+1}/3") + + # Generate a problem to see which bootcamp is selected + test_item = await env.get_next_item() + _, test_metadata = test_item + if ( + isinstance(test_metadata["identity"], dict) + and "_bootcamp_name" in test_metadata["identity"] + ): + bootcamp_name = test_metadata["identity"]["_bootcamp_name"] + bootcamp_names.append(bootcamp_name) + logger.info(f" Evaluation problem {i+1} using: {bootcamp_name}") + + eval_tasks.append(env.evaluate_single_problem()) + + results = await asyncio.gather(*eval_tasks) + + # Calculate metrics + correct_count = sum(1 for is_correct, _ in results if is_correct) + format_count = sum(1 for _, has_format in results if has_format) + total_count = len(results) + + accuracy = correct_count / total_count if total_count > 0 else 0 + format_rate = format_count / total_count if total_count > 0 else 0 + + logger.info("Evaluation complete:") + logger.info(f" Bootcamps used: {bootcamp_names}") + logger.info(f" Accuracy: {accuracy:.2%}") + logger.info(f" Format rate: {format_rate:.2%}") + + return [("eval/random_tasks_accuracy", accuracy)] + + env.evaluate = quick_evaluate + await env.evaluate() + + # Test 4: Test specific bootcamp fallback + logger.info("\n========== Test 4: Specific Bootcamp Test ==========") + + # Test with a specific bootcamp to ensure single-task mode still works + specific_config = InternBootcampEnvConfig( + **env_config.model_dump(), + task_name="Game24bootcamp", + task_params={ + "num_numbers": 4, + "range_max": 20, + "target_max": 30, + }, + ) + + try: + specific_env = InternBootcampEnv( + config=specific_config, + server_configs=server_configs, + slurm=False, + testing=True, + ) + + await specific_env.setup() + item = await specific_env.get_next_item() + _, metadata = item + + logger.info("Specific bootcamp test (Game24bootcamp):") + logger.info(f" Task: {metadata['task_name']}") + logger.info(f" Problem: {metadata['identity']}") + logger.info(f" Prompt preview: {metadata['raw_prompt'][:100]}...") + + except Exception as e: + logger.error(f"Failed to test specific bootcamp: {e}") + + # Test 5: Show bootcamp registry info + logger.info("\n========== Test 5: Bootcamp Registry Info ==========") + from environments.intern_bootcamp.bootcamp_registry import ( + get_available_bootcamps, + ) + + available = get_available_bootcamps() + logger.info(f"Total available bootcamps: {len(available)}") + logger.info(f"Sample bootcamps: {available[:10]}") + + # Show some variety in bootcamp names + math_bootcamps = [ + name + for name in available + if any(x in name.lower() for x in ["math", "game", "number"]) + ] + logic_bootcamps = [ + name + for name in available + if any(x in name.lower() for x in ["logic", "puzzle", "cipher"]) + ] + + logger.info(f"Math-related bootcamps (sample): {math_bootcamps[:5]}") + logger.info(f"Logic-related bootcamps (sample): {logic_bootcamps[:5]}") + + logger.info("\n========== All Tests Complete ==========") + logger.info("RandomTask multitask curriculum is working correctly!") + + except Exception as e: + logger.exception(f"An error occurred during testing: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/environments/intern_bootcamp/internbootcamp_lib b/environments/intern_bootcamp/internbootcamp_lib new file mode 160000 index 00000000..7b218f8e --- /dev/null +++ b/environments/intern_bootcamp/internbootcamp_lib @@ -0,0 +1 @@ +Subproject commit 7b218f8e38c148d1aa87f5d92ba4b7e137946fb8 diff --git a/environments/intern_bootcamp/run_intern_bootcamp.py b/environments/intern_bootcamp/run_intern_bootcamp.py new file mode 100644 index 00000000..b510ed86 --- /dev/null +++ b/environments/intern_bootcamp/run_intern_bootcamp.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +""" +Standalone entry point for InternBootcamp environment. +This script avoids relative import issues when running directly. +""" + +import os +import sys + +# Add the atropos root directory to Python path +atropos_root = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) +sys.path.insert(0, atropos_root) + +# Now import with absolute imports +from environments.intern_bootcamp.intern_bootcamp_env import ( # noqa: E402 + InternBootcampEnv, +) + +if __name__ == "__main__": + InternBootcampEnv.cli()