mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-27 17:23:08 +00:00
Intern bootcamp env (#146)
* Created registry and started off the env * Local testing works * process working but error in gen * removed old code * adding debug, it's still not progressing to collect trajectories * linting * removed redundant settings
This commit is contained in:
parent
ea304892ee
commit
283877dd88
8 changed files with 1218 additions and 0 deletions
241
environments/intern_bootcamp/intern_bootcamp_local_test.py
Normal file
241
environments/intern_bootcamp/intern_bootcamp_local_test.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Local testing script for InternBootcamp environment with RandomTask
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from atroposlib.envs.base import APIServerConfig, EvalHandlingEnum
|
||||
from environments.intern_bootcamp.intern_bootcamp_env import (
|
||||
InternBootcampEnv,
|
||||
InternBootcampEnvConfig,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info("Starting InternBootcamp environment local test runner with RandomTask")
|
||||
|
||||
# Test configuration - using RandomTask for multitask curriculum
|
||||
env_config = InternBootcampEnvConfig(
|
||||
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-8B-Preview",
|
||||
group_size=2, # Small group for testing
|
||||
use_wandb=False,
|
||||
wandb_name="intern_bootcamp_random_test",
|
||||
max_num_workers=1,
|
||||
rollout_server_url="http://localhost:8000",
|
||||
total_steps=1,
|
||||
batch_size=2,
|
||||
steps_per_eval=0,
|
||||
max_token_length=2048, # Increased for diverse tasks
|
||||
inference_weight=1.0,
|
||||
data_path_to_save_groups=None,
|
||||
eval_handling=EvalHandlingEnum.NONE,
|
||||
eval_limit_ratio=0.0,
|
||||
# InternBootcamp specific settings - using RandomTask
|
||||
task_name="RandomTask",
|
||||
task_params={}, # RandomTask doesn't need specific params
|
||||
correct_reward=1.0,
|
||||
incorrect_reward=-0.5,
|
||||
format_bonus=0.2,
|
||||
require_reasoning=True,
|
||||
min_reasoning_length=20,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
model_name="gpt-4o-mini",
|
||||
base_url="https://api.openai.com/v1",
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
num_requests_for_eval=0,
|
||||
)
|
||||
]
|
||||
|
||||
logger.info("Using RandomTask configuration for multitask curriculum")
|
||||
logger.debug(f"Env Config: {env_config}")
|
||||
logger.debug(f"Server Configs: {server_configs}")
|
||||
|
||||
try:
|
||||
env = InternBootcampEnv(
|
||||
config=env_config, server_configs=server_configs, slurm=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to initialize InternBootcampEnv: {e}")
|
||||
return
|
||||
|
||||
logger.info("Running RandomTask tests")
|
||||
try:
|
||||
await env.setup()
|
||||
|
||||
# Test 1: Generate multiple random problems to show variety
|
||||
logger.info("\n========== Test 1: Multiple Random Problems ==========")
|
||||
|
||||
for i in range(5):
|
||||
logger.info(f"\n--- Random Problem {i+1} ---")
|
||||
item = await env.get_next_item()
|
||||
prompt_tuple, metadata = item
|
||||
|
||||
# Extract bootcamp name from identity if available
|
||||
bootcamp_name = "Unknown"
|
||||
if (
|
||||
isinstance(metadata["identity"], dict)
|
||||
and "_bootcamp_name" in metadata["identity"]
|
||||
):
|
||||
bootcamp_name = metadata["identity"]["_bootcamp_name"]
|
||||
|
||||
logger.info(f" Selected Bootcamp: {bootcamp_name}")
|
||||
logger.info(f" Task: {metadata['task_name']}")
|
||||
logger.info(f" Prompt preview: {metadata['raw_prompt'][:150]}...")
|
||||
|
||||
# Test 2: Collect and score trajectories from a random problem
|
||||
logger.info("\n========== Test 2: Trajectory Collection & Scoring ==========")
|
||||
item = await env.get_next_item()
|
||||
prompt_tuple, metadata = item
|
||||
|
||||
# Extract bootcamp name
|
||||
bootcamp_name = "Unknown"
|
||||
if (
|
||||
isinstance(metadata["identity"], dict)
|
||||
and "_bootcamp_name" in metadata["identity"]
|
||||
):
|
||||
bootcamp_name = metadata["identity"]["_bootcamp_name"]
|
||||
|
||||
logger.info(f"Testing with bootcamp: {bootcamp_name}")
|
||||
logger.info(f"Problem: {metadata['raw_prompt'][:200]}...")
|
||||
|
||||
# Collect trajectories
|
||||
scored_data, backlog = await env.collect_trajectories(item)
|
||||
logger.info(f"Collected and scored {len(scored_data['scores'])} responses")
|
||||
|
||||
for i, score in enumerate(scored_data["scores"]):
|
||||
response_preview = (
|
||||
scored_data["messages"][i][-1]["content"][:100]
|
||||
if scored_data["messages"][i]
|
||||
else "No response"
|
||||
)
|
||||
logger.info(
|
||||
f" Response {i+1}: Score={score:.2f}, Preview: {response_preview}..."
|
||||
)
|
||||
|
||||
# Test 3: Quick evaluation with random tasks
|
||||
logger.info("\n========== Test 3: Random Task Evaluation ==========")
|
||||
|
||||
async def quick_evaluate(*args, **kwargs):
|
||||
logger.info("Starting evaluation with random tasks")
|
||||
eval_tasks = []
|
||||
bootcamp_names = []
|
||||
|
||||
for i in range(3): # Only 3 problems for testing
|
||||
logger.info(f"Starting evaluation problem {i+1}/3")
|
||||
|
||||
# Generate a problem to see which bootcamp is selected
|
||||
test_item = await env.get_next_item()
|
||||
_, test_metadata = test_item
|
||||
if (
|
||||
isinstance(test_metadata["identity"], dict)
|
||||
and "_bootcamp_name" in test_metadata["identity"]
|
||||
):
|
||||
bootcamp_name = test_metadata["identity"]["_bootcamp_name"]
|
||||
bootcamp_names.append(bootcamp_name)
|
||||
logger.info(f" Evaluation problem {i+1} using: {bootcamp_name}")
|
||||
|
||||
eval_tasks.append(env.evaluate_single_problem())
|
||||
|
||||
results = await asyncio.gather(*eval_tasks)
|
||||
|
||||
# Calculate metrics
|
||||
correct_count = sum(1 for is_correct, _ in results if is_correct)
|
||||
format_count = sum(1 for _, has_format in results if has_format)
|
||||
total_count = len(results)
|
||||
|
||||
accuracy = correct_count / total_count if total_count > 0 else 0
|
||||
format_rate = format_count / total_count if total_count > 0 else 0
|
||||
|
||||
logger.info("Evaluation complete:")
|
||||
logger.info(f" Bootcamps used: {bootcamp_names}")
|
||||
logger.info(f" Accuracy: {accuracy:.2%}")
|
||||
logger.info(f" Format rate: {format_rate:.2%}")
|
||||
|
||||
return [("eval/random_tasks_accuracy", accuracy)]
|
||||
|
||||
env.evaluate = quick_evaluate
|
||||
await env.evaluate()
|
||||
|
||||
# Test 4: Test specific bootcamp fallback
|
||||
logger.info("\n========== Test 4: Specific Bootcamp Test ==========")
|
||||
|
||||
# Test with a specific bootcamp to ensure single-task mode still works
|
||||
specific_config = InternBootcampEnvConfig(
|
||||
**env_config.model_dump(),
|
||||
task_name="Game24bootcamp",
|
||||
task_params={
|
||||
"num_numbers": 4,
|
||||
"range_max": 20,
|
||||
"target_max": 30,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
specific_env = InternBootcampEnv(
|
||||
config=specific_config,
|
||||
server_configs=server_configs,
|
||||
slurm=False,
|
||||
testing=True,
|
||||
)
|
||||
|
||||
await specific_env.setup()
|
||||
item = await specific_env.get_next_item()
|
||||
_, metadata = item
|
||||
|
||||
logger.info("Specific bootcamp test (Game24bootcamp):")
|
||||
logger.info(f" Task: {metadata['task_name']}")
|
||||
logger.info(f" Problem: {metadata['identity']}")
|
||||
logger.info(f" Prompt preview: {metadata['raw_prompt'][:100]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to test specific bootcamp: {e}")
|
||||
|
||||
# Test 5: Show bootcamp registry info
|
||||
logger.info("\n========== Test 5: Bootcamp Registry Info ==========")
|
||||
from environments.intern_bootcamp.bootcamp_registry import (
|
||||
get_available_bootcamps,
|
||||
)
|
||||
|
||||
available = get_available_bootcamps()
|
||||
logger.info(f"Total available bootcamps: {len(available)}")
|
||||
logger.info(f"Sample bootcamps: {available[:10]}")
|
||||
|
||||
# Show some variety in bootcamp names
|
||||
math_bootcamps = [
|
||||
name
|
||||
for name in available
|
||||
if any(x in name.lower() for x in ["math", "game", "number"])
|
||||
]
|
||||
logic_bootcamps = [
|
||||
name
|
||||
for name in available
|
||||
if any(x in name.lower() for x in ["logic", "puzzle", "cipher"])
|
||||
]
|
||||
|
||||
logger.info(f"Math-related bootcamps (sample): {math_bootcamps[:5]}")
|
||||
logger.info(f"Logic-related bootcamps (sample): {logic_bootcamps[:5]}")
|
||||
|
||||
logger.info("\n========== All Tests Complete ==========")
|
||||
logger.info("RandomTask multitask curriculum is working correctly!")
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"An error occurred during testing: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue