mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
Port many benchmarks into atropos
This commit is contained in:
parent
405efa8302
commit
ef9c0c3699
37 changed files with 22878 additions and 308 deletions
|
|
@ -25,6 +25,11 @@ from atroposlib.envs.base import (
|
|||
ScoredDataGroup,
|
||||
)
|
||||
from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
|
||||
from eval_helpers import (
|
||||
get_default_thinking_prompt,
|
||||
create_system_content,
|
||||
save_eval_results,
|
||||
)
|
||||
|
||||
|
||||
class RefusalBenchConfig(BaseEnvConfig):
|
||||
|
|
@ -69,8 +74,8 @@ class RefusalBenchConfig(BaseEnvConfig):
|
|||
)
|
||||
|
||||
judge_max_tokens: int = Field(
|
||||
default=2048,
|
||||
description="Maximum tokens for judge completions.",
|
||||
default=0,
|
||||
description="Maximum tokens for judge completions (0 = use model default).",
|
||||
)
|
||||
|
||||
# Retry configuration for judge calls
|
||||
|
|
@ -140,8 +145,8 @@ class RefusalBenchConfig(BaseEnvConfig):
|
|||
)
|
||||
|
||||
eval_max_tokens: int = Field(
|
||||
default=32000,
|
||||
description="Maximum tokens for evaluation completions.",
|
||||
default=0,
|
||||
description="Maximum tokens for evaluation completions (0 = use model default).",
|
||||
)
|
||||
|
||||
train_max_tokens: int = Field(
|
||||
|
|
@ -269,14 +274,7 @@ class RefusalBenchEnv(BaseEnv):
|
|||
|
||||
def _get_thinking_prompt(self) -> str:
|
||||
"""Get thinking system prompt."""
|
||||
return (
|
||||
self.config.custom_thinking_prompt
|
||||
if self.config.custom_thinking_prompt
|
||||
else "You are a helpful AI assistant. You may use extremely long chains of thought to deeply consider the "
|
||||
"problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
|
||||
"solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
|
||||
"</think> tags, and then provide your solution or response to the problem."
|
||||
)
|
||||
return get_default_thinking_prompt(self.config.custom_thinking_prompt)
|
||||
|
||||
# "Enable deep thinking subroutine."
|
||||
def _get_system_prompt(self) -> Optional[str]:
|
||||
|
|
@ -361,7 +359,7 @@ class RefusalBenchEnv(BaseEnv):
|
|||
batch_size=1024,
|
||||
steps_per_eval=25,
|
||||
train_max_tokens=1024 * 16,
|
||||
eval_max_tokens=8192, # DeepSeek's maximum
|
||||
eval_max_tokens=0, # Use model default
|
||||
inference_weight=1.0,
|
||||
wandb_name="refusal_bench",
|
||||
eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
|
||||
|
|
@ -615,12 +613,15 @@ class RefusalBenchEnv(BaseEnv):
|
|||
async def _judge_api_call_raw(self, messages: List[Dict]):
|
||||
"""Make a raw API call to the judge model without retry logic."""
|
||||
try:
|
||||
result = await self.judge_client.chat.completions.create(
|
||||
model=self.config.judge_model_name,
|
||||
messages=messages,
|
||||
temperature=self.config.judge_temperature,
|
||||
max_tokens=self.config.judge_max_tokens,
|
||||
)
|
||||
kwargs = {
|
||||
"model": self.config.judge_model_name,
|
||||
"messages": messages,
|
||||
"temperature": self.config.judge_temperature,
|
||||
}
|
||||
if self.config.judge_max_tokens > 0:
|
||||
kwargs["max_tokens"] = self.config.judge_max_tokens
|
||||
|
||||
result = await self.judge_client.chat.completions.create(**kwargs)
|
||||
print(" ✅ Judge API call successful")
|
||||
return result
|
||||
except Exception as e:
|
||||
|
|
@ -706,12 +707,12 @@ Your answer:"""
|
|||
|
||||
try:
|
||||
# Log full debug request for judge
|
||||
debug_params = {"temperature": self.config.judge_temperature}
|
||||
if self.config.judge_max_tokens > 0:
|
||||
debug_params["max_tokens"] = self.config.judge_max_tokens
|
||||
self._log_full_debug_request(
|
||||
messages,
|
||||
{
|
||||
"temperature": self.config.judge_temperature,
|
||||
"max_tokens": self.config.judge_max_tokens,
|
||||
},
|
||||
debug_params,
|
||||
"JUDGE_REFUSAL",
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue