mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-25 17:10:42 +00:00
Port many benchmarks into atropos
This commit is contained in:
parent
405efa8302
commit
ef9c0c3699
37 changed files with 22878 additions and 308 deletions
|
|
@ -20,6 +20,10 @@ from atroposlib.envs.base import (
|
|||
ScoredDataGroup,
|
||||
)
|
||||
from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
|
||||
from eval_helpers import (
|
||||
get_default_thinking_prompt,
|
||||
create_system_content,
|
||||
)
|
||||
|
||||
|
||||
class RewardBenchCategory(str, Enum):
|
||||
|
|
@ -69,8 +73,8 @@ class PairwiseJudgementConfig(BaseEnvConfig):
|
|||
)
|
||||
|
||||
eval_max_tokens: int = Field(
|
||||
default=1024 * 16,
|
||||
description="Maximum tokens for evaluation completions.",
|
||||
default=0,
|
||||
description="Maximum tokens for evaluation completions (0 = use model default).",
|
||||
)
|
||||
|
||||
train_max_tokens: int = Field(
|
||||
|
|
@ -193,14 +197,7 @@ class PairwiseJudgementEnv(BaseEnv):
|
|||
|
||||
def _get_thinking_prompt(self) -> str:
|
||||
"""Get thinking system prompt."""
|
||||
return (
|
||||
self.config.custom_thinking_prompt
|
||||
if self.config.custom_thinking_prompt
|
||||
else "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the "
|
||||
"problem and deliberate with yourself via systematic reasoning processes to help come to a correct "
|
||||
"solution prior to answering. You should enclose your thoughts and internal monologue inside <think> "
|
||||
"</think> tags, and then provide your solution or response to the problem."
|
||||
)
|
||||
return get_default_thinking_prompt(self.config.custom_thinking_prompt)
|
||||
|
||||
def _get_judgment_prompt(self) -> str:
|
||||
"""Get judgment system prompt."""
|
||||
|
|
@ -360,7 +357,7 @@ class PairwiseJudgementEnv(BaseEnv):
|
|||
batch_size=1024,
|
||||
steps_per_eval=25,
|
||||
train_max_tokens=1024 * 16,
|
||||
eval_max_tokens=1024 * 16,
|
||||
eval_max_tokens=0, # Use model default
|
||||
inference_weight=1.0,
|
||||
wandb_name="pairwise_judgment",
|
||||
eval_handling=EvalHandlingEnum.LIMIT_TRAIN,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue