mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-28 17:29:30 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
This commit is contained in:
parent
ea6db6fe92
commit
1d4275d441
2 changed files with 19 additions and 12 deletions
|
|
@ -988,18 +988,17 @@ def extract_all_boxed(latex_str: str) -> List[str]:
|
|||
|
||||
# Count braces to find matching closing brace
|
||||
while j < len(latex_str) and depth > 0:
|
||||
if latex_str[j] == '{':
|
||||
if latex_str[j] == "{":
|
||||
depth += 1
|
||||
elif latex_str[j] == '}':
|
||||
elif latex_str[j] == "}":
|
||||
depth -= 1
|
||||
j += 1
|
||||
|
||||
if depth == 0:
|
||||
# Extract content between braces
|
||||
content = latex_str[start:j-1].strip()
|
||||
content = latex_str[start : j - 1].strip()
|
||||
results.append(content)
|
||||
|
||||
i = j
|
||||
|
||||
return results
|
||||
|
||||
|
|
|
|||
|
|
@ -176,7 +176,6 @@ class PHYBenchEvalConfig(BaseEnvConfig):
|
|||
)
|
||||
|
||||
|
||||
|
||||
class PHYBenchEvalEnv(BaseEnv):
|
||||
"""
|
||||
PHYBench Evaluation Environment.
|
||||
|
|
@ -496,7 +495,10 @@ class PHYBenchEvalEnv(BaseEnv):
|
|||
async def rollout_and_score_eval(self, item: Dict) -> Optional[Dict]:
|
||||
"""Run evaluation on a single item and return the result."""
|
||||
if self.config.full_debug:
|
||||
print(f"[DEBUG] Starting eval for item: {item.get('id', 'unknown')}", flush=True)
|
||||
print(
|
||||
f"[DEBUG] Starting eval for item: {item.get('id', 'unknown')}",
|
||||
flush=True,
|
||||
)
|
||||
prompt = self._format_prompt(item)
|
||||
system_content = self._create_system_content()
|
||||
|
||||
|
|
@ -519,9 +521,17 @@ class PHYBenchEvalEnv(BaseEnv):
|
|||
for attempt in range(self.config.max_retries):
|
||||
try:
|
||||
if self.config.full_debug:
|
||||
print(f" Making API request (attempt {attempt + 1}/{self.config.max_retries})...", flush=True)
|
||||
print(f" Temperature: {self.config.eval_temperature}", flush=True)
|
||||
print(f" Max tokens: {self.config.eval_max_tokens if self.config.eval_max_tokens > 0 else 'model default'}", flush=True)
|
||||
print(
|
||||
f" Making API request (attempt {attempt + 1}/{self.config.max_retries})...",
|
||||
flush=True,
|
||||
)
|
||||
print(
|
||||
f" Temperature: {self.config.eval_temperature}", flush=True
|
||||
)
|
||||
print(
|
||||
f" Max tokens: {self.config.eval_max_tokens if self.config.eval_max_tokens > 0 else 'model default'}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
response = await self.server.chat_completion(**kwargs)
|
||||
response_text = response.choices[0].message.content or ""
|
||||
|
|
@ -610,9 +620,7 @@ class PHYBenchEvalEnv(BaseEnv):
|
|||
print(f"{'='*60}\n")
|
||||
|
||||
# Create evaluation tasks
|
||||
eval_tasks = [
|
||||
self.rollout_and_score_eval(item) for item in self.eval_items
|
||||
]
|
||||
eval_tasks = [self.rollout_and_score_eval(item) for item in self.eval_items]
|
||||
|
||||
# Run with progress bar
|
||||
results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating PHYBench")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue