[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
2026-04-28 17:29:30 +00:00 · 2025-12-28 04:12:13 +00:00 · 2025-12-28 04:12:13 +00:00 · 1d4275d441
commit 1d4275d441
parent ea6db6fe92
2 changed files with 19 additions and 12 deletions
--- a/environments/eval_environments/eed_score.py
+++ b/environments/eval_environments/eed_score.py
@ -988,18 +988,17 @@ def extract_all_boxed(latex_str: str) -> List[str]:

        # Count braces to find matching closing brace
        while j < len(latex_str) and depth > 0:
-            if latex_str[j] == '{':
+            if latex_str[j] == "{":
                depth += 1
-            elif latex_str[j] == '}':
+            elif latex_str[j] == "}":
                depth -= 1
            j += 1

        if depth == 0:
            # Extract content between braces
-            content = latex_str[start:j-1].strip()
+            content = latex_str[start : j - 1].strip()
            results.append(content)

        i = j

    return results
-
--- a/environments/eval_environments/phybench_eval.py
+++ b/environments/eval_environments/phybench_eval.py
@ -176,7 +176,6 @@ class PHYBenchEvalConfig(BaseEnvConfig):
    )


-
 class PHYBenchEvalEnv(BaseEnv):
    """
    PHYBench Evaluation Environment.
@ -496,7 +495,10 @@ class PHYBenchEvalEnv(BaseEnv):
    async def rollout_and_score_eval(self, item: Dict) -> Optional[Dict]:
        """Run evaluation on a single item and return the result."""
        if self.config.full_debug:
-            print(f"[DEBUG] Starting eval for item: {item.get('id', 'unknown')}", flush=True)
+            print(
+                f"[DEBUG] Starting eval for item: {item.get('id', 'unknown')}",
+                flush=True,
+            )
        prompt = self._format_prompt(item)
        system_content = self._create_system_content()

@ -519,9 +521,17 @@ class PHYBenchEvalEnv(BaseEnv):
        for attempt in range(self.config.max_retries):
            try:
                if self.config.full_debug:
-                    print(f"  Making API request (attempt {attempt + 1}/{self.config.max_retries})...", flush=True)
-                    print(f"    Temperature: {self.config.eval_temperature}", flush=True)
-                    print(f"    Max tokens: {self.config.eval_max_tokens if self.config.eval_max_tokens > 0 else 'model default'}", flush=True)
+                    print(
+                        f"  Making API request (attempt {attempt + 1}/{self.config.max_retries})...",
+                        flush=True,
+                    )
+                    print(
+                        f"    Temperature: {self.config.eval_temperature}", flush=True
+                    )
+                    print(
+                        f"    Max tokens: {self.config.eval_max_tokens if self.config.eval_max_tokens > 0 else 'model default'}",
+                        flush=True,
+                    )

                response = await self.server.chat_completion(**kwargs)
                response_text = response.choices[0].message.content or ""
@ -610,9 +620,7 @@ class PHYBenchEvalEnv(BaseEnv):
        print(f"{'='*60}\n")

        # Create evaluation tasks
-        eval_tasks = [
-            self.rollout_and_score_eval(item) for item in self.eval_items
-        ]
+        eval_tasks = [self.rollout_and_score_eval(item) for item in self.eval_items]

        # Run with progress bar
        results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating PHYBench")