diff --git a/environments/eval_environments/eed_score.py b/environments/eval_environments/eed_score.py index 1c2a25fb..4a342e6b 100644 --- a/environments/eval_environments/eed_score.py +++ b/environments/eval_environments/eed_score.py @@ -691,7 +691,9 @@ def master_convert(s: str): Various exceptions if conversion fails """ if not EED_AVAILABLE: - raise ImportError("latex2sympy2_extended and sympy are required for EED scoring") + raise ImportError( + "latex2sympy2_extended and sympy are required for EED scoring" + ) preprocessed_stage1 = first_preprocess(s) preprocessed_stage2 = second_preprocess(preprocessed_stage1) @@ -723,7 +725,9 @@ def sympy_to_tree(expr) -> TreeNode: ValueError: If expression contains unsupported types """ # Numbers and constants - if isinstance(expr, (Integer, Pi, Exp1, Float, Rational, Infinity, NegativeInfinity)): + if isinstance( + expr, (Integer, Pi, Exp1, Float, Rational, Infinity, NegativeInfinity) + ): return TreeNode(label=f"number_{expr}", children=[]) # Symbols diff --git a/environments/eval_environments/phybench_eval.py b/environments/eval_environments/phybench_eval.py index b179cc83..b9566bda 100644 --- a/environments/eval_environments/phybench_eval.py +++ b/environments/eval_environments/phybench_eval.py @@ -58,7 +58,6 @@ from atroposlib.envs.base import ( EvalHandlingEnum, ) - # Physics domain tags in PHYBench PHYBENCH_TAGS = [ "MECHANICS", @@ -260,7 +259,9 @@ class PHYBenchEvalEnv(BaseEnv): print(f" Thinking mode: {self.config.thinking_mode}") print(f" EED Score enabled: {self.config.compute_eed_score and EED_AVAILABLE}") if self.config.thinking_mode: - thinking_prompt = get_default_thinking_prompt(self.config.custom_thinking_prompt) + thinking_prompt = get_default_thinking_prompt( + self.config.custom_thinking_prompt + ) print(f" Thinking prompt: {thinking_prompt[:80]}...") if self.config.tags_filter: print(f" Tags filter: {self.config.tags_filter}") @@ -315,13 +316,15 @@ class PHYBenchEvalEnv(BaseEnv): # Track tag distribution tag_counts[tag] = tag_counts.get(tag, 0) + 1 - self.eval_items.append({ - "id": problem_id, - "tag": tag, - "content": content, - "solution": solution, - "answer": answer, - }) + self.eval_items.append( + { + "id": problem_id, + "tag": tag, + "content": content, + "solution": solution, + "answer": answer, + } + ) # Shuffle with seed for reproducibility random.seed(self.config.shuffle_seed) @@ -356,7 +359,9 @@ class PHYBenchEvalEnv(BaseEnv): or "" ) - def _extract_answer(self, response: str, debug: bool = False) -> Tuple[Optional[str], str]: + def _extract_answer( + self, response: str, debug: bool = False + ) -> Tuple[Optional[str], str]: """ Extract the answer from the model's response. @@ -382,7 +387,9 @@ class PHYBenchEvalEnv(BaseEnv): if len(boxed_answers) > 1: if debug: - print(f" Multiple \\boxed{{}} found ({len(boxed_answers)}), using last one") + print( + f" Multiple \\boxed{{}} found ({len(boxed_answers)}), using last one" + ) return boxed_answers[-1], "boxed_last" return boxed_answers[0], "boxed" @@ -420,7 +427,9 @@ class PHYBenchEvalEnv(BaseEnv): # Try EED Score - if score is 100, they're equivalent if self.config.compute_eed_score and EED_AVAILABLE: try: - score, _, _, _ = compute_eed_score(gold_clean, pred_clean, debug_mode=False) + score, _, _, _ = compute_eed_score( + gold_clean, pred_clean, debug_mode=False + ) if score == 100: return True, "sympy_equivalent" except Exception: @@ -537,7 +546,9 @@ class PHYBenchEvalEnv(BaseEnv): # Extract thinking content if present thinking_content = ( - extract_thinking_content(response_text) if self.config.thinking_mode else None + extract_thinking_content(response_text) + if self.config.thinking_mode + else None ) # Get content for answer extraction @@ -557,12 +568,16 @@ class PHYBenchEvalEnv(BaseEnv): # Compute scores gold_answer = item["answer"] - scores = self._compute_scores(extracted_answer, gold_answer, debug=self.config.full_debug) + scores = self._compute_scores( + extracted_answer, gold_answer, debug=self.config.full_debug + ) if self.config.full_debug: status = "✓" if scores["is_correct"] else "✗" eed = scores["eed_score"] - print(f" [{status}] {item['tag']}: EED={eed:.1f}, gold={gold_answer[:50]}...") + print( + f" [{status}] {item['tag']}: EED={eed:.1f}, gold={gold_answer[:50]}..." + ) return { "item_id": item["id"], @@ -648,11 +663,15 @@ class PHYBenchEvalEnv(BaseEnv): # Format compliance and thinking utilization format_valid = sum(1 for r in valid_results if r.get("format_valid", True)) has_thinking = sum(1 for r in valid_results if r.get("has_thinking", False)) - has_boxed = sum(1 for r in valid_results if r.get("extracted_answer") is not None) + has_boxed = sum( + 1 for r in valid_results if r.get("extracted_answer") is not None + ) # Average response length response_lengths = [r.get("response_length", 0) for r in valid_results] - avg_response_length = sum(response_lengths) / len(response_lengths) if response_lengths else 0 + avg_response_length = ( + sum(response_lengths) / len(response_lengths) if response_lengths else 0 + ) metrics = { "accuracy": accuracy, @@ -714,7 +733,9 @@ class PHYBenchEvalEnv(BaseEnv): "phybench/total_evaluated": metrics.get("total_evaluated", 0), "phybench/has_boxed_rate": metrics.get("has_boxed_rate", 0), "phybench/format_compliance_rate": metrics.get("format_compliance_rate", 0), - "phybench/thinking_utilization_rate": metrics.get("thinking_utilization_rate", 0), + "phybench/thinking_utilization_rate": metrics.get( + "thinking_utilization_rate", 0 + ), "phybench/avg_response_length": metrics.get("avg_response_length", 0), } @@ -742,4 +763,3 @@ class PHYBenchEvalEnv(BaseEnv): if __name__ == "__main__": PHYBenchEvalEnv.cli() -