Revert "rm hardcoded same score check"

This reverts commit f02c24204d.
2026-04-19 12:57:58 +00:00 · 2026-03-10 01:42:44 +05:30 · 2026-03-10 01:42:44 +05:30 · 632ab0161c
commit 632ab0161c
parent cdc23ba5dc
22 changed files with 85 additions and 0 deletions
--- a/environments/answer_format_environment/answer_format_environment.py
+++ b/environments/answer_format_environment/answer_format_environment.py
@ -3653,6 +3653,14 @@ class AnswerFormatEnv(BaseEnv):
                    )
                await self._save_failed_rollouts_to_jsonl()

+        # Check if all scores are the same (no learning signal)
+        if all(group_scores[0] == score for score in group_scores):
+            if self.debug_logging:
+                self.logger.debug(
+                    "All scores are identical, returning None for learning signal"
+                )
+            return None
+
        # Track successful groups for equivalent ratio enforcement
        if self.ensure_equivalent_ratios:
            # Count this as a successful group if we have any successful examples
--- a/environments/community/cat_behavior_env/catbot_arena.py
+++ b/environments/community/cat_behavior_env/catbot_arena.py
@ -285,6 +285,8 @@ class GSM8kEnv(BaseEnv):
                        percentage_of_range = min(percentage_of_range, 1.0)
                        # Apply linear penalty scaling from 1.0 down to 0.0
                        scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
            return scores
        else:
            # If the gold solution is not parseable, we return None
--- a/environments/community/dynastai/dynastai_server.py
+++ b/environments/community/dynastai/dynastai_server.py
@ -503,6 +503,11 @@ class DynastAIEnv(BaseEnv):
        for score in scores["scores"]:
            self.percent_correct_buffer.append(max(score, 0))

+        # Check if all the same
+        if all([score == scores["scores"][0] for score in scores["scores"]]):
+            print("[DYNASTAI] All scores identical, returning None")
+            return None  # If all the same, we return None
+
        return scores

    async def get_next_item(self) -> DynastAIRow:
--- a/environments/community/mcp_tool_calling/tool_calling_server.py
+++ b/environments/community/mcp_tool_calling/tool_calling_server.py
@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
                    # Apply linear penalty scaling from 1.0 down to 0.0
                    scores["scores"].append(1.0 - percentage_of_range)

+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def get_next_item(self):
--- a/environments/community/options_iv_prediction/options_iv_prediction.py
+++ b/environments/community/options_iv_prediction/options_iv_prediction.py
@ -433,6 +433,10 @@ class OptionsIVPrediction(BaseEnv):
            if len(scores["tokens"]) >= self.config.group_size:
                break

+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def rollout_and_score_eval(self, test_item):
--- a/environments/community/pay_to_play/pay_to_play_env.py
+++ b/environments/community/pay_to_play/pay_to_play_env.py
@ -777,6 +777,10 @@ End your evaluation with \\boxed{{score}} where score is your numerical rating.
        if hasattr(self, "last_agent_card_feedback"):
            self.last_agent_card_feedback = agent_card_feedback

+        # Ensure we have different scores for training signal
+        if len(set(scores["scores"])) == 1:
+            return None
+
        return scores

    def _extract_score_from_agent_card(self, agent_card_response: str) -> float:
--- a/environments/community/physical_space_stl/physical_env.py
+++ b/environments/community/physical_space_stl/physical_env.py
@ -409,6 +409,9 @@ class PhysicalEnv(BaseEnv):
                        percentage_of_range = min(percentage_of_range, 1.0)
                        scores["scores"].append(1.0 - percentage_of_range)

+        if all([scores["scores"][0] == score for score in scores["scores"]]):
+            return None  # If all the same, we return None
+
        return scores

    async def get_next_item(self) -> PhysicalRow:
--- a/environments/community/regex_generation/regex_env.py
+++ b/environments/community/regex_generation/regex_env.py
@ -248,6 +248,10 @@ class RegexEnv(BaseEnv):
                1.0 if s >= self.config.score_threshold else 0.0
            )

+        # If all scores identical, no learning signal
+        if len(set(scores["scores"])) == 1:
+            return None
+
        return scores

    async def rollout_and_score_eval(self, problem: dict) -> dict:
--- a/environments/community/solitaire_winning_probability/solitaire_server.py
+++ b/environments/community/solitaire_winning_probability/solitaire_server.py
@ -377,6 +377,8 @@ class SolitaireEnv(BaseEnv):
                        percentage_of_range = min(percentage_of_range, 1.0)
                        # Apply linear penalty scaling from 1.0 down to 0.0
                        scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
            return scores
        else:
            # If the gold solution is not parseable, we return None
--- a/environments/community/sql_query_env/sql_query_env.py
+++ b/environments/community/sql_query_env/sql_query_env.py
@ -408,6 +408,9 @@ class SQLQueryEnv(BaseEnv):
                    percentage_of_range = min(percentage_of_range, 1.0)
                    scores["scores"].append(1.0 - percentage_of_range)

+        if all([scores["scores"][0] == score for score in scores["scores"]]):
+            return None  # If all the same, return None
+
        return scores

    async def get_next_item(self) -> WikiSQLRow:
--- a/environments/community/wikipedia_research/tool_calling_server.py
+++ b/environments/community/wikipedia_research/tool_calling_server.py
@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
                    # Apply linear penalty scaling from 1.0 down to 0.0
                    scores["scores"].append(1.0 - percentage_of_range)

+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def get_next_item(self):
--- a/environments/eval_environments/pairwise_judgement_environment.py
+++ b/environments/eval_environments/pairwise_judgement_environment.py
@ -1112,6 +1112,10 @@ class PairwiseJudgementEnv(BaseEnv):
            for score in scores["scores"]:
                self.percent_correct_buffer.append(max(score, 0))

+            # Return None if all scores are the same (no learning signal)
+            if len(set(scores["scores"])) == 1:
+                return None
+
            return scores

        except Exception as e:
--- a/environments/fundamental_prediction_environment.py
+++ b/environments/fundamental_prediction_environment.py
@ -402,6 +402,10 @@ class FundamentalPredictionEnv(BaseEnv):
            if len(scores["tokens"]) >= self.config.group_size:
                break

+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def rollout_and_score_eval(self, test_item):
--- a/environments/gsm8k_server.py
+++ b/environments/gsm8k_server.py
@ -352,6 +352,8 @@ class GSM8kEnv(BaseEnv):
                        percentage_of_range = min(percentage_of_range, 1.0)
                        # Apply linear penalty scaling from 1.0 down to 0.0
                        scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
            return scores
        else:
            # If the gold solution is not parseable, we return None
--- a/environments/gsm8k_server_axolotl.py
+++ b/environments/gsm8k_server_axolotl.py
@ -283,6 +283,8 @@ class GSM8kEnv(BaseEnv):
                        percentage_of_range = min(percentage_of_range, 1.0)
                        # Apply linear penalty scaling from 1.0 down to 0.0
                        scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
            return scores
        else:
            # If the gold solution is not parseable, we return None
--- a/environments/letter_counting_environment/letter_counting_environment.py
+++ b/environments/letter_counting_environment/letter_counting_environment.py
@ -1248,6 +1248,13 @@ class LetterCountingEnv(BaseEnv):
            )
            return None

+        # Skip if all scores are identical (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            self.logger.debug(
+                f"All scores identical ({scores['scores'][0]:.2f}) - skipping group"
+            )
+            return None
+
        return scores

    async def rollout_and_score_eval(self, eval_item: Dict) -> Tuple[int, int]:
--- a/environments/math_server.py
+++ b/environments/math_server.py
@ -534,6 +534,8 @@ class MathEnv(BaseEnv):
                        return None, to_backlog
                else:
                    return None, to_backlog
+            else:
+                return None, to_backlog
        else:
            self.normal_rollouts.append(
                (
@ -1165,6 +1167,8 @@ class MathEnv(BaseEnv):
                    "Max message delta is less than 0.1 * shortest message, no length penalty"
                )
                return None, []
+        elif all([score == scores["scores"][0] for score in scores["scores"]]):
+            return None, []
        if len(for_table) > 0:
            self.judge_rollouts.append(for_table)
            if len(self.judge_rollouts) >= self.config.num_rollouts_to_keep:
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@ -383,6 +383,10 @@ class MathEnv(BaseEnv):
        to_postprocess = await self.score(to_score)
        if to_postprocess is None:
            return None, to_backlog
+        if all(
+            [to_postprocess["scores"][0] == score for score in to_postprocess["scores"]]
+        ):
+            return None, to_backlog
        self.normal_rollouts.append(
            (
                prompt_format.format(prompt=problem_format.format(problem=item[0])),
--- a/environments/mcp_env.py
+++ b/environments/mcp_env.py
@ -347,6 +347,9 @@ class McpEnv(BaseEnv):
        for score in scores["scores"]:
            self.percent_correct_buffer.append(max(score, 0))

+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def get_next_item(self):
--- a/environments/mcqa_thinking_env.py
+++ b/environments/mcqa_thinking_env.py
@ -373,6 +373,10 @@ class MCQAThinkingEnv(BaseEnv):
        for score in scores["scores"]:
            self.percent_correct_buffer.append(max(score, 0))

+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def rollout_and_score_eval(self, test_item):
--- a/environments/text_reversal_environment.py
+++ b/environments/text_reversal_environment.py
@ -770,6 +770,10 @@ class TextReversalEnv(BaseEnv):
            for score in scores["scores"]:
                self.percent_correct_buffer.append(max(score, 0))

+            # Return None if all scores are the same (no learning signal)
+            if len(set(scores["scores"])) == 1:
+                return None
+
            return scores

        except Exception as e:
--- a/environments/tool_calling_server.py
+++ b/environments/tool_calling_server.py
@ -413,6 +413,10 @@ class SingleToolCallingEnv(BaseEnv):
                    # Apply linear penalty scaling from 1.0 down to 0.0
                    scores["scores"].append(1.0 - percentage_of_range)

+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
        return scores

    async def get_next_item(self):