diff --git a/reasoning_gym/code/bf.py b/reasoning_gym/code/bf.py index 5ada391e..973a41a6 100644 --- a/reasoning_gym/code/bf.py +++ b/reasoning_gym/code/bf.py @@ -117,36 +117,6 @@ int main() {{ # bf = Minify.minify(bf) # Is this necessary? return bf - def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: - """Determine if the solution provided solves the BF task. - - The function awards 1.0 for a correct answer. - - Args: - answer (Optional[str]): The user's answer. - entry (dict[str, Any]): The original dataset entry containing the correct answer. - - Returns: - float: The computed score between 0.0 and 1.0. - """ - - if not isinstance(answer, str): - return 0.0 - - if answer == entry["answer"]: - return 1.0 # Yay - - if entry["answer"] in answer.splitlines(): - # We can be quite confident that the correct answer was given - # It was likely just given alongside an explanation - return max(0.9 * len(answer) / len(entry["answer"]), 0.1) - - if entry["answer"] in answer: - # Since answers are English words, some risk of the response coincidentally containing the answer - return max(0.5 * len(answer) / len(entry["answer"]), 0.1) - - return 0.0 - class BFCurriculum(BaseCurriculum): def __init__(self):