diff --git a/reasoning_gym/algorithmic/letter_jumble.py b/reasoning_gym/algorithmic/letter_jumble.py index f92d8f2b..728c9c67 100644 --- a/reasoning_gym/algorithmic/letter_jumble.py +++ b/reasoning_gym/algorithmic/letter_jumble.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from random import Random -from typing import Optional +from typing import Dict, Optional from reasoning_gym.data import read_data_file @@ -99,5 +99,27 @@ class LetterJumbleDataset(ProceduralDataset): }, } + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Determine if the solution provided solves this task. + + The function awards 1.0 for a correct answer. + + Args: + answer (Optional[str]): The user's answer. + entry (Dict[str, any]): The original dataset entry containing the correct answer. + + Returns: + float: The computed score between 0.0 and 1.0. + """ + + if answer == None: + return 0.0 + + s_answer = answer.strip().lower() + if not s_answer == entry["answer"].strip().lower(): + return 0.01 + else: + return 1.0 + register_dataset("letter_jumble", LetterJumbleDataset, LetterJumbleConfig) diff --git a/tests/test_letter_jumble.py b/tests/test_letter_jumble.py index 8203f2f0..89f860b5 100644 --- a/tests/test_letter_jumble.py +++ b/tests/test_letter_jumble.py @@ -108,6 +108,11 @@ def test_letter_jumble_dataset_items(): assert config.min_word_len <= len(word) <= config.max_word_len assert word.isalpha() + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer="gibberish", entry=item) == 0.01 + assert dataset.score_answer(answer=None, entry=item) == 0.0 + def test_letter_jumble_iteration(): """Test that iteration respects dataset size"""