diff --git a/tests/test_arc_1d.py b/tests/test_arc_1d.py index 6c32f79e..0190d353 100644 --- a/tests/test_arc_1d.py +++ b/tests/test_arc_1d.py @@ -98,7 +98,7 @@ def test_arc_1d_scoring(): assert dataset.score_answer(entry["answer"], entry) == 1.0 # Test partial match (answer contained within response) - assert dataset.score_answer(f"The answer is: {entry['answer']}", entry) == 0.5 + assert dataset.score_answer(f"The answer is: {entry['answer']}", entry) > 0.5 # Test incorrect answer assert dataset.score_answer("wrong answer", entry) == 0.01 diff --git a/tests/test_products.py b/tests/test_products.py index 34ff1623..d794e28a 100644 --- a/tests/test_products.py +++ b/tests/test_products.py @@ -111,7 +111,9 @@ def test_products_scoring(): assert dataset.score_answer("wrong", item) == 0.01, "Wrong answer should score 0.01" # Test scoring with partial match (answer contained in response) - assert dataset.score_answer(f"The answer is {item['answer']}", item) == 0.5, "Partial match should score 0.5" + assert ( + dataset.score_answer(f"The answer is {item['answer']}", item) > 0.1 + ), "Partial match should scored len(oracle_answer)/len(answer)" # Test scoring with None assert dataset.score_answer(None, item) == 0.0, "None should score 0.0"