mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
add ArcAgiDataset class, fix score_entry() metadata params
This commit is contained in:
parent
2ad0965fdc
commit
4e49806d22
20 changed files with 194 additions and 93 deletions
|
|
@ -52,30 +52,30 @@ def test_complex_arithmetic_scoring():
|
|||
dataset = ComplexArithmeticDataset(config)
|
||||
|
||||
# Test case with answer 3 + 2i
|
||||
metadata = {"result": (3.0, 2.0)}
|
||||
entry = {"metadata": {"result": (3.0, 2.0)}}
|
||||
|
||||
# Test exact matches (should get score of 1.0)
|
||||
assert dataset.score_answer("3 + 2i", metadata) == 1.0
|
||||
assert dataset.score_answer("3+2i", metadata) == 1.0
|
||||
assert dataset.score_answer("3.0 + 2.0i", metadata) == 1.0
|
||||
assert dataset.score_answer("3 + 2i", entry) == 1.0
|
||||
assert dataset.score_answer("3+2i", entry) == 1.0
|
||||
assert dataset.score_answer("3.0 + 2.0i", entry) == 1.0
|
||||
|
||||
# Test answers with small errors (should get high but < 1.0 scores)
|
||||
print(dataset.score_answer("3.1 + 2i", metadata))
|
||||
assert 0.9 < dataset.score_answer("3.1 + 2i", metadata) < 1.0
|
||||
assert 0.9 < dataset.score_answer("3 + 2.1i", metadata) < 1.0
|
||||
assert 0.7 < dataset.score_answer("3.1 + 2.1i", metadata) < 0.95
|
||||
print(dataset.score_answer("3.1 + 2i", entry))
|
||||
assert 0.9 < dataset.score_answer("3.1 + 2i", entry) < 1.0
|
||||
assert 0.9 < dataset.score_answer("3 + 2.1i", entry) < 1.0
|
||||
assert 0.7 < dataset.score_answer("3.1 + 2.1i", entry) < 0.95
|
||||
|
||||
# Test answers with moderate errors (should get medium scores)
|
||||
assert 0.3 < dataset.score_answer("4 + 2i", metadata) < 0.4
|
||||
assert 0.3 < dataset.score_answer("3 + 3i", metadata) < 0.4
|
||||
assert 0.3 < dataset.score_answer("4 + 2i", entry) < 0.4
|
||||
assert 0.3 < dataset.score_answer("3 + 3i", entry) < 0.4
|
||||
|
||||
# Test answers with large errors (should get very low scores)
|
||||
assert dataset.score_answer("10 + 10i", metadata) < 0.01
|
||||
assert dataset.score_answer("10 + 10i", entry) < 0.01
|
||||
|
||||
# Test invalid answers (should get 0.0)
|
||||
assert dataset.score_answer("invalid", metadata) == 0.0
|
||||
assert dataset.score_answer(None, metadata) == 0.0
|
||||
assert dataset.score_answer("inf + 2i", metadata) == 0.0
|
||||
assert dataset.score_answer("invalid", entry) == 0.0
|
||||
assert dataset.score_answer(None, entry) == 0.0
|
||||
assert dataset.score_answer("inf + 2i", entry) == 0.0
|
||||
|
||||
|
||||
def test_complex_arithmetic_division_by_zero():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue