mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-29 17:35:16 +00:00
updated test score board
This commit is contained in:
parent
b6b33a1d04
commit
bd2788ad2a
1 changed files with 16 additions and 17 deletions
|
|
@ -61,32 +61,32 @@ def test_score_aggregation():
|
||||||
aggregated = experiment.score_board.aggregate()
|
aggregated = experiment.score_board.aggregate()
|
||||||
|
|
||||||
# Verify we have scores grouped by difficulty parameters
|
# Verify we have scores grouped by difficulty parameters
|
||||||
assert len(aggregated.scores) > 0
|
assert len(aggregated["leg_counting"].scores.keys()) > 0
|
||||||
|
|
||||||
# Each key should be a tuple of tuples containing difficulty parameters
|
# Each key should be a tuple of tuples containing difficulty parameters
|
||||||
for key in aggregated.scores:
|
for key in aggregated["leg_counting"].scores:
|
||||||
assert isinstance(key, tuple)
|
assert isinstance(key, tuple)
|
||||||
# Each inner tuple should be (param_name, value) or (param_name, (min_value, max_value))
|
# Each inner tuple should be (param_name, value) or (param_name, (min_value, max_value))
|
||||||
for param in key:
|
for param in key:
|
||||||
assert isinstance(param, tuple)
|
assert isinstance(param, tuple)
|
||||||
assert param[0] in ("source", "idx", "num_animals", "num_instances")
|
assert param[0] in ("source", "num_animals", "num_instances")
|
||||||
|
|
||||||
# Test aggregation with last_n
|
# Test aggregation with last_n
|
||||||
last_3 = experiment.score_board.aggregate(last_n=3)
|
last_3 = experiment.score_board.aggregate(last_n=3)
|
||||||
assert len(last_3.scores) > 0
|
assert len(last_3["leg_counting"].scores) > 0
|
||||||
|
|
||||||
# Verify total scores count
|
# Verify total scores count
|
||||||
assert last_3.total_scores == 3
|
assert last_3["leg_counting"].total_scores == 3
|
||||||
|
|
||||||
# Verify conversation tracking
|
# Verify conversation tracking
|
||||||
assert len(experiment.score_board.conversations) == 5
|
assert len(experiment.score_board.conversations["leg_counting"]) == 5
|
||||||
for conv in experiment.score_board.conversations:
|
for conv in experiment.score_board.conversations["leg_counting"]:
|
||||||
assert len(conv) == 2 # user question and assistant response
|
assert len(conv) == 2 # user question and assistant response
|
||||||
assert conv[0]["role"] == "user"
|
assert conv[0]["role"] == "user"
|
||||||
assert conv[1]["role"] == "assistant"
|
assert conv[1]["role"] == "assistant"
|
||||||
|
|
||||||
# Test stats calculation
|
# Test stats calculation
|
||||||
stats = aggregated.stats()
|
stats = aggregated["leg_counting"].stats()
|
||||||
|
|
||||||
for key, values in stats.scores.items():
|
for key, values in stats.scores.items():
|
||||||
assert isinstance(values, tuple)
|
assert isinstance(values, tuple)
|
||||||
|
|
@ -107,11 +107,11 @@ def test_score_aggregation():
|
||||||
assert all(math.isnan(v) for v in stats_tuple[1:]) # stats should be NaN
|
assert all(math.isnan(v) for v in stats_tuple[1:]) # stats should be NaN
|
||||||
|
|
||||||
# Test clear functionality
|
# Test clear functionality
|
||||||
experiment.score_board.clear()
|
experiment.score_board.clear("leg_counting")
|
||||||
assert len(experiment.score_board.scores) == 0
|
assert len(experiment.score_board.scores["leg_counting"]) == 0
|
||||||
assert len(experiment.score_board.metadata) == 0
|
assert len(experiment.score_board.metadata["leg_counting"]) == 0
|
||||||
assert len(experiment.score_board.conversations) == 0
|
assert len(experiment.score_board.conversations["leg_counting"]) == 0
|
||||||
assert len(experiment.score_board.aggregate().scores) == 0
|
assert len(experiment.score_board.aggregate()["leg_counting"].scores) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_experiment_with_composite():
|
def test_experiment_with_composite():
|
||||||
|
|
@ -147,15 +147,14 @@ def test_experiment_with_composite():
|
||||||
|
|
||||||
# Test aggregation
|
# Test aggregation
|
||||||
aggregated = experiment.score_board.aggregate()
|
aggregated = experiment.score_board.aggregate()
|
||||||
assert len(aggregated.scores) > 0
|
assert len(aggregated["leg_counting"].scores) > 0
|
||||||
|
|
||||||
# Verify source dataset info is first in keys
|
# Verify source dataset info is first in keys
|
||||||
for key in aggregated.scores:
|
for key in aggregated["leg_counting"].scores:
|
||||||
assert key[0][0] == "source" # First tuple should be ("source", dataset_name)
|
assert key[0][0] == "source" # First tuple should be ("source", dataset_name)
|
||||||
assert key[1][0] == "idx" # Second tuple should be ("idx", index)
|
|
||||||
|
|
||||||
# Test stats
|
# Test stats
|
||||||
stats = aggregated.stats()
|
stats = aggregated["leg_counting"].stats()
|
||||||
for key, values in stats.scores.items():
|
for key, values in stats.scores.items():
|
||||||
assert isinstance(values, tuple)
|
assert isinstance(values, tuple)
|
||||||
assert len(values) == 5 # (count, mean, std, min, max)
|
assert len(values) == 5 # (count, mean, std, min, max)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue