mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
include ranges rather than sampled values in difficulty metadata dicts (#387)
* update difficulty metadata for logic datasets * update difficulty metadata for graph datasets * update difficulty metadata for geometry datasets * update difficulty metadata for games datasets * update difficulty metadata for cognition datasets * update difficulty metadata for arithmetic datasets * update difficulty metadata for arc datasets * update difficulty metadata for algorithmic datasets * update difficulty metadata for algebra datasets * use tuples * update tests * update tests
This commit is contained in:
parent
b69c35818a
commit
7475a20700
80 changed files with 304 additions and 126 deletions
|
|
@ -43,8 +43,8 @@ def test_boxnet_items():
|
|||
assert "initial_state" in item["metadata"]
|
||||
|
||||
# Verify row_num and column_num are within limits
|
||||
row_num = item["metadata"]["difficulty"]["row_num"]
|
||||
column_num = item["metadata"]["difficulty"]["column_num"]
|
||||
row_num = item["metadata"]["row_num"]
|
||||
column_num = item["metadata"]["column_num"]
|
||||
assert 1 <= row_num <= 2, f"row_num {row_num} outside valid range"
|
||||
assert 1 <= column_num <= 2, f"column_num {column_num} outside valid range"
|
||||
|
||||
|
|
@ -78,8 +78,8 @@ def test_boxnet_grid_sizes():
|
|||
|
||||
for i in range(len(dataset)):
|
||||
item = dataset[i]
|
||||
row_num = item["metadata"]["difficulty"]["row_num"]
|
||||
column_num = item["metadata"]["difficulty"]["column_num"]
|
||||
row_num = item["metadata"]["row_num"]
|
||||
column_num = item["metadata"]["column_num"]
|
||||
|
||||
rows_set.add(row_num)
|
||||
columns_set.add(column_num)
|
||||
|
|
|
|||
|
|
@ -53,11 +53,15 @@ def test_coach_with_chain_sum():
|
|||
# Each key should be a tuple of tuples containing difficulty parameters
|
||||
for key in aggregated.scores:
|
||||
assert isinstance(key, tuple)
|
||||
# Each inner tuple should be (param_name, value)
|
||||
# Each inner tuple should be (param_name, value) or (param_name, (min_value, max_value))
|
||||
for param in key:
|
||||
assert isinstance(param, tuple)
|
||||
assert param[0] in ("num_terms", "num_digits")
|
||||
assert isinstance(param[1], int)
|
||||
assert (
|
||||
isinstance(param[1], int)
|
||||
or (isinstance(param[1], tuple) and len(param[1]) == 2)
|
||||
and all(isinstance(v, int) for v in param[1])
|
||||
)
|
||||
|
||||
# Test aggregation with last_n
|
||||
last_3 = coach.score_board.aggregate(last_n=3)
|
||||
|
|
@ -171,7 +175,7 @@ def test_coach_with_composite():
|
|||
item = coach[i + 5] # Use different indices
|
||||
if "chain_sum" in item["metadata"]["source_dataset"]:
|
||||
metadata = item["metadata"]
|
||||
assert metadata["difficulty"]["num_terms"] >= 4
|
||||
assert metadata["num_terms"] >= 4
|
||||
|
||||
|
||||
def test_grouped_scores_str():
|
||||
|
|
|
|||
|
|
@ -38,12 +38,12 @@ def test_rearc_items():
|
|||
assert "input" in meta
|
||||
assert "output" in meta
|
||||
assert "task_id" in meta
|
||||
assert "rng" in meta["difficulty"]
|
||||
assert "pso" in meta["difficulty"]
|
||||
assert "rng" in meta
|
||||
assert "pso" in meta
|
||||
|
||||
# Validate difficulty bounds
|
||||
assert config.diff_lb <= meta["difficulty"]["rng"] <= config.diff_ub
|
||||
assert config.diff_lb <= meta["difficulty"]["pso"] <= config.diff_ub
|
||||
assert config.diff_lb <= meta["rng"] <= config.diff_ub
|
||||
assert config.diff_lb <= meta["pso"] <= config.diff_ub
|
||||
|
||||
|
||||
def test_rearc_solution_validation():
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ def test_score_answer():
|
|||
|
||||
# test optimal score for answers, patching each entry
|
||||
for x in dataset:
|
||||
assert len(x["metadata"]["board"]) == x["metadata"]["difficulty"]["board_size"]
|
||||
assert len(x["metadata"]["board"]) == x["metadata"]["board_size"]
|
||||
assert dataset.score_answer(x["answer"], entry=x) == 1.0
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue