diff --git a/GALLERY.md b/GALLERY.md index 07642aa7..e7ba9238 100644 --- a/GALLERY.md +++ b/GALLERY.md @@ -5,6 +5,7 @@ This gallery shows examples from all available datasets using their default conf - [advanced_geometry](#advanced_geometry) - [aiw](#aiw) - [arc_1d](#arc_1d) +- [arc_agi](#arc_agi) - [base_conversion](#base_conversion) - [basic_arithmetic](#basic_arithmetic) - [bf](#bf) @@ -24,6 +25,7 @@ This gallery shows examples from all available datasets using their default conf - [gsm_symbolic](#gsm_symbolic) - [intermediate_integration](#intermediate_integration) - [isomorphic_strings](#isomorphic_strings) +- [knight_swap](#knight_swap) - [largest_island](#largest_island) - [lcm](#lcm) - [leg_counting](#leg_counting) @@ -230,6 +232,421 @@ Metadata: {'task_name': 'two_points_and_fill_inv', 'size': 26, 'train_examples': ```` +### arc_agi +Default configuration: +```python +use_train = True +use_eval = True +board_format_opts = BoardFormattingOptions(alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], col_delimiter=' ', row_delimiter='\n', array_brackets=False) +rotations = ['90', '180', '270'] +mirrors = ['horizontal', 'vertical', 'diagonal', 'counterdiagonal'] +use_color_permutation = True +seed = 42 +size = 500 +``` + +Example tasks: +```` +Example 1: +Question: Find the common rule that maps an input grid to an output grid, given the examples below. + +Example 1: + +Input: +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 6 3 6 7 7 7 7 7 7 7 7 7 7 +7 6 6 3 7 6 6 6 7 7 6 3 7 7 +7 7 7 7 7 6 3 6 7 7 6 6 7 7 +7 7 7 7 7 6 6 3 7 7 7 7 7 7 +7 7 7 7 7 3 6 6 7 7 7 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 3 6 +7 6 6 3 7 7 7 7 7 7 7 6 6 6 +7 3 6 6 7 7 7 7 7 7 7 7 7 7 +7 6 6 6 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 3 6 6 7 7 7 7 +7 7 7 7 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +Output: +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 6 3 7 7 +7 7 7 7 7 7 7 7 7 7 6 6 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 3 6 +7 7 7 7 7 7 7 7 7 7 7 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 3 6 6 7 7 7 7 +7 7 7 7 7 7 7 6 6 6 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 + +Example 2: + +Input: +7 7 7 7 7 6 3 6 7 7 7 6 6 7 +7 7 7 7 7 6 6 6 7 7 7 6 6 7 +6 6 6 6 7 6 6 6 7 7 7 6 6 7 +6 3 6 6 7 7 7 7 7 7 7 7 7 7 +6 6 6 6 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 6 6 3 6 7 +7 7 7 7 7 7 7 7 7 6 3 6 6 7 +7 7 7 6 6 6 6 7 7 6 6 6 3 7 +7 7 7 6 6 3 6 7 7 7 7 7 7 7 +7 7 7 6 3 6 6 7 7 7 7 7 7 7 +7 7 7 6 6 6 6 7 7 7 6 3 6 6 +7 7 7 7 7 7 7 7 7 7 6 6 6 3 +7 7 7 7 7 7 7 7 7 7 6 3 3 6 +7 7 7 7 7 7 7 7 7 7 6 6 6 6 +Output: +7 7 7 7 7 6 3 6 7 7 7 6 6 7 +7 7 7 7 7 6 6 6 7 7 7 6 6 7 +6 6 6 6 7 6 6 6 7 7 7 6 6 7 +6 3 6 6 7 7 7 7 7 7 7 7 7 7 +6 6 6 6 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 + +Example 3: + +Input: +7 7 7 7 7 6 6 6 6 7 7 3 6 7 7 +6 6 6 6 7 3 6 6 3 7 7 6 3 7 7 +6 3 6 6 7 6 6 6 6 7 7 7 7 7 7 +6 6 6 6 7 6 6 3 6 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 3 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 +7 7 6 6 3 6 6 7 7 7 7 7 7 7 7 +7 7 6 6 6 3 6 7 7 7 7 7 7 7 7 +7 7 6 3 6 6 6 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 6 6 3 7 7 7 +7 7 6 6 6 6 7 7 7 6 3 6 7 7 7 +7 7 6 6 6 6 7 7 7 6 6 6 7 7 7 +7 7 6 6 6 6 7 7 7 3 6 3 7 7 7 +Output: +7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 +6 3 6 6 7 7 7 7 7 7 7 7 7 7 7 +6 6 6 6 7 7 7 7 7 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 3 6 6 +7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +7 7 6 6 6 6 7 7 7 7 7 7 7 7 7 +7 7 6 6 6 6 7 7 7 7 7 7 7 7 7 +7 7 6 6 6 6 7 7 7 7 7 7 7 7 7 + + +Below is a test input grid. Predict the corresponding output grid by applying the rule you found. +Your final answer should just be the text output grid itself. + +Input: +7 7 7 7 7 7 7 7 6 3 6 6 +6 6 6 7 7 7 7 7 6 6 6 6 +3 6 6 7 7 7 7 7 6 3 6 3 +6 6 6 7 3 6 6 7 7 7 7 7 +7 7 7 7 6 6 6 7 7 7 7 7 +7 7 7 7 6 6 3 7 7 7 7 7 +7 7 7 7 6 6 6 7 6 6 6 6 +7 7 7 7 7 7 7 7 6 6 3 6 +7 6 6 6 6 6 6 7 6 6 6 6 +7 6 6 6 6 3 6 7 6 6 6 6 +7 6 3 6 6 6 6 7 7 7 7 7 +7 6 6 6 6 6 6 7 6 6 6 7 +7 7 7 7 7 7 7 7 6 6 6 7 + +Answer: 7 7 7 7 7 7 7 7 7 7 7 7 +6 6 6 7 7 7 7 7 7 7 7 7 +3 6 6 7 7 7 7 7 7 7 7 7 +6 6 6 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 6 6 3 6 +7 7 7 7 7 7 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 6 6 6 6 +7 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 6 6 6 7 +7 7 7 7 7 7 7 7 6 6 6 7 +Metadata: {'input': ((7, 7, 7, 7, 7, 7, 7, 7, 6, 3, 6, 6), (6, 6, 6, 7, 7, 7, 7, 7, 6, 6, 6, 6), (3, 6, 6, 7, 7, 7, 7, 7, 6, 3, 6, 3), (6, 6, 6, 7, 3, 6, 6, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 6, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 3, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 3, 6), (7, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6), (7, 6, 6, 6, 6, 3, 6, 7, 6, 6, 6, 6), (7, 6, 3, 6, 6, 6, 6, 7, 7, 7, 7, 7), (7, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7)), 'output': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (3, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 3, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7)), 'task_id': 'a934301b'} + +Example 2: +Question: Find the common rule that maps an input grid to an output grid, given the examples below. + +Example 1: + +Input: +2 8 8 8 8 8 8 8 8 9 +2 8 8 0 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 0 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 0 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +Output: +2 8 8 8 8 8 8 8 8 9 +2 8 8 2 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 9 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 9 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 +2 8 8 8 8 8 8 8 8 9 + +Example 2: + +Input: +6 6 6 6 6 6 6 6 6 6 +8 8 8 8 8 8 8 8 8 8 +8 8 0 8 8 8 8 8 0 8 +8 8 8 8 8 8 0 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 0 8 8 8 8 +8 0 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +1 1 1 1 1 1 1 1 1 1 +Output: +6 6 6 6 6 6 6 6 6 6 +8 8 8 8 8 8 8 8 8 8 +8 8 6 8 8 8 8 8 6 8 +8 8 8 8 8 8 6 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 1 8 8 8 8 +8 1 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +1 1 1 1 1 1 1 1 1 1 + +Example 3: + +Input: +5 5 5 5 5 5 5 5 5 5 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 0 8 8 8 8 +8 8 0 8 8 8 8 8 0 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 0 8 8 8 8 0 8 +8 8 8 8 8 8 0 8 8 8 +8 8 8 8 8 8 8 8 8 8 +7 7 7 7 7 7 7 7 7 7 +Output: +5 5 5 5 5 5 5 5 5 5 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 5 8 8 8 8 +8 8 5 8 8 8 8 8 5 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 8 8 8 8 8 8 8 +8 8 8 7 8 8 8 8 7 8 +8 8 8 8 8 8 7 8 8 8 +8 8 8 8 8 8 8 8 8 8 +7 7 7 7 7 7 7 7 7 7 + + +Below is a test input grid. Predict the corresponding output grid by applying the rule you found. +Your final answer should just be the text output grid itself. + +Input: +6 8 8 8 8 8 8 8 0 4 +6 0 8 8 0 8 8 8 8 4 +6 8 8 8 8 8 8 8 8 4 +6 8 8 8 8 8 0 8 8 4 +6 8 8 0 8 8 8 8 8 4 +6 8 8 8 8 8 0 8 8 4 +6 8 8 8 8 8 8 8 8 4 +6 8 8 8 8 0 8 8 8 4 +6 8 8 0 8 8 8 0 8 4 +6 8 8 8 8 8 8 8 8 4 + +Answer: 6 8 8 8 8 8 8 8 4 4 +6 6 8 8 6 8 8 8 8 4 +6 8 8 8 8 8 8 8 8 4 +6 8 8 8 8 8 4 8 8 4 +6 8 8 6 8 8 8 8 8 4 +6 8 8 8 8 8 4 8 8 4 +6 8 8 8 8 8 8 8 8 4 +6 8 8 8 8 4 8 8 8 4 +6 8 8 6 8 8 8 4 8 4 +6 8 8 8 8 8 8 8 8 4 +Metadata: {'input': ((6, 8, 8, 8, 8, 8, 8, 8, 0, 4), (6, 0, 8, 8, 0, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 0, 8, 8, 4), (6, 8, 8, 0, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 0, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 0, 8, 8, 8, 4), (6, 8, 8, 0, 8, 8, 8, 0, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4)), 'output': ((6, 8, 8, 8, 8, 8, 8, 8, 4, 4), (6, 6, 8, 8, 6, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 4, 8, 8, 4), (6, 8, 8, 6, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 4, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 4, 8, 8, 8, 4), (6, 8, 8, 6, 8, 8, 8, 4, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4)), 'task_id': '2204b7a8'} + +Example 3: +Question: Find the common rule that maps an input grid to an output grid, given the examples below. + +Example 1: + +Input: +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 +5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 5 5 5 5 5 +5 5 8 8 8 8 5 5 5 5 5 8 8 8 8 5 5 5 5 5 +2 5 8 8 8 8 5 5 5 5 5 8 8 8 8 5 5 5 5 2 +5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 8 8 8 8 8 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5 +Output: +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 8 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 8 5 +5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 2 2 2 2 5 5 5 5 5 +5 5 2 2 2 2 5 2 5 5 5 2 2 2 2 5 5 5 5 5 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +5 5 2 2 2 2 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 2 2 2 2 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5 + +Example 2: + +Input: +5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 5 5 5 5 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 8 8 8 8 8 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 8 8 8 8 8 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 8 8 8 8 8 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +Output: +5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 5 5 2 5 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 5 5 5 2 5 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 2 2 2 2 2 5 8 8 8 5 5 5 5 8 8 8 8 +5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 2 2 2 2 2 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 2 2 2 2 2 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 + +Example 3: + +Input: +5 8 8 8 8 8 5 2 5 5 5 5 5 5 +5 8 8 8 8 8 5 5 5 5 5 8 8 8 +5 5 5 5 5 5 5 5 5 5 5 8 8 8 +5 5 5 5 8 8 8 8 8 8 5 8 8 8 +5 5 5 5 8 8 8 8 8 8 5 8 8 8 +5 5 5 5 8 8 8 8 8 8 5 8 8 8 +8 8 5 5 8 8 8 8 8 8 5 5 5 5 +8 8 5 5 8 8 8 8 8 8 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 5 5 8 8 8 5 5 5 5 +2 5 8 8 8 5 5 8 8 8 5 5 5 2 +5 5 8 8 8 5 5 5 5 5 5 5 5 5 +5 5 8 8 8 5 5 2 5 5 5 5 5 5 +Output: +5 8 8 8 8 8 5 2 5 5 5 5 5 5 +5 8 8 8 8 8 5 2 5 5 5 8 8 8 +5 5 5 5 5 5 5 2 5 5 5 8 8 8 +5 5 5 5 2 2 2 2 2 2 5 8 8 8 +5 5 5 5 2 2 2 2 2 2 5 8 8 8 +5 5 5 5 2 2 2 2 2 2 5 8 8 8 +8 8 5 5 2 2 2 2 2 2 5 5 5 5 +8 8 5 5 2 2 2 2 2 2 5 5 5 5 +5 5 5 5 5 5 5 2 5 5 5 5 5 5 +5 5 2 2 2 5 5 2 2 2 5 5 5 5 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 +5 5 2 2 2 5 5 2 5 5 5 5 5 5 +5 5 2 2 2 5 5 2 5 5 5 5 5 5 + + +Below is a test input grid. Predict the corresponding output grid by applying the rule you found. +Your final answer should just be the text output grid itself. + +Input: +5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5 +5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 8 8 5 5 8 8 8 5 +5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 8 8 8 5 +5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 8 8 8 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +2 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 2 +5 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 8 8 5 +5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 8 8 5 +5 5 5 5 8 8 8 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 +2 5 5 5 8 8 8 5 5 5 5 5 5 8 8 8 8 8 5 8 8 8 8 5 2 +5 5 5 5 8 8 8 5 5 8 8 8 5 8 8 8 8 8 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 5 5 5 8 8 8 8 5 5 +5 5 5 5 5 2 5 5 5 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 + +Answer: 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 8 8 5 5 5 5 5 5 +5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 8 8 5 5 8 8 8 5 +5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 8 8 8 5 +5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 8 8 8 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +5 2 2 2 2 2 5 5 5 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 5 5 5 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 8 8 5 +5 5 5 5 5 2 5 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 8 8 5 +5 5 5 5 2 2 2 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +5 5 5 5 2 2 2 5 5 2 2 2 5 2 2 2 2 2 5 2 2 2 2 5 5 +5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 2 2 2 2 5 5 +5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 2 2 2 2 5 5 +5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 +Metadata: {'input': ((5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (2, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2), (5, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5), (2, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 5, 2), (5, 5, 5, 5, 8, 8, 8, 5, 5, 8, 8, 8, 5, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), 'output': ((5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 8, 8, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), (5, 2, 2, 2, 2, 2, 5, 5, 5, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5), (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), (5, 5, 5, 5, 2, 2, 2, 5, 5, 2, 2, 2, 5, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), 'task_id': '0d87d2a6'} + +```` + ### base_conversion Generates base conversion tasks @@ -1171,6 +1588,144 @@ Metadata: {'words': ['hogtytyof', 'kgqwfwfgh'], 'solution': True, 'solvable': Tr ```` +### knight_swap +Generates Knight Swap puzzles with configurable parameters. + +Default configuration: +```python +min_nodes = 6 +max_nodes = 9 +min_pieces = 2 +max_pieces = 2 +min_steps = 4 +max_steps = 20 +max_attempts = 100 +seed = 42 +size = 5 +impossible_ratio = 0.2 +``` + +Example tasks: +```` +Example 1: +Question: Knight Swap Challenge: + +``` + A B C D + ---------------- +3 | | . | | . | + ---------------- +2 | B | w | | | + ---------------- +1 | | | B | w | + ---------------- +``` + +Legend: +- 'w' = White Knight +- 'B' = Black Knight +- Empty squares are marked with '.' + +Objective: +Swap the positions of all white knights with all black knights through valid moves. + +Rules: +1. Knights move in L-shape (2 squares + 1 square perpendicular) +2. Knights can only move to empty squares +3. w moves first, then players alternate +4. All knights must reach their target positions (white ↔ black) + +Question: +Is it possible to swap all knights' positions? If yes, list the moves. + +Answer Format: +- For impossible puzzles: "No" +- For possible puzzles: List moves as ["color,from,to", ...] + Example: ["w,A1,B3"] means white knight moves A1→B3 + +Answer: No +Metadata: {'board': {'C1': ['A2', 'B3', 'D3'], 'A2': ['C1'], 'B3': ['C1'], 'D1': ['B2'], 'B2': ['D1', 'D3'], 'D3': ['B2', 'C1']}, 'pieces': {'C1': 'B', 'A2': 'B', 'B3': None, 'D1': 'w', 'B2': 'w', 'D3': None}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None} + +Example 2: +Question: Knight Swap Challenge: + +``` + A B C D + ---------------- +3 | | w | . | | + ---------------- +2 | w | | | B | + ---------------- +1 | | | . | B | + ---------------- +``` + +Legend: +- 'w' = White Knight +- 'B' = Black Knight +- Empty squares are marked with '.' + +Objective: +Swap the positions of all white knights with all black knights through valid moves. + +Rules: +1. Knights move in L-shape (2 squares + 1 square perpendicular) +2. Knights can only move to empty squares +3. w moves first, then players alternate +4. All knights must reach their target positions (white ↔ black) + +Question: +Is it possible to swap all knights' positions? If yes, list the moves. + +Answer Format: +- For impossible puzzles: "No" +- For possible puzzles: List moves as ["color,from,to", ...] + Example: ["w,A1,B3"] means white knight moves A1→B3 + +Answer: No +Metadata: {'board': {'B3': ['C1'], 'D1': ['C3'], 'C3': ['A2', 'D1'], 'C1': ['A2', 'B3'], 'D2': [], 'A2': ['C1', 'C3']}, 'pieces': {'B3': 'w', 'D1': 'B', 'C3': None, 'C1': None, 'D2': 'B', 'A2': 'w'}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None} + +Example 3: +Question: Knight Swap Challenge: + +``` + A B C + ------------ +3 | . | | B | + ------------ +2 | w | | . | + ------------ +1 | | w | B | + ------------ +``` + +Legend: +- 'w' = White Knight +- 'B' = Black Knight +- Empty squares are marked with '.' + +Objective: +Swap the positions of all white knights with all black knights through valid moves. + +Rules: +1. Knights move in L-shape (2 squares + 1 square perpendicular) +2. Knights can only move to empty squares +3. w moves first, then players alternate +4. All knights must reach their target positions (white ↔ black) + +Question: +Is it possible to swap all knights' positions? If yes, list the moves. + +Answer Format: +- For impossible puzzles: "No" +- For possible puzzles: List moves as ["color,from,to", ...] + Example: ["w,A1,B3"] means white knight moves A1→B3 + +Answer: No +Metadata: {'board': {'B1': ['A3'], 'A3': ['B1', 'C2'], 'A2': ['C1', 'C3'], 'C3': ['A2'], 'C1': ['A2'], 'C2': ['A3']}, 'pieces': {'B1': 'w', 'A3': None, 'A2': 'w', 'C3': 'B', 'C1': 'B', 'C2': None}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None} + +```` + ### largest_island Generates Largest Island exercises with configurable difficulty @@ -2097,7 +2652,11 @@ Input: 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 -Answer: ((3, 9, 3, 9, 3, 9, 3, 9), (3, 9, 3, 9, 3, 9, 3, 3), (3, 9, 3, 9, 3, 9, 9, 9), (3, 9, 3, 9, 3, 3, 3, 3), (3, 9, 3, 9, 9, 9, 9, 9)) +Answer: 3 9 3 9 3 9 3 9 +3 9 3 9 3 9 3 3 +3 9 3 9 3 9 9 9 +3 9 3 9 3 3 3 3 +3 9 3 9 9 9 9 9 Metadata: {'input': ((3, 3, 3, 3, 3, 3, 3, 9), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3)), 'output': ((3, 9, 3, 9, 3, 9, 3, 9), (3, 9, 3, 9, 3, 9, 3, 3), (3, 9, 3, 9, 3, 9, 9, 9), (3, 9, 3, 9, 3, 3, 3, 3), (3, 9, 3, 9, 9, 9, 9, 9)), 'task_id': 'd22278a0', 'difficulty': {'rng': 0.07173948707162241, 'pso': 0.12314814814814816}} Example 2: @@ -2232,7 +2791,14 @@ Input: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -Answer: ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)) +Answer: 7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 8 7 7 7 +7 7 7 7 7 7 8 7 8 7 7 +7 7 7 7 7 8 7 8 7 8 7 +7 7 7 7 7 7 8 7 8 7 7 +7 7 7 7 7 7 7 8 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 +7 7 7 7 7 7 7 7 7 7 7 Metadata: {'input': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)), 'output': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)), 'task_id': '11852cab', 'difficulty': {'rng': 0.09651305327452808, 'pso': 0.15228956228956228}} Example 3: @@ -2276,7 +2842,10 @@ Input: 1 1 1 1 1 1 1 1 1 1 -Answer: ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1)) +Answer: 1 1 1 1 1 +1 1 1 1 1 +1 1 1 1 1 +1 1 1 1 1 Metadata: {'input': ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1)), 'output': ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1)), 'task_id': '8be77c9e', 'difficulty': {'rng': 0.09322002370336528, 'pso': 0.0638888888888889}} ```` @@ -2993,7 +3562,7 @@ Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6, Example 2: Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM. Answer: 02:38 -Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 8, 9, 44), 'end_time': datetime.datetime(2025, 2, 8, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} +Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 9, 9, 44), 'end_time': datetime.datetime(2025, 2, 9, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} Example 3: Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days. diff --git a/pyproject.toml b/pyproject.toml index 794077d3..eb0bbc4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "reasoning_gym" -version = "0.1.5" +version = "0.1.6" authors = [ { name = "Open-Thought community", email = "andreas.koepf@xamla.com" }, ] @@ -21,6 +21,7 @@ dependencies = [ "pytz>=2024.1", "tabulate==0.9.0", "pyyaml>=6.0.2", + "arckit==0.1.0", ] classifiers = [ "Programming Language :: Python :: 3", diff --git a/reasoning_gym/__init__.py b/reasoning_gym/__init__.py index 48be1fa9..37783e4e 100644 --- a/reasoning_gym/__init__.py +++ b/reasoning_gym/__init__.py @@ -5,7 +5,7 @@ Reasoning Gym - A library of procedural dataset generators for training reasonin from . import algebra, algorithmic, arc, arithmetic, code, cognition, data, games, geometry, graphs, logic from .factory import create_dataset, register_dataset -__version__ = "0.1.5" +__version__ = "0.1.6" __all__ = [ "arc", "algebra", diff --git a/reasoning_gym/algebra/complex_arithmetic.py b/reasoning_gym/algebra/complex_arithmetic.py index 7c749eaa..c55e1d09 100644 --- a/reasoning_gym/algebra/complex_arithmetic.py +++ b/reasoning_gym/algebra/complex_arithmetic.py @@ -127,11 +127,12 @@ class ComplexArithmeticDataset(ProceduralDataset): return student_result - def score_answer(self, answer: str, metadata: dict) -> float: + def score_answer(self, answer: Optional[str], entry: dict) -> float: """Score the answer using exponential distance-based scoring.""" if answer is None: return 0.0 + metadata = entry["metadata"] try: student_result = self.parse_string_to_complex(answer) expected_result = complex(*metadata["result"]) diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py index 5d0b139c..6335b6b7 100644 --- a/reasoning_gym/algebra/intermediate_integration.py +++ b/reasoning_gym/algebra/intermediate_integration.py @@ -235,9 +235,10 @@ class IntermediateIntegrationDataset(ProceduralDataset): }, } - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: """Determine if the solution provided solves the problem""" reward = 0.0 + metadata = entry["metadata"] if answer is not None: try: var = metadata["variable"] diff --git a/reasoning_gym/algebra/polynomial_multiplication.py b/reasoning_gym/algebra/polynomial_multiplication.py index eff42a86..9a74679f 100644 --- a/reasoning_gym/algebra/polynomial_multiplication.py +++ b/reasoning_gym/algebra/polynomial_multiplication.py @@ -138,8 +138,9 @@ class PolynomialMultiplicationDataset(ProceduralDataset): return polynomial_expr - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: reward = 0.0 + metadata = entry["metadata"] if answer is not None: try: predicted_poly = sp.parse_expr(answer) diff --git a/reasoning_gym/algebra/simple_integration.py b/reasoning_gym/algebra/simple_integration.py index 1da32004..a8ca3be2 100644 --- a/reasoning_gym/algebra/simple_integration.py +++ b/reasoning_gym/algebra/simple_integration.py @@ -80,9 +80,10 @@ class SimpleIntegrationDataset(ProceduralDataset): }, } - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: """Determine if the solution provided solves the problem""" reward = 0.0 + metadata = entry["metadata"] if answer is not None: try: var = metadata["variable"] diff --git a/reasoning_gym/algorithmic/palindrome_generation.py b/reasoning_gym/algorithmic/palindrome_generation.py index e7ab21e3..a663770b 100644 --- a/reasoning_gym/algorithmic/palindrome_generation.py +++ b/reasoning_gym/algorithmic/palindrome_generation.py @@ -81,7 +81,7 @@ class PalindromeDataset(ProceduralDataset): """Return the palindrome string from the letter set.""" return "".join(letters) - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: """Determine if the solution provided is a valid palindrome. The answer is expected to be a single string @@ -98,6 +98,7 @@ class PalindromeDataset(ProceduralDataset): if answer == "": return 0.01 + metadata = entry["metadata"] answer = answer.strip().lower() expected_letters = metadata["letters"] diff --git a/reasoning_gym/arc/__init__.py b/reasoning_gym/arc/__init__.py index e9bf7086..c7c48b94 100644 --- a/reasoning_gym/arc/__init__.py +++ b/reasoning_gym/arc/__init__.py @@ -1,4 +1,5 @@ from .arc_1d import Arc1DConfig, Arc1DDataset +from .arc_agi import ArcAgiConfig, ArcAgiDataset from .rearc import ReArcConfig, ReArcDataset -__all__ = ["Arc1DConfig", "Arc1DDataset", "ReArcDataset", "ReArcConfig"] +__all__ = ["Arc1DConfig", "Arc1DDataset", "ArcAgiConfig", "ArcAgiDataset", "ReArcDataset", "ReArcConfig"] diff --git a/reasoning_gym/arc/arc_agi.py b/reasoning_gym/arc/arc_agi.py new file mode 100644 index 00000000..b96698bb --- /dev/null +++ b/reasoning_gym/arc/arc_agi.py @@ -0,0 +1,202 @@ +from dataclasses import dataclass, field +from random import Random +from typing import Any, Callable, Optional + +import arckit + +from reasoning_gym.arc.board_format import ( + ARC_PROMPT_TEMPLATE, + BoardFormattingOptions, + format_board, + format_board_pair, + parse_board, +) +from reasoning_gym.dataset import ProceduralDataset +from reasoning_gym.factory import register_dataset + + +@dataclass +class ArcAgiConfig: + use_train: bool = True + use_eval: bool = True + board_format_opts: BoardFormattingOptions = field(default_factory=lambda: BoardFormattingOptions()) + + # Augmentation options + rotations: list[str] = field(default_factory=lambda: ["90", "180", "270"]) # empty list for no rotations + mirrors: list[str] = field( + default_factory=lambda: ["horizontal", "vertical", "diagonal", "counterdiagonal"] + ) # empty list for no mirrors + use_color_permutation: bool = True + + seed: Optional[int] = None + size: int = 500 + + def validate(self): + assert self.size > 0, "Size of dataset must be positive." + valid_rotations = ["90", "180", "270"] + valid_mirrors = ["horizontal", "vertical", "diagonal", "counterdiagonal"] + for rot in self.rotations: + assert rot in valid_rotations, f"Invalid rotation option: {rot}" + for mirror in self.mirrors: + assert mirror in valid_mirrors, f"Invalid mirror option: {mirror}" + + +Board = list[list[int]] + + +def identity(board: Board) -> Board: + return board + + +def rot90(board: Board) -> Board: + """quarter clockwise rotation""" + return [row for row in zip(*board[::-1])] + + +def rot180(board: Board) -> Board: + """half rotation""" + return [row[::-1] for row in board[::-1]] + + +def rot270(board: Board) -> Board: + """quarter anticlockwise rotation""" + return [row[::-1] for row in zip(*board[::-1])][::-1] + + +def hmirror(board: Board) -> Board: + """mirroring along horizontal""" + return board[::-1] + + +def vmirror(board: Board) -> Board: + """mirroring along vertical""" + return [row[::-1] for row in board] + + +def dmirror(board: Board) -> Board: + """mirroring along diagonal""" + return list(zip(*board)) + + +def cmirror(board: Board) -> Board: + """mirroring along counterdiagonal""" + return list(zip(*[r[::-1] for r in board[::-1]])) + + +def cmap(board: Board, colors: list[int]) -> Board: + return [[colors[c] for c in row] for row in board] + + +ROTATION_AUGMENTATIONS = [identity, rot90, rot180, rot270] +MIRROR_AUGMENTATIONS = [identity, hmirror, vmirror, dmirror, cmirror] + + +class ArcAgiDataset(ProceduralDataset): + def __init__(self, config: ArcAgiConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + self.board_format_opts = config.board_format_opts + self._prompt_templates = ARC_PROMPT_TEMPLATE + + self._tasks = {} + train_set, eval_set = arckit.load_data() + if config.use_train: + for x in train_set: + self._tasks[x.id] = x.to_dict() + if config.use_eval: + for x in eval_set: + self._tasks[x.id] = x.to_dict() + self._task_ids = list(self._tasks.keys()) + + def _create_augmentation_fn(self, rng: Random) -> Callable[[Board], Board]: + """Create a composite augmentation function from enabled options""" + fns = [] + + # Map rotation strings to functions + rotation_map = {"90": rot90, "180": rot180, "270": rot270} + if self.config.rotations: + chosen_rot = rng.choice([identity] + [rotation_map[r] for r in self.config.rotations]) + fns.append(chosen_rot) + + # Map mirror strings to functions + mirror_map = {"horizontal": hmirror, "vertical": vmirror, "diagonal": dmirror, "counterdiagonal": cmirror} + if self.config.mirrors: + chosen_mirror = rng.choice([identity] + [mirror_map[m] for m in self.config.mirrors]) + fns.append(chosen_mirror) + + if self.config.use_color_permutation: + color_table = list(range(10)) + rng.shuffle(color_table) + fns.append(lambda x: cmap(x, color_table)) + + def composite_fn(board: Board) -> Board: + result = board + for fn in fns: + result = fn(result) + return result + + return composite_fn + + def __getitem__(self, idx: int) -> dict: + """ + Generate a single ARC-AGI-1 task + """ + rng = Random(self.seed + idx) + + task_id = rng.choice(self._task_ids) + task = self._tasks[task_id] + + # Create augmentation function to be used for all examples + augment = self._create_augmentation_fn(rng) + + train = task["train"] + test = task["test"][0] + + # Apply augmentation to all train examples + augmented_train = [] + for p in train: + augmented_train.append({"input": augment(p["input"]), "output": augment(p["output"])}) + + examples = [ + format_board_pair(i + 1, p, formatting_options=self.config.board_format_opts) + for i, p in enumerate(augmented_train) + ] + examples = "".join(examples) + + # Apply augmentation to test example + augmented_test_input = augment(test["input"]) + augmented_test_output = augment(test["output"]) + + test_input = format_board(augmented_test_input, self.board_format_opts) + test_output = format_board(augmented_test_output, self.board_format_opts) + + input_prompt = self._prompt_templates.format(examples=examples, input_grid=test_input) + + def totuple(board: list[list[int]]) -> tuple[tuple[int, ...], ...]: + return tuple(tuple(r) for r in board) + + return { + "question": input_prompt, + "answer": test_output, + "metadata": { + "input": totuple(augmented_test_input), + "output": totuple(augmented_test_output), + "task_id": task_id, + }, + } + + def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float: + reward = 0.0 + metadata = entry["metadata"] + if answer is not None: + try: + answer_board = parse_board(answer, self.board_format_opts) + if answer_board == metadata["output"]: + reward = 1.0 + else: + reward = 0.05 + except: + reward = 0.01 + return reward + + +register_dataset("arc_agi", ArcAgiDataset, ArcAgiConfig) diff --git a/reasoning_gym/arc/board_format.py b/reasoning_gym/arc/board_format.py index 1360fcd1..554ba299 100644 --- a/reasoning_gym/arc/board_format.py +++ b/reasoning_gym/arc/board_format.py @@ -1,6 +1,16 @@ from dataclasses import dataclass, field from typing import List, Tuple +ARC_PROMPT_TEMPLATE = """Find the common rule that maps an input grid to an output grid, given the examples below. + +{examples} +Below is a test input grid. Predict the corresponding output grid by applying the rule you found. +Your final answer should just be the text output grid itself. + +Input: +{input_grid} +""" + @dataclass class BoardFormattingOptions: @@ -10,26 +20,6 @@ class BoardFormattingOptions: array_brackets: bool = False -def format_arc_task( - input_grid: Tuple[Tuple[int, ...], ...], output_grid: Tuple[Tuple[int, ...], ...], options: BoardFormattingOptions -) -> str: - """ - Format an ARC task as a string - """ - - buffer = [] - if options.task_identifier: - buffer.append(f"ARC Task: {options.task_identifier}") - - buffer.append("\nInput Grid:") - buffer.append(format_board(input_grid, options)) - - buffer.append("\n\nOutput Grid:") - buffer.append(format_board(output_grid, options)) - - return "\n".join(buffer) - - def format_board( board: List[List[int]], formatting_options: BoardFormattingOptions, with_board_shape: bool = False ) -> str: diff --git a/reasoning_gym/arc/rearc.py b/reasoning_gym/arc/rearc.py index ace4c3f9..ac362f9a 100644 --- a/reasoning_gym/arc/rearc.py +++ b/reasoning_gym/arc/rearc.py @@ -3,17 +3,7 @@ from random import Random from typing import Any, Callable, Dict, Optional from ..factory import ProceduralDataset, register_dataset -from .board_format import BoardFormattingOptions, format_board, format_board_pair, parse_board - -_REARC_PROMPT_TEMPLATES = """Find the common rule that maps an input grid to an output grid, given the examples below. - -{examples} -Below is a test input grid. Predict the corresponding output grid by applying the rule you found. -Your final answer should just be the text output grid itself. - -Input: -{input_grid} -""" +from .board_format import ARC_PROMPT_TEMPLATE, BoardFormattingOptions, format_board, format_board_pair, parse_board @dataclass @@ -37,7 +27,7 @@ class ReArcDataset(ProceduralDataset): def __init__(self, config: ReArcConfig): super().__init__(config=config, seed=config.seed, size=config.size) self.board_format_opts = config.board_format_opts - self._prompt_templates = _REARC_PROMPT_TEMPLATES + self._prompt_templates = ARC_PROMPT_TEMPLATE self.diff_lb = config.diff_lb self.diff_ub = config.diff_ub @@ -89,10 +79,11 @@ class ReArcDataset(ProceduralDataset): rng_difficulty = self.get_rng_difficulty(rng) pso_difficulty = self.get_pso_difficulty(task) input_prompt = self.format_rearc_input(rng, task, generator) + answer = format_board(task["output"], self.board_format_opts) return { "question": input_prompt, - "answer": task["output"], + "answer": answer, "metadata": { "input": task["input"], "output": task["output"], @@ -104,12 +95,13 @@ class ReArcDataset(ProceduralDataset): }, } - def score_answer(self, answer: str, metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: str, entry: Dict[str, Any]) -> float: reward = 0.0 + metadata = entry["metadata"] if answer is not None: try: - formatted_answer = parse_board(answer, self.board_format_opts) - if formatted_answer == metadata["output"]: + answer_board = parse_board(answer, self.board_format_opts) + if answer_board == metadata["output"]: reward = 1.0 else: reward = 0.05 diff --git a/reasoning_gym/games/__init__.py b/reasoning_gym/games/__init__.py index 295f6cdf..dd1ed898 100644 --- a/reasoning_gym/games/__init__.py +++ b/reasoning_gym/games/__init__.py @@ -8,6 +8,7 @@ Game tasks for training reasoning capabilities: from .countdown import CountdownConfig, CountdownDataset from .game_of_life import GameOfLifeConfig, GameOfLifeDataset +from .knight_swap import KnightSwapConfig, KnightSwapDataset from .maze import MazeConfig, MazeDataset from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset from .n_queens import NQueensDataset @@ -34,4 +35,6 @@ __all__ = [ "NQueensDataset", "TsumegoConfig", "TsumegoDataset", + "KnightSwapConfig", + "KnightSwapDataset", ] diff --git a/reasoning_gym/games/countdown.py b/reasoning_gym/games/countdown.py index 38a60c4f..169fc5ee 100644 --- a/reasoning_gym/games/countdown.py +++ b/reasoning_gym/games/countdown.py @@ -159,9 +159,10 @@ class CountdownDataset(ProceduralDataset): raise ValueError(f"Failed to generate valid expression after {max_attempts} attempts") - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: """Determine if the solution provided solves the problem""" reward = 0.0 + metadata = entry["metadata"] if answer is not None: try: user_answer = int(parse_expr(answer)) diff --git a/reasoning_gym/games/knight_swap.py b/reasoning_gym/games/knight_swap.py new file mode 100644 index 00000000..8e8c1167 --- /dev/null +++ b/reasoning_gym/games/knight_swap.py @@ -0,0 +1,396 @@ +import collections +import json +from dataclasses import dataclass +from random import Random +from typing import Dict, FrozenSet, List, Optional, Set, Tuple + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """Knight Swap Challenge: + +``` +{board} +``` + +Legend: +- 'w' = White Knight +- 'B' = Black Knight +- Empty squares are marked with '.' + +Objective: +Swap the positions of all white knights with all black knights through valid moves. + +Rules: +1. Knights move in L-shape (2 squares + 1 square perpendicular) +2. Knights can only move to empty squares +3. {start_turn} moves first, then players alternate +4. All knights must reach their target positions (white ↔ black) + +Question: +Is it possible to swap all knights' positions? If yes, list the moves. + +Answer Format: +- For impossible puzzles: "No" +- For possible puzzles: List moves as ["color,from,to", ...] + Example: ["w,A1,B3"] means white knight moves A1→B3 +""" + + +@dataclass +class KnightSwapConfig: + """Configuration for Knight Swap puzzle generation. + + A Knight Swap puzzle involves moving white and black knights on a chess-like board + where each move must be a valid knight's move. The goal is to swap the positions + of white and black knights. + """ + + min_nodes: int = 6 # Minimum number of squares on the board + max_nodes: int = 9 # Maximum number of squares on the board + min_pieces: int = 2 # Minimum number of pieces per color + max_pieces: int = 2 # Maximum number of pieces per color + min_steps: int = 4 # Minimum solution length + max_steps: int = 20 # Maximum solution length + max_attempts: int = 100 # Maximum attempts for board generation and puzzle creation + seed: Optional[int] = None + size: int = 5 # Virtual dataset size + impossible_ratio: float = 0.2 # Ratio of puzzles that should be impossible + + def validate(self): + """Validate configuration parameters""" + assert self.min_nodes >= 6, "min_nodes must be >= 6" + assert self.max_nodes >= self.min_nodes, "max_nodes must be >= min_nodes" + assert self.min_pieces >= 1, "min_pieces must be >= 1" + assert self.max_pieces >= self.min_pieces, "max_pieces must be >= min_pieces" + assert self.min_steps >= 1, "min_steps must be >= 1" + assert self.max_steps >= self.min_steps, "max_steps must be >= min_steps" + assert self.max_attempts >= 1, "max_attempts must be >= 1" + assert 0 <= self.impossible_ratio <= 1, "impossible_ratio must be between 0 and 1" + + +class KnightSwapLogic: + """Core game logic for Knight Swap puzzles.""" + + @staticmethod + def is_knight_move(a: str, b: str) -> bool: + """Check if moving from square 'a' to square 'b' is a legal knight move.""" + a_col = ord(a[0].upper()) - ord("A") + 1 + a_row = int(a[1:]) + b_col = ord(b[0].upper()) - ord("A") + 1 + b_row = int(b[1:]) + return {abs(a_col - b_col), abs(a_row - b_row)} == {1, 2} + + @staticmethod + def is_connected(graph: Dict[str, List[str]]) -> bool: + """Check if a graph is connected (all nodes reachable from any starting node).""" + if not graph: + return True + start = next(iter(graph)) + visited = set() + queue = collections.deque([start]) + while queue: + node = queue.popleft() + if node not in visited: + visited.add(node) + for neighbor in graph[node]: + if neighbor not in visited: + queue.append(neighbor) + return len(visited) == len(graph) + + @staticmethod + def generate_board(num_nodes: int, rng: Random, max_attempts: int = 1000) -> Dict[str, List[str]]: + """Generate a random connected board where edges represent valid knight moves.""" + candidates = ["A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3", "D1", "D2", "D3"] + attempts = 0 + while True: + attempts += 1 + nodes = rng.sample(candidates, num_nodes) + graph = {node: [] for node in nodes} + for i in range(len(nodes)): + for j in range(i + 1, len(nodes)): + if KnightSwapLogic.is_knight_move(nodes[i], nodes[j]): + graph[nodes[i]].append(nodes[j]) + graph[nodes[j]].append(nodes[i]) + for node in graph: + graph[node].sort() + if KnightSwapLogic.is_connected(graph): + return graph + if attempts > max_attempts: + raise Exception(f"Failed to generate connected board after {max_attempts} attempts") + + @staticmethod + def solve_swap( + board: Dict[str, List[str]], pieces: Dict[str, str], start_turn: str = "w" + ) -> Optional[List[Tuple[str, str, str]]]: + """Find a sequence of moves to swap white and black pieces positions.""" + + @dataclass(frozen=True) + class GameState: + white_set: FrozenSet[str] + black_set: FrozenSet[str] + turn: str + + initial_white = frozenset(pos for pos, piece in pieces.items() if piece == "w") + initial_black = frozenset(pos for pos, piece in pieces.items() if piece == "B") + initial_state = GameState(initial_white, initial_black, start_turn) + + queue = collections.deque([initial_state]) + visited = {initial_state} + predecessors = {initial_state: (None, None)} + + while queue: + state = queue.popleft() + if state.white_set == initial_black and state.black_set == initial_white: + moves = [] + cur_state = state + while predecessors[cur_state][0] is not None: + prev_state, move = predecessors[cur_state] + moves.append(move) + cur_state = prev_state + moves.reverse() + return moves + + current_positions = state.white_set if state.turn == "w" else state.black_set + for pos in current_positions: + for neighbor in board[pos]: + if neighbor in state.white_set or neighbor in state.black_set: + continue + if state.turn == "w": + new_white = frozenset(p if p != pos else neighbor for p in state.white_set) + new_black = state.black_set + else: + new_black = frozenset(p if p != pos else neighbor for p in state.black_set) + new_white = state.white_set + next_turn = "B" if state.turn == "w" else "w" + new_state = GameState(new_white, new_black, next_turn) + if new_state not in visited: + visited.add(new_state) + predecessors[new_state] = (state, (state.turn, pos, neighbor)) + queue.append(new_state) + return None + + +class KnightSwapDataset(ProceduralDataset): + """Generates Knight Swap puzzles with configurable parameters.""" + + def __init__(self, config: KnightSwapConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + self.game_logic = KnightSwapLogic() + + def _format_board(self, board: Dict[str, List[str]], pieces: Dict[str, str]) -> str: + """Format the board state as a string.""" + positions = list(board.keys()) + if not positions: + return "" + + columns = sorted(set(pos[0] for pos in positions)) + rows = sorted(set(int(pos[1:]) for pos in positions), reverse=True) + + lines = [] + # Header + lines.append(" " + " ".join(columns)) + lines.append(" " + "----" * len(columns)) + + # Board rows + for row in rows: + line = f"{row} |" + for col in columns: + pos = col + str(row) + if pos in pieces: + piece = pieces[pos] if pieces[pos] is not None else "." + line += f" {piece} |" + else: + line += " |" + lines.append(line) + lines.append(" " + "----" * len(columns)) + + return "\n".join(lines) + + def _format_moves(self, moves: List[Tuple[str, str, str]]) -> str: + """Format the solution moves as a string.""" + if not moves: + return "No" + return json.dumps([f"{color},{start},{end}" for color, start, end in moves]) + + def __getitem__(self, idx: int) -> Dict: + """Generate a single Knight Swap puzzle.""" + rng = Random(self.seed + idx) + + # Keep trying with new boards until we succeed + board_attempts = 0 + while board_attempts < self.config.max_attempts: + try: + # Generate a new board + num_nodes = rng.randint(self.config.min_nodes, self.config.max_nodes) + board = self.game_logic.generate_board(num_nodes, rng, max_attempts=self.config.max_attempts) + positions = list(board.keys()) + + # Decide if this should be an impossible puzzle + make_impossible = rng.random() < self.config.impossible_ratio + + # Try different piece placements on this board + for _ in range(50): # Reduced attempts per board since we try multiple boards + # Use fixed number of pieces for more reliable generation + num_pieces = self.config.min_pieces + white_positions = rng.sample(positions, num_pieces) + remaining = [p for p in positions if p not in white_positions] + black_positions = rng.sample(remaining, num_pieces) + + pieces = {pos: None for pos in positions} + for pos in white_positions: + pieces[pos] = "w" + for pos in black_positions: + pieces[pos] = "B" + + # For impossible puzzles, try a simpler approach: just remove some key connections + board_copy = {k: list(v) for k, v in board.items()} # Make a copy of the board + if make_impossible: + # Remove critical edges that would make the puzzle impossible + critical_edges = [] + for w_pos in white_positions: + for b_pos in black_positions: + if b_pos in board_copy[w_pos]: + critical_edges.append((w_pos, b_pos)) + + if critical_edges: # Only proceed if we found critical edges + # Remove a random critical edge + w_pos, b_pos = rng.choice(critical_edges) + board_copy[w_pos].remove(b_pos) + board_copy[b_pos].remove(w_pos) + + # Try both starting turns + for start_turn in ["w", "B"]: + solution = self.game_logic.solve_swap(board_copy, pieces, start_turn) + + # Accept solutions with more flexible length requirements + if (make_impossible and solution is None) or ( + not make_impossible + and solution is not None + and self.config.min_steps <= len(solution) <= self.config.max_steps + ): + board_str = self._format_board(board_copy, pieces) + solution_str = self._format_moves(solution) if solution else "No" + + # Generate board states for solvable puzzles + board_states = [] + if solution is not None: + current_pieces = dict(pieces) + board_states.append(dict(current_pieces)) # Initial state + + for color, start, end in solution: + current_pieces[end] = current_pieces[start] + current_pieces[start] = None + board_states.append(dict(current_pieces)) + + return { + "question": QUESTION_TEMPLATE.format(board=board_str, start_turn=start_turn), + "answer": solution_str, + "metadata": { + "board": board_copy, + "pieces": pieces, + "start_turn": start_turn, + "solution": solution, + "is_possible": solution is not None, + "num_steps": len(solution) if solution else 0, + "board_states": board_states if solution is not None else None, + }, + } + + except Exception: + pass # If board generation fails, we'll try again with a new board + + board_attempts += 1 + + raise ValueError(f"Failed to generate valid puzzle after trying {self.config.max_attempts} different boards") + + def score_answer(self, answer: Optional[str], entry: Dict) -> float: + """Score the user's solution for the Knight Swap puzzle. + + The answer should be either: + - "No" if the puzzle is impossible + - A JSON list of moves in format ["color,start,end", ...] where color is 'w' or 'B' + + Returns: + - 1.0 for correct answer (either "No" for impossible puzzles or valid solution of optimal length) + - A proportional score for correct but longer solutions + - 0.05 for valid moves that don't solve the puzzle + - 0.01 for invalid format + - 0.0 for None + """ + if answer is None: + return 0.0 + + answer = answer.strip() + if not answer: + return 0.01 + + # Handle impossible puzzles + if not entry["metadata"]["is_possible"]: + return 1.0 if answer.lower() == "no" else 0.01 + + # Handle "No" answer for possible puzzles + if answer.lower() == "no": + return 0.01 + + try: + # Parse moves from JSON list + move_list = json.loads(answer) + if not isinstance(move_list, list): + return 0.01 + + # Parse moves + moves = [] + for move_str in move_list: + color, start, end = move_str.split(",") + if color not in ("w", "B"): + return 0.01 + moves.append((color, start, end)) + + # Validate and apply moves + board = entry["metadata"]["board"] + pieces = dict(entry["metadata"]["pieces"]) + current_turn = entry["metadata"]["start_turn"] + + # Track board states after each move + board_states = [] + board_states.append(dict(pieces)) # Initial state + + for color, start, end in moves: + if color != current_turn: + return 0.01 + if start not in pieces or pieces[start] != color: + return 0.01 + if end not in board[start]: + return 0.01 + if end in pieces and pieces[end] is not None: + return 0.01 + + # Apply move + pieces[end] = pieces[start] + pieces[start] = None + current_turn = "B" if current_turn == "w" else "w" + + # Store board state after this move + board_states.append(dict(pieces)) + + # Check if solved + white_positions = {pos for pos, piece in pieces.items() if piece == "w"} + black_positions = {pos for pos, piece in pieces.items() if piece == "B"} + initial_white = {pos for pos, piece in entry["metadata"]["pieces"].items() if piece == "w"} + initial_black = {pos for pos, piece in entry["metadata"]["pieces"].items() if piece == "B"} + + if white_positions == initial_black and black_positions == initial_white: + optimal_moves = len(entry["metadata"]["solution"]) + # Add board states to metadata if solution is valid + entry["metadata"]["board_states"] = board_states + if len(moves) <= optimal_moves: + return 1.0 + else: + return optimal_moves / len(moves) + return 0.05 + + except Exception: + return 0.01 + + +register_dataset("knight_swap", KnightSwapDataset, KnightSwapConfig) diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py index 7e33c236..e3adab5b 100644 --- a/reasoning_gym/games/tower_of_hanoi.py +++ b/reasoning_gym/games/tower_of_hanoi.py @@ -368,7 +368,7 @@ class HanoiDataset(ProceduralDataset): to_peg = int(match.group(3)) return disk, from_peg, to_peg - def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float: """ Score the user's solution for the Tower of Hanoi puzzle. @@ -398,6 +398,7 @@ class HanoiDataset(ProceduralDataset): return 0.0 # Build the initial peg state from metadata. + metadata = entry["metadata"] num_disks = metadata["num_disks"] num_pegs = metadata["num_pegs"] start_peg = metadata["start_peg"] diff --git a/tests/test_arc_agi.py b/tests/test_arc_agi.py new file mode 100644 index 00000000..da43e6ab --- /dev/null +++ b/tests/test_arc_agi.py @@ -0,0 +1,139 @@ +import pytest + +from reasoning_gym.arc.arc_agi import ArcAgiConfig, ArcAgiDataset + + +def test_arc_agi_config_validation(): + """Test validation of ArcAgi configuration parameters""" + with pytest.raises(AssertionError): + ArcAgiConfig(size=0).validate() + + with pytest.raises(AssertionError): + ArcAgiConfig(rotations=["invalid"]).validate() + + with pytest.raises(AssertionError): + ArcAgiConfig(mirrors=["invalid"]).validate() + + # Valid configs should not raise + config = ArcAgiConfig(size=10, seed=42) + config.validate() + + config = ArcAgiConfig(rotations=["90", "180"], mirrors=["horizontal", "diagonal"]) + config.validate() + + # Empty lists should be valid (no augmentations) + config = ArcAgiConfig(rotations=[], mirrors=[]) + config.validate() + + +def test_arc_agi_deterministic(): + """Test dataset reproducibility with fixed seed""" + config = ArcAgiConfig(seed=42, size=10) + ds1 = ArcAgiDataset(config) + ds2 = ArcAgiDataset(config) + + for i in range(len(ds1)): + assert ds1[i] == ds2[i], "ArcAgi datasets with same seed should match exactly" + + +def test_arc_agi_items(): + """Test basic structure and metadata of generated items""" + config = ArcAgiConfig(seed=42, size=10) + dataset = ArcAgiDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + meta = item["metadata"] + assert "input" in meta + assert "output" in meta + assert "task_id" in meta + + # Verify input/output are tuples of tuples (board format) + assert isinstance(meta["input"], tuple) + assert isinstance(meta["output"], tuple) + assert all(isinstance(row, tuple) for row in meta["input"]) + assert all(isinstance(row, tuple) for row in meta["output"]) + + # Verify task_id is a string + assert isinstance(meta["task_id"], str) + + +def test_arc_agi_augmentations(): + """Test that augmentations can be selectively enabled/disabled""" + # Test with all augmentations disabled + config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=[], use_color_permutation=False) + base_dataset = ArcAgiDataset(config) + base_items = list(base_dataset) + + # Test with specific rotation only + rot_config = ArcAgiConfig(seed=42, size=10, rotations=["90"], mirrors=[], use_color_permutation=False) + rot_dataset = ArcAgiDataset(rot_config) + rot_items = list(rot_dataset) + + # Items should differ with rotation enabled + assert any( + base_items[i]["metadata"]["input"] != rot_items[i]["metadata"]["input"] for i in range(len(base_items)) + ), "90-degree rotation augmentation had no effect" + + # Test with specific mirror only + mirror_config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=["horizontal"], use_color_permutation=False) + mirror_dataset = ArcAgiDataset(mirror_config) + mirror_items = list(mirror_dataset) + + # Items should differ with mirror enabled + assert any( + base_items[i]["metadata"]["input"] != mirror_items[i]["metadata"]["input"] for i in range(len(base_items)) + ), "Horizontal mirror augmentation had no effect" + + # Test with color permutation only + color_config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=[], use_color_permutation=True) + color_dataset = ArcAgiDataset(color_config) + color_items = list(color_dataset) + + # Items should differ with color permutation enabled + assert any( + base_items[i]["metadata"]["input"] != color_items[i]["metadata"]["input"] for i in range(len(base_items)) + ), "Color permutation had no effect" + + +def test_arc_agi_scoring(): + """Test solution verification and scoring""" + config = ArcAgiConfig(size=10, seed=123) + dataset = ArcAgiDataset(config) + + for item in dataset: + # Test correct solution + assert dataset.score_answer(item["answer"], entry=item) == 1.0 + + # Test invalid format + assert dataset.score_answer("invalid grid format", entry=item) == 0.01 + + # Test None answer + assert dataset.score_answer(None, entry=item) == 0.0 + + # Test wrong but valid grid format + wrong_answer = "1 0 0 0\n0 0 0 1" + assert dataset.score_answer(wrong_answer, entry=item) == 0.05 + + +def test_arc_agi_dataset_modes(): + """Test dataset behavior with different train/eval configurations""" + # Test train-only mode + train_config = ArcAgiConfig(use_train=True, use_eval=False, size=10, seed=42) + train_ds = ArcAgiDataset(train_config) + assert len(train_ds._task_ids) > 0 + + # Test eval-only mode + eval_config = ArcAgiConfig(use_train=False, use_eval=True, size=10, seed=42) + eval_ds = ArcAgiDataset(eval_config) + assert len(eval_ds._task_ids) > 0 + + # Test both modes + both_config = ArcAgiConfig(use_train=True, use_eval=True, size=10, seed=42) + both_ds = ArcAgiDataset(both_config) + assert len(both_ds._task_ids) > len(train_ds._task_ids) + assert len(both_ds._task_ids) > len(eval_ds._task_ids) diff --git a/tests/test_complex_arithmetic.py b/tests/test_complex_arithmetic.py index 0d369fc1..7c300b52 100644 --- a/tests/test_complex_arithmetic.py +++ b/tests/test_complex_arithmetic.py @@ -52,30 +52,30 @@ def test_complex_arithmetic_scoring(): dataset = ComplexArithmeticDataset(config) # Test case with answer 3 + 2i - metadata = {"result": (3.0, 2.0)} + entry = {"metadata": {"result": (3.0, 2.0)}} # Test exact matches (should get score of 1.0) - assert dataset.score_answer("3 + 2i", metadata) == 1.0 - assert dataset.score_answer("3+2i", metadata) == 1.0 - assert dataset.score_answer("3.0 + 2.0i", metadata) == 1.0 + assert dataset.score_answer("3 + 2i", entry) == 1.0 + assert dataset.score_answer("3+2i", entry) == 1.0 + assert dataset.score_answer("3.0 + 2.0i", entry) == 1.0 # Test answers with small errors (should get high but < 1.0 scores) - print(dataset.score_answer("3.1 + 2i", metadata)) - assert 0.9 < dataset.score_answer("3.1 + 2i", metadata) < 1.0 - assert 0.9 < dataset.score_answer("3 + 2.1i", metadata) < 1.0 - assert 0.7 < dataset.score_answer("3.1 + 2.1i", metadata) < 0.95 + print(dataset.score_answer("3.1 + 2i", entry)) + assert 0.9 < dataset.score_answer("3.1 + 2i", entry) < 1.0 + assert 0.9 < dataset.score_answer("3 + 2.1i", entry) < 1.0 + assert 0.7 < dataset.score_answer("3.1 + 2.1i", entry) < 0.95 # Test answers with moderate errors (should get medium scores) - assert 0.3 < dataset.score_answer("4 + 2i", metadata) < 0.4 - assert 0.3 < dataset.score_answer("3 + 3i", metadata) < 0.4 + assert 0.3 < dataset.score_answer("4 + 2i", entry) < 0.4 + assert 0.3 < dataset.score_answer("3 + 3i", entry) < 0.4 # Test answers with large errors (should get very low scores) - assert dataset.score_answer("10 + 10i", metadata) < 0.01 + assert dataset.score_answer("10 + 10i", entry) < 0.01 # Test invalid answers (should get 0.0) - assert dataset.score_answer("invalid", metadata) == 0.0 - assert dataset.score_answer(None, metadata) == 0.0 - assert dataset.score_answer("inf + 2i", metadata) == 0.0 + assert dataset.score_answer("invalid", entry) == 0.0 + assert dataset.score_answer(None, entry) == 0.0 + assert dataset.score_answer("inf + 2i", entry) == 0.0 def test_complex_arithmetic_division_by_zero(): diff --git a/tests/test_countdown.py b/tests/test_countdown.py index e78a69ab..2bd20f4f 100644 --- a/tests/test_countdown.py +++ b/tests/test_countdown.py @@ -66,13 +66,13 @@ def test_countdown_game_items(): expr = item["metadata"]["expression"] # check score - assert dataset.score_answer(answer=expr, metadata=item["metadata"]) == 1.0 # correct answer - assert dataset.score_answer(answer="45+2", metadata=item["metadata"]) == 0.05 # wrong answer but an attempt + assert dataset.score_answer(answer=expr, entry=item) == 1.0 # correct answer + assert dataset.score_answer(answer="45+2", entry=item) == 0.05 # wrong answer but an attempt assert ( - dataset.score_answer(answer="a wrong solution", metadata=item["metadata"]) == 0.01 + dataset.score_answer(answer="a wrong solution", entry=item) == 0.01 ) # wrong answer but incorrectly formatted - assert dataset.score_answer(answer="", metadata=item["metadata"]) == 0.01 # wrong answer but empty string - assert dataset.score_answer(answer=None, metadata=item["metadata"]) == 0.0 # no answer + assert dataset.score_answer(answer="", entry=item) == 0.01 # wrong answer but empty string + assert dataset.score_answer(answer=None, entry=item) == 0.0 # no answer try: result = eval(expr) # Safe here since we control expression generation diff --git a/tests/test_intermediate_integration.py b/tests/test_intermediate_integration.py index df62ea76..f4393e2f 100644 --- a/tests/test_intermediate_integration.py +++ b/tests/test_intermediate_integration.py @@ -100,7 +100,7 @@ def test_verify_answer(): dataset = IntermediateIntegrationDataset(config) for i in range(len(dataset)): item = dataset[i] - score = dataset.score_answer(item["answer"], item["metadata"]) + score = dataset.score_answer(answer=item["answer"], entry=item) assert score == 1.0 @@ -140,5 +140,6 @@ def test_score_answer_cases(): ] for answer, metadata, expected in test_cases: - score = dataset.score_answer(answer, metadata) + dummy_entry = {"metadata": metadata} + score = dataset.score_answer(answer, entry=dummy_entry) assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}" diff --git a/tests/test_knight_swap.py b/tests/test_knight_swap.py new file mode 100644 index 00000000..74b606d5 --- /dev/null +++ b/tests/test_knight_swap.py @@ -0,0 +1,162 @@ +import pytest + +from reasoning_gym.games.knight_swap import KnightSwapConfig, KnightSwapDataset, KnightSwapLogic + + +def test_default_config_validation(): + """Test that default configuration is valid""" + config = KnightSwapConfig() + config.validate() # Should not raise any exceptions + + +def test_invalid_config(): + """Test that invalid configurations raise appropriate errors""" + with pytest.raises(AssertionError): + config = KnightSwapConfig(min_nodes=4) # Too few nodes + config.validate() + + with pytest.raises(AssertionError): + config = KnightSwapConfig(max_nodes=5, min_nodes=6) # max < min + config.validate() + + +def test_board_connectivity(): + """Test that generated boards are connected""" + config = KnightSwapConfig(min_nodes=6, max_nodes=6) + dataset = KnightSwapDataset(config) + attempts = 10 + # Try multiple puzzles since generation is random + found_connected = False + for i in range(attempts): + board = dataset[i]["metadata"]["board"] + if KnightSwapLogic.is_connected(board): + found_connected = True + break + # Print debug info for failing boards + print(f"\nBoard {i} not connected:") + print(f"Nodes: {list(board.keys())}") + print(f"Edges: {board}") + + assert found_connected, f"Could not find a connected board after {attempts} attempts" + + +def test_known_connected_board(): + """Test connectivity check with a known connected board""" + # Create a simple connected board with valid knight moves + board = { + "A1": ["B3", "C2"], + "B3": ["A1", "C1"], + "C1": ["B3", "A2"], + "A2": ["C1", "B4"], + "B4": ["A2", "C2"], + "C2": ["A1", "B4"], + } + assert KnightSwapLogic.is_connected(board), "Known connected board should be identified as connected" + + +def test_valid_knight_moves(): + """Test that all edges in generated board represent valid knight moves""" + config = KnightSwapConfig(min_nodes=6, max_nodes=6) + dataset = KnightSwapDataset(config) + + board = dataset[0]["metadata"]["board"] + for start, neighbors in board.items(): + for end in neighbors: + assert KnightSwapLogic.is_knight_move(start, end) + + +def test_knight_move_validation(): + """Test basic knight move validation""" + assert KnightSwapLogic.is_knight_move("A1", "B3") # Valid move + assert KnightSwapLogic.is_knight_move("B3", "A1") # Valid move reverse + assert not KnightSwapLogic.is_knight_move("A1", "A2") # Invalid move + assert not KnightSwapLogic.is_knight_move("A1", "B2") # Invalid move + + +def test_simple_solvable_puzzle(): + """Test a minimal solvable puzzle with one piece each""" + config = KnightSwapConfig(min_nodes=6, max_nodes=6, min_pieces=1, max_pieces=1, impossible_ratio=0.0) + dataset = KnightSwapDataset(config) + + # Try to find a solvable puzzle + for i in range(5): # Try a few times since generation is random + puzzle = dataset[i] + if puzzle["metadata"]["is_possible"]: + assert puzzle["answer"] != "No" + assert isinstance(eval(puzzle["answer"]), list) + return + + pytest.fail("Could not find a solvable puzzle") + + +def test_impossible_puzzle(): + """Test that impossible puzzles are correctly identified""" + config = KnightSwapConfig(min_nodes=6, max_nodes=6, min_pieces=2, max_pieces=2, impossible_ratio=1.0) + dataset = KnightSwapDataset(config) + + puzzle = dataset[0] + assert puzzle["metadata"]["is_possible"] is False + assert puzzle["answer"] == "No" + + +def test_alternating_turns(): + """Test that solutions follow alternating turns rule""" + config = KnightSwapConfig(impossible_ratio=0.0) + dataset = KnightSwapDataset(config) + + # Find a solvable puzzle + for i in range(5): + puzzle = dataset[i] + if puzzle["metadata"]["is_possible"]: + moves = eval(puzzle["answer"]) + current_turn = puzzle["metadata"]["start_turn"] + for move in moves: + color = move.split(",")[0] + assert color == current_turn + current_turn = "B" if current_turn == "w" else "w" + return + + pytest.fail("Could not find a solvable puzzle") + + +def test_solution_validation(): + """Test that solutions reach the target state""" + config = KnightSwapConfig(impossible_ratio=0.0) + dataset = KnightSwapDataset(config) + + # Find a solvable puzzle + for i in range(5): + puzzle = dataset[i] + if puzzle["metadata"]["is_possible"]: + # Get initial positions + initial_white = {pos for pos, piece in puzzle["metadata"]["pieces"].items() if piece == "w"} + initial_black = {pos for pos, piece in puzzle["metadata"]["pieces"].items() if piece == "B"} + + # Get final positions from board states + final_state = puzzle["metadata"]["board_states"][-1] + final_white = {pos for pos, piece in final_state.items() if piece == "w"} + final_black = {pos for pos, piece in final_state.items() if piece == "B"} + + # Check that positions are swapped + assert final_white == initial_black + assert final_black == initial_white + return + + pytest.fail("Could not find a solvable puzzle") + + +def test_score_calculation(): + """Test scoring for different answer types""" + config = KnightSwapConfig() + dataset = KnightSwapDataset(config) + + # Get a sample puzzle + puzzle = dataset[0] + + # Test invalid answers + assert dataset.score_answer(None, puzzle) == 0.0 + assert dataset.score_answer("", puzzle) == 0.01 + assert dataset.score_answer("Invalid", puzzle) == 0.01 + + # Test correct answer + assert dataset.score_answer(puzzle["answer"], puzzle) == 1.0 diff --git a/tests/test_palindrome.py b/tests/test_palindrome.py index e2844267..49e23786 100644 --- a/tests/test_palindrome.py +++ b/tests/test_palindrome.py @@ -72,21 +72,20 @@ def test_score_answer(): for item in dataset: correct_answer = item["answer"] - metadata = item["metadata"] # Correct answer should score 1.0 - assert dataset.score_answer(correct_answer, metadata) == 1.0 + assert dataset.score_answer(correct_answer, entry=item) == 1.0 # Incorrect answer (palindrome, but not correct one) should score 0.05 pal_letters = "racecar" if "racecar" != correct_answer else "aba" - assert dataset.score_answer(pal_letters, metadata) == 0.05 + assert dataset.score_answer(pal_letters, entry=item) == 0.05 # Incorrect answer (not palindrome) should score 0.02 wrong_letters = "abcd" if "abcd" != correct_answer else "efgh" - assert dataset.score_answer(wrong_letters, metadata) == 0.02 + assert dataset.score_answer(wrong_letters, entry=item) == 0.02 # Empty String input should score 0.01 - assert dataset.score_answer("", metadata) == 0.01 + assert dataset.score_answer("", entry=item) == 0.01 # Empty input should score 0.0 - assert dataset.score_answer(None, metadata) == 0.0 + assert dataset.score_answer(None, entry=item) == 0.0 diff --git a/tests/test_polynomial_multiplication.py b/tests/test_polynomial_multiplication.py index a27bd6bf..7f357173 100644 --- a/tests/test_polynomial_multiplication.py +++ b/tests/test_polynomial_multiplication.py @@ -137,10 +137,10 @@ def test_score_function(): seed=42, ) - assert ds.score_answer(None, ds[0]["metadata"]) == 0.00 - assert ds.score_answer("6*x**4 + 9*x**3 - 6*x**2 - 39*x - 45", ds[0]["metadata"]) == 1 - assert ds.score_answer("Not a polynomial", ds[0]["metadata"]) == 0.01 - assert ds.score_answer("x**4", ds[0]["metadata"]) == 0.05 + assert ds.score_answer(None, ds[0]) == 0.00 + assert ds.score_answer("6*x**4 + 9*x**3 - 6*x**2 - 39*x - 45", ds[0]) == 1 + assert ds.score_answer("Not a polynomial", ds[0]) == 0.01 + assert ds.score_answer("x**4", ds[0]) == 0.05 def test_multivariate_score_function(): @@ -160,7 +160,7 @@ def test_multivariate_score_function(): seed=42, ) - assert ds.score_answer(None, ds[0]["metadata"]) == 0.00 - assert ds.score_answer("-27*a**3*c - 27*a**3 + 144*a*c + 144*a", ds[0]["metadata"]) == 1 - assert ds.score_answer("Not a polynomial", ds[0]["metadata"]) == 0.01 - assert ds.score_answer("x**4", ds[0]["metadata"]) == 0.05 + assert ds.score_answer(None, ds[0]) == 0.00 + assert ds.score_answer("-27*a**3*c - 27*a**3 + 144*a*c + 144*a", ds[0]) == 1 + assert ds.score_answer("Not a polynomial", ds[0]) == 0.01 + assert ds.score_answer("x**4", ds[0]) == 0.05 diff --git a/tests/test_rearc.py b/tests/test_rearc.py index aa43e64d..17de2241 100644 --- a/tests/test_rearc.py +++ b/tests/test_rearc.py @@ -54,7 +54,7 @@ def test_rearc_solution_validation(): for item in dataset: # Test correct solution correct = format_board(item["metadata"]["output"], dataset.board_format_opts) - assert dataset.score_answer(correct, item["metadata"]) == 1.0 + assert dataset.score_answer(correct, entry=item) == 1.0 # Test invalid format invalid_grid = """ @@ -63,10 +63,10 @@ def test_rearc_solution_validation(): 7 8 7 0 0 0 """ - assert dataset.score_answer(invalid_grid, item["metadata"]) == 0.05 + assert dataset.score_answer(invalid_grid, entry=item) == 0.05 # Test empty answer - assert dataset.score_answer(None, item["metadata"]) == 0.0 + assert dataset.score_answer(None, entry=item) == 0.0 def test_rearc_scoring_edge_cases(): @@ -77,11 +77,11 @@ def test_rearc_scoring_edge_cases(): for item in dataset: # Partial match partial = format_board([[0, 0], [0, 0]], dataset.board_format_opts) - assert 0.0 < dataset.score_answer(partial, item["metadata"]) < 1.0 + assert 0.0 < dataset.score_answer(partial, entry=item) < 1.0 # Malformed answer - assert dataset.score_answer("[[invalid", item["metadata"]) == 0.01 + assert dataset.score_answer("[[invalid", entry=item) == 0.01 # Case sensitivity answer = format_board(item["metadata"]["output"], dataset.board_format_opts).lower() - assert dataset.score_answer(answer, item["metadata"]) == 1.0 + assert dataset.score_answer(answer, entry=item) == 1.0 diff --git a/tests/test_simple_integration.py b/tests/test_simple_integration.py index 0de8ab36..726b14be 100644 --- a/tests/test_simple_integration.py +++ b/tests/test_simple_integration.py @@ -73,7 +73,7 @@ def test_verify_answer(): dataset = SimpleIntegrationDataset(config) for i in range(len(dataset)): item = dataset[i] - score = dataset.score_answer(item["answer"], item["metadata"]) + score = dataset.score_answer(item["answer"], item) assert score == 1.0 @@ -113,5 +113,6 @@ def test_score_answer_cases(): ] for answer, metadata, expected in test_cases: - score = dataset.score_answer(answer, metadata) + dummy_entry = {"metadata": metadata} + score = dataset.score_answer(answer=answer, entry=dummy_entry) assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}" diff --git a/tests/test_tower_of_hanoi.py b/tests/test_tower_of_hanoi.py index 9bf83dca..2d870078 100644 --- a/tests/test_tower_of_hanoi.py +++ b/tests/test_tower_of_hanoi.py @@ -245,27 +245,26 @@ def test_score_answer(): dataset = HanoiDataset(config) # Pick one instance from the dataset for testing. item = dataset[0] - metadata = item["metadata"] correct_answer = item["answer"] # 1. Correct answer should yield full reward. - score_correct = dataset.score_answer(answer=correct_answer, metadata=metadata) + score_correct = dataset.score_answer(answer=correct_answer, entry=item) assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0." # 2. A badly formatted answer should yield minimal reward (0.01). - score_bad_format = dataset.score_answer(answer="a wrong solution", metadata=metadata) + score_bad_format = dataset.score_answer(answer="a wrong solution", entry=item) assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01." # 3. An answer that is validly formatted but unsolved. # For example, remove the last move from the correct answer. unfinished_answer = correct_answer[:-1] - score_unsolved = dataset.score_answer(answer=unfinished_answer, metadata=metadata) + score_unsolved = dataset.score_answer(answer=unfinished_answer, entry=item) assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05." # 4. An empty answer should yield 0.01. - score_empty = dataset.score_answer(answer="", metadata=metadata) + score_empty = dataset.score_answer(answer="", entry=item) assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01." # 5. A None answer should yield 0.0. - score_none = dataset.score_answer(answer=None, metadata=metadata) + score_none = dataset.score_answer(answer=None, entry=item) assert score_none == 0.0, f"None answer score {score_none} is not 0.0."