mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Merge branch 'main' of https://github.com/rishabhranawat/reasoning-gym into poly-reward
This commit is contained in:
commit
adfcf52bca
26 changed files with 1561 additions and 99 deletions
577
GALLERY.md
577
GALLERY.md
|
|
@ -5,6 +5,7 @@ This gallery shows examples from all available datasets using their default conf
|
|||
- [advanced_geometry](#advanced_geometry)
|
||||
- [aiw](#aiw)
|
||||
- [arc_1d](#arc_1d)
|
||||
- [arc_agi](#arc_agi)
|
||||
- [base_conversion](#base_conversion)
|
||||
- [basic_arithmetic](#basic_arithmetic)
|
||||
- [bf](#bf)
|
||||
|
|
@ -24,6 +25,7 @@ This gallery shows examples from all available datasets using their default conf
|
|||
- [gsm_symbolic](#gsm_symbolic)
|
||||
- [intermediate_integration](#intermediate_integration)
|
||||
- [isomorphic_strings](#isomorphic_strings)
|
||||
- [knight_swap](#knight_swap)
|
||||
- [largest_island](#largest_island)
|
||||
- [lcm](#lcm)
|
||||
- [leg_counting](#leg_counting)
|
||||
|
|
@ -230,6 +232,421 @@ Metadata: {'task_name': 'two_points_and_fill_inv', 'size': 26, 'train_examples':
|
|||
|
||||
````
|
||||
|
||||
### arc_agi
|
||||
Default configuration:
|
||||
```python
|
||||
use_train = True
|
||||
use_eval = True
|
||||
board_format_opts = BoardFormattingOptions(alphabet=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], col_delimiter=' ', row_delimiter='\n', array_brackets=False)
|
||||
rotations = ['90', '180', '270']
|
||||
mirrors = ['horizontal', 'vertical', 'diagonal', 'counterdiagonal']
|
||||
use_color_permutation = True
|
||||
seed = 42
|
||||
size = 500
|
||||
```
|
||||
|
||||
Example tasks:
|
||||
````
|
||||
Example 1:
|
||||
Question: Find the common rule that maps an input grid to an output grid, given the examples below.
|
||||
|
||||
Example 1:
|
||||
|
||||
Input:
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 6 3 6 7 7 7 7 7 7 7 7 7 7
|
||||
7 6 6 3 7 6 6 6 7 7 6 3 7 7
|
||||
7 7 7 7 7 6 3 6 7 7 6 6 7 7
|
||||
7 7 7 7 7 6 6 3 7 7 7 7 7 7
|
||||
7 7 7 7 7 3 6 6 7 7 7 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 3 6
|
||||
7 6 6 3 7 7 7 7 7 7 7 6 6 6
|
||||
7 3 6 6 7 7 7 7 7 7 7 7 7 7
|
||||
7 6 6 6 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 3 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
Output:
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 6 3 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 6 6 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 3 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 3 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 6 6 6 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
|
||||
Example 2:
|
||||
|
||||
Input:
|
||||
7 7 7 7 7 6 3 6 7 7 7 6 6 7
|
||||
7 7 7 7 7 6 6 6 7 7 7 6 6 7
|
||||
6 6 6 6 7 6 6 6 7 7 7 6 6 7
|
||||
6 3 6 6 7 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 6 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 6 6 3 6 7
|
||||
7 7 7 7 7 7 7 7 7 6 3 6 6 7
|
||||
7 7 7 6 6 6 6 7 7 6 6 6 3 7
|
||||
7 7 7 6 6 3 6 7 7 7 7 7 7 7
|
||||
7 7 7 6 3 6 6 7 7 7 7 7 7 7
|
||||
7 7 7 6 6 6 6 7 7 7 6 3 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 6 6 6 3
|
||||
7 7 7 7 7 7 7 7 7 7 6 3 3 6
|
||||
7 7 7 7 7 7 7 7 7 7 6 6 6 6
|
||||
Output:
|
||||
7 7 7 7 7 6 3 6 7 7 7 6 6 7
|
||||
7 7 7 7 7 6 6 6 7 7 7 6 6 7
|
||||
6 6 6 6 7 6 6 6 7 7 7 6 6 7
|
||||
6 3 6 6 7 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 6 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
|
||||
Example 3:
|
||||
|
||||
Input:
|
||||
7 7 7 7 7 6 6 6 6 7 7 3 6 7 7
|
||||
6 6 6 6 7 3 6 6 3 7 7 6 3 7 7
|
||||
6 3 6 6 7 6 6 6 6 7 7 7 7 7 7
|
||||
6 6 6 6 7 6 6 3 6 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 3 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 6 6 3 6 6 7 7 7 7 7 7 7 7
|
||||
7 7 6 6 6 3 6 7 7 7 7 7 7 7 7
|
||||
7 7 6 3 6 6 6 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 6 6 3 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 6 3 6 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 6 6 6 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 3 6 3 7 7 7
|
||||
Output:
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 6 7 7 7 7 7 7 7 7 7 7 7
|
||||
6 3 6 6 7 7 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 6 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 3 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 7 7 7 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 7 7 7 7 7 7
|
||||
7 7 6 6 6 6 7 7 7 7 7 7 7 7 7
|
||||
|
||||
|
||||
Below is a test input grid. Predict the corresponding output grid by applying the rule you found.
|
||||
Your final answer should just be the text output grid itself.
|
||||
|
||||
Input:
|
||||
7 7 7 7 7 7 7 7 6 3 6 6
|
||||
6 6 6 7 7 7 7 7 6 6 6 6
|
||||
3 6 6 7 7 7 7 7 6 3 6 3
|
||||
6 6 6 7 3 6 6 7 7 7 7 7
|
||||
7 7 7 7 6 6 6 7 7 7 7 7
|
||||
7 7 7 7 6 6 3 7 7 7 7 7
|
||||
7 7 7 7 6 6 6 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 6 6 3 6
|
||||
7 6 6 6 6 6 6 7 6 6 6 6
|
||||
7 6 6 6 6 3 6 7 6 6 6 6
|
||||
7 6 3 6 6 6 6 7 7 7 7 7
|
||||
7 6 6 6 6 6 6 7 6 6 6 7
|
||||
7 7 7 7 7 7 7 7 6 6 6 7
|
||||
|
||||
Answer: 7 7 7 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 7 7 7 7 7 7 7 7 7
|
||||
3 6 6 7 7 7 7 7 7 7 7 7
|
||||
6 6 6 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 6 6 3 6
|
||||
7 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 6 6 6 6
|
||||
7 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 6 6 6 7
|
||||
7 7 7 7 7 7 7 7 6 6 6 7
|
||||
Metadata: {'input': ((7, 7, 7, 7, 7, 7, 7, 7, 6, 3, 6, 6), (6, 6, 6, 7, 7, 7, 7, 7, 6, 6, 6, 6), (3, 6, 6, 7, 7, 7, 7, 7, 6, 3, 6, 3), (6, 6, 6, 7, 3, 6, 6, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 6, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 3, 7, 7, 7, 7, 7), (7, 7, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 3, 6), (7, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6), (7, 6, 6, 6, 6, 3, 6, 7, 6, 6, 6, 6), (7, 6, 3, 6, 6, 6, 6, 7, 7, 7, 7, 7), (7, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7)), 'output': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (3, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 3, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7), (7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7)), 'task_id': 'a934301b'}
|
||||
|
||||
Example 2:
|
||||
Question: Find the common rule that maps an input grid to an output grid, given the examples below.
|
||||
|
||||
Example 1:
|
||||
|
||||
Input:
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 0 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 0 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 0 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
Output:
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 2 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 9 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 9 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
2 8 8 8 8 8 8 8 8 9
|
||||
|
||||
Example 2:
|
||||
|
||||
Input:
|
||||
6 6 6 6 6 6 6 6 6 6
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 0 8 8 8 8 8 0 8
|
||||
8 8 8 8 8 8 0 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 0 8 8 8 8
|
||||
8 0 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
1 1 1 1 1 1 1 1 1 1
|
||||
Output:
|
||||
6 6 6 6 6 6 6 6 6 6
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 6 8 8 8 8 8 6 8
|
||||
8 8 8 8 8 8 6 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 1 8 8 8 8
|
||||
8 1 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
1 1 1 1 1 1 1 1 1 1
|
||||
|
||||
Example 3:
|
||||
|
||||
Input:
|
||||
5 5 5 5 5 5 5 5 5 5
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 0 8 8 8 8
|
||||
8 8 0 8 8 8 8 8 0 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 0 8 8 8 8 0 8
|
||||
8 8 8 8 8 8 0 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
7 7 7 7 7 7 7 7 7 7
|
||||
Output:
|
||||
5 5 5 5 5 5 5 5 5 5
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 5 8 8 8 8
|
||||
8 8 5 8 8 8 8 8 5 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
8 8 8 7 8 8 8 8 7 8
|
||||
8 8 8 8 8 8 7 8 8 8
|
||||
8 8 8 8 8 8 8 8 8 8
|
||||
7 7 7 7 7 7 7 7 7 7
|
||||
|
||||
|
||||
Below is a test input grid. Predict the corresponding output grid by applying the rule you found.
|
||||
Your final answer should just be the text output grid itself.
|
||||
|
||||
Input:
|
||||
6 8 8 8 8 8 8 8 0 4
|
||||
6 0 8 8 0 8 8 8 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
6 8 8 8 8 8 0 8 8 4
|
||||
6 8 8 0 8 8 8 8 8 4
|
||||
6 8 8 8 8 8 0 8 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
6 8 8 8 8 0 8 8 8 4
|
||||
6 8 8 0 8 8 8 0 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
|
||||
Answer: 6 8 8 8 8 8 8 8 4 4
|
||||
6 6 8 8 6 8 8 8 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
6 8 8 8 8 8 4 8 8 4
|
||||
6 8 8 6 8 8 8 8 8 4
|
||||
6 8 8 8 8 8 4 8 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
6 8 8 8 8 4 8 8 8 4
|
||||
6 8 8 6 8 8 8 4 8 4
|
||||
6 8 8 8 8 8 8 8 8 4
|
||||
Metadata: {'input': ((6, 8, 8, 8, 8, 8, 8, 8, 0, 4), (6, 0, 8, 8, 0, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 0, 8, 8, 4), (6, 8, 8, 0, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 0, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 0, 8, 8, 8, 4), (6, 8, 8, 0, 8, 8, 8, 0, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4)), 'output': ((6, 8, 8, 8, 8, 8, 8, 8, 4, 4), (6, 6, 8, 8, 6, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 4, 8, 8, 4), (6, 8, 8, 6, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 8, 4, 8, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4), (6, 8, 8, 8, 8, 4, 8, 8, 8, 4), (6, 8, 8, 6, 8, 8, 8, 4, 8, 4), (6, 8, 8, 8, 8, 8, 8, 8, 8, 4)), 'task_id': '2204b7a8'}
|
||||
|
||||
Example 3:
|
||||
Question: Find the common rule that maps an input grid to an output grid, given the examples below.
|
||||
|
||||
Example 1:
|
||||
|
||||
Input:
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5
|
||||
5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 5 5 5 5 5
|
||||
5 5 8 8 8 8 5 5 5 5 5 8 8 8 8 5 5 5 5 5
|
||||
2 5 8 8 8 8 5 5 5 5 5 8 8 8 8 5 5 5 5 2
|
||||
5 5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 8 8 8 8 8 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
Output:
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 8 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 8 5
|
||||
5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 8 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 2 2 2 2 5 5 5 5 5
|
||||
5 5 2 2 2 2 5 2 5 5 5 2 2 2 2 5 5 5 5 5
|
||||
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
|
||||
5 5 2 2 2 2 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 2 2 2 2 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
|
||||
Example 2:
|
||||
|
||||
Input:
|
||||
5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 5 5 5 5 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 8 8 8 8 8 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 8 8 8 8 8 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 8 8 8 8 8 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
Output:
|
||||
5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 5 5 2 5 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 5 5 5 2 5 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 2 2 2 2 2 5 8 8 8 5 5 5 5 8 8 8 8
|
||||
5 5 5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 2 2 2 2 2 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 2 2 2 2 2 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
|
||||
Example 3:
|
||||
|
||||
Input:
|
||||
5 8 8 8 8 8 5 2 5 5 5 5 5 5
|
||||
5 8 8 8 8 8 5 5 5 5 5 8 8 8
|
||||
5 5 5 5 5 5 5 5 5 5 5 8 8 8
|
||||
5 5 5 5 8 8 8 8 8 8 5 8 8 8
|
||||
5 5 5 5 8 8 8 8 8 8 5 8 8 8
|
||||
5 5 5 5 8 8 8 8 8 8 5 8 8 8
|
||||
8 8 5 5 8 8 8 8 8 8 5 5 5 5
|
||||
8 8 5 5 8 8 8 8 8 8 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 5 5 8 8 8 5 5 5 5
|
||||
2 5 8 8 8 5 5 8 8 8 5 5 5 2
|
||||
5 5 8 8 8 5 5 5 5 5 5 5 5 5
|
||||
5 5 8 8 8 5 5 2 5 5 5 5 5 5
|
||||
Output:
|
||||
5 8 8 8 8 8 5 2 5 5 5 5 5 5
|
||||
5 8 8 8 8 8 5 2 5 5 5 8 8 8
|
||||
5 5 5 5 5 5 5 2 5 5 5 8 8 8
|
||||
5 5 5 5 2 2 2 2 2 2 5 8 8 8
|
||||
5 5 5 5 2 2 2 2 2 2 5 8 8 8
|
||||
5 5 5 5 2 2 2 2 2 2 5 8 8 8
|
||||
8 8 5 5 2 2 2 2 2 2 5 5 5 5
|
||||
8 8 5 5 2 2 2 2 2 2 5 5 5 5
|
||||
5 5 5 5 5 5 5 2 5 5 5 5 5 5
|
||||
5 5 2 2 2 5 5 2 2 2 5 5 5 5
|
||||
2 2 2 2 2 2 2 2 2 2 2 2 2 2
|
||||
5 5 2 2 2 5 5 2 5 5 5 5 5 5
|
||||
5 5 2 2 2 5 5 2 5 5 5 5 5 5
|
||||
|
||||
|
||||
Below is a test input grid. Predict the corresponding output grid by applying the rule you found.
|
||||
Your final answer should just be the text output grid itself.
|
||||
|
||||
Input:
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5
|
||||
5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 8 8 5 5 8 8 8 5
|
||||
5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 8 8 8 5
|
||||
5 5 5 5 5 8 8 8 8 8 8 8 8 8 5 5 5 5 5 5 5 8 8 8 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 8 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
2 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 2
|
||||
5 8 8 8 8 8 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 8 8 5
|
||||
5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 8 8 5
|
||||
5 5 5 5 8 8 8 5 5 5 5 5 5 8 8 8 8 8 5 5 5 5 5 5 5
|
||||
2 5 5 5 8 8 8 5 5 5 5 5 5 8 8 8 8 8 5 8 8 8 8 5 2
|
||||
5 5 5 5 8 8 8 5 5 8 8 8 5 8 8 8 8 8 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 5 5 5 5 8 8 8 5 5 5 5 5 5 5 8 8 8 8 5 5
|
||||
5 5 5 5 5 2 5 5 5 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
|
||||
Answer: 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 8 8 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 8 8 5 5 8 8 8 5
|
||||
5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 8 8 8 5
|
||||
5 5 5 5 5 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5 8 8 8 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 2 2 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
|
||||
5 2 2 2 2 2 5 5 5 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 5 5 5 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 8 8 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 8 8 5
|
||||
5 5 5 5 5 2 5 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 8 8 5
|
||||
5 5 5 5 2 2 2 5 5 5 5 5 5 2 2 2 2 2 5 5 5 5 5 5 5
|
||||
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
|
||||
5 5 5 5 2 2 2 5 5 2 2 2 5 2 2 2 2 2 5 2 2 2 2 5 5
|
||||
5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 2 2 2 2 5 5
|
||||
5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 2 2 2 2 5 5
|
||||
5 5 5 5 5 2 5 5 5 2 2 2 5 5 5 5 5 5 5 5 5 5 5 5 5
|
||||
Metadata: {'input': ((5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (2, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2), (5, 8, 8, 8, 8, 8, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5), (2, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 5, 2), (5, 5, 5, 5, 8, 8, 8, 5, 5, 8, 8, 8, 5, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), 'output': ((5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 8, 8, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 8, 8, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), (5, 2, 2, 2, 2, 2, 5, 5, 5, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 8, 8, 5), (5, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5), (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), (5, 5, 5, 5, 2, 2, 2, 5, 5, 2, 2, 2, 5, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 5, 5), (5, 5, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), 'task_id': '0d87d2a6'}
|
||||
|
||||
````
|
||||
|
||||
### base_conversion
|
||||
Generates base conversion tasks
|
||||
|
||||
|
|
@ -1171,6 +1588,144 @@ Metadata: {'words': ['hogtytyof', 'kgqwfwfgh'], 'solution': True, 'solvable': Tr
|
|||
|
||||
````
|
||||
|
||||
### knight_swap
|
||||
Generates Knight Swap puzzles with configurable parameters.
|
||||
|
||||
Default configuration:
|
||||
```python
|
||||
min_nodes = 6
|
||||
max_nodes = 9
|
||||
min_pieces = 2
|
||||
max_pieces = 2
|
||||
min_steps = 4
|
||||
max_steps = 20
|
||||
max_attempts = 100
|
||||
seed = 42
|
||||
size = 5
|
||||
impossible_ratio = 0.2
|
||||
```
|
||||
|
||||
Example tasks:
|
||||
````
|
||||
Example 1:
|
||||
Question: Knight Swap Challenge:
|
||||
|
||||
```
|
||||
A B C D
|
||||
----------------
|
||||
3 | | . | | . |
|
||||
----------------
|
||||
2 | B | w | | |
|
||||
----------------
|
||||
1 | | | B | w |
|
||||
----------------
|
||||
```
|
||||
|
||||
Legend:
|
||||
- 'w' = White Knight
|
||||
- 'B' = Black Knight
|
||||
- Empty squares are marked with '.'
|
||||
|
||||
Objective:
|
||||
Swap the positions of all white knights with all black knights through valid moves.
|
||||
|
||||
Rules:
|
||||
1. Knights move in L-shape (2 squares + 1 square perpendicular)
|
||||
2. Knights can only move to empty squares
|
||||
3. w moves first, then players alternate
|
||||
4. All knights must reach their target positions (white ↔ black)
|
||||
|
||||
Question:
|
||||
Is it possible to swap all knights' positions? If yes, list the moves.
|
||||
|
||||
Answer Format:
|
||||
- For impossible puzzles: "No"
|
||||
- For possible puzzles: List moves as ["color,from,to", ...]
|
||||
Example: ["w,A1,B3"] means white knight moves A1→B3
|
||||
|
||||
Answer: No
|
||||
Metadata: {'board': {'C1': ['A2', 'B3', 'D3'], 'A2': ['C1'], 'B3': ['C1'], 'D1': ['B2'], 'B2': ['D1', 'D3'], 'D3': ['B2', 'C1']}, 'pieces': {'C1': 'B', 'A2': 'B', 'B3': None, 'D1': 'w', 'B2': 'w', 'D3': None}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None}
|
||||
|
||||
Example 2:
|
||||
Question: Knight Swap Challenge:
|
||||
|
||||
```
|
||||
A B C D
|
||||
----------------
|
||||
3 | | w | . | |
|
||||
----------------
|
||||
2 | w | | | B |
|
||||
----------------
|
||||
1 | | | . | B |
|
||||
----------------
|
||||
```
|
||||
|
||||
Legend:
|
||||
- 'w' = White Knight
|
||||
- 'B' = Black Knight
|
||||
- Empty squares are marked with '.'
|
||||
|
||||
Objective:
|
||||
Swap the positions of all white knights with all black knights through valid moves.
|
||||
|
||||
Rules:
|
||||
1. Knights move in L-shape (2 squares + 1 square perpendicular)
|
||||
2. Knights can only move to empty squares
|
||||
3. w moves first, then players alternate
|
||||
4. All knights must reach their target positions (white ↔ black)
|
||||
|
||||
Question:
|
||||
Is it possible to swap all knights' positions? If yes, list the moves.
|
||||
|
||||
Answer Format:
|
||||
- For impossible puzzles: "No"
|
||||
- For possible puzzles: List moves as ["color,from,to", ...]
|
||||
Example: ["w,A1,B3"] means white knight moves A1→B3
|
||||
|
||||
Answer: No
|
||||
Metadata: {'board': {'B3': ['C1'], 'D1': ['C3'], 'C3': ['A2', 'D1'], 'C1': ['A2', 'B3'], 'D2': [], 'A2': ['C1', 'C3']}, 'pieces': {'B3': 'w', 'D1': 'B', 'C3': None, 'C1': None, 'D2': 'B', 'A2': 'w'}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None}
|
||||
|
||||
Example 3:
|
||||
Question: Knight Swap Challenge:
|
||||
|
||||
```
|
||||
A B C
|
||||
------------
|
||||
3 | . | | B |
|
||||
------------
|
||||
2 | w | | . |
|
||||
------------
|
||||
1 | | w | B |
|
||||
------------
|
||||
```
|
||||
|
||||
Legend:
|
||||
- 'w' = White Knight
|
||||
- 'B' = Black Knight
|
||||
- Empty squares are marked with '.'
|
||||
|
||||
Objective:
|
||||
Swap the positions of all white knights with all black knights through valid moves.
|
||||
|
||||
Rules:
|
||||
1. Knights move in L-shape (2 squares + 1 square perpendicular)
|
||||
2. Knights can only move to empty squares
|
||||
3. w moves first, then players alternate
|
||||
4. All knights must reach their target positions (white ↔ black)
|
||||
|
||||
Question:
|
||||
Is it possible to swap all knights' positions? If yes, list the moves.
|
||||
|
||||
Answer Format:
|
||||
- For impossible puzzles: "No"
|
||||
- For possible puzzles: List moves as ["color,from,to", ...]
|
||||
Example: ["w,A1,B3"] means white knight moves A1→B3
|
||||
|
||||
Answer: No
|
||||
Metadata: {'board': {'B1': ['A3'], 'A3': ['B1', 'C2'], 'A2': ['C1', 'C3'], 'C3': ['A2'], 'C1': ['A2'], 'C2': ['A3']}, 'pieces': {'B1': 'w', 'A3': None, 'A2': 'w', 'C3': 'B', 'C1': 'B', 'C2': None}, 'start_turn': 'w', 'solution': None, 'is_possible': False, 'num_steps': 0, 'board_states': None}
|
||||
|
||||
````
|
||||
|
||||
### largest_island
|
||||
Generates Largest Island exercises with configurable difficulty
|
||||
|
||||
|
|
@ -2097,7 +2652,11 @@ Input:
|
|||
3 3 3 3 3 3 3 3
|
||||
3 3 3 3 3 3 3 3
|
||||
|
||||
Answer: ((3, 9, 3, 9, 3, 9, 3, 9), (3, 9, 3, 9, 3, 9, 3, 3), (3, 9, 3, 9, 3, 9, 9, 9), (3, 9, 3, 9, 3, 3, 3, 3), (3, 9, 3, 9, 9, 9, 9, 9))
|
||||
Answer: 3 9 3 9 3 9 3 9
|
||||
3 9 3 9 3 9 3 3
|
||||
3 9 3 9 3 9 9 9
|
||||
3 9 3 9 3 3 3 3
|
||||
3 9 3 9 9 9 9 9
|
||||
Metadata: {'input': ((3, 3, 3, 3, 3, 3, 3, 9), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3), (3, 3, 3, 3, 3, 3, 3, 3)), 'output': ((3, 9, 3, 9, 3, 9, 3, 9), (3, 9, 3, 9, 3, 9, 3, 3), (3, 9, 3, 9, 3, 9, 9, 9), (3, 9, 3, 9, 3, 3, 3, 3), (3, 9, 3, 9, 9, 9, 9, 9)), 'task_id': 'd22278a0', 'difficulty': {'rng': 0.07173948707162241, 'pso': 0.12314814814814816}}
|
||||
|
||||
Example 2:
|
||||
|
|
@ -2232,7 +2791,14 @@ Input:
|
|||
7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7
|
||||
|
||||
Answer: ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7))
|
||||
Answer: 7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 8 7 7 7
|
||||
7 7 7 7 7 7 8 7 8 7 7
|
||||
7 7 7 7 7 8 7 8 7 8 7
|
||||
7 7 7 7 7 7 8 7 8 7 7
|
||||
7 7 7 7 7 7 7 8 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7
|
||||
7 7 7 7 7 7 7 7 7 7 7
|
||||
Metadata: {'input': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)), 'output': ((7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 8, 7, 8, 7, 8, 7), (7, 7, 7, 7, 7, 7, 8, 7, 8, 7, 7), (7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)), 'task_id': '11852cab', 'difficulty': {'rng': 0.09651305327452808, 'pso': 0.15228956228956228}}
|
||||
|
||||
Example 3:
|
||||
|
|
@ -2276,7 +2842,10 @@ Input:
|
|||
1 1 1 1 1
|
||||
1 1 1 1 1
|
||||
|
||||
Answer: ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1))
|
||||
Answer: 1 1 1 1 1
|
||||
1 1 1 1 1
|
||||
1 1 1 1 1
|
||||
1 1 1 1 1
|
||||
Metadata: {'input': ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1)), 'output': ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1)), 'task_id': '8be77c9e', 'difficulty': {'rng': 0.09322002370336528, 'pso': 0.0638888888888889}}
|
||||
|
||||
````
|
||||
|
|
@ -2993,7 +3562,7 @@ Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6,
|
|||
Example 2:
|
||||
Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM.
|
||||
Answer: 02:38
|
||||
Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 8, 9, 44), 'end_time': datetime.datetime(2025, 2, 8, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'}
|
||||
Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 9, 9, 44), 'end_time': datetime.datetime(2025, 2, 9, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'}
|
||||
|
||||
Example 3:
|
||||
Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days.
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|||
|
||||
[project]
|
||||
name = "reasoning_gym"
|
||||
version = "0.1.5"
|
||||
version = "0.1.6"
|
||||
authors = [
|
||||
{ name = "Open-Thought community", email = "andreas.koepf@xamla.com" },
|
||||
]
|
||||
|
|
@ -21,6 +21,7 @@ dependencies = [
|
|||
"pytz>=2024.1",
|
||||
"tabulate==0.9.0",
|
||||
"pyyaml>=6.0.2",
|
||||
"arckit==0.1.0",
|
||||
]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ Reasoning Gym - A library of procedural dataset generators for training reasonin
|
|||
from . import algebra, algorithmic, arc, arithmetic, code, cognition, data, games, geometry, graphs, logic
|
||||
from .factory import create_dataset, register_dataset
|
||||
|
||||
__version__ = "0.1.5"
|
||||
__version__ = "0.1.6"
|
||||
__all__ = [
|
||||
"arc",
|
||||
"algebra",
|
||||
|
|
|
|||
|
|
@ -127,11 +127,12 @@ class ComplexArithmeticDataset(ProceduralDataset):
|
|||
|
||||
return student_result
|
||||
|
||||
def score_answer(self, answer: str, metadata: dict) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: dict) -> float:
|
||||
"""Score the answer using exponential distance-based scoring."""
|
||||
if answer is None:
|
||||
return 0.0
|
||||
|
||||
metadata = entry["metadata"]
|
||||
try:
|
||||
student_result = self.parse_string_to_complex(answer)
|
||||
expected_result = complex(*metadata["result"])
|
||||
|
|
|
|||
|
|
@ -235,9 +235,10 @@ class IntermediateIntegrationDataset(ProceduralDataset):
|
|||
},
|
||||
}
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
"""Determine if the solution provided solves the problem"""
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
var = metadata["variable"]
|
||||
|
|
|
|||
|
|
@ -138,8 +138,9 @@ class PolynomialMultiplicationDataset(ProceduralDataset):
|
|||
|
||||
return polynomial_expr
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
predicted_poly = sp.parse_expr(answer)
|
||||
|
|
|
|||
|
|
@ -80,9 +80,10 @@ class SimpleIntegrationDataset(ProceduralDataset):
|
|||
},
|
||||
}
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
"""Determine if the solution provided solves the problem"""
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
var = metadata["variable"]
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ class PalindromeDataset(ProceduralDataset):
|
|||
"""Return the palindrome string from the letter set."""
|
||||
return "".join(letters)
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
"""Determine if the solution provided is a valid palindrome.
|
||||
The answer is expected to be a single string
|
||||
|
||||
|
|
@ -98,6 +98,7 @@ class PalindromeDataset(ProceduralDataset):
|
|||
if answer == "":
|
||||
return 0.01
|
||||
|
||||
metadata = entry["metadata"]
|
||||
answer = answer.strip().lower()
|
||||
expected_letters = metadata["letters"]
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from .arc_1d import Arc1DConfig, Arc1DDataset
|
||||
from .arc_agi import ArcAgiConfig, ArcAgiDataset
|
||||
from .rearc import ReArcConfig, ReArcDataset
|
||||
|
||||
__all__ = ["Arc1DConfig", "Arc1DDataset", "ReArcDataset", "ReArcConfig"]
|
||||
__all__ = ["Arc1DConfig", "Arc1DDataset", "ArcAgiConfig", "ArcAgiDataset", "ReArcDataset", "ReArcConfig"]
|
||||
|
|
|
|||
202
reasoning_gym/arc/arc_agi.py
Normal file
202
reasoning_gym/arc/arc_agi.py
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
from dataclasses import dataclass, field
|
||||
from random import Random
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import arckit
|
||||
|
||||
from reasoning_gym.arc.board_format import (
|
||||
ARC_PROMPT_TEMPLATE,
|
||||
BoardFormattingOptions,
|
||||
format_board,
|
||||
format_board_pair,
|
||||
parse_board,
|
||||
)
|
||||
from reasoning_gym.dataset import ProceduralDataset
|
||||
from reasoning_gym.factory import register_dataset
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcAgiConfig:
|
||||
use_train: bool = True
|
||||
use_eval: bool = True
|
||||
board_format_opts: BoardFormattingOptions = field(default_factory=lambda: BoardFormattingOptions())
|
||||
|
||||
# Augmentation options
|
||||
rotations: list[str] = field(default_factory=lambda: ["90", "180", "270"]) # empty list for no rotations
|
||||
mirrors: list[str] = field(
|
||||
default_factory=lambda: ["horizontal", "vertical", "diagonal", "counterdiagonal"]
|
||||
) # empty list for no mirrors
|
||||
use_color_permutation: bool = True
|
||||
|
||||
seed: Optional[int] = None
|
||||
size: int = 500
|
||||
|
||||
def validate(self):
|
||||
assert self.size > 0, "Size of dataset must be positive."
|
||||
valid_rotations = ["90", "180", "270"]
|
||||
valid_mirrors = ["horizontal", "vertical", "diagonal", "counterdiagonal"]
|
||||
for rot in self.rotations:
|
||||
assert rot in valid_rotations, f"Invalid rotation option: {rot}"
|
||||
for mirror in self.mirrors:
|
||||
assert mirror in valid_mirrors, f"Invalid mirror option: {mirror}"
|
||||
|
||||
|
||||
Board = list[list[int]]
|
||||
|
||||
|
||||
def identity(board: Board) -> Board:
|
||||
return board
|
||||
|
||||
|
||||
def rot90(board: Board) -> Board:
|
||||
"""quarter clockwise rotation"""
|
||||
return [row for row in zip(*board[::-1])]
|
||||
|
||||
|
||||
def rot180(board: Board) -> Board:
|
||||
"""half rotation"""
|
||||
return [row[::-1] for row in board[::-1]]
|
||||
|
||||
|
||||
def rot270(board: Board) -> Board:
|
||||
"""quarter anticlockwise rotation"""
|
||||
return [row[::-1] for row in zip(*board[::-1])][::-1]
|
||||
|
||||
|
||||
def hmirror(board: Board) -> Board:
|
||||
"""mirroring along horizontal"""
|
||||
return board[::-1]
|
||||
|
||||
|
||||
def vmirror(board: Board) -> Board:
|
||||
"""mirroring along vertical"""
|
||||
return [row[::-1] for row in board]
|
||||
|
||||
|
||||
def dmirror(board: Board) -> Board:
|
||||
"""mirroring along diagonal"""
|
||||
return list(zip(*board))
|
||||
|
||||
|
||||
def cmirror(board: Board) -> Board:
|
||||
"""mirroring along counterdiagonal"""
|
||||
return list(zip(*[r[::-1] for r in board[::-1]]))
|
||||
|
||||
|
||||
def cmap(board: Board, colors: list[int]) -> Board:
|
||||
return [[colors[c] for c in row] for row in board]
|
||||
|
||||
|
||||
ROTATION_AUGMENTATIONS = [identity, rot90, rot180, rot270]
|
||||
MIRROR_AUGMENTATIONS = [identity, hmirror, vmirror, dmirror, cmirror]
|
||||
|
||||
|
||||
class ArcAgiDataset(ProceduralDataset):
|
||||
def __init__(self, config: ArcAgiConfig):
|
||||
super().__init__(config=config, seed=config.seed, size=config.size)
|
||||
self.board_format_opts = config.board_format_opts
|
||||
self._prompt_templates = ARC_PROMPT_TEMPLATE
|
||||
|
||||
self._tasks = {}
|
||||
train_set, eval_set = arckit.load_data()
|
||||
if config.use_train:
|
||||
for x in train_set:
|
||||
self._tasks[x.id] = x.to_dict()
|
||||
if config.use_eval:
|
||||
for x in eval_set:
|
||||
self._tasks[x.id] = x.to_dict()
|
||||
self._task_ids = list(self._tasks.keys())
|
||||
|
||||
def _create_augmentation_fn(self, rng: Random) -> Callable[[Board], Board]:
|
||||
"""Create a composite augmentation function from enabled options"""
|
||||
fns = []
|
||||
|
||||
# Map rotation strings to functions
|
||||
rotation_map = {"90": rot90, "180": rot180, "270": rot270}
|
||||
if self.config.rotations:
|
||||
chosen_rot = rng.choice([identity] + [rotation_map[r] for r in self.config.rotations])
|
||||
fns.append(chosen_rot)
|
||||
|
||||
# Map mirror strings to functions
|
||||
mirror_map = {"horizontal": hmirror, "vertical": vmirror, "diagonal": dmirror, "counterdiagonal": cmirror}
|
||||
if self.config.mirrors:
|
||||
chosen_mirror = rng.choice([identity] + [mirror_map[m] for m in self.config.mirrors])
|
||||
fns.append(chosen_mirror)
|
||||
|
||||
if self.config.use_color_permutation:
|
||||
color_table = list(range(10))
|
||||
rng.shuffle(color_table)
|
||||
fns.append(lambda x: cmap(x, color_table))
|
||||
|
||||
def composite_fn(board: Board) -> Board:
|
||||
result = board
|
||||
for fn in fns:
|
||||
result = fn(result)
|
||||
return result
|
||||
|
||||
return composite_fn
|
||||
|
||||
def __getitem__(self, idx: int) -> dict:
|
||||
"""
|
||||
Generate a single ARC-AGI-1 task
|
||||
"""
|
||||
rng = Random(self.seed + idx)
|
||||
|
||||
task_id = rng.choice(self._task_ids)
|
||||
task = self._tasks[task_id]
|
||||
|
||||
# Create augmentation function to be used for all examples
|
||||
augment = self._create_augmentation_fn(rng)
|
||||
|
||||
train = task["train"]
|
||||
test = task["test"][0]
|
||||
|
||||
# Apply augmentation to all train examples
|
||||
augmented_train = []
|
||||
for p in train:
|
||||
augmented_train.append({"input": augment(p["input"]), "output": augment(p["output"])})
|
||||
|
||||
examples = [
|
||||
format_board_pair(i + 1, p, formatting_options=self.config.board_format_opts)
|
||||
for i, p in enumerate(augmented_train)
|
||||
]
|
||||
examples = "".join(examples)
|
||||
|
||||
# Apply augmentation to test example
|
||||
augmented_test_input = augment(test["input"])
|
||||
augmented_test_output = augment(test["output"])
|
||||
|
||||
test_input = format_board(augmented_test_input, self.board_format_opts)
|
||||
test_output = format_board(augmented_test_output, self.board_format_opts)
|
||||
|
||||
input_prompt = self._prompt_templates.format(examples=examples, input_grid=test_input)
|
||||
|
||||
def totuple(board: list[list[int]]) -> tuple[tuple[int, ...], ...]:
|
||||
return tuple(tuple(r) for r in board)
|
||||
|
||||
return {
|
||||
"question": input_prompt,
|
||||
"answer": test_output,
|
||||
"metadata": {
|
||||
"input": totuple(augmented_test_input),
|
||||
"output": totuple(augmented_test_output),
|
||||
"task_id": task_id,
|
||||
},
|
||||
}
|
||||
|
||||
def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
answer_board = parse_board(answer, self.board_format_opts)
|
||||
if answer_board == metadata["output"]:
|
||||
reward = 1.0
|
||||
else:
|
||||
reward = 0.05
|
||||
except:
|
||||
reward = 0.01
|
||||
return reward
|
||||
|
||||
|
||||
register_dataset("arc_agi", ArcAgiDataset, ArcAgiConfig)
|
||||
|
|
@ -1,6 +1,16 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import List, Tuple
|
||||
|
||||
ARC_PROMPT_TEMPLATE = """Find the common rule that maps an input grid to an output grid, given the examples below.
|
||||
|
||||
{examples}
|
||||
Below is a test input grid. Predict the corresponding output grid by applying the rule you found.
|
||||
Your final answer should just be the text output grid itself.
|
||||
|
||||
Input:
|
||||
{input_grid}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BoardFormattingOptions:
|
||||
|
|
@ -10,26 +20,6 @@ class BoardFormattingOptions:
|
|||
array_brackets: bool = False
|
||||
|
||||
|
||||
def format_arc_task(
|
||||
input_grid: Tuple[Tuple[int, ...], ...], output_grid: Tuple[Tuple[int, ...], ...], options: BoardFormattingOptions
|
||||
) -> str:
|
||||
"""
|
||||
Format an ARC task as a string
|
||||
"""
|
||||
|
||||
buffer = []
|
||||
if options.task_identifier:
|
||||
buffer.append(f"ARC Task: {options.task_identifier}")
|
||||
|
||||
buffer.append("\nInput Grid:")
|
||||
buffer.append(format_board(input_grid, options))
|
||||
|
||||
buffer.append("\n\nOutput Grid:")
|
||||
buffer.append(format_board(output_grid, options))
|
||||
|
||||
return "\n".join(buffer)
|
||||
|
||||
|
||||
def format_board(
|
||||
board: List[List[int]], formatting_options: BoardFormattingOptions, with_board_shape: bool = False
|
||||
) -> str:
|
||||
|
|
|
|||
|
|
@ -3,17 +3,7 @@ from random import Random
|
|||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from ..factory import ProceduralDataset, register_dataset
|
||||
from .board_format import BoardFormattingOptions, format_board, format_board_pair, parse_board
|
||||
|
||||
_REARC_PROMPT_TEMPLATES = """Find the common rule that maps an input grid to an output grid, given the examples below.
|
||||
|
||||
{examples}
|
||||
Below is a test input grid. Predict the corresponding output grid by applying the rule you found.
|
||||
Your final answer should just be the text output grid itself.
|
||||
|
||||
Input:
|
||||
{input_grid}
|
||||
"""
|
||||
from .board_format import ARC_PROMPT_TEMPLATE, BoardFormattingOptions, format_board, format_board_pair, parse_board
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -37,7 +27,7 @@ class ReArcDataset(ProceduralDataset):
|
|||
def __init__(self, config: ReArcConfig):
|
||||
super().__init__(config=config, seed=config.seed, size=config.size)
|
||||
self.board_format_opts = config.board_format_opts
|
||||
self._prompt_templates = _REARC_PROMPT_TEMPLATES
|
||||
self._prompt_templates = ARC_PROMPT_TEMPLATE
|
||||
self.diff_lb = config.diff_lb
|
||||
self.diff_ub = config.diff_ub
|
||||
|
||||
|
|
@ -89,10 +79,11 @@ class ReArcDataset(ProceduralDataset):
|
|||
rng_difficulty = self.get_rng_difficulty(rng)
|
||||
pso_difficulty = self.get_pso_difficulty(task)
|
||||
input_prompt = self.format_rearc_input(rng, task, generator)
|
||||
answer = format_board(task["output"], self.board_format_opts)
|
||||
|
||||
return {
|
||||
"question": input_prompt,
|
||||
"answer": task["output"],
|
||||
"answer": answer,
|
||||
"metadata": {
|
||||
"input": task["input"],
|
||||
"output": task["output"],
|
||||
|
|
@ -104,12 +95,13 @@ class ReArcDataset(ProceduralDataset):
|
|||
},
|
||||
}
|
||||
|
||||
def score_answer(self, answer: str, metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: str, entry: Dict[str, Any]) -> float:
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
formatted_answer = parse_board(answer, self.board_format_opts)
|
||||
if formatted_answer == metadata["output"]:
|
||||
answer_board = parse_board(answer, self.board_format_opts)
|
||||
if answer_board == metadata["output"]:
|
||||
reward = 1.0
|
||||
else:
|
||||
reward = 0.05
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ Game tasks for training reasoning capabilities:
|
|||
|
||||
from .countdown import CountdownConfig, CountdownDataset
|
||||
from .game_of_life import GameOfLifeConfig, GameOfLifeDataset
|
||||
from .knight_swap import KnightSwapConfig, KnightSwapDataset
|
||||
from .maze import MazeConfig, MazeDataset
|
||||
from .mini_sudoku import MiniSudokuConfig, MiniSudokuDataset
|
||||
from .n_queens import NQueensDataset
|
||||
|
|
@ -34,4 +35,6 @@ __all__ = [
|
|||
"NQueensDataset",
|
||||
"TsumegoConfig",
|
||||
"TsumegoDataset",
|
||||
"KnightSwapConfig",
|
||||
"KnightSwapDataset",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -159,9 +159,10 @@ class CountdownDataset(ProceduralDataset):
|
|||
|
||||
raise ValueError(f"Failed to generate valid expression after {max_attempts} attempts")
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
"""Determine if the solution provided solves the problem"""
|
||||
reward = 0.0
|
||||
metadata = entry["metadata"]
|
||||
if answer is not None:
|
||||
try:
|
||||
user_answer = int(parse_expr(answer))
|
||||
|
|
|
|||
396
reasoning_gym/games/knight_swap.py
Normal file
396
reasoning_gym/games/knight_swap.py
Normal file
|
|
@ -0,0 +1,396 @@
|
|||
import collections
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from random import Random
|
||||
from typing import Dict, FrozenSet, List, Optional, Set, Tuple
|
||||
|
||||
from ..factory import ProceduralDataset, register_dataset
|
||||
|
||||
QUESTION_TEMPLATE = """Knight Swap Challenge:
|
||||
|
||||
```
|
||||
{board}
|
||||
```
|
||||
|
||||
Legend:
|
||||
- 'w' = White Knight
|
||||
- 'B' = Black Knight
|
||||
- Empty squares are marked with '.'
|
||||
|
||||
Objective:
|
||||
Swap the positions of all white knights with all black knights through valid moves.
|
||||
|
||||
Rules:
|
||||
1. Knights move in L-shape (2 squares + 1 square perpendicular)
|
||||
2. Knights can only move to empty squares
|
||||
3. {start_turn} moves first, then players alternate
|
||||
4. All knights must reach their target positions (white ↔ black)
|
||||
|
||||
Question:
|
||||
Is it possible to swap all knights' positions? If yes, list the moves.
|
||||
|
||||
Answer Format:
|
||||
- For impossible puzzles: "No"
|
||||
- For possible puzzles: List moves as ["color,from,to", ...]
|
||||
Example: ["w,A1,B3"] means white knight moves A1→B3
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class KnightSwapConfig:
|
||||
"""Configuration for Knight Swap puzzle generation.
|
||||
|
||||
A Knight Swap puzzle involves moving white and black knights on a chess-like board
|
||||
where each move must be a valid knight's move. The goal is to swap the positions
|
||||
of white and black knights.
|
||||
"""
|
||||
|
||||
min_nodes: int = 6 # Minimum number of squares on the board
|
||||
max_nodes: int = 9 # Maximum number of squares on the board
|
||||
min_pieces: int = 2 # Minimum number of pieces per color
|
||||
max_pieces: int = 2 # Maximum number of pieces per color
|
||||
min_steps: int = 4 # Minimum solution length
|
||||
max_steps: int = 20 # Maximum solution length
|
||||
max_attempts: int = 100 # Maximum attempts for board generation and puzzle creation
|
||||
seed: Optional[int] = None
|
||||
size: int = 5 # Virtual dataset size
|
||||
impossible_ratio: float = 0.2 # Ratio of puzzles that should be impossible
|
||||
|
||||
def validate(self):
|
||||
"""Validate configuration parameters"""
|
||||
assert self.min_nodes >= 6, "min_nodes must be >= 6"
|
||||
assert self.max_nodes >= self.min_nodes, "max_nodes must be >= min_nodes"
|
||||
assert self.min_pieces >= 1, "min_pieces must be >= 1"
|
||||
assert self.max_pieces >= self.min_pieces, "max_pieces must be >= min_pieces"
|
||||
assert self.min_steps >= 1, "min_steps must be >= 1"
|
||||
assert self.max_steps >= self.min_steps, "max_steps must be >= min_steps"
|
||||
assert self.max_attempts >= 1, "max_attempts must be >= 1"
|
||||
assert 0 <= self.impossible_ratio <= 1, "impossible_ratio must be between 0 and 1"
|
||||
|
||||
|
||||
class KnightSwapLogic:
|
||||
"""Core game logic for Knight Swap puzzles."""
|
||||
|
||||
@staticmethod
|
||||
def is_knight_move(a: str, b: str) -> bool:
|
||||
"""Check if moving from square 'a' to square 'b' is a legal knight move."""
|
||||
a_col = ord(a[0].upper()) - ord("A") + 1
|
||||
a_row = int(a[1:])
|
||||
b_col = ord(b[0].upper()) - ord("A") + 1
|
||||
b_row = int(b[1:])
|
||||
return {abs(a_col - b_col), abs(a_row - b_row)} == {1, 2}
|
||||
|
||||
@staticmethod
|
||||
def is_connected(graph: Dict[str, List[str]]) -> bool:
|
||||
"""Check if a graph is connected (all nodes reachable from any starting node)."""
|
||||
if not graph:
|
||||
return True
|
||||
start = next(iter(graph))
|
||||
visited = set()
|
||||
queue = collections.deque([start])
|
||||
while queue:
|
||||
node = queue.popleft()
|
||||
if node not in visited:
|
||||
visited.add(node)
|
||||
for neighbor in graph[node]:
|
||||
if neighbor not in visited:
|
||||
queue.append(neighbor)
|
||||
return len(visited) == len(graph)
|
||||
|
||||
@staticmethod
|
||||
def generate_board(num_nodes: int, rng: Random, max_attempts: int = 1000) -> Dict[str, List[str]]:
|
||||
"""Generate a random connected board where edges represent valid knight moves."""
|
||||
candidates = ["A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3", "D1", "D2", "D3"]
|
||||
attempts = 0
|
||||
while True:
|
||||
attempts += 1
|
||||
nodes = rng.sample(candidates, num_nodes)
|
||||
graph = {node: [] for node in nodes}
|
||||
for i in range(len(nodes)):
|
||||
for j in range(i + 1, len(nodes)):
|
||||
if KnightSwapLogic.is_knight_move(nodes[i], nodes[j]):
|
||||
graph[nodes[i]].append(nodes[j])
|
||||
graph[nodes[j]].append(nodes[i])
|
||||
for node in graph:
|
||||
graph[node].sort()
|
||||
if KnightSwapLogic.is_connected(graph):
|
||||
return graph
|
||||
if attempts > max_attempts:
|
||||
raise Exception(f"Failed to generate connected board after {max_attempts} attempts")
|
||||
|
||||
@staticmethod
|
||||
def solve_swap(
|
||||
board: Dict[str, List[str]], pieces: Dict[str, str], start_turn: str = "w"
|
||||
) -> Optional[List[Tuple[str, str, str]]]:
|
||||
"""Find a sequence of moves to swap white and black pieces positions."""
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GameState:
|
||||
white_set: FrozenSet[str]
|
||||
black_set: FrozenSet[str]
|
||||
turn: str
|
||||
|
||||
initial_white = frozenset(pos for pos, piece in pieces.items() if piece == "w")
|
||||
initial_black = frozenset(pos for pos, piece in pieces.items() if piece == "B")
|
||||
initial_state = GameState(initial_white, initial_black, start_turn)
|
||||
|
||||
queue = collections.deque([initial_state])
|
||||
visited = {initial_state}
|
||||
predecessors = {initial_state: (None, None)}
|
||||
|
||||
while queue:
|
||||
state = queue.popleft()
|
||||
if state.white_set == initial_black and state.black_set == initial_white:
|
||||
moves = []
|
||||
cur_state = state
|
||||
while predecessors[cur_state][0] is not None:
|
||||
prev_state, move = predecessors[cur_state]
|
||||
moves.append(move)
|
||||
cur_state = prev_state
|
||||
moves.reverse()
|
||||
return moves
|
||||
|
||||
current_positions = state.white_set if state.turn == "w" else state.black_set
|
||||
for pos in current_positions:
|
||||
for neighbor in board[pos]:
|
||||
if neighbor in state.white_set or neighbor in state.black_set:
|
||||
continue
|
||||
if state.turn == "w":
|
||||
new_white = frozenset(p if p != pos else neighbor for p in state.white_set)
|
||||
new_black = state.black_set
|
||||
else:
|
||||
new_black = frozenset(p if p != pos else neighbor for p in state.black_set)
|
||||
new_white = state.white_set
|
||||
next_turn = "B" if state.turn == "w" else "w"
|
||||
new_state = GameState(new_white, new_black, next_turn)
|
||||
if new_state not in visited:
|
||||
visited.add(new_state)
|
||||
predecessors[new_state] = (state, (state.turn, pos, neighbor))
|
||||
queue.append(new_state)
|
||||
return None
|
||||
|
||||
|
||||
class KnightSwapDataset(ProceduralDataset):
|
||||
"""Generates Knight Swap puzzles with configurable parameters."""
|
||||
|
||||
def __init__(self, config: KnightSwapConfig):
|
||||
super().__init__(config=config, seed=config.seed, size=config.size)
|
||||
self.game_logic = KnightSwapLogic()
|
||||
|
||||
def _format_board(self, board: Dict[str, List[str]], pieces: Dict[str, str]) -> str:
|
||||
"""Format the board state as a string."""
|
||||
positions = list(board.keys())
|
||||
if not positions:
|
||||
return ""
|
||||
|
||||
columns = sorted(set(pos[0] for pos in positions))
|
||||
rows = sorted(set(int(pos[1:]) for pos in positions), reverse=True)
|
||||
|
||||
lines = []
|
||||
# Header
|
||||
lines.append(" " + " ".join(columns))
|
||||
lines.append(" " + "----" * len(columns))
|
||||
|
||||
# Board rows
|
||||
for row in rows:
|
||||
line = f"{row} |"
|
||||
for col in columns:
|
||||
pos = col + str(row)
|
||||
if pos in pieces:
|
||||
piece = pieces[pos] if pieces[pos] is not None else "."
|
||||
line += f" {piece} |"
|
||||
else:
|
||||
line += " |"
|
||||
lines.append(line)
|
||||
lines.append(" " + "----" * len(columns))
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_moves(self, moves: List[Tuple[str, str, str]]) -> str:
|
||||
"""Format the solution moves as a string."""
|
||||
if not moves:
|
||||
return "No"
|
||||
return json.dumps([f"{color},{start},{end}" for color, start, end in moves])
|
||||
|
||||
def __getitem__(self, idx: int) -> Dict:
|
||||
"""Generate a single Knight Swap puzzle."""
|
||||
rng = Random(self.seed + idx)
|
||||
|
||||
# Keep trying with new boards until we succeed
|
||||
board_attempts = 0
|
||||
while board_attempts < self.config.max_attempts:
|
||||
try:
|
||||
# Generate a new board
|
||||
num_nodes = rng.randint(self.config.min_nodes, self.config.max_nodes)
|
||||
board = self.game_logic.generate_board(num_nodes, rng, max_attempts=self.config.max_attempts)
|
||||
positions = list(board.keys())
|
||||
|
||||
# Decide if this should be an impossible puzzle
|
||||
make_impossible = rng.random() < self.config.impossible_ratio
|
||||
|
||||
# Try different piece placements on this board
|
||||
for _ in range(50): # Reduced attempts per board since we try multiple boards
|
||||
# Use fixed number of pieces for more reliable generation
|
||||
num_pieces = self.config.min_pieces
|
||||
white_positions = rng.sample(positions, num_pieces)
|
||||
remaining = [p for p in positions if p not in white_positions]
|
||||
black_positions = rng.sample(remaining, num_pieces)
|
||||
|
||||
pieces = {pos: None for pos in positions}
|
||||
for pos in white_positions:
|
||||
pieces[pos] = "w"
|
||||
for pos in black_positions:
|
||||
pieces[pos] = "B"
|
||||
|
||||
# For impossible puzzles, try a simpler approach: just remove some key connections
|
||||
board_copy = {k: list(v) for k, v in board.items()} # Make a copy of the board
|
||||
if make_impossible:
|
||||
# Remove critical edges that would make the puzzle impossible
|
||||
critical_edges = []
|
||||
for w_pos in white_positions:
|
||||
for b_pos in black_positions:
|
||||
if b_pos in board_copy[w_pos]:
|
||||
critical_edges.append((w_pos, b_pos))
|
||||
|
||||
if critical_edges: # Only proceed if we found critical edges
|
||||
# Remove a random critical edge
|
||||
w_pos, b_pos = rng.choice(critical_edges)
|
||||
board_copy[w_pos].remove(b_pos)
|
||||
board_copy[b_pos].remove(w_pos)
|
||||
|
||||
# Try both starting turns
|
||||
for start_turn in ["w", "B"]:
|
||||
solution = self.game_logic.solve_swap(board_copy, pieces, start_turn)
|
||||
|
||||
# Accept solutions with more flexible length requirements
|
||||
if (make_impossible and solution is None) or (
|
||||
not make_impossible
|
||||
and solution is not None
|
||||
and self.config.min_steps <= len(solution) <= self.config.max_steps
|
||||
):
|
||||
board_str = self._format_board(board_copy, pieces)
|
||||
solution_str = self._format_moves(solution) if solution else "No"
|
||||
|
||||
# Generate board states for solvable puzzles
|
||||
board_states = []
|
||||
if solution is not None:
|
||||
current_pieces = dict(pieces)
|
||||
board_states.append(dict(current_pieces)) # Initial state
|
||||
|
||||
for color, start, end in solution:
|
||||
current_pieces[end] = current_pieces[start]
|
||||
current_pieces[start] = None
|
||||
board_states.append(dict(current_pieces))
|
||||
|
||||
return {
|
||||
"question": QUESTION_TEMPLATE.format(board=board_str, start_turn=start_turn),
|
||||
"answer": solution_str,
|
||||
"metadata": {
|
||||
"board": board_copy,
|
||||
"pieces": pieces,
|
||||
"start_turn": start_turn,
|
||||
"solution": solution,
|
||||
"is_possible": solution is not None,
|
||||
"num_steps": len(solution) if solution else 0,
|
||||
"board_states": board_states if solution is not None else None,
|
||||
},
|
||||
}
|
||||
|
||||
except Exception:
|
||||
pass # If board generation fails, we'll try again with a new board
|
||||
|
||||
board_attempts += 1
|
||||
|
||||
raise ValueError(f"Failed to generate valid puzzle after trying {self.config.max_attempts} different boards")
|
||||
|
||||
def score_answer(self, answer: Optional[str], entry: Dict) -> float:
|
||||
"""Score the user's solution for the Knight Swap puzzle.
|
||||
|
||||
The answer should be either:
|
||||
- "No" if the puzzle is impossible
|
||||
- A JSON list of moves in format ["color,start,end", ...] where color is 'w' or 'B'
|
||||
|
||||
Returns:
|
||||
- 1.0 for correct answer (either "No" for impossible puzzles or valid solution of optimal length)
|
||||
- A proportional score for correct but longer solutions
|
||||
- 0.05 for valid moves that don't solve the puzzle
|
||||
- 0.01 for invalid format
|
||||
- 0.0 for None
|
||||
"""
|
||||
if answer is None:
|
||||
return 0.0
|
||||
|
||||
answer = answer.strip()
|
||||
if not answer:
|
||||
return 0.01
|
||||
|
||||
# Handle impossible puzzles
|
||||
if not entry["metadata"]["is_possible"]:
|
||||
return 1.0 if answer.lower() == "no" else 0.01
|
||||
|
||||
# Handle "No" answer for possible puzzles
|
||||
if answer.lower() == "no":
|
||||
return 0.01
|
||||
|
||||
try:
|
||||
# Parse moves from JSON list
|
||||
move_list = json.loads(answer)
|
||||
if not isinstance(move_list, list):
|
||||
return 0.01
|
||||
|
||||
# Parse moves
|
||||
moves = []
|
||||
for move_str in move_list:
|
||||
color, start, end = move_str.split(",")
|
||||
if color not in ("w", "B"):
|
||||
return 0.01
|
||||
moves.append((color, start, end))
|
||||
|
||||
# Validate and apply moves
|
||||
board = entry["metadata"]["board"]
|
||||
pieces = dict(entry["metadata"]["pieces"])
|
||||
current_turn = entry["metadata"]["start_turn"]
|
||||
|
||||
# Track board states after each move
|
||||
board_states = []
|
||||
board_states.append(dict(pieces)) # Initial state
|
||||
|
||||
for color, start, end in moves:
|
||||
if color != current_turn:
|
||||
return 0.01
|
||||
if start not in pieces or pieces[start] != color:
|
||||
return 0.01
|
||||
if end not in board[start]:
|
||||
return 0.01
|
||||
if end in pieces and pieces[end] is not None:
|
||||
return 0.01
|
||||
|
||||
# Apply move
|
||||
pieces[end] = pieces[start]
|
||||
pieces[start] = None
|
||||
current_turn = "B" if current_turn == "w" else "w"
|
||||
|
||||
# Store board state after this move
|
||||
board_states.append(dict(pieces))
|
||||
|
||||
# Check if solved
|
||||
white_positions = {pos for pos, piece in pieces.items() if piece == "w"}
|
||||
black_positions = {pos for pos, piece in pieces.items() if piece == "B"}
|
||||
initial_white = {pos for pos, piece in entry["metadata"]["pieces"].items() if piece == "w"}
|
||||
initial_black = {pos for pos, piece in entry["metadata"]["pieces"].items() if piece == "B"}
|
||||
|
||||
if white_positions == initial_black and black_positions == initial_white:
|
||||
optimal_moves = len(entry["metadata"]["solution"])
|
||||
# Add board states to metadata if solution is valid
|
||||
entry["metadata"]["board_states"] = board_states
|
||||
if len(moves) <= optimal_moves:
|
||||
return 1.0
|
||||
else:
|
||||
return optimal_moves / len(moves)
|
||||
return 0.05
|
||||
|
||||
except Exception:
|
||||
return 0.01
|
||||
|
||||
|
||||
register_dataset("knight_swap", KnightSwapDataset, KnightSwapConfig)
|
||||
|
|
@ -368,7 +368,7 @@ class HanoiDataset(ProceduralDataset):
|
|||
to_peg = int(match.group(3))
|
||||
return disk, from_peg, to_peg
|
||||
|
||||
def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
|
||||
def score_answer(self, answer: Optional[str], entry: Dict[str, Any]) -> float:
|
||||
"""
|
||||
Score the user's solution for the Tower of Hanoi puzzle.
|
||||
|
||||
|
|
@ -398,6 +398,7 @@ class HanoiDataset(ProceduralDataset):
|
|||
return 0.0
|
||||
|
||||
# Build the initial peg state from metadata.
|
||||
metadata = entry["metadata"]
|
||||
num_disks = metadata["num_disks"]
|
||||
num_pegs = metadata["num_pegs"]
|
||||
start_peg = metadata["start_peg"]
|
||||
|
|
|
|||
139
tests/test_arc_agi.py
Normal file
139
tests/test_arc_agi.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
import pytest
|
||||
|
||||
from reasoning_gym.arc.arc_agi import ArcAgiConfig, ArcAgiDataset
|
||||
|
||||
|
||||
def test_arc_agi_config_validation():
|
||||
"""Test validation of ArcAgi configuration parameters"""
|
||||
with pytest.raises(AssertionError):
|
||||
ArcAgiConfig(size=0).validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
ArcAgiConfig(rotations=["invalid"]).validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
ArcAgiConfig(mirrors=["invalid"]).validate()
|
||||
|
||||
# Valid configs should not raise
|
||||
config = ArcAgiConfig(size=10, seed=42)
|
||||
config.validate()
|
||||
|
||||
config = ArcAgiConfig(rotations=["90", "180"], mirrors=["horizontal", "diagonal"])
|
||||
config.validate()
|
||||
|
||||
# Empty lists should be valid (no augmentations)
|
||||
config = ArcAgiConfig(rotations=[], mirrors=[])
|
||||
config.validate()
|
||||
|
||||
|
||||
def test_arc_agi_deterministic():
|
||||
"""Test dataset reproducibility with fixed seed"""
|
||||
config = ArcAgiConfig(seed=42, size=10)
|
||||
ds1 = ArcAgiDataset(config)
|
||||
ds2 = ArcAgiDataset(config)
|
||||
|
||||
for i in range(len(ds1)):
|
||||
assert ds1[i] == ds2[i], "ArcAgi datasets with same seed should match exactly"
|
||||
|
||||
|
||||
def test_arc_agi_items():
|
||||
"""Test basic structure and metadata of generated items"""
|
||||
config = ArcAgiConfig(seed=42, size=10)
|
||||
dataset = ArcAgiDataset(config)
|
||||
|
||||
for item in dataset:
|
||||
assert isinstance(item, dict)
|
||||
assert "question" in item
|
||||
assert "answer" in item
|
||||
assert "metadata" in item
|
||||
|
||||
meta = item["metadata"]
|
||||
assert "input" in meta
|
||||
assert "output" in meta
|
||||
assert "task_id" in meta
|
||||
|
||||
# Verify input/output are tuples of tuples (board format)
|
||||
assert isinstance(meta["input"], tuple)
|
||||
assert isinstance(meta["output"], tuple)
|
||||
assert all(isinstance(row, tuple) for row in meta["input"])
|
||||
assert all(isinstance(row, tuple) for row in meta["output"])
|
||||
|
||||
# Verify task_id is a string
|
||||
assert isinstance(meta["task_id"], str)
|
||||
|
||||
|
||||
def test_arc_agi_augmentations():
|
||||
"""Test that augmentations can be selectively enabled/disabled"""
|
||||
# Test with all augmentations disabled
|
||||
config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=[], use_color_permutation=False)
|
||||
base_dataset = ArcAgiDataset(config)
|
||||
base_items = list(base_dataset)
|
||||
|
||||
# Test with specific rotation only
|
||||
rot_config = ArcAgiConfig(seed=42, size=10, rotations=["90"], mirrors=[], use_color_permutation=False)
|
||||
rot_dataset = ArcAgiDataset(rot_config)
|
||||
rot_items = list(rot_dataset)
|
||||
|
||||
# Items should differ with rotation enabled
|
||||
assert any(
|
||||
base_items[i]["metadata"]["input"] != rot_items[i]["metadata"]["input"] for i in range(len(base_items))
|
||||
), "90-degree rotation augmentation had no effect"
|
||||
|
||||
# Test with specific mirror only
|
||||
mirror_config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=["horizontal"], use_color_permutation=False)
|
||||
mirror_dataset = ArcAgiDataset(mirror_config)
|
||||
mirror_items = list(mirror_dataset)
|
||||
|
||||
# Items should differ with mirror enabled
|
||||
assert any(
|
||||
base_items[i]["metadata"]["input"] != mirror_items[i]["metadata"]["input"] for i in range(len(base_items))
|
||||
), "Horizontal mirror augmentation had no effect"
|
||||
|
||||
# Test with color permutation only
|
||||
color_config = ArcAgiConfig(seed=42, size=10, rotations=[], mirrors=[], use_color_permutation=True)
|
||||
color_dataset = ArcAgiDataset(color_config)
|
||||
color_items = list(color_dataset)
|
||||
|
||||
# Items should differ with color permutation enabled
|
||||
assert any(
|
||||
base_items[i]["metadata"]["input"] != color_items[i]["metadata"]["input"] for i in range(len(base_items))
|
||||
), "Color permutation had no effect"
|
||||
|
||||
|
||||
def test_arc_agi_scoring():
|
||||
"""Test solution verification and scoring"""
|
||||
config = ArcAgiConfig(size=10, seed=123)
|
||||
dataset = ArcAgiDataset(config)
|
||||
|
||||
for item in dataset:
|
||||
# Test correct solution
|
||||
assert dataset.score_answer(item["answer"], entry=item) == 1.0
|
||||
|
||||
# Test invalid format
|
||||
assert dataset.score_answer("invalid grid format", entry=item) == 0.01
|
||||
|
||||
# Test None answer
|
||||
assert dataset.score_answer(None, entry=item) == 0.0
|
||||
|
||||
# Test wrong but valid grid format
|
||||
wrong_answer = "1 0 0 0\n0 0 0 1"
|
||||
assert dataset.score_answer(wrong_answer, entry=item) == 0.05
|
||||
|
||||
|
||||
def test_arc_agi_dataset_modes():
|
||||
"""Test dataset behavior with different train/eval configurations"""
|
||||
# Test train-only mode
|
||||
train_config = ArcAgiConfig(use_train=True, use_eval=False, size=10, seed=42)
|
||||
train_ds = ArcAgiDataset(train_config)
|
||||
assert len(train_ds._task_ids) > 0
|
||||
|
||||
# Test eval-only mode
|
||||
eval_config = ArcAgiConfig(use_train=False, use_eval=True, size=10, seed=42)
|
||||
eval_ds = ArcAgiDataset(eval_config)
|
||||
assert len(eval_ds._task_ids) > 0
|
||||
|
||||
# Test both modes
|
||||
both_config = ArcAgiConfig(use_train=True, use_eval=True, size=10, seed=42)
|
||||
both_ds = ArcAgiDataset(both_config)
|
||||
assert len(both_ds._task_ids) > len(train_ds._task_ids)
|
||||
assert len(both_ds._task_ids) > len(eval_ds._task_ids)
|
||||
|
|
@ -52,30 +52,30 @@ def test_complex_arithmetic_scoring():
|
|||
dataset = ComplexArithmeticDataset(config)
|
||||
|
||||
# Test case with answer 3 + 2i
|
||||
metadata = {"result": (3.0, 2.0)}
|
||||
entry = {"metadata": {"result": (3.0, 2.0)}}
|
||||
|
||||
# Test exact matches (should get score of 1.0)
|
||||
assert dataset.score_answer("3 + 2i", metadata) == 1.0
|
||||
assert dataset.score_answer("3+2i", metadata) == 1.0
|
||||
assert dataset.score_answer("3.0 + 2.0i", metadata) == 1.0
|
||||
assert dataset.score_answer("3 + 2i", entry) == 1.0
|
||||
assert dataset.score_answer("3+2i", entry) == 1.0
|
||||
assert dataset.score_answer("3.0 + 2.0i", entry) == 1.0
|
||||
|
||||
# Test answers with small errors (should get high but < 1.0 scores)
|
||||
print(dataset.score_answer("3.1 + 2i", metadata))
|
||||
assert 0.9 < dataset.score_answer("3.1 + 2i", metadata) < 1.0
|
||||
assert 0.9 < dataset.score_answer("3 + 2.1i", metadata) < 1.0
|
||||
assert 0.7 < dataset.score_answer("3.1 + 2.1i", metadata) < 0.95
|
||||
print(dataset.score_answer("3.1 + 2i", entry))
|
||||
assert 0.9 < dataset.score_answer("3.1 + 2i", entry) < 1.0
|
||||
assert 0.9 < dataset.score_answer("3 + 2.1i", entry) < 1.0
|
||||
assert 0.7 < dataset.score_answer("3.1 + 2.1i", entry) < 0.95
|
||||
|
||||
# Test answers with moderate errors (should get medium scores)
|
||||
assert 0.3 < dataset.score_answer("4 + 2i", metadata) < 0.4
|
||||
assert 0.3 < dataset.score_answer("3 + 3i", metadata) < 0.4
|
||||
assert 0.3 < dataset.score_answer("4 + 2i", entry) < 0.4
|
||||
assert 0.3 < dataset.score_answer("3 + 3i", entry) < 0.4
|
||||
|
||||
# Test answers with large errors (should get very low scores)
|
||||
assert dataset.score_answer("10 + 10i", metadata) < 0.01
|
||||
assert dataset.score_answer("10 + 10i", entry) < 0.01
|
||||
|
||||
# Test invalid answers (should get 0.0)
|
||||
assert dataset.score_answer("invalid", metadata) == 0.0
|
||||
assert dataset.score_answer(None, metadata) == 0.0
|
||||
assert dataset.score_answer("inf + 2i", metadata) == 0.0
|
||||
assert dataset.score_answer("invalid", entry) == 0.0
|
||||
assert dataset.score_answer(None, entry) == 0.0
|
||||
assert dataset.score_answer("inf + 2i", entry) == 0.0
|
||||
|
||||
|
||||
def test_complex_arithmetic_division_by_zero():
|
||||
|
|
|
|||
|
|
@ -66,13 +66,13 @@ def test_countdown_game_items():
|
|||
expr = item["metadata"]["expression"]
|
||||
|
||||
# check score
|
||||
assert dataset.score_answer(answer=expr, metadata=item["metadata"]) == 1.0 # correct answer
|
||||
assert dataset.score_answer(answer="45+2", metadata=item["metadata"]) == 0.05 # wrong answer but an attempt
|
||||
assert dataset.score_answer(answer=expr, entry=item) == 1.0 # correct answer
|
||||
assert dataset.score_answer(answer="45+2", entry=item) == 0.05 # wrong answer but an attempt
|
||||
assert (
|
||||
dataset.score_answer(answer="a wrong solution", metadata=item["metadata"]) == 0.01
|
||||
dataset.score_answer(answer="a wrong solution", entry=item) == 0.01
|
||||
) # wrong answer but incorrectly formatted
|
||||
assert dataset.score_answer(answer="", metadata=item["metadata"]) == 0.01 # wrong answer but empty string
|
||||
assert dataset.score_answer(answer=None, metadata=item["metadata"]) == 0.0 # no answer
|
||||
assert dataset.score_answer(answer="", entry=item) == 0.01 # wrong answer but empty string
|
||||
assert dataset.score_answer(answer=None, entry=item) == 0.0 # no answer
|
||||
|
||||
try:
|
||||
result = eval(expr) # Safe here since we control expression generation
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ def test_verify_answer():
|
|||
dataset = IntermediateIntegrationDataset(config)
|
||||
for i in range(len(dataset)):
|
||||
item = dataset[i]
|
||||
score = dataset.score_answer(item["answer"], item["metadata"])
|
||||
score = dataset.score_answer(answer=item["answer"], entry=item)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
|
|
@ -140,5 +140,6 @@ def test_score_answer_cases():
|
|||
]
|
||||
|
||||
for answer, metadata, expected in test_cases:
|
||||
score = dataset.score_answer(answer, metadata)
|
||||
dummy_entry = {"metadata": metadata}
|
||||
score = dataset.score_answer(answer, entry=dummy_entry)
|
||||
assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}"
|
||||
|
|
|
|||
162
tests/test_knight_swap.py
Normal file
162
tests/test_knight_swap.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
import pytest
|
||||
|
||||
from reasoning_gym.games.knight_swap import KnightSwapConfig, KnightSwapDataset, KnightSwapLogic
|
||||
|
||||
|
||||
def test_default_config_validation():
|
||||
"""Test that default configuration is valid"""
|
||||
config = KnightSwapConfig()
|
||||
config.validate() # Should not raise any exceptions
|
||||
|
||||
|
||||
def test_invalid_config():
|
||||
"""Test that invalid configurations raise appropriate errors"""
|
||||
with pytest.raises(AssertionError):
|
||||
config = KnightSwapConfig(min_nodes=4) # Too few nodes
|
||||
config.validate()
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
config = KnightSwapConfig(max_nodes=5, min_nodes=6) # max < min
|
||||
config.validate()
|
||||
|
||||
|
||||
def test_board_connectivity():
|
||||
"""Test that generated boards are connected"""
|
||||
config = KnightSwapConfig(min_nodes=6, max_nodes=6)
|
||||
dataset = KnightSwapDataset(config)
|
||||
attempts = 10
|
||||
# Try multiple puzzles since generation is random
|
||||
found_connected = False
|
||||
for i in range(attempts):
|
||||
board = dataset[i]["metadata"]["board"]
|
||||
if KnightSwapLogic.is_connected(board):
|
||||
found_connected = True
|
||||
break
|
||||
# Print debug info for failing boards
|
||||
print(f"\nBoard {i} not connected:")
|
||||
print(f"Nodes: {list(board.keys())}")
|
||||
print(f"Edges: {board}")
|
||||
|
||||
assert found_connected, f"Could not find a connected board after {attempts} attempts"
|
||||
|
||||
|
||||
def test_known_connected_board():
|
||||
"""Test connectivity check with a known connected board"""
|
||||
# Create a simple connected board with valid knight moves
|
||||
board = {
|
||||
"A1": ["B3", "C2"],
|
||||
"B3": ["A1", "C1"],
|
||||
"C1": ["B3", "A2"],
|
||||
"A2": ["C1", "B4"],
|
||||
"B4": ["A2", "C2"],
|
||||
"C2": ["A1", "B4"],
|
||||
}
|
||||
assert KnightSwapLogic.is_connected(board), "Known connected board should be identified as connected"
|
||||
|
||||
|
||||
def test_valid_knight_moves():
|
||||
"""Test that all edges in generated board represent valid knight moves"""
|
||||
config = KnightSwapConfig(min_nodes=6, max_nodes=6)
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
board = dataset[0]["metadata"]["board"]
|
||||
for start, neighbors in board.items():
|
||||
for end in neighbors:
|
||||
assert KnightSwapLogic.is_knight_move(start, end)
|
||||
|
||||
|
||||
def test_knight_move_validation():
|
||||
"""Test basic knight move validation"""
|
||||
assert KnightSwapLogic.is_knight_move("A1", "B3") # Valid move
|
||||
assert KnightSwapLogic.is_knight_move("B3", "A1") # Valid move reverse
|
||||
assert not KnightSwapLogic.is_knight_move("A1", "A2") # Invalid move
|
||||
assert not KnightSwapLogic.is_knight_move("A1", "B2") # Invalid move
|
||||
|
||||
|
||||
def test_simple_solvable_puzzle():
|
||||
"""Test a minimal solvable puzzle with one piece each"""
|
||||
config = KnightSwapConfig(min_nodes=6, max_nodes=6, min_pieces=1, max_pieces=1, impossible_ratio=0.0)
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
# Try to find a solvable puzzle
|
||||
for i in range(5): # Try a few times since generation is random
|
||||
puzzle = dataset[i]
|
||||
if puzzle["metadata"]["is_possible"]:
|
||||
assert puzzle["answer"] != "No"
|
||||
assert isinstance(eval(puzzle["answer"]), list)
|
||||
return
|
||||
|
||||
pytest.fail("Could not find a solvable puzzle")
|
||||
|
||||
|
||||
def test_impossible_puzzle():
|
||||
"""Test that impossible puzzles are correctly identified"""
|
||||
config = KnightSwapConfig(min_nodes=6, max_nodes=6, min_pieces=2, max_pieces=2, impossible_ratio=1.0)
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
puzzle = dataset[0]
|
||||
assert puzzle["metadata"]["is_possible"] is False
|
||||
assert puzzle["answer"] == "No"
|
||||
|
||||
|
||||
def test_alternating_turns():
|
||||
"""Test that solutions follow alternating turns rule"""
|
||||
config = KnightSwapConfig(impossible_ratio=0.0)
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
# Find a solvable puzzle
|
||||
for i in range(5):
|
||||
puzzle = dataset[i]
|
||||
if puzzle["metadata"]["is_possible"]:
|
||||
moves = eval(puzzle["answer"])
|
||||
current_turn = puzzle["metadata"]["start_turn"]
|
||||
for move in moves:
|
||||
color = move.split(",")[0]
|
||||
assert color == current_turn
|
||||
current_turn = "B" if current_turn == "w" else "w"
|
||||
return
|
||||
|
||||
pytest.fail("Could not find a solvable puzzle")
|
||||
|
||||
|
||||
def test_solution_validation():
|
||||
"""Test that solutions reach the target state"""
|
||||
config = KnightSwapConfig(impossible_ratio=0.0)
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
# Find a solvable puzzle
|
||||
for i in range(5):
|
||||
puzzle = dataset[i]
|
||||
if puzzle["metadata"]["is_possible"]:
|
||||
# Get initial positions
|
||||
initial_white = {pos for pos, piece in puzzle["metadata"]["pieces"].items() if piece == "w"}
|
||||
initial_black = {pos for pos, piece in puzzle["metadata"]["pieces"].items() if piece == "B"}
|
||||
|
||||
# Get final positions from board states
|
||||
final_state = puzzle["metadata"]["board_states"][-1]
|
||||
final_white = {pos for pos, piece in final_state.items() if piece == "w"}
|
||||
final_black = {pos for pos, piece in final_state.items() if piece == "B"}
|
||||
|
||||
# Check that positions are swapped
|
||||
assert final_white == initial_black
|
||||
assert final_black == initial_white
|
||||
return
|
||||
|
||||
pytest.fail("Could not find a solvable puzzle")
|
||||
|
||||
|
||||
def test_score_calculation():
|
||||
"""Test scoring for different answer types"""
|
||||
config = KnightSwapConfig()
|
||||
dataset = KnightSwapDataset(config)
|
||||
|
||||
# Get a sample puzzle
|
||||
puzzle = dataset[0]
|
||||
|
||||
# Test invalid answers
|
||||
assert dataset.score_answer(None, puzzle) == 0.0
|
||||
assert dataset.score_answer("", puzzle) == 0.01
|
||||
assert dataset.score_answer("Invalid", puzzle) == 0.01
|
||||
|
||||
# Test correct answer
|
||||
assert dataset.score_answer(puzzle["answer"], puzzle) == 1.0
|
||||
|
|
@ -72,21 +72,20 @@ def test_score_answer():
|
|||
|
||||
for item in dataset:
|
||||
correct_answer = item["answer"]
|
||||
metadata = item["metadata"]
|
||||
|
||||
# Correct answer should score 1.0
|
||||
assert dataset.score_answer(correct_answer, metadata) == 1.0
|
||||
assert dataset.score_answer(correct_answer, entry=item) == 1.0
|
||||
|
||||
# Incorrect answer (palindrome, but not correct one) should score 0.05
|
||||
pal_letters = "racecar" if "racecar" != correct_answer else "aba"
|
||||
assert dataset.score_answer(pal_letters, metadata) == 0.05
|
||||
assert dataset.score_answer(pal_letters, entry=item) == 0.05
|
||||
|
||||
# Incorrect answer (not palindrome) should score 0.02
|
||||
wrong_letters = "abcd" if "abcd" != correct_answer else "efgh"
|
||||
assert dataset.score_answer(wrong_letters, metadata) == 0.02
|
||||
assert dataset.score_answer(wrong_letters, entry=item) == 0.02
|
||||
|
||||
# Empty String input should score 0.01
|
||||
assert dataset.score_answer("", metadata) == 0.01
|
||||
assert dataset.score_answer("", entry=item) == 0.01
|
||||
|
||||
# Empty input should score 0.0
|
||||
assert dataset.score_answer(None, metadata) == 0.0
|
||||
assert dataset.score_answer(None, entry=item) == 0.0
|
||||
|
|
|
|||
|
|
@ -137,10 +137,10 @@ def test_score_function():
|
|||
seed=42,
|
||||
)
|
||||
|
||||
assert ds.score_answer(None, ds[0]["metadata"]) == 0.00
|
||||
assert ds.score_answer("6*x**4 + 9*x**3 - 6*x**2 - 39*x - 45", ds[0]["metadata"]) == 1
|
||||
assert ds.score_answer("Not a polynomial", ds[0]["metadata"]) == 0.01
|
||||
assert ds.score_answer("x**4", ds[0]["metadata"]) == 0.05
|
||||
assert ds.score_answer(None, ds[0]) == 0.00
|
||||
assert ds.score_answer("6*x**4 + 9*x**3 - 6*x**2 - 39*x - 45", ds[0]) == 1
|
||||
assert ds.score_answer("Not a polynomial", ds[0]) == 0.01
|
||||
assert ds.score_answer("x**4", ds[0]) == 0.05
|
||||
|
||||
|
||||
def test_multivariate_score_function():
|
||||
|
|
@ -160,7 +160,7 @@ def test_multivariate_score_function():
|
|||
seed=42,
|
||||
)
|
||||
|
||||
assert ds.score_answer(None, ds[0]["metadata"]) == 0.00
|
||||
assert ds.score_answer("-27*a**3*c - 27*a**3 + 144*a*c + 144*a", ds[0]["metadata"]) == 1
|
||||
assert ds.score_answer("Not a polynomial", ds[0]["metadata"]) == 0.01
|
||||
assert ds.score_answer("x**4", ds[0]["metadata"]) == 0.05
|
||||
assert ds.score_answer(None, ds[0]) == 0.00
|
||||
assert ds.score_answer("-27*a**3*c - 27*a**3 + 144*a*c + 144*a", ds[0]) == 1
|
||||
assert ds.score_answer("Not a polynomial", ds[0]) == 0.01
|
||||
assert ds.score_answer("x**4", ds[0]) == 0.05
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ def test_rearc_solution_validation():
|
|||
for item in dataset:
|
||||
# Test correct solution
|
||||
correct = format_board(item["metadata"]["output"], dataset.board_format_opts)
|
||||
assert dataset.score_answer(correct, item["metadata"]) == 1.0
|
||||
assert dataset.score_answer(correct, entry=item) == 1.0
|
||||
|
||||
# Test invalid format
|
||||
invalid_grid = """
|
||||
|
|
@ -63,10 +63,10 @@ def test_rearc_solution_validation():
|
|||
7 8 7
|
||||
0 0 0
|
||||
"""
|
||||
assert dataset.score_answer(invalid_grid, item["metadata"]) == 0.05
|
||||
assert dataset.score_answer(invalid_grid, entry=item) == 0.05
|
||||
|
||||
# Test empty answer
|
||||
assert dataset.score_answer(None, item["metadata"]) == 0.0
|
||||
assert dataset.score_answer(None, entry=item) == 0.0
|
||||
|
||||
|
||||
def test_rearc_scoring_edge_cases():
|
||||
|
|
@ -77,11 +77,11 @@ def test_rearc_scoring_edge_cases():
|
|||
for item in dataset:
|
||||
# Partial match
|
||||
partial = format_board([[0, 0], [0, 0]], dataset.board_format_opts)
|
||||
assert 0.0 < dataset.score_answer(partial, item["metadata"]) < 1.0
|
||||
assert 0.0 < dataset.score_answer(partial, entry=item) < 1.0
|
||||
|
||||
# Malformed answer
|
||||
assert dataset.score_answer("[[invalid", item["metadata"]) == 0.01
|
||||
assert dataset.score_answer("[[invalid", entry=item) == 0.01
|
||||
|
||||
# Case sensitivity
|
||||
answer = format_board(item["metadata"]["output"], dataset.board_format_opts).lower()
|
||||
assert dataset.score_answer(answer, item["metadata"]) == 1.0
|
||||
assert dataset.score_answer(answer, entry=item) == 1.0
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ def test_verify_answer():
|
|||
dataset = SimpleIntegrationDataset(config)
|
||||
for i in range(len(dataset)):
|
||||
item = dataset[i]
|
||||
score = dataset.score_answer(item["answer"], item["metadata"])
|
||||
score = dataset.score_answer(item["answer"], item)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
|
|
@ -113,5 +113,6 @@ def test_score_answer_cases():
|
|||
]
|
||||
|
||||
for answer, metadata, expected in test_cases:
|
||||
score = dataset.score_answer(answer, metadata)
|
||||
dummy_entry = {"metadata": metadata}
|
||||
score = dataset.score_answer(answer=answer, entry=dummy_entry)
|
||||
assert score == expected, f"Failed case: {answer} | Expected {expected}, got {score}"
|
||||
|
|
|
|||
|
|
@ -245,27 +245,26 @@ def test_score_answer():
|
|||
dataset = HanoiDataset(config)
|
||||
# Pick one instance from the dataset for testing.
|
||||
item = dataset[0]
|
||||
metadata = item["metadata"]
|
||||
correct_answer = item["answer"]
|
||||
|
||||
# 1. Correct answer should yield full reward.
|
||||
score_correct = dataset.score_answer(answer=correct_answer, metadata=metadata)
|
||||
score_correct = dataset.score_answer(answer=correct_answer, entry=item)
|
||||
assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0."
|
||||
|
||||
# 2. A badly formatted answer should yield minimal reward (0.01).
|
||||
score_bad_format = dataset.score_answer(answer="a wrong solution", metadata=metadata)
|
||||
score_bad_format = dataset.score_answer(answer="a wrong solution", entry=item)
|
||||
assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01."
|
||||
|
||||
# 3. An answer that is validly formatted but unsolved.
|
||||
# For example, remove the last move from the correct answer.
|
||||
unfinished_answer = correct_answer[:-1]
|
||||
score_unsolved = dataset.score_answer(answer=unfinished_answer, metadata=metadata)
|
||||
score_unsolved = dataset.score_answer(answer=unfinished_answer, entry=item)
|
||||
assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05."
|
||||
|
||||
# 4. An empty answer should yield 0.01.
|
||||
score_empty = dataset.score_answer(answer="", metadata=metadata)
|
||||
score_empty = dataset.score_answer(answer="", entry=item)
|
||||
assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01."
|
||||
|
||||
# 5. A None answer should yield 0.0.
|
||||
score_none = dataset.score_answer(answer=None, metadata=metadata)
|
||||
score_none = dataset.score_answer(answer=None, entry=item)
|
||||
assert score_none == 0.0, f"None answer score {score_none} is not 0.0."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue