diff --git a/GALLERY.md b/GALLERY.md index 8a14d81a..2bd381fa 100644 --- a/GALLERY.md +++ b/GALLERY.md @@ -2,6 +2,7 @@ This gallery shows examples from all available datasets using their default configurations. ## Available Datasets +- [ab](#ab) - [advanced_geometry](#advanced_geometry) - [aiw](#aiw) - [arc_1d](#arc_1d) @@ -16,8 +17,10 @@ This gallery shows examples from all available datasets using their default conf - [color_cube_rotation](#color_cube_rotation) - [complex_arithmetic](#complex_arithmetic) - [count_bits](#count_bits) +- [count_primes](#count_primes) - [countdown](#countdown) - [course_schedule](#course_schedule) +- [dice](#dice) - [family_relationships](#family_relationships) - [figlet_font](#figlet_font) - [fraction_simplification](#fraction_simplification) @@ -49,6 +52,7 @@ This gallery shows examples from all available datasets using their default conf - [quantum_lock](#quantum_lock) - [ransom_note](#ransom_note) - [rearc](#rearc) +- [rectangle_count](#rectangle_count) - [rotate_matrix](#rotate_matrix) - [rubiks_cube](#rubiks_cube) - [self_reference](#self_reference) @@ -70,6 +74,131 @@ This gallery shows examples from all available datasets using their default conf - [zebra_puzzles](#zebra_puzzles) ## Dataset Examples +### ab +Generates A::B tasks, as described by @VictorTaelin [here](https://x.com/VictorTaelin/status/1776096481704804789) + +Default configuration: +```python +seed = 42 +size = 500 +length = 10 +``` + +Example tasks: +```` +Example 1: +Question: A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`. + +An A::B program is a sequence of tokens. Example: + + B# A# #B #A B# + +To *compute* a program, we must rewrite neighbor tokens, using the rules: + + A# #A ... becomes ... nothing + A# #B ... becomes ... #B A# + B# #A ... becomes ... #A B# + B# #B ... becomes ... nothing + +In other words, whenever two neighbor tokens have their '#' facing each-other, +they must be rewritten according to the corresponding rule. For example, the +first example shown here is computed as: + + B# A# #B #A B# = + B# #B A# #A B# = + A# #A B# = + B# + +The steps were: +1. We replaced `A# #B` by `#B A#`. +2. We replaced `B# #B` by nothing. +3. We replaced `A# #A` by nothing. +The final result was just `B#`. + +Now, consider the following program: + +A# A# #A B# B# B# A# A# #B A# + +Return the final state of the program. + +Answer: A# B# B# A# A# A# + +Example 2: +Question: A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`. + +An A::B program is a sequence of tokens. Example: + + B# A# #B #A B# + +To *compute* a program, we must rewrite neighbor tokens, using the rules: + + A# #A ... becomes ... nothing + A# #B ... becomes ... #B A# + B# #A ... becomes ... #A B# + B# #B ... becomes ... nothing + +In other words, whenever two neighbor tokens have their '#' facing each-other, +they must be rewritten according to the corresponding rule. For example, the +first example shown here is computed as: + + B# A# #B #A B# = + B# #B A# #A B# = + A# #A B# = + B# + +The steps were: +1. We replaced `A# #B` by `#B A#`. +2. We replaced `B# #B` by nothing. +3. We replaced `A# #A` by nothing. +The final result was just `B#`. + +Now, consider the following program: + +A# #A B# #B #A A# #B #B A# #B + +Return the final state of the program. + +Answer: #A #B #B #B A# A# + +Example 3: +Question: A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`. + +An A::B program is a sequence of tokens. Example: + + B# A# #B #A B# + +To *compute* a program, we must rewrite neighbor tokens, using the rules: + + A# #A ... becomes ... nothing + A# #B ... becomes ... #B A# + B# #A ... becomes ... #A B# + B# #B ... becomes ... nothing + +In other words, whenever two neighbor tokens have their '#' facing each-other, +they must be rewritten according to the corresponding rule. For example, the +first example shown here is computed as: + + B# A# #B #A B# = + B# #B A# #A B# = + A# #A B# = + B# + +The steps were: +1. We replaced `A# #B` by `#B A#`. +2. We replaced `B# #B` by nothing. +3. We replaced `A# #A` by nothing. +The final result was just `B#`. + +Now, consider the following program: + +#B A# B# #B B# #A A# B# A# A# + +Return the final state of the program. + +Answer: #B B# A# B# A# A# + +```` + ### advanced_geometry A dataset for advanced geometry tasks using coordinate geometry. @@ -1073,6 +1202,35 @@ Metadata: {'number': 877324117, 'solution': 16, 'binary': '110100010010101110011 ```` +### count_primes +Generates Count Primes exercises with configurable difficulty + +Default configuration: +```python +max_n = 10000 +size = 500 +seed = 42 +``` + +Example tasks: +```` +Example 1: +Question: Count how many prime numbers there are between 1825 and 2029 (inclusive) ? +Answer: 27 +Metadata: {'start': 1825, 'end': 2029, 'primes': [False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True], 'solution': 27} + +Example 2: +Question: Count how many prime numbers there are between 632 and 5319 (inclusive) ? +Answer: 589 +Metadata: {'start': 632, 'end': 5319, 'primes': [False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False], 'solution': 589} + +Example 3: +Question: Count how many prime numbers there are between 6694 and 8824 (inclusive) ? +Answer: 236 +Metadata: {'start': 6694, 'end': 8824, 'primes': [False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False], 'solution': 236} + +```` + ### countdown Generates Countdown Number Game tasks @@ -1163,6 +1321,33 @@ Metadata: {'courses': [2, 1, 4, 0, 3], 'prerequisites': [], 'solution': True, 's ```` +### dice +Generates Dice-based puzzles with configurable parameters + +Default configuration: +```python +num_dice = 4 +max_dice_size = 20 +seed = 42 +size = 500 +``` + +Example tasks: +```` +Example 1: +Question: I have these dice: 1d20, 1d10, 1d5, 1d2. What are the odds of rolling 18 or higher? (Assume that all dice are rolled at once, and that '1d6' represents one roll of a 6-sided dice.) Please respond with a reduced fraction representing the probability [ex., 1/60]. +Answer: 13/20 + +Example 2: +Question: I have these dice: 1d20, 1d11, 1d6, 1d3. What are the odds of rolling 23 or higher? (Assume that all dice are rolled at once, and that '1d6' represents one roll of a 6-sided dice.) Please respond with a reduced fraction representing the probability [ex., 1/60]. +Answer: 19/40 + +Example 3: +Question: I have these dice: 1d20, 1d19, 1d18, 1d15. What are the odds of rolling 48 or higher? (Assume that all dice are rolled at once, and that '1d6' represents one roll of a 6-sided dice.) Please respond with a reduced fraction representing the probability [ex., 1/60]. +Answer: 9677/51300 + +```` + ### family_relationships Generates family relationship reasoning tasks @@ -2541,12 +2726,12 @@ Answer: [0.0] Metadata: {'polynomial_expr': '-127*u', 'variable': 'u', 'degree': 1, 'real_solutions': [0.0]} Example 2: -Question: Determine the real value(s) of b tha satisfies: 86*b**2 - 2*b - 13 = 0 +Question: Determine the real value(s) of b that satisfies: 86*b**2 - 2*b - 13 = 0 Answer: [-0.3773425275273891, 0.4005983414808775] Metadata: {'polynomial_expr': '86*b**2 - 2*b - 13', 'variable': 'b', 'degree': 2, 'real_solutions': [-0.3773425275273891, 0.4005983414808775]} Example 3: -Question: Determine the real value(s) of n tha satisfies: 71*n**3 - 2*n - 29 = 0 +Question: Determine the real value(s) of n that satisfies: 71*n**3 - 2*n - 29 = 0 Answer: [0.7546129960163634] Metadata: {'polynomial_expr': '71*n**3 - 2*n - 29', 'variable': 'n', 'degree': 3, 'real_solutions': [0.7546129960163634]} @@ -3082,6 +3267,280 @@ Metadata: {'input': ((1, 1, 1, 1, 1), (1, 1, 1, 1, 1)), 'output': ((1, 1, 1, 1, ```` +### rectangle_count +Generates [RectangleCount Puzzles](https://en.wikipedia.org/wiki/RectangleCount_Puzzle) with configurable parameters + +Default configuration: +```python +max_rectangles = 10 +width = 80 +height = 80 +seed = 42 +size = 500 +``` + +Example tasks: +```` +Example 1: +Question: How many rectangles do you see? Single rectangles are outlined with a '#', overlapping rectangles (max 2) are shown with '█'. + + + + + + + + + + + + + + + ################################################## + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + ################################################## + + + + + ###################################### + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + ###################################### + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Answer: 2 + +Example 2: +Question: How many rectangles do you see? Single rectangles are outlined with a '#', overlapping rectangles (max 2) are shown with '█'. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ############ + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + ############ + + + + + + + + +Answer: 1 + +Example 3: +Question: How many rectangles do you see? Single rectangles are outlined with a '#', overlapping rectangles (max 2) are shown with '█'. + + + + + + + + + + + ######################### + # # + # # + # # + # # + # ############ + # ## # + # ## # + # ## # + # ## # + # ## # + #####█#######################██#########█# + # # ## ## + # # ## ## + # # ## ## + # # ## ## + # # ## ## + # # ## ## + # # ## ## + # # ## ## + #####█#######################██#########█# + # ## # + # ## # + # ## # + # ## # + # ## # + # ## # + # ## # + # ########## ## # + # # # ############ + # # # # + # ########## # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + # # + ######################### + + + + + + + + + + + ####################### + # # + # # + # # + # # + # # + # # + # ######█### + # # # # + # ######█### + # # ########################### + # # # # + # # # # + ####################### ########################### + + +Answer: 7 + +```` + ### rotate_matrix Generates Rotate Matrix exercises with configurable difficulty @@ -3794,7 +4253,7 @@ Metadata: {'task_type': 'datetime_tz', 'start_time': datetime.datetime(2964, 6, Example 2: Question: A video call started at 09:44 and ended at 12:22. How long was the call? Answer in HH:MM. Answer: 02:38 -Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 10, 9, 44), 'end_time': datetime.datetime(2025, 2, 10, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} +Metadata: {'task_type': 'time', 'start_time': datetime.datetime(2025, 2, 11, 9, 44), 'end_time': datetime.datetime(2025, 2, 11, 12, 22), 'format': '%H:%M', 'expected_format': 'HH:MM'} Example 3: Question: Calculate the time difference between Sat Dec 22 2677 and Thu Mar 21 2678. Express the result in D days. diff --git a/eval/eval.py b/eval/eval.py index d2e24555..53571dd0 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -2,6 +2,7 @@ import argparse import asyncio import json import os +import re import time from datetime import datetime from typing import Any, Dict, List @@ -10,6 +11,7 @@ from openai import AsyncOpenAI from tqdm.asyncio import tqdm_asyncio from reasoning_gym.factory import create_dataset +from reasoning_gym.utils import SYSTEM_PROMPTS class AsyncOpenRouterEvaluator: @@ -25,22 +27,33 @@ class AsyncOpenRouterEvaluator: async with self.semaphore: try: completion = await self.client.chat.completions.create( - extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}] + extra_headers=self.extra_headers, + model=self.model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPTS["default"]}, + {"role": "user", "content": prompt}, + ], ) return completion.choices[0].message.content except Exception as e: print(f"Error calling OpenRouter API: {str(e)}") raise + def parse_model_response(self, response: str) -> str: + """Gather the final answer between the and tags.""" + match = re.search(r"(.*?)", response, re.DOTALL) + return match.group(1).strip() if match else response + async def process_single_question(self, entry: Dict, dataset) -> Dict: """Process a single question and return the result.""" response = await self.get_model_response(entry["question"]) - score = dataset.score_answer(answer=response, entry=entry) + answer = self.parse_model_response(response) + score = dataset.score_answer(answer=answer, entry=entry) return { "question": entry["question"], "expected_answer": entry["answer"], - "model_answer": response, + "model_answer": answer, "score": score, "metadata": entry["metadata"], } diff --git a/eval/results/summary_openai_o1_20250212_103017.json b/eval/results/summary_openai_o1_20250212_103017.json new file mode 100644 index 00000000..cf74a09e --- /dev/null +++ b/eval/results/summary_openai_o1_20250212_103017.json @@ -0,0 +1,61 @@ +[ + { + "dataset_name": "letter_counting", + "model": "openai/o1", + "average_score": 0.99, + "total_examples": 50, + "timestamp": "2025-02-12T10:26:39.897674", + "config": { + "min_words": 5, + "max_words": 15, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "openai/o1", + "average_score": 0.010000000000000004, + "total_examples": 50, + "timestamp": "2025-02-12T10:27:45.054740", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "openai/o1", + "average_score": 0.802, + "total_examples": 50, + "timestamp": "2025-02-12T10:28:06.199253", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "group_anagrams", + "model": "openai/o1", + "average_score": 0.94, + "total_examples": 50, + "timestamp": "2025-02-12T10:30:02.084562", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "spell_backward", + "model": "openai/o1", + "average_score": 0.9802000000000001, + "total_examples": 50, + "timestamp": "2025-02-12T10:30:17.839014", + "config": { + "size": 50, + "seed": 42 + } + } +] diff --git a/pyproject.toml b/pyproject.toml index eb0bbc4c..85432edf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "reasoning_gym" -version = "0.1.6" +version = "0.1.7" authors = [ { name = "Open-Thought community", email = "andreas.koepf@xamla.com" }, ] diff --git a/reasoning_gym/__init__.py b/reasoning_gym/__init__.py index 37783e4e..c267a7ff 100644 --- a/reasoning_gym/__init__.py +++ b/reasoning_gym/__init__.py @@ -5,7 +5,7 @@ Reasoning Gym - A library of procedural dataset generators for training reasonin from . import algebra, algorithmic, arc, arithmetic, code, cognition, data, games, geometry, graphs, logic from .factory import create_dataset, register_dataset -__version__ = "0.1.6" +__version__ = "0.1.7" __all__ = [ "arc", "algebra", diff --git a/reasoning_gym/algorithmic/__init__.py b/reasoning_gym/algorithmic/__init__.py index acaceff0..875ab539 100644 --- a/reasoning_gym/algorithmic/__init__.py +++ b/reasoning_gym/algorithmic/__init__.py @@ -6,9 +6,11 @@ Algorithmic tasks for training reasoning capabilities: - Pattern matching """ +from .ab import ABConfig, ABDataset from .base_conversion import BaseConversionConfig, BaseConversionDataset from .binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset +from .count_primes import CountPrimesConfig, CountPrimesDataset from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset from .letter_counting import LetterCountingConfig, LetterCountingDataset @@ -69,4 +71,8 @@ __all__ = [ "BinaryMatrixDataset", "PoolMatrixConfig", "PoolMatrixDataset", + "ABConfig", + "ABDataset", + "CountPrimesConfig", + "CountPrimesDataset", ] diff --git a/reasoning_gym/algorithmic/ab.py b/reasoning_gym/algorithmic/ab.py new file mode 100644 index 00000000..1daea0e9 --- /dev/null +++ b/reasoning_gym/algorithmic/ab.py @@ -0,0 +1,154 @@ +from dataclasses import dataclass +from random import Random +from typing import Dict, Optional + +from ..factory import ProceduralDataset, register_dataset + + +def generate_program(length, rng): + """Generates a random initial program of a given length.""" + elements = ["A#", "B#", "#A", "#B"] + return [rng.choice(elements) for _ in range(length)] + + +def compute_steps(program, max_steps=100): + """Computes the transformation steps and detects if the program does not halt.""" + steps = [program.copy()] + seen_states = {tuple(program)} + + for step in range(max_steps): + current = steps[-1] + new_program = None + + for i in range(len(current) - 1): + a, b = current[i], current[i + 1] + if a == "A#" and b == "#A": + new_program = current[:i] + current[i + 2 :] + elif a == "A#" and b == "#B": + new_program = current[:i] + ["#B", "A#"] + current[i + 2 :] + elif a == "B#" and b == "#A": + new_program = current[:i] + ["#A", "B#"] + current[i + 2 :] + elif a == "B#" and b == "#B": + new_program = current[:i] + current[i + 2 :] + + if new_program is not None: + break + + if new_program is None: + # No more transformations possible + return steps, False + + if tuple(new_program) in seen_states: + # Detected a loop, meaning non-halting behavior + return steps, True + + steps.append(new_program) + seen_states.add(tuple(new_program)) + + return steps, True # Reached max steps, assume non-halting + + +@dataclass +class ABConfig: + """Configuration for A::B task generation""" + + seed: Optional[int] = None + size: int = 500 + length: int = 10 + + def validate(self) -> None: + """Validate configuration parameters""" + assert self.length > 0, "length must be greater than 0" + assert self.size > 0, "size must be greater than 0" + + +class ABDataset(ProceduralDataset): + """Generates A::B tasks, as described by @VictorTaelin [here](https://x.com/VictorTaelin/status/1776096481704804789)""" + + def __init__(self, config: ABConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def __getitem__(self, idx: int) -> dict: + """Generate a single AB task + + Returns: + dict with keys: + - question: str, the task description with AB program + - answer: str, the result of this AB program ABI execution + - metadata: dict with generation parameters + """ + rng = Random(self.seed + idx) + + while True: + initial_program = generate_program(self.config.length, rng) + steps, non_halting = compute_steps(initial_program) + if not non_halting: + break + + # Via: + # https://x.com/VictorTaelin/status/1776248021858111542 + # https://gist.github.com/VictorTaelin/e514844f4df9e5f182b28e5a07e44b17 + prompt = f"""A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`. + +An A::B program is a sequence of tokens. Example: + + B# A# #B #A B# + +To *compute* a program, we must rewrite neighbor tokens, using the rules: + + A# #A ... becomes ... nothing + A# #B ... becomes ... #B A# + B# #A ... becomes ... #A B# + B# #B ... becomes ... nothing + +In other words, whenever two neighbor tokens have their '#' facing each-other, +they must be rewritten according to the corresponding rule. For example, the +first example shown here is computed as: + + B# A# #B #A B# = + B# #B A# #A B# = + A# #A B# = + B# + +The steps were: +1. We replaced `A# #B` by `#B A#`. +2. We replaced `B# #B` by nothing. +3. We replaced `A# #A` by nothing. +The final result was just `B#`. + +Now, consider the following program: + +{' '.join(initial_program)} + +Return the final state of the program. +""" + + return { + "question": prompt, + "answer": " ".join(steps[-1]), + "metadata": {}, + } + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Determine if the solution provided solves the AB task. + + The function awards 1.0 for a correct answer. + + Args: + answer (Optional[str]): The user's answer. + entry (Dict[str, any]): The original dataset entry containing the correct answer. + + Returns: + float: The computed score between 0.0 and 1.0. + """ + + if answer == None: + return 0.0 + if answer != entry["answer"]: + return 0.01 + else: + return 1.0 # Yay + + +# Register the dataset +register_dataset("ab", ABDataset, ABConfig) diff --git a/reasoning_gym/algorithmic/count_primes.py b/reasoning_gym/algorithmic/count_primes.py new file mode 100644 index 00000000..0a553c7f --- /dev/null +++ b/reasoning_gym/algorithmic/count_primes.py @@ -0,0 +1,63 @@ +"""Count prime numbers in a given interval. + +Solution obtained with Sieve of Eratosthenes: +https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes +""" + +import math +from dataclasses import dataclass +from random import Random +from typing import Optional + +from ..factory import ProceduralDataset, register_dataset + +QUESTION_TEMPLATE = """Count how many prime numbers there are between {start} and {end} (inclusive) ?""" + + +@dataclass +class CountPrimesConfig: + """Configuration for Count Primes dataset generation""" + + max_n: int = 10_000 # Upper bound for the interval + + size: int = 500 # Virtual dataset size + seed: Optional[int] = None + + def validate(self): + """Validate configuration parameters""" + assert 1 <= self.max_n, "max_n must be at least 1" + + +class CountPrimesDataset(ProceduralDataset): + """Generates Count Primes exercises with configurable difficulty""" + + def __init__(self, config: CountPrimesConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + self.primes = self._get_primes(config.max_n + 1) + + def _get_primes(self, n: int) -> list[bool]: + if n <= 1: + return [] + primes = [True] * n + primes[0] = primes[1] = False + for i in range(2, int(math.sqrt(n)) + 1): + if primes[i]: + for j in range(2 * i, n, i): + primes[j] = False + return primes + + def __getitem__(self, idx: int) -> dict: + """Generate a single Count Primes question""" + rng = Random(self.seed + idx) + start = rng.randint(1, self.config.max_n) + end = rng.randint(start, self.config.max_n) + primes = self.primes[start : end + 1] + answer = sum(primes) + return { + "question": QUESTION_TEMPLATE.format(start=start, end=end), + "answer": str(answer), + "metadata": {"start": start, "end": end, "primes": primes, "solution": answer}, + } + + +register_dataset("count_primes", CountPrimesDataset, CountPrimesConfig) diff --git a/reasoning_gym/algorithmic/rotate_matrix.py b/reasoning_gym/algorithmic/rotate_matrix.py index 4fdf651e..adeaa47c 100644 --- a/reasoning_gym/algorithmic/rotate_matrix.py +++ b/reasoning_gym/algorithmic/rotate_matrix.py @@ -60,22 +60,16 @@ class RotateMatrixDataset(ProceduralDataset): matrix = [numbers[i * n : (i + 1) * n] for i in range(n)] return matrix + def _rot90(self, matrix: list[list[int]]) -> list[list[int]]: + """quarter clockwise rotation""" + return [list(row) for row in zip(*matrix[::-1])] + def _get_rotated(self, matrix: list[list[int]], num_rotations: int) -> list[list[int]]: """Rotate the matrix K times by 90 degrees clockwise""" num_rotations %= 4 - n = len(matrix) output = deepcopy(matrix) - for _ in range(num_rotations): - for l in range(n // 2): - for i in range(l, n - 1 - l): - (output[l][i], output[i][n - 1 - l], output[n - 1 - l][n - 1 - i], output[n - 1 - i][l]) = ( - output[n - 1 - i][l], - output[l][i], - output[i][n - 1 - l], - output[n - 1 - l][n - 1 - i], - ) - + output = self._rot90(output) return output def _matrix_to_str(self, matrix: list[list[int]]) -> str: diff --git a/reasoning_gym/arithmetic/__init__.py b/reasoning_gym/arithmetic/__init__.py index 05d321da..cc94bf56 100644 --- a/reasoning_gym/arithmetic/__init__.py +++ b/reasoning_gym/arithmetic/__init__.py @@ -6,6 +6,7 @@ from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConf from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset from .chain_sum import ChainSum, ChainSumConfig from .count_bits import CountBitsConfig, CountBitsDataset +from .dice import DiceConfig, DiceDataset from .fraction_simplification import FractionSimplificationConfig, FractionSimplificationDataset from .gcd import GCDConfig, GCDDataset from .gsm_symbolic.gsm_symbolic import GSMSymbolicDataset, GSMSymbolicDatasetConfig @@ -38,4 +39,6 @@ __all__ = [ "TimeIntervalsDataset", "CountBitsConfig", "CountBitsDataset", + "DiceConfig", + "DiceDataset", ] diff --git a/reasoning_gym/arithmetic/chain_sum.py b/reasoning_gym/arithmetic/chain_sum.py index 30dcb0c4..01d387a6 100644 --- a/reasoning_gym/arithmetic/chain_sum.py +++ b/reasoning_gym/arithmetic/chain_sum.py @@ -2,6 +2,7 @@ import random from dataclasses import dataclass from typing import Optional +from ..coaching import AttributeType, BaseCurriculum, RangeAttributeDefinition from ..factory import ProceduralDataset, register_dataset @@ -112,5 +113,36 @@ class ChainSum(ProceduralDataset): return expression, result +class ChainSumCurriculum(BaseCurriculum): + def __init__(self): + super().__init__(ChainSumCurriculum.__name__, ChainSumConfig) + + # Define attributes + self._define_attributes( + ( + RangeAttributeDefinition( + name="num_terms", + levels=[2, 3, 4, 5], + default_level=0, # Start with 2 terms + description="Maximum number of terms in the expression", + attr_type=AttributeType.APPEND, + min_value=2, # Ensure at least 2 terms + lower_field_name="min_terms", + upper_field_name="max_terms", + ), + RangeAttributeDefinition( + name="num_digits", + levels=[1, 2, 4, 10], + default_level=0, # Start with 1-digit numbers + description="Number of digits in each operand", + attr_type=AttributeType.APPEND, + min_value=1, # Ensure numbers are at least 1 digit + lower_field_name="min_digits", + upper_field_name="max_digits", + ), + ) + ) + + # Register the dataset register_dataset("chain_sum", ChainSum, ChainSumConfig) diff --git a/reasoning_gym/arithmetic/dice.py b/reasoning_gym/arithmetic/dice.py new file mode 100644 index 00000000..f4ad97e9 --- /dev/null +++ b/reasoning_gym/arithmetic/dice.py @@ -0,0 +1,149 @@ +from dataclasses import dataclass +from functools import reduce +from math import gcd +from random import Random +from typing import Dict, Optional + +from ..factory import ProceduralDataset, register_dataset + + +def compute_probability(dice, target): + """ + Computes the probability of rolling a total of at least `target` + when rolling dice specified in the list `dice`. Each element in dice + is the number of sides on that die. The computation is done via dynamic programming. + Returns the probability as a fraction (numerator, denominator) and as a float. + """ + # dp[i][s] = number of ways to get sum s using the first i dice. + # We use only one dictionary for the current dp state. + dp = {0: 1} + for sides in dice: + new_dp = {} + for current_sum, count in dp.items(): + # Each die gives a number from 1 to sides. + for face in range(1, sides + 1): + new_sum = current_sum + face + new_dp[new_sum] = new_dp.get(new_sum, 0) + count + dp = new_dp + + total_outcomes = reduce(lambda a, b: a * b, dice, 1) + ways = sum(count for s, count in dp.items() if s >= target) + + # Simplify the fraction (ways / total_outcomes) + def simplify(n, d): + common = gcd(n, d) + return n // common, d // common + + frac = simplify(ways, total_outcomes) + return frac, ways / total_outcomes + + +def generate_puzzle(num_dice, max_dice_size, rng): + """ + Generates a puzzle: + - It forces one die to have max_dice_size. + - The other (num_dice-1) dice are chosen randomly between 2 and max_dice_size-1. + - The dice are then shuffled. + - The target total is chosen roughly in the middle (but you can adjust the method). + + It then computes the probability of rolling a total at least the target. + Finally, it prints out the puzzle statement and the answer. + """ + + # Guarantee one die is the maximum. + dice = [max_dice_size] + for _ in range(num_dice - 1): + # Choose a die size randomly from 2 up to max_dice_size-1. + # (If max_dice_size == 2 then all dice are 2-sided.) + if max_dice_size > 2: + die = rng.randint(2, max_dice_size - 1) + else: + die = 2 + dice.append(die) + + # Optionally, sort dice in descending order (as is common in puzzles) + dice.sort(reverse=True) + + # Compute minimum and maximum possible totals. + min_total = num_dice # each die gives at least 1 + max_total = sum(dice) + + # Choose a target total. For an interesting puzzle, + # we choose a target somewhere in the middle third of the range. + low_target = min_total + (max_total - min_total) // 3 + high_target = min_total + 2 * (max_total - min_total) // 3 + target = rng.randint(low_target, high_target) + + # Compute probability. + (num, den), prob = compute_probability(dice, target) + + # Create a string representing the dice, e.g., "1d20, 1d17, 1d6" etc. + dice_str = ", ".join(f"1d{s}" for s in dice) + + # Return the puzzle. + return {"dice_str": dice_str, "target": target, "num": num, "den": den} + + +@dataclass +class DiceConfig: + """Configuration for dice puzzle generation""" + + num_dice: int = 4 + max_dice_size: int = 20 + seed: Optional[int] = None + size: int = 500 + + def validate(self): + """Validate configuration parameters""" + assert self.num_dice >= 1, "num_dice must be gte 1" + assert self.max_dice_size >= 2, "max_dice_size must be gte 2" + + +class DiceDataset(ProceduralDataset): + """Generates Dice-based puzzles with configurable parameters""" + + def __init__(self, config: DiceConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def __getitem__(self, idx: int) -> dict: + """Generate a single Dice task + + Returns: + dict with keys: + - question: str, the task description + - answer: str, a solution string + - metadata: dict with generation parameters + """ + rng = Random(self.seed + idx) + puzzle = generate_puzzle(self.config.num_dice, self.config.max_dice_size, rng) + puzzle_str = f"I have these dice: {puzzle['dice_str']}. What are the odds of rolling {puzzle['target']} or higher? (Assume that all dice are rolled at once, and that '1d6' represents one roll of a 6-sided dice.) Please respond with a reduced fraction representing the probability [ex., 1/60]." + answer_str = f"{puzzle['num']}/{puzzle['den']}" + + return { + "question": puzzle_str, + "answer": answer_str, + "metadata": {}, + } + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Determine if the solution provided solves the Dice task. + + The function awards 1.0 for a correct answer. + + Args: + answer (Optional[str]): The user's answer. + entry (Dict[str, any]): The original dataset entry containing the correct answer. + + Returns: + float: The computed score between 0.0 and 1.0. + """ + + if answer == None: + return 0.0 + if answer.lower().replace("\n", "") != entry["answer"].lower().replace("\n", ""): + return 0.01 + else: + return 1.0 # Yay + + +register_dataset("dice", DiceDataset, DiceConfig) diff --git a/reasoning_gym/coaching/__init__.py b/reasoning_gym/coaching/__init__.py new file mode 100644 index 00000000..50683d66 --- /dev/null +++ b/reasoning_gym/coaching/__init__.py @@ -0,0 +1,14 @@ +from .attributes import AttributeDefinition, AttributeType, RangeAttributeDefinition +from .base_curriculum import BaseCurriculum +from .coach import Coach, GroupedScores, ScoreBoard, ScoreStats + +__all__ = [ + "AttributeType", + "AttributeDefinition", + "RangeAttributeDefinition", + "BaseCurriculum", + "Coach", + "ScoreBoard", + "GroupedScores", + "ScoreStats", +] diff --git a/reasoning_gym/coaching/attributes.py b/reasoning_gym/coaching/attributes.py new file mode 100644 index 00000000..33fc053b --- /dev/null +++ b/reasoning_gym/coaching/attributes.py @@ -0,0 +1,73 @@ +from collections import abc +from dataclasses import dataclass +from enum import StrEnum +from typing import Any, Optional + + +class AttributeType(StrEnum): + """Defines how attribute levels should be interpreted""" + + STATIC = "static" # Each level is independent + UBOUND = "ubound" # Each level is an upper bound + APPEND = "append" # Each level includes all previous levels + + +@dataclass(kw_only=True) +class AttributeDefinition: + name: str + levels: list + default_level: int + description: Optional[str] = None + attr_type: AttributeType = AttributeType.STATIC # Default to static + min_value: Optional[int | float] = None # Minimum value for numeric attributes + + def validate_level(self, level: int, curriculum: str) -> None: + """ + Validate that a level is valid for an attribute. + Args: + level: Level to validate + curriculum: Name of the curriculum + Raises: + ValueError: If level is invalid + """ + # TODO: if > set as [-1], if <0 set as [0] + if not 0 <= level < len(self.levels): + raise ValueError( + f"Invalid level: {level} for attribute '{curriculum}.{self.name}'. " + f"Must be between 0 and {len(self.levels)-1}" + ) + + def get_level_value(self, level: int, curriculum: str) -> Any: + """ + Get the value for an attribute at a specific level based on its type. + Args: + attr: The attribute definition + level: Level to get value for + Returns: + Value for the attribute based on its level and type + """ + if self.attr_type == AttributeType.STATIC: + return self.levels[level] + elif self.attr_type == AttributeType.UBOUND: + return self.levels[level] + elif self.attr_type == AttributeType.APPEND: + return self.levels[: level + 1] + + raise ValueError(f"Unknown attribute type: {self.attr_type} for attribute '{curriculum}.{self.name}'") + + +@dataclass(kw_only=True) +class ScalarAttributeDefinition(AttributeDefinition): + field_name: str + + +@dataclass(kw_only=True) +class RangeAttributeDefinition(AttributeDefinition): + lower_field_name: str + upper_field_name: str + + def get_level_value(self, level: int, curriculum: str) -> Any: + v = super().get_level_value(level, curriculum) + if not isinstance(v, abc.Iterable): + return [v] + return v diff --git a/reasoning_gym/coaching/base_curriculum.py b/reasoning_gym/coaching/base_curriculum.py new file mode 100644 index 00000000..8d619869 --- /dev/null +++ b/reasoning_gym/coaching/base_curriculum.py @@ -0,0 +1,108 @@ +from typing import Any, Iterable, Optional + +from ..factory import ConfigT +from .attributes import AttributeDefinition, RangeAttributeDefinition, ScalarAttributeDefinition + + +class BaseCurriculum: + def __init__(self, name: str, config_cls: ConfigT): + self.name = name + self._config_cls = config_cls + self._attributes: dict[str, AttributeDefinition] = {} + self._current_levels: dict[str, int] = {} + + def generate_configuration(self, defaults: Optional[dict[str, any]] = None) -> ConfigT: + config_args = defaults.copy() if defaults is not None else {} + for attr in self._attributes.values(): + if isinstance(attr, RangeAttributeDefinition): + vals = self.get_attr_value(attr.name) + config_args[attr.lower_field_name] = min(vals) + config_args[attr.upper_field_name] = max(vals) + elif isinstance(attr, ScalarAttributeDefinition): + val = self.get_attr_value(attr.name) + config_args[attr.field_name] = val + print(config_args) + return self._config_cls(**config_args) + + @property + def attributes(self) -> dict[str, AttributeDefinition]: + """Get the curriculum's attributes""" + return self._attributes + + def get_attribute(self, attr_name: str) -> AttributeDefinition: + if attr_name not in self._attributes: + raise KeyError(f"Attribute '{self.name}.{attr_name}' does not exist") + return self._attributes[attr_name] + + def _define_attributes(self, attrs: Iterable[AttributeDefinition]) -> None: + for attr in attrs: + if attr.name in self.attributes: + raise RuntimeError(f"Attribute with name {attr.name} is already defined.") + self.attributes[attr.name] = attr + + def get_attr_level(self, attr_name: str) -> int: + """ + Get the current level for an attribute. + Args: + attr_name: Name of the attribute + Returns: + Current level index for the attribute + """ + attr = self.get_attribute(attr_name) + return self._current_levels.get(attr_name, attr.default_level) + + def get_attr_value(self, attr_name: str) -> Any: + """ + Get the current value for an attribute based on its level. + Args: + attr_name: Name of the attribute + Returns: + Current value for the attribute based on its level and type + """ + attr = self.get_attribute(attr_name) + level = self.get_attr_level(attr_name) + return attr.get_level_value(level, curriculum=self.name) + + def set_attr_level(self, attr_name: str, level: int) -> None: + """ + Set the level for an attribute. + Args: + attr_name: Name of the attribute + level: New level index + """ + attr = self.get_attribute(attr_name) + attr.validate_level(level, curriculum=self.name) + self._current_levels[attr_name] = level + + def increment_attr_level(self, attr_name: str) -> bool: + """ + Increment the level of an attribute if possible. + Args: + attr_name: Name of the attribute to increment + Returns: + bool: True if level was incremented, False if already at max level + Raises: + KeyError: If attribute doesn't exist + """ + attr = self.get_attribute(attr_name) + current_level = self.get_attr_level(attr_name) + if current_level < len(attr.levels) - 1: + self.set_attr_level(attr_name, current_level + 1) + return True + return False + + def decrement_attr_level(self, attr_name: str) -> bool: + """ + Decrement the level of an attribute if possible. + Args: + attr_name: Name of the attribute to decrement + Returns: + bool: True if level was decremented, False if already at min level + Raises: + KeyError: If attribute doesn't exist + """ + current_level = self.get_attr_level(attr_name) + if current_level > 0: + self.set_attr_level(attr_name, current_level - 1) + return True + return False diff --git a/reasoning_gym/coaching.py b/reasoning_gym/coaching/coach.py similarity index 99% rename from reasoning_gym/coaching.py rename to reasoning_gym/coaching/coach.py index ad14077a..eeeab5d1 100644 --- a/reasoning_gym/coaching.py +++ b/reasoning_gym/coaching/coach.py @@ -8,7 +8,7 @@ from pathlib import Path from statistics import mean, stdev from typing import Any, Dict, List, Optional, Tuple, Union -from .dataset import ProceduralDataset +from ..dataset import ProceduralDataset @dataclass diff --git a/reasoning_gym/cognition/__init__.py b/reasoning_gym/cognition/__init__.py index 5c0b7f8b..473fee97 100644 --- a/reasoning_gym/cognition/__init__.py +++ b/reasoning_gym/cognition/__init__.py @@ -5,6 +5,7 @@ Cognition tasks for training reasoning capabilities. from .color_cube_rotation import ColorCubeRotationConfig, ColorCubeRotationDataset from .figlet_fonts import FigletFontConfig, FigletFontDataset from .number_sequences import NumberSequenceConfig, NumberSequenceDataset +from .rectangle_count import RectangleCountConfig, RectangleCountDataset from .rubiks_cube import RubiksCubeConfig, RubiksCubeDataset __all__ = [ @@ -16,4 +17,6 @@ __all__ = [ "NumberSequenceDataset", "RubiksCubeConfig", "RubiksCubeDataset", + "RectangleCountConfig", + "RectangleCountDataset", ] diff --git a/reasoning_gym/cognition/rectangle_count.py b/reasoning_gym/cognition/rectangle_count.py new file mode 100644 index 00000000..959dc25f --- /dev/null +++ b/reasoning_gym/cognition/rectangle_count.py @@ -0,0 +1,135 @@ +from dataclasses import dataclass +from random import Random +from typing import Dict, Optional + +from ..factory import ProceduralDataset, register_dataset + + +def draw_rectangles_with_overlap(n, width, height, rng): + # Create a grid that holds a count of how many times a cell is drawn. + grid = [[0 for _ in range(width)] for _ in range(height)] + rectangles = [] + + max_attempts = 100000 # Prevent infinite loops in case of a crowded grid + attempts = 0 + + while len(rectangles) < n and attempts < max_attempts: + attempts += 1 + # Ensure minimum width and height of 3. + # For a rectangle to be at least 3 cells wide, right must be at least left + 2. + # Similarly, bottom must be at least top + 2. + left = rng.randint(0, width - 3) + right = rng.randint(left + 2, width - 1) + top = rng.randint(0, height - 3) + bottom = rng.randint(top + 2, height - 1) + + # Prepare a list of all the cells that would be updated. + cells_to_update = [] + + # Top edge: + for col in range(left, right + 1): + cells_to_update.append((top, col)) + # Bottom edge: + for col in range(left, right + 1): + cells_to_update.append((bottom, col)) + # Left edge (excluding corners already drawn): + for row in range(top + 1, bottom): + cells_to_update.append((row, left)) + # Right edge (excluding corners already drawn): + for row in range(top + 1, bottom): + cells_to_update.append((row, right)) + + # Check if drawing this rectangle would cause any cell to exceed a count of 2. + conflict = False + for r, c in cells_to_update: + if grid[r][c] >= 2: + conflict = True + break + if conflict: + continue # Skip this rectangle candidate + + # No conflict: update the grid counts. + for r, c in cells_to_update: + grid[r][c] += 1 + + # Save the rectangle (stored as (left, right, top, bottom)). + rectangles.append((left, right, top, bottom)) + + if len(rectangles) < n: + print(f"Only placed {len(rectangles)} rectangles after {attempts} attempts.") + + # Print the grid. + # Use ' ' for an untouched cell, '#' for a single hit, and '█' for exactly two hits. + lines = "" + for row in grid: + line = "".join(" " if count == 0 else ("#" if count == 1 else "█") for count in row) + lines = lines + line + "\n" + return lines, len(rectangles) + + +@dataclass +class RectangleCountConfig: + """Configuration for RectangleCount puzzle generation""" + + max_rectangles: int = 10 + width: int = 80 + height: int = 80 + seed: Optional[int] = None + size: int = 500 + + def validate(self): + """Validate configuration parameters""" + assert self.width >= 10, "width must be gte 10" + assert self.height >= 10, "height must be gte 10" + + +class RectangleCountDataset(ProceduralDataset): + """Generates [RectangleCount Puzzles](https://en.wikipedia.org/wiki/RectangleCount_Puzzle) with configurable parameters""" + + def __init__(self, config: RectangleCountConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def __getitem__(self, idx: int) -> dict: + """Generate a single RectangleCount task + + Returns: + dict with keys: + - question: str, the task description + - answer: str, a solution string + - metadata: dict with generation parameters + """ + rng = Random(self.seed + idx) + + target = rng.randint(1, self.config.max_rectangles) + puzzle, answer = draw_rectangles_with_overlap(target, self.config.width, self.config.height, rng) + + puzz = f"How many rectangles do you see? Single rectangles are outlined with a '#', overlapping rectangles (max 2) are shown with '█'. \n\n {puzzle}" + + return { + "question": puzz, + "answer": str(answer), + "metadata": {}, + } + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Determine if the solution provided solves the RectangleCount task. + + The function awards 1.0 for a correct answer. + + Args: + answer (Optional[str]): The user's answer. + entry (Dict[str, any]): The original dataset entry containing the correct answer. + + Returns: + float: The computed score between 0.0 and 1.0. + """ + + if answer == None: + return 0.0 + if answer.lower().replace("\n", "") != entry["answer"].lower().replace("\n", ""): + return 0.01 + else: + return 1.0 # Yay + + +register_dataset("rectangle_count", RectangleCountDataset, RectangleCountConfig) diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py index 320a553d..457004ce 100644 --- a/reasoning_gym/utils.py +++ b/reasoning_gym/utils.py @@ -4,12 +4,15 @@ from decimal import Decimal, InvalidOperation from fractions import Fraction from typing import Any, Optional, Union -# DeepSeek Zero system prompt SYSTEM_PROMPTS = { "DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here -""" +""", + "default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner. +Once you have thought about the reasoning process, provide the answer in the following format: + answer here +""", } diff --git a/tests/test_ab.py b/tests/test_ab.py new file mode 100644 index 00000000..489c4acf --- /dev/null +++ b/tests/test_ab.py @@ -0,0 +1,100 @@ +import random + +import pytest + +from reasoning_gym.algorithmic.ab import ABConfig, ABDataset, compute_steps, generate_program + + +def test_ab_config_validation(): + """Test that invalid configs raise appropriate errors""" + with pytest.raises(AssertionError): + config = ABConfig(length=0) + config.validate() + + with pytest.raises(AssertionError): + config = ABConfig(size=0) + config.validate() + + +def test_ab_deterministic(): + """Test that dataset generates same items with same seed""" + config = ABConfig(seed=42, size=10, length=5) + dataset1 = ABDataset(config) + dataset2 = ABDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_ab_program_generation(): + """Test program generation and computation""" + rng = random.Random(42) + program = generate_program(5, rng) + + # Test program format + assert len(program) == 5 + assert all(token in ["A#", "#A", "B#", "#B"] for token in program) + + # Test computation + steps, non_halting = compute_steps(program) + assert isinstance(steps, list) + assert isinstance(non_halting, bool) + assert len(steps) > 0 + + # Test each step follows valid transformation rules + for step in steps: + assert all(token in ["A#", "#A", "B#", "#B"] for token in step) + + +def test_ab_scoring(): + """Test scoring functionality""" + config = ABConfig(seed=42, size=10, length=5) + dataset = ABDataset(config) + + for item in dataset: + # Test correct answer + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + + # Test wrong answer + wrong_answer = "A# B#" if item["answer"] != "A# B#" else "B# A#" + assert dataset.score_answer(answer=wrong_answer, entry=item) == 0.01 + + # Test None answer + assert dataset.score_answer(answer=None, entry=item) == 0.0 + + +def test_ab_iteration(): + """Test dataset iteration behavior""" + config = ABConfig(size=5, seed=42) + dataset = ABDataset(config) + + # Test length + assert len(dataset) == config.size + + # Test iteration + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same results + items2 = list(dataset) + assert items == items2 + + +def test_ab_item_structure(): + """Test structure and content of generated items""" + config = ABConfig(seed=42, size=10, length=5) + dataset = ABDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test question format + assert "A::B is a system" in item["question"] + assert "Return the final state" in item["question"] + + # Test answer format + answer_tokens = item["answer"].split() + assert all(token in ["A#", "#A", "B#", "#B"] for token in answer_tokens) diff --git a/tests/test_chain_sum.py b/tests/test_chain_sum.py index c1ddf641..36b0185c 100644 --- a/tests/test_chain_sum.py +++ b/tests/test_chain_sum.py @@ -1,6 +1,7 @@ import pytest from reasoning_gym.arithmetic import ChainSum, ChainSumConfig +from reasoning_gym.arithmetic.chain_sum import ChainSumCurriculum def test_chain_sum_config_validation(): @@ -127,3 +128,30 @@ def test_chain_sum_iteration(): first_items = list(dataset) second_items = list(dataset) assert first_items == second_items, "Multiple iterations should yield same items" + + +def test_chain_sum_curriculum(): + curriculum = ChainSumCurriculum() + + base_value = {"size": 150, "seed": 1} + + base_cfg: ChainSumConfig = curriculum.generate_configuration(base_value) + assert base_cfg.seed == 1 + assert base_cfg.size == 150 + assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1 + assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2 + + # test incrementing attribute levels for num_terms & num_digits attributes + curriculum.increment_attr_level("num_terms") + curriculum.increment_attr_level("num_digits") + + increased_cfg = curriculum.generate_configuration(base_value) + assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2 + assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3 + + # test decrementing attribute level for num_digits again + curriculum.decrement_attr_level("num_digits") + + partially_decreased_cfg = curriculum.generate_configuration(base_value) + assert partially_decreased_cfg.min_digits == 1 and partially_decreased_cfg.max_digits == 1 + assert partially_decreased_cfg.min_terms == 2 and partially_decreased_cfg.max_terms == 3 diff --git a/tests/test_count_primes.py b/tests/test_count_primes.py new file mode 100644 index 00000000..f131647b --- /dev/null +++ b/tests/test_count_primes.py @@ -0,0 +1,88 @@ +"""Tests for Count Primes questions generation""" + +import pytest + +from reasoning_gym.algorithmic.count_primes import CountPrimesConfig, CountPrimesDataset + + +def test_count_primes_config_validation(): + """Test that invalid configs raise appropriate errors""" + with pytest.raises(AssertionError): + config = CountPrimesConfig(max_n=-1) # Negative not allowed + config.validate() + + with pytest.raises(AssertionError): + config = CountPrimesConfig(max_n=0) # Zero not allowed + config.validate() + + +def test_count_primes_dataset_deterministic(): + """Test that dataset generates same items with same seed""" + config = CountPrimesConfig(seed=42, size=10) + dataset1 = CountPrimesDataset(config) + dataset2 = CountPrimesDataset(config) + + for i in range(len(dataset1)): + assert dataset1[i] == dataset2[i] + + +def test_count_primes_dataset_items(): + """Test basic properties of generated items""" + config = CountPrimesConfig(max_n=10, size=10, seed=42) + dataset = CountPrimesDataset(config) + + for i in range(len(dataset)): + item = dataset[i] + # Check item structure + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Check metadata + assert "start" in item["metadata"] + assert "end" in item["metadata"] + assert "primes" in item["metadata"] + assert "solution" in item["metadata"] + + start = item["metadata"]["start"] + end = item["metadata"]["end"] + primes = item["metadata"]["primes"] + + assert start <= end + assert len(primes) <= end - start + 1 + + +def test_count_primes_dataset_iteration(): + """Test that iteration respects dataset size""" + config = CountPrimesConfig(size=5, seed=42) + dataset = CountPrimesDataset(config) + + items = list(dataset) + assert len(items) == config.size + + # Test multiple iterations yield same items + assert items == list(dataset) + + +def test_count_primes_answer(): + """Test the _get_primes method""" + config = CountPrimesConfig(seed=42) + dataset = CountPrimesDataset(config) + + # Base cases + assert dataset._get_primes(n=0) == [] + assert dataset._get_primes(n=1) == [] + assert dataset._get_primes(n=2) == [False, False] + + # Test primes up to 10 + primes = dataset._get_primes(n=11) + assert primes[2] == True + assert primes[3] == True + assert primes[4] == False + assert primes[5] == True + assert primes[6] == False + assert primes[7] == True + assert primes[8] == False + assert primes[9] == False + assert primes[10] == False diff --git a/tests/test_dice.py b/tests/test_dice.py new file mode 100644 index 00000000..8a3bd991 --- /dev/null +++ b/tests/test_dice.py @@ -0,0 +1,35 @@ +import pytest + +from reasoning_gym.arithmetic.dice import DiceConfig, DiceDataset + + +def test_dice(): + """Test basic properties and solution of generated items""" + config = DiceConfig(seed=42, size=50, num_dice=8, max_dice_size=24) + dataset = DiceDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=None, entry=item) == 0.0 + + # Easy + config = DiceConfig(seed=42, size=1, num_dice=1, max_dice_size=2) + dataset = DiceDataset(config) + + for item in dataset: + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=None, entry=item) == 0.0 + + # Hard + config = DiceConfig(seed=42, size=1, num_dice=40, max_dice_size=40) + dataset = DiceDataset(config) + + for item in dataset: + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=None, entry=item) == 0.0 diff --git a/tests/test_rectangle_count.py b/tests/test_rectangle_count.py new file mode 100644 index 00000000..fcbb09f0 --- /dev/null +++ b/tests/test_rectangle_count.py @@ -0,0 +1,19 @@ +import pytest + +from reasoning_gym.cognition.rectangle_count import RectangleCountConfig, RectangleCountDataset + + +def test_dice(): + """Test basic properties and solution of generated items""" + config = RectangleCountConfig(seed=42, size=50, max_rectangles=15, width=40, height=40) + dataset = RectangleCountDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=None, entry=item) == 0.0