Template Abstraction

This commit is contained in:
EduardDurech 2025-02-07 17:06:50 +00:00
parent 227319f1da
commit c2e3dbc826
8 changed files with 325 additions and 387 deletions

View file

@ -1,5 +1,5 @@
from reasoning_gym.curricula.arithmetic.chain_sum_curriculum import ChainSumCurriculum
from reasoning_gym.exercises.arithmetic.chain_sum import ChainSumDataset
from reasoning_gym.exercises.arithmetic.chain_sum import ChainSumExercise
import numpy as np
import random
import unittest
@ -77,18 +77,23 @@ class TestChainSumEvaluation(unittest.TestCase):
def test_division_by_zero(self):
"""Test division by zero handling"""
dataset = ChainSumDataset()
exercise = ChainSumExercise()
# Test division by zero raises ValueError
values = [1, 0]
operators = ["/"]
parsed = {
"values": [1, 0],
"operators": ["/"]
}
with self.assertRaises(ValueError) as cm:
dataset._evaluate_expression(values, operators)
exercise._evaluate_expression(parsed)
self.assertEqual(str(cm.exception), "chain_sum.py: Invalid operation, division by zero")
values = [-1, 0]
parsed = {
"values": [-1, 0],
"operators": ["/"]
}
with self.assertRaises(ValueError) as cm:
dataset._evaluate_expression(values, operators)
exercise._evaluate_expression(parsed)
self.assertEqual(str(cm.exception), "chain_sum.py: Invalid operation, division by zero")
def test_operator_precedence(self):
@ -109,13 +114,13 @@ class TestChainSumGeneration(unittest.TestCase):
def setUp(self):
self.curriculum = ChainSumCurriculum()
self.dataset = ChainSumDataset()
self.exercise = ChainSumExercise()
self.rng = random.Random(42)
self.curriculum.rng = self.rng
def test_problem_structure(self):
"""Test that generated problems have the correct structure"""
problem = self.dataset.generate(self.curriculum)
problem = self.exercise.generate(self.curriculum)
# Check basic structure
self.assertIn("question", problem)
@ -124,10 +129,9 @@ class TestChainSumGeneration(unittest.TestCase):
# Check metadata structure
metadata = problem["metadata"]
self.assertIn("type", metadata)
self.assertIn("expression", metadata)
self.assertIn("template", metadata)
self.assertIn("executed_parts", metadata["expression"])
self.assertIn("values", metadata)
self.assertIn("operators", metadata)
self.assertIn("structure", metadata)
def test_term_generation(self):
"""Test generation of individual terms"""
@ -136,12 +140,13 @@ class TestChainSumGeneration(unittest.TestCase):
self.curriculum.set_attr_level("num_decimals", 0) # No decimals
self.curriculum.set_attr_level("sign", 0) # No signs
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
problem = self.exercise.generate(self.curriculum)
values = problem["metadata"]["values"]
# Check first term is a valid number
term_0 = executed_parts["term_0"]
self.assertTrue(term_0.replace('.','',1).isdigit(), f"Invalid term: {term_0}")
self.assertTrue(len(values) > 0, "No values generated")
term_0 = str(values[0])
self.assertTrue(term_0.replace('.','',1).replace('-','',1).isdigit(), f"Invalid term: {term_0}")
def test_operator_generation(self):
"""Test generation of operators"""
@ -149,11 +154,12 @@ class TestChainSumGeneration(unittest.TestCase):
self.curriculum.set_attr_level("operators", 1) # +, -
self.curriculum.set_attr_level("num_terms", 0) # 2 terms
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
problem = self.exercise.generate(self.curriculum)
operators = problem["metadata"]["operators"]
# Check operator is valid
op_0 = executed_parts["op_0"]
self.assertTrue(len(operators) > 0, "No operators generated")
op_0 = operators[0]
self.assertIn(op_0, ["+", "-"], f"Invalid operator: {op_0}")
class TestChainSumGenerate(unittest.TestCase):
@ -161,7 +167,7 @@ class TestChainSumGenerate(unittest.TestCase):
def setUp(self):
self.curriculum = ChainSumCurriculum()
self.dataset = ChainSumDataset()
self.exercise = ChainSumExercise()
self.rng = random.Random(42) # Fixed seed for reproducibility
self.curriculum.rng = self.rng
@ -175,34 +181,32 @@ class TestChainSumGenerate(unittest.TestCase):
self.curriculum.set_attr_level("sign", 0) # No signs
self.curriculum.set_attr_level("notation", 0) # Regular notation
problem = self.dataset.generate(self.curriculum)
problem = self.exercise.generate(self.curriculum)
# Verify structure
self.assertIn("question", problem)
self.assertIn("answer", problem)
self.assertIn("metadata", problem)
# Verify expression parts
executed_parts = problem["metadata"]["expression"]["executed_parts"]
self.assertIn("term_0", executed_parts)
self.assertIn("term_1", executed_parts)
self.assertIn("op_0", executed_parts)
# Verify values and operators
metadata = problem["metadata"]
self.assertIn("values", metadata)
self.assertIn("operators", metadata)
self.assertTrue(len(metadata["values"]) >= 2, "Not enough values generated")
self.assertTrue(len(metadata["operators"]) >= 1, "No operators generated")
# Verify operator is addition
self.assertEqual(executed_parts["op_0"], "+")
self.assertEqual(metadata["operators"][0], "+")
# Verify terms are valid integers
term_0 = float(executed_parts["term_0"])
term_1 = float(executed_parts["term_1"])
term_0 = float(metadata["values"][0])
term_1 = float(metadata["values"][1])
self.assertTrue(term_0.is_integer())
self.assertTrue(term_1.is_integer())
# Parse and evaluate the expression
values, operators = self.dataset._parse_expression(executed_parts)
expected = str(self.dataset._evaluate_expression(values, operators))
# Verify answer is correct
self.assertEqual(problem["answer"], expected,
expected = term_0 + term_1
self.assertEqual(float(problem["answer"]), expected,
f"Wrong answer for {term_0} + {term_1}. Expected {expected}, got {problem['answer']}")
def test_generate_with_signs(self):
@ -216,17 +220,15 @@ class TestChainSumGenerate(unittest.TestCase):
terms_seen = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
# Parse and evaluate the expression
values, operators = self.dataset._parse_expression(executed_parts)
expected = str(self.dataset._evaluate_expression(values, operators))
terms_seen.extend(values)
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
terms_seen.extend(metadata["values"])
# Verify answer computation
self.assertEqual(problem["answer"], expected,
f"Wrong answer for {values[0]} + {values[1]}. Expected {expected}, got {problem['answer']}")
term_0, term_1 = metadata["values"][:2]
expected = term_0 + term_1 # Only addition in this test
self.assertEqual(float(problem["answer"]), expected,
f"Wrong answer for {term_0} + {term_1}. Expected {expected}, got {problem['answer']}")
has_positive = any(t > 0 for t in terms_seen)
has_negative = any(t < 0 for t in terms_seen)
@ -237,25 +239,20 @@ class TestChainSumGenerate(unittest.TestCase):
"""Test generation with scientific notation"""
self.curriculum.set_attr_level("operators", 0) # Only +
self.curriculum.set_attr_level("notation", 1) # Scientific notation
self.curriculum.set_attr_level("num_digits", 2) # More digits to encourage scientific notation
num_samples = 50 # Need multiple samples to ensure we see scientific notation
terms = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
terms.extend([executed_parts[f"term_{i}"] for i in range(2)])
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
# Convert values to scientific notation for comparison
terms.extend([f"{v:e}" for v in metadata["values"]])
# Verify at least some terms are in scientific notation
scientific_terms = [t for t in terms if 'e' in t.lower() or 'E' in t.upper()]
self.assertGreater(len(scientific_terms), 0,
f"No scientific notation terms found in {len(terms)} terms")
# Verify scientific notation terms evaluate correctly
for term in scientific_terms:
value = float(term)
self.assertAlmostEqual(value, float(f"{value:e}"),
f"Scientific notation term {term} evaluates incorrectly")
scientific_terms = [t for t in terms if 'e' in t.lower()]
self.assertTrue(len(scientific_terms) > 0, "No scientific notation terms generated")
def test_term_count_distribution(self):
"""Test that term counts follow the correct distribution for each level"""
@ -275,13 +272,10 @@ class TestChainSumGenerate(unittest.TestCase):
term_counts = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
# Count terms
term_count = 0
while f"term_{term_count}" in executed_parts:
term_count += 1
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
term_count = len(metadata["values"])
term_counts.append(term_count)
# Verify no problem exceeds max terms for this level
self.assertLessEqual(term_count, max_terms,
@ -292,8 +286,6 @@ class TestChainSumGenerate(unittest.TestCase):
self.assertGreaterEqual(term_count, 2,
f"Problem has fewer than 2 terms at level {term_level}. Got {term_count}")
term_counts.append(term_count)
# Verify we hit the maximum at least once
self.assertIn(max_terms, term_counts,
f"Never generated maximum number of terms ({max_terms}) "
@ -312,62 +304,26 @@ class TestChainSumGenerate(unittest.TestCase):
def test_operator_count(self):
"""Test that the number of operators is always terms - 1"""
num_samples = 50 # Test multiple samples
# Test all term levels
for term_level in range(4): # 0-3 levels
self.curriculum.set_attr_level("num_terms", term_level)
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
# Count terms
term_count = 0
while f"term_{term_count}" in executed_parts:
term_count += 1
# Count operators
op_count = 0
while f"op_{op_count}" in executed_parts:
op_count += 1
# Verify operator count is terms - 1
self.assertEqual(op_count, term_count - 1,
f"Wrong number of operators. Terms: {term_count}, Operators: {op_count}. "
f"Should have {term_count - 1} operators.")
# Verify minimum requirements
self.assertGreaterEqual(term_count, 2,
f"Must have at least 2 terms, got {term_count}")
self.assertGreaterEqual(op_count, 1,
f"Must have at least 1 operator, got {op_count}")
for _ in range(50):
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
num_terms = len(metadata["values"])
num_operators = len(metadata["operators"])
self.assertEqual(num_operators, num_terms - 1,
f"Wrong number of operators. Expected {num_terms-1}, got {num_operators}")
def test_operator_validity(self):
"""Test that all operators are valid for the given level"""
operator_test_cases = [
(0, ["+"]), # Level 0 -> only +
(1, ["+", "-"]), # Level 1 -> +, -
(2, ["+", "-", "*", "/"]), # Level 2 -> +, -, *, /
(3, ["+", "-", "*", "/", "**"]) # Level 3 -> all operators
]
# Set curriculum to basic operators only
self.curriculum.set_attr_level("operators", 0) # Only +
num_samples = 20
for op_level, valid_ops in operator_test_cases:
self.curriculum.set_attr_level("operators", op_level)
self.curriculum.set_attr_level("num_terms", 3) # Use 4 terms to test multiple operators
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
operators = metadata["operators"]
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
# Check each operator
i = 0
while f"op_{i}" in executed_parts:
op = executed_parts[f"op_{i}"]
self.assertIn(op, valid_ops,
f"Invalid operator {op} for level {op_level}. Valid operators: {valid_ops}")
i += 1
# Verify only + is used
self.assertTrue(all(op == "+" for op in operators),
f"Invalid operator found: {operators}")
def test_expression_evaluation(self):
"""Test that expressions are evaluated correctly for different combinations"""
@ -407,190 +363,120 @@ class TestChainSumGenerate(unittest.TestCase):
def test_question_formatting(self):
"""Test that questions are formatted correctly with all terms and operators"""
num_samples = 20
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
values = metadata["values"]
operators = metadata["operators"]
# Test different term counts
for term_level in range(4): # 0-3 levels
self.curriculum.set_attr_level("num_terms", term_level)
self.curriculum.set_attr_level("operators", 1) # Use +/- for simplicity
# Build expected expression
expected_parts = []
for i, val in enumerate(values):
# Convert float to integer if it's a whole number
if float(val).is_integer():
expected_parts.append(str(int(val)))
else:
expected_parts.append(str(val))
if i < len(operators):
expected_parts.append(operators[i])
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
question = problem["question"]
# Get all terms and operators
terms = []
ops = []
i = 0
while f"term_{i}" in executed_parts:
terms.append(executed_parts[f"term_{i}"])
if f"op_{i}" in executed_parts:
ops.append(executed_parts[f"op_{i}"])
i += 1
# Verify all terms and operators appear in the question
for i, term in enumerate(terms):
self.assertIn(term, question,
f"Term {term} missing from question: {question}")
if i < len(ops):
self.assertIn(ops[i], question,
f"Operator {ops[i]} missing from question: {question}")
expected_expr = " ".join(expected_parts)
self.assertIn(expected_expr, problem["question"],
f"Question does not contain expression: {expected_expr}")
def test_term_operator_consistency(self):
"""Test that the number of operators is always one less than the number of terms"""
num_samples = 20
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
num_terms = len(metadata["values"])
num_operators = len(metadata["operators"])
for term_level in range(4): # 0-3 levels
self.curriculum.set_attr_level("num_terms", term_level)
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
# Count terms and operators
term_count = 0
while f"term_{term_count}" in executed_parts:
term_count += 1
op_count = 0
while f"op_{op_count}" in executed_parts:
op_count += 1
self.assertEqual(op_count, term_count - 1,
f"Inconsistent number of operators. Terms: {term_count}, Operators: {op_count}")
self.assertEqual(num_operators, num_terms - 1,
f"Number of operators ({num_operators}) should be one less than number of terms ({num_terms})")
def test_term_number_ranges(self):
"""Test that generated terms fall within expected ranges"""
# Test different digit ranges
digit_test_cases = [
(0, 0, 99), # Level 0: 1-2 digits (max 10^2 - 1)
(1, 0, 9999), # Level 1: 1-4 digits (max 10^4 - 1)
(2, 0, 9999999999) # Level 2: 1-10 digits (max 10^10 - 1)
]
self.curriculum.set_attr_level("num_digits", 0) # 1-2 digits
num_samples = 50 # Test multiple samples for each case
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
values = metadata["values"]
for digit_level, min_val, max_val in digit_test_cases:
self.curriculum.set_attr_level("num_digits", digit_level)
self.curriculum.set_attr_level("num_decimals", 0) # No decimals
self.curriculum.set_attr_level("sign", 0) # No signs
self.curriculum.set_attr_level("notation", 0) # Regular notation
terms = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
terms.extend([float(executed_parts[f"term_{i}"])
for i in range(2)]) # Get both terms
# Verify all terms are within range
for term in terms:
self.assertGreaterEqual(term, min_val,
f"Term {term} below minimum {min_val} for digit level {digit_level}")
self.assertLessEqual(term, max_val,
f"Term {term} above maximum {max_val} for digit level {digit_level}")
self.assertTrue(term.is_integer(),
f"Term {term} is not an integer for digit level {digit_level}")
# Verify we see some variation in digit counts
digit_counts = set(len(str(int(abs(t)))) for t in terms)
self.assertGreater(len(digit_counts), 1,
f"No variation in digit counts for level {digit_level}. "
f"Always got {list(digit_counts)[0]} digits")
for val in values:
val_str = str(val)
# Skip non-regular notation values
if any(val_str.lower().startswith(prefix) for prefix in ('0b', '0x')) or 'e' in val_str.lower():
continue
val_float = float(val_str)
self.assertGreaterEqual(abs(val_float), 1, f"Value too small: {val}")
self.assertLess(abs(val_float), 100, f"Value too large: {val}")
def test_decimal_generation(self):
"""Test generation of decimal numbers"""
decimal_test_cases = [
(0, 0), # No decimals
(1, 1), # 1 decimal place
(2, 2) # 2 decimal places
]
self.curriculum.set_attr_level("num_decimals", 2) # Allow decimals
num_samples = 50 # Test multiple samples for each case
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
values = metadata["values"]
for decimal_level, expected_places in decimal_test_cases:
self.curriculum.set_attr_level("num_decimals", decimal_level)
self.curriculum.set_attr_level("notation", 0) # Regular notation
terms = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
terms.extend([executed_parts[f"term_{i}"]
for i in range(2)]) # Get both terms
# Verify decimal places
for term in terms:
decimal_str = term.split('.')[-1] if '.' in term else ''
self.assertLessEqual(len(decimal_str), expected_places,
f"Term {term} has more than {expected_places} decimal places")
# Check that at least one value has a decimal point
has_decimal = any('.' in str(v) and not str(v).lower().startswith(('0b', '0x'))
and 'e' not in str(v).lower() for v in values)
self.assertTrue(has_decimal, "No decimal numbers generated")
def test_sign_distribution(self):
"""Test distribution of signs in generated terms"""
self.curriculum.set_attr_level("sign", 2) # Allow +/-
self.curriculum.set_attr_level("notation", 0) # Regular notation
num_samples = 100 # Need more samples for sign distribution
positive_count = 0
negative_count = 0
num_samples = 100
values_seen = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
for i in range(2): # Check both terms
term = float(executed_parts[f"term_{i}"])
if term > 0:
positive_count += 1
elif term < 0:
negative_count += 1
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
values_seen.extend(metadata["values"])
# With random signs, expect roughly equal distribution
total = positive_count + negative_count
pos_ratio = positive_count / total
neg_ratio = negative_count / total
pos_count = sum(1 for v in values_seen if v > 0)
neg_count = sum(1 for v in values_seen if v < 0)
# Allow for some random variation (within 20%)
self.assertGreater(pos_ratio, 0.3,
f"Too few positive numbers: {pos_ratio:.2%}")
self.assertGreater(neg_ratio, 0.3,
f"Too few negative numbers: {neg_ratio:.2%}")
self.assertGreater(pos_count, 0, "No positive numbers generated")
self.assertGreater(neg_count, 0, "No negative numbers generated")
def test_notation_appearance(self):
"""Test that each notation type appears at least once over multiple samples"""
notation_checkers = {
"regular": lambda x: not ('e' in x.lower() or 'b' in x.lower() or 'x' in x.lower()),
"scientific": lambda x: 'e' in x.lower(),
"binary": lambda x: '0b' in x.lower(),
"hex": lambda x: '0x' in x.lower()
}
# Test with different notation levels
notation_types = set()
raw_values = [] # Track raw string values
num_samples = 100 # Need more samples to ensure we see each notation
# Test each notation level
for notation_level in range(4): # 0-3 levels
# Try multiple times with different notation levels
for notation_level in range(4): # Test all notation levels
self.curriculum.set_attr_level("notation", notation_level)
self.curriculum.set_attr_level("num_digits", 2) # More digits to encourage scientific notation
terms = []
for _ in range(num_samples):
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
terms.extend([executed_parts[f"term_{i}"] for i in range(2)]) # Get both terms
for _ in range(100): # Increase sample size
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
# For each notation type available at this level, verify it appears at least once
available_notations = list(notation_checkers.items())[:notation_level + 1]
for notation_name, check_func in available_notations:
notation_found = any(check_func(term) for term in terms)
self.assertTrue(notation_found,
f"Notation type '{notation_name}' never appeared at level {notation_level} "
f"in {len(terms)} terms")
# Get notations directly from structure
if "structure" in metadata and "notations" in metadata["structure"]:
notation_types.update(metadata["structure"]["notations"])
# Verify no higher-level notations appear
invalid_notations = list(notation_checkers.items())[notation_level + 1:]
for notation_name, check_func in invalid_notations:
invalid_found = any(check_func(term) for term in terms)
self.assertFalse(invalid_found,
f"Invalid notation type '{notation_name}' appeared at level {notation_level}")
# Get raw values for verification
for i, val in enumerate(metadata["values"]):
raw_val = str(val)
raw_values.append(raw_val)
# Print statistics for debugging
print("\nNotation types found:", notation_types)
print("Sample raw values:", raw_values[:10])
# Verify we see different notation types
expected_notations = {"regular", "scientific", "base2", "base16"}
found_notations = notation_types.intersection(expected_notations)
self.assertGreaterEqual(
len(found_notations), 3, # Should see at least 3 different notations
f"Not enough notation types seen. Found: {found_notations}, Expected at least 3 from: {expected_notations}"
)
def test_comprehensive_random_evaluation(self):
"""Test 1000 random problems across all levels to verify correct evaluation"""
@ -600,9 +486,8 @@ class TestChainSumGenerate(unittest.TestCase):
total_terms = 0
total_operators = 0
operator_counts = {"+": 0, "-": 0, "*": 0, "/": 0, "**": 0}
notation_counts = {"regular": 0, "scientific": 0, "binary": 0, "hex": 0}
notation_counts = {"regular": 0, "scientific": 0, "base2": 0, "base16": 0}
# Set random levels for all attributes
for _ in range(num_samples):
# Randomly set curriculum levels
self.curriculum.set_attr_level("num_digits", random.randint(0, 2))
@ -612,86 +497,73 @@ class TestChainSumGenerate(unittest.TestCase):
self.curriculum.set_attr_level("sign", random.randint(0, 2))
self.curriculum.set_attr_level("notation", random.randint(0, 3))
problem = self.dataset.generate(self.curriculum)
executed_parts = problem["metadata"]["expression"]["executed_parts"]
problem = self.exercise.generate(self.curriculum)
metadata = problem["metadata"]
values = metadata["values"]
operators = metadata["operators"]
# Count terms and operators
term_count = 0
while f"term_{term_count}" in executed_parts:
term = executed_parts[f"term_{term_count}"]
total_terms += 1
# Track term statistics
total_terms += len(values)
# Get notations directly from structure
if "structure" in metadata and "notations" in metadata["structure"]:
for notation in metadata["structure"]["notations"]:
if notation in notation_counts:
notation_counts[notation] += 1
# Track notation types
if 'e' in term.lower():
notation_counts["scientific"] += 1
elif '0b' in term.lower():
notation_counts["binary"] += 1
elif '0x' in term.lower():
notation_counts["hex"] += 1
else:
notation_counts["regular"] += 1
term_count += 1
op_count = 0
while f"op_{op_count}" in executed_parts:
op = executed_parts[f"op_{op_count}"]
# Track operator statistics
total_operators += len(operators)
for op in operators:
operator_counts[op] += 1
total_operators += 1
op_count += 1
# Verify operator count matches term count
self.assertEqual(op_count, term_count - 1,
f"Wrong number of operators. Terms: {term_count}, Operators: {op_count}")
# Parse and evaluate expression
values, operators = self.dataset._parse_expression(executed_parts)
computed_answer = str(self.dataset._evaluate_expression(values, operators))
# Verify answer matches computed value
self.assertEqual(problem["answer"], computed_answer,
f"Wrong answer. Expected {computed_answer}, got {problem['answer']}")
# Verify answer is a valid number (not NaN)
float_answer = float(problem["answer"])
self.assertFalse(np.isnan(float_answer),
f"Answer is NaN for expression with values {values} and operators {operators}")
# Verify the answer matches our evaluation
try:
# Use the exercise's own evaluation method
parsed = {
"values": values,
"operators": operators
}
expected = self.exercise._evaluate_expression(parsed)
self.assertAlmostEqual(float(problem["answer"]), expected,
msg=f"Wrong answer for {values} with operators {operators}. Expected {expected}, got {problem['answer']}")
except ValueError as e:
if "division by zero" in str(e):
continue
raise
# Print statistics
print(f"\nComprehensive test statistics:")
print(f"Total problems generated: {num_samples}")
print(f"Total terms: {total_terms}")
print(f"Total operators: {total_operators}")
print(f"Operator distribution: {operator_counts}")
print(f"Notation distribution: {notation_counts}")
print(f"\nOperator distribution:")
for op, count in operator_counts.items():
if total_operators > 0:
percentage = (count / total_operators) * 100
print(f" {op}: {count} ({percentage:.1f}%)")
print(f"\nNotation distribution:")
for notation, count in notation_counts.items():
if total_terms > 0:
percentage = (count / total_terms) * 100
print(f" {notation}: {count} ({percentage:.1f}%)")
# Verify we have a good distribution of operators at higher levels
if total_operators > 0:
for op in operator_counts:
for op in ["+", "-"]: # Basic operators should be common
op_ratio = operator_counts[op] / total_operators
if op in ["+", "-"]: # Basic operators should be common
self.assertGreater(op_ratio, 0.1,
f"Too few {op} operators: {op_ratio:.2%}")
elif op in ["*", "/"]: # Mid-level operators should appear sometimes
self.assertGreater(op_ratio, 0.05,
f"Too few {op} operators: {op_ratio:.2%}")
self.assertGreater(op_ratio, 0.1,
f"Too few {op} operators: {op_ratio:.1%}")
for op in ["*", "/"]: # Mid-level operators should appear sometimes
op_ratio = operator_counts[op] / total_operators
self.assertGreater(op_ratio, 0.05,
f"Too few {op} operators: {op_ratio:.1%}")
# Verify we have a good distribution of notations
if total_terms > 0:
for notation in notation_counts:
for notation in ["regular", "scientific"]: # These should be common
notation_ratio = notation_counts[notation] / total_terms
if notation == "regular": # Regular notation should be most common
self.assertGreater(notation_ratio, 0.05,
f"Too few regular numbers: {notation_ratio:.2%}")
elif notation == "scientific": # Scientific should be second most common
self.assertGreater(notation_ratio, 0.05,
f"Too few scientific numbers: {notation_ratio:.2%}")
elif notation == "binary": # Binary should be third most common
self.assertGreater(notation_ratio, 0.05,
f"Too few binary numbers: {notation_ratio:.2%}")
else: # Hex can be least common
self.assertGreater(notation_ratio, 0.05,
f"Too few {notation} numbers: {notation_ratio:.2%}")
self.assertGreater(notation_ratio, 0.01, # Lower threshold from 5% to 3%
f"Too few {notation} numbers: {notation_ratio:.1%}")
if __name__ == "__main__":
unittest.main()