InternBootcamp/internbootcamp/libs/bbeh_multistep_arithmetic/bbeh_arithmetic_generator.py

import random
import math
import logging
from typing import Dict, Tuple


class BBEHArithmeticGenerator:
    def __init__(self):
        self.operators = ['+', '-', '*', '/', '><', ';', '@', '<>', '[]', '#', '!', '~', '&', ':', '][']
        self.precedence = {
            '+': 1, '-': 1,
            '*': 2, '/': 2,
            '><': 3, ';': 3,
            '@': 4, '<>': 4, '[]': 4,
            '#': 5, '!': 5,
            '~': 6, '&': 6,
            ':': 7, '][': 7
        }
        self.number_words = {
            'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
            'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
        }
        self.logger = logging.getLogger(__name__)
        self.max_value = 1.7976931348623157e+308
        self.min_value = -1.7976931348623157e+308
        self.epsilon = 1e-10

    def generate_case(self, min_depth: int = 3, max_depth: int = 6,
                      max_length: int = 50, difficulty: str = "medium") -> Dict:
        """生成一个算术表达式案例"""
        # 根据难度调整参数
        if difficulty == "easy":
            min_depth = 2
            max_depth = 4
            max_length = 30
        elif difficulty == "hard":
            min_depth = 4
            max_depth = 8
            max_length = 70

        expression, answer = self._generate_expression(min_depth, max_depth, max_length)

        return {
            "expression": expression,
            "answer": answer,
            "difficulty": difficulty
        }

    def _generate_subexpression(self, depth: int, current_length: int,
                                max_length: int) -> Tuple[str, int]:
        """生成子表达式"""
        if depth == 0 or current_length >= max_length:
            if random.random() < 0.3:
                word = random.choice(list(self.number_words.keys()))
                return word, current_length + 1
            return str(random.randint(-100, 100)), current_length + 1

        choice = random.random()

        if choice < 0.4:
            op = random.choice(self.operators)
            left, left_length = self._generate_subexpression(depth - 1, current_length, max_length)
            right, right_length = self._generate_subexpression(depth - 1, left_length + 1, max_length)
            return f"({left} {op} {right})", right_length + 3
        else:
            op = random.choice(self.operators)
            left, left_length = self._generate_subexpression(depth - 1, current_length, max_length)
            right, right_length = self._generate_subexpression(depth - 1, left_length + 1, max_length)
            return f"({left} {op} {right})", right_length + 3

    def _generate_expression(self, min_depth: int, max_depth: int,
                             max_length: int) -> Tuple[str, float]:
        """生成完整表达式及其答案"""
        from libs.bbeh_multistep_arithmetic.bbeh_arithmetic_solver import BBEHArithmeticSolver
        solver = BBEHArithmeticSolver()

        while True:
            try:
                expression, _ = self._generate_subexpression(
                    random.randint(min_depth, max_depth), 0, max_length
                )
                answer = solver.solve(expression)

                # 验证答案是否有效
                if not math.isinf(answer) and not math.isnan(answer):
                    return expression, answer
            except Exception as e:
                self.logger.warning(f"Expression generation failed: {str(e)}")
                continue

    def _validate_expression(self, tokens):
        """验证表达式的有效性"""
        stack = []
        for token in tokens:
            if token == '(':
                stack.append(token)
            elif token == ')':
                if not stack:
                    return False
                stack.pop()
        return len(stack) == 0

    def _validate_expression_structure(self, tokens):
        """验证表达式的结构是否合法"""
        stack = []
        operand_count = 0
        operator_count = 0

        for token in tokens:
            if token == '(':
                stack.append(token)
            elif token == ')':
                if not stack:
                    return False
                stack.pop()
            elif token in self.operators:
                operator_count += 1
            else:
                operand_count += 1

        # 检查括号是否匹配
        if stack:
            return False

        # 检查操作数和操作符的数量关系
        return operand_count == operator_count + 1

    def _tokenize(self, expr):
        """将表达式转换为token列表"""
        try:
            tokens = []
            i = 0
            while i < len(expr):
                char = expr[i]

                # 处理空格
                if char.isspace():
                    i += 1
                    continue

                # 处理数字单词
                if char.isalpha() and (i == 0 or not expr[i - 1].isdigit()):
                    word = ''
                    while i < len(expr) and expr[i].isalpha():
                        word += expr[i]
                        i += 1
                    if word in self.number_words:
                        tokens.append(str(self.number_words[word]))
                    else:
                        raise ValueError(f"Unknown word: {word}")
                    continue

                # 处理数字（包括科学记数法）
                if char.isdigit() or (char == '-' and (not tokens or tokens[-1] in self.operators + ['('])):
                    num = char
                    i += 1
                    while i < len(expr) and (expr[i].isdigit() or expr[i] == '.'):
                        num += expr[i]
                        i += 1

                    if i < len(expr) and (expr[i] == 'e' or expr[i] == 'E'):
                        num += expr[i]
                        i += 1
                        if i < len(expr) and (expr[i] == '+' or expr[i] == '-'):
                            num += expr[i]
                            i += 1
                        while i < len(expr) and expr[i].isdigit():
                            num += expr[i]
                            i += 1

                    try:
                        float(num)
                        tokens.append(num)
                    except ValueError:
                        raise ValueError(f"Invalid number format: {num}")
                    continue

                # 处理括号
                if char in '()':
                    tokens.append(char)
                    i += 1
                    continue

                # 处理运算符
                if i < len(expr):
                    max_op_len = 3
                    matched = False
                    for length in range(max_op_len, 0, -1):
                        if i + length <= len(expr):
                            potential_op = expr[i:i + length]
                            if potential_op in self.operators:
                                tokens.append(potential_op)
                                i += length
                                matched = True
                                break
                    if not matched:
                        raise ValueError(f"Invalid character: {char}")

            if not self._validate_expression(tokens):
                raise ValueError("Invalid expression: Mismatched parentheses")

            return tokens
        except Exception as e:
            self.logger.error(f"Error in tokenization: {str(e)}")
            raise