InternBootcamp/internbootcamp/bootcamp/bigcodebench/bigcodebench.py

from internbootcamp.bootcamp.base import Basebootcamp
import re
import tempfile
import subprocess
import os
from datasets import load_dataset

class BigCodeBenchbootcamp(Basebootcamp):
    def __init__(self, **params):
        super().__init__(**params)

    def case_generator(self):
        pass

    @staticmethod
    def prompt_func(case) -> str:
        def add_python_marker(text: str) -> str:
            marker = "```"
            insert_position = text.find(marker)

            if insert_position == -1:
                return text  # 如果没有 ```，则返回原文本

            return text[:insert_position + len(marker)] + "python " + text[insert_position + len(marker):]
        return add_python_marker(case['instruct_prompt'])

    @staticmethod
    def extract_output(output):
        match = re.search(r'```python(.*?)```', output, re.DOTALL)
        if match:
            solution = match.group(1).strip()
            return solution
        else:
            return None

    @classmethod
    def _verify_correction(cls, solution, identity) -> bool:
        test_code = identity['test']
        timeout = 30
        with tempfile.TemporaryDirectory() as temp_dir:
            # 1. 生成 task_func 代码文件
            task_file = os.path.join(temp_dir, "solution.py")
            with open(task_file, "w", encoding="utf-8") as f:
                f.write(solution)

            # 2. 生成 unittest 测试文件
            # test_code = "from task import task_func\n" + test_code
            test_file = os.path.join(temp_dir, "test_task.py")
            with open(test_file, "w", encoding="utf-8") as f:
                f.write(test_code)

            # 3. 运行 unittest
            process = subprocess.Popen(
                ["python", "-m", "unittest", "test_task.py"],
                cwd=temp_dir,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            try:
                stdout, stderr = process.communicate(timeout=timeout)
            except subprocess.TimeoutExpired:
                process.kill()
                process.wait()
                return 0  # 进程超时，返回 0 分
            process.stdout.close()  # 关闭管道，释放资源
            process.stderr.close()
            # 4. 解析测试结果
            match_ran = re.search(r'Ran (\d+) tests?', stderr)
            match_failed = re.search(r'failures=(\d+)', stderr)
            match_error = re.search(r'errors=(\d+)', stderr)

            total_tests = int(match_ran.group(1)) if match_ran else 0
            failed_tests = int(match_failed.group(1)) if match_failed else 0
            error_tests = int(match_error.group(1)) if match_error else 0
            passed_tests = total_tests - failed_tests - error_tests
            pass_rate = float(passed_tests) / float(total_tests) if total_tests > 0 else 0.0

        return pass_rate