update to tech report version (#10)

* feat(run_eval): add checkpoint resume functionality and update example documentation; - update new bootcamp benchmark dataset * refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation * docs: update bootcamp list and add new scripts - Update Fulllist_InternBootcamp.md with new bootcamps and categories - Add new scripts to .gitignore: - examples/pipelines/filter_autogen_configs.py - examples/pipelines/quickgen_data_configs_from_eval_meta.py - Update dependencies in setup.py: - Add scipy and scikit-learn * refactor(internbootcamp): update bootcamp modules and improve error handling - Update import statements in __init__.py files - Add timestamp to target directory name in verl_data_preprocess.py - Improve error handling and scoring logic in bootcamp_judger.py - Remove unnecessary comments and update puzzle descriptions in multiple files
2026-04-28 17:29:37 +00:00 · 2025-08-28 12:39:47 +08:00 · 2025-08-28 12:39:47 +08:00 · a8249acc18
commit a8249acc18
parent 125a7818e0
2952 changed files with 105460 additions and 17649 deletions
--- a/internbootcamp/bootcamp/med_calculator/med_calculator.py
+++ b/internbootcamp/bootcamp/med_calculator/med_calculator.py
@ -46,10 +46,12 @@ def last_boxed_only_string(string):
    return retval

 class Medcalculatorbootcamp(Basebootcamp): # 医学计算器类
-    def __init__(self, conf_file="./internbootcamp/libs/med_calculator/med_calculator.json", seed=None):
+    def __init__(self, conf_file="./internbootcamp/libs/med_calculator/med_calculator.json", seed=None, add_rule_ratio=0.25):
        random.seed(seed)
+        self.add_rule_ratio = add_rule_ratio
        with open(conf_file, "r", encoding="utf-8") as f:
            self.config = json.load(f)
+        self.rules = self._construct_rules()

    def _gen_a_case(self, category, name):
        details = self.config[category][name]
@ -60,48 +62,59 @@ class Medcalculatorbootcamp(Basebootcamp): # 医学计算器类
            case 'equation':
                while 1:
                    formula = details["formula"]
-                    for i in details["inputs"]:
-                        match indicators[i]["type"]:
-                            case "int":
-                                v = random.randint(*indicators[i]["range"])
-                            case "float":
-                                v = random.uniform(*indicators[i]["range"])
-                                if 'precision' in indicators[i]:
-                                    v = round(v, indicators[i]["precision"])
-                            case "choice":
-                                v = random.choice(indicators[i]["range"])
-
-                        t = i + str(v) + indicators[i].get("unit", "")
-                        inputs.append(t)
-                        formula = formula.replace(i, str(v))
-
                    try:
+                        for i in details["inputs"]:
+                            match indicators[i]["type"]:
+                                case "int":
+                                    k = v = random.randint(*indicators[i]["range"])
+                                case "float":
+                                    v = random.uniform(*indicators[i]["range"])
+                                    if 'precision' in indicators[i]:
+                                        v = round(v, indicators[i]["precision"])
+                                    k = v
+                                case "choice":
+                                    rg = indicators[i]["range"]
+                                    if (t := type(rg)) is list:
+                                        k = v = random.choice(indicators[i]["range"])
+                                    elif t is dict:
+                                        k = random.choice(list(rg.keys()))
+                                        v = rg[k]
+
+                            inp = i.split('[')[0] + str(k) + indicators[i].get("unit", "")
+                            inputs.append(inp)
+                            formula = formula.replace(i, str(v))
+
                        target = eval(formula)
                        break
                    except (ZeroDivisionError, ValueError):
                        pass
                    except Exception as e:
+                        e.args=(*e.args, name, '公式：'+details["formula"], '带入数值：'+formula)
                        raise e
-                        # print(name, formula, details["formula"])
-                        # breakpoint()
                out_k = name.split('—')[-1]
                if 'precision' in indicators[out_k]:
                    target = round(target, indicators[out_k]["precision"])
            case 'scale':
                target = 0
-                for title, options in details["points"].items():
-                    if isinstance(options, dict):
-                        selected_option = random.choice(list(options.keys()))
-                        inputs.append(f'{title}: {selected_option}')
-                        target += options[selected_option]
-                    else:
-                        inputs.append(title)
-                        target += options
+                try:
+                    for title, options in details["points"].items():
+                        if options['type'] == '单选':
+                            selected_option = random.choice(list(options['items'].keys()))
+                            inputs.append(f'{title}: {selected_option}')
+                            target += options['items'][selected_option]
+                        elif options['type'] == '多选':
+                            selected_options = random.sample(list(options['items'].keys()), k=random.randint(1, len(options['items'])))
+                            inputs.append(f'{title}: {"、".join(selected_options)}')
+                            target += sum(options['items'][opt] for opt in selected_options)
+                except Exception as e:
+                    e.args=(*e.args, name)
+                    raise e

        ret = {
            "category": category,
            "name": name,
            "inputs": inputs,
+            "add_rule": random.random() < self.add_rule_ratio,
            "target": target,
        }
        return ret
@ -111,8 +124,36 @@ class Medcalculatorbootcamp(Basebootcamp): # 医学计算器类
        name = random.choice(list(self.config[category].keys()))
        return self._gen_a_case(category, name)

+    def _construct_rules(self):
+        rules = {}
+        for name, details in self.config["equation"].items():
+            formula = details["formula"]
+            explanation_arr = []
+            for i in details["inputs"]:
+                indicator = self.config["indicator"][i]
+                if indicator["type"] == "choice":
+                    j = i.split('[')[0]
+                    t = f'{j}：'+ '，'.join([f'{k}：{v}' for k,v in indicator["range"].items()])
+                    formula = formula.replace(i, j)
+                    explanation_arr.append(t)
+            formula = formula.replace('math.', '')
+
+            explanation = ''
+            if explanation_arr:
+                explanation = "公式说明：" + '；'.join(explanation_arr) + "\n"
+
+            rules[name] = f"{name}的计算公式：{formula}\n{explanation}\n"
+        for name, details in self.config["scale"].items():
+            rules[name] = f"{name}量表的评分标准：\n"
+            for title, options in details["points"].items():
+                rules[name] += f"[{title}][{options['type']}]\n"
+                rules[name] += '\n'.join([f'{k}（{v}分）' for k, v in options['items'].items()]) + "\n"
+            rules[name] += "\n"
+        return rules
+
    def prompt_func(self, case):
        indicators = self.config["indicator"]
+        random.shuffle(case["inputs"])
        inp_items = '，'.join(case["inputs"])
        out_item = case["name"]

@ -123,7 +164,8 @@ class Medcalculatorbootcamp(Basebootcamp): # 医学计算器类
                if 'precision' in indicators[out_name]:
                    other_item = f"，保留{indicators[out_name]['precision']}位小数"

-        instruction = f"患者信息：{inp_items}。请计算{out_item}{other_item}。"
+        rule = self.rules[case["name"]] if case["add_rule"] else ""
+        instruction = f"{rule}患者信息：{inp_items}。请计算{out_item}{other_item}。"
        instruction_following = """Let's think step by step and output the final answer within \\boxed{xxx:xxx}. For example "\\boxed{BMI: 20.5}"."""
        prompt = instruction + '\n' + instruction_following
        return prompt