mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-30 17:40:42 +00:00
update to tech report version (#10)
* feat(run_eval): add checkpoint resume functionality and update example documentation; - update new bootcamp benchmark dataset * refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation * docs: update bootcamp list and add new scripts - Update Fulllist_InternBootcamp.md with new bootcamps and categories - Add new scripts to .gitignore: - examples/pipelines/filter_autogen_configs.py - examples/pipelines/quickgen_data_configs_from_eval_meta.py - Update dependencies in setup.py: - Add scipy and scikit-learn * refactor(internbootcamp): update bootcamp modules and improve error handling - Update import statements in __init__.py files - Add timestamp to target directory name in verl_data_preprocess.py - Improve error handling and scoring logic in bootcamp_judger.py - Remove unnecessary comments and update puzzle descriptions in multiple files
This commit is contained in:
parent
125a7818e0
commit
a8249acc18
2952 changed files with 105460 additions and 17649 deletions
|
|
@ -0,0 +1,335 @@
|
|||
import ast
|
||||
from internbootcamp.bootcamp.base import Basebootcamp
|
||||
import re
|
||||
import ast
|
||||
import random
|
||||
|
||||
|
||||
BLOOD_TEST_REFERENCE = {
|
||||
"白细胞计数": {
|
||||
"abbr": "WBC",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (3.5, 9.5)
|
||||
},
|
||||
"红细胞计数": {
|
||||
"abbr": "RBC",
|
||||
"unit_value": 1e12,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (4.3, 5.8)
|
||||
},
|
||||
"血红蛋白": {
|
||||
"abbr": "HGB",
|
||||
"unit_value": 1,
|
||||
"unit_name": "g/L",
|
||||
"reference_range": (130, 175)
|
||||
},
|
||||
"红细胞比容": {
|
||||
"abbr": "HCT",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (40, 50)
|
||||
},
|
||||
"平均红细胞容积": {
|
||||
"abbr": "MCV",
|
||||
"unit_value": 1,
|
||||
"unit_name": "fL",
|
||||
"reference_range": (82, 100)
|
||||
},
|
||||
"平均红细胞血红蛋白量": {
|
||||
"abbr": "MCH",
|
||||
"unit_value": 1,
|
||||
"unit_name": "pg",
|
||||
"reference_range": (27, 34)
|
||||
},
|
||||
"平均红细胞血红蛋白浓度": {
|
||||
"abbr": "MCHC",
|
||||
"unit_value": 1,
|
||||
"unit_name": "g/L",
|
||||
"reference_range": (316, 354)
|
||||
},
|
||||
"血小板": {
|
||||
"abbr": "PLT",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (125, 350)
|
||||
},
|
||||
"红细胞分布宽度标准差": {
|
||||
"abbr": "RDW-SD",
|
||||
"unit_value": 1,
|
||||
"unit_name": "fL",
|
||||
"reference_range": (30, 54)
|
||||
},
|
||||
"红细胞分布宽度变异系数": {
|
||||
"abbr": "RDW-c",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (0, 14.1)
|
||||
},
|
||||
"血小板体积分布宽度": {
|
||||
"abbr": "PDW",
|
||||
"unit_value": 1,
|
||||
"unit_name": "fL",
|
||||
"reference_range": (9, 17)
|
||||
},
|
||||
"平均血小板体积": {
|
||||
"abbr": "MPV",
|
||||
"unit_value": 1,
|
||||
"unit_name": "fL",
|
||||
"reference_range": (9, 13)
|
||||
},
|
||||
"大血小板比率": {
|
||||
"abbr": "P-LCR",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (17.5, 30)
|
||||
},
|
||||
"血小板体积分数": {
|
||||
"abbr": "PCT",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (0.13, 0.35)
|
||||
},
|
||||
"中性粒细胞绝对值": {
|
||||
"abbr": "NEUT#",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (1.8, 6.3)
|
||||
},
|
||||
"淋巴细胞绝对值": {
|
||||
"abbr": "LYMPH#",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (1.1, 3.2)
|
||||
},
|
||||
"单核细胞绝对值": {
|
||||
"abbr": "MONO#",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (0.1, 0.6)
|
||||
},
|
||||
"嗜酸细胞绝对值": {
|
||||
"abbr": "EO#",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (0.02, 0.52)
|
||||
},
|
||||
"嗜碱细胞绝对值": {
|
||||
"abbr": "BASO#",
|
||||
"unit_value": 1e9,
|
||||
"unit_name": "/L",
|
||||
"reference_range": (0, 0.06)
|
||||
},
|
||||
"中性粒细胞比率": {
|
||||
"abbr": "NEUT%",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (40, 75)
|
||||
},
|
||||
"淋巴细胞比率": {
|
||||
"abbr": "LYMPH%",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (20, 50)
|
||||
},
|
||||
"单核细胞比率": {
|
||||
"abbr": "MONO%",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (3, 10)
|
||||
},
|
||||
"嗜酸细胞比率": {
|
||||
"abbr": "EO%",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (0.4, 8)
|
||||
},
|
||||
"嗜碱细胞比率": {
|
||||
"abbr": "BASO%",
|
||||
"unit_value": 1,
|
||||
"unit_name": "%",
|
||||
"reference_range": (0, 1)
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
class BloodTestJudgementbootcamp(Basebootcamp):
|
||||
def __init__(self, seed: int=0, max_num_items=24, augment_unit=False):
|
||||
random.seed(seed)
|
||||
self.references = BLOOD_TEST_REFERENCE
|
||||
self.max_num_items = min(max(max_num_items, 1), len(self.references))
|
||||
self.augment_unit = augment_unit
|
||||
|
||||
def case_generator(self):
|
||||
"""
|
||||
Randomly select ≥1 blood-test items from self.references, generate a reasonable
|
||||
random value for each (within/slightly out/greatly out of the reference range),
|
||||
then for non-percentaged units apply a random power-of-10 scaling to (value, unit_value).
|
||||
|
||||
Returns:
|
||||
dict: {
|
||||
"item_names": [<item_name>, ...],
|
||||
"values": [<float>, ...],
|
||||
"unit_names": [<unit_str>, ...],
|
||||
"unit_value": [<unit_val>, ...],
|
||||
"ground_truths": [<int>, ...]
|
||||
}
|
||||
"""
|
||||
|
||||
# 1. pick a random non‐empty subset of items
|
||||
all_items = list(self.references.keys())
|
||||
k = random.randint(1, self.max_num_items)
|
||||
items = random.sample(all_items, k)
|
||||
|
||||
# 2. randomly generate value and unit for each item
|
||||
out_item_names, out_values = [], []
|
||||
out_unit_names, out_unit_values = [], []
|
||||
out_results = []
|
||||
for item_name in items:
|
||||
ref_min, ref_max = self.references[item_name]["reference_range"]
|
||||
|
||||
# generate a plausible value
|
||||
mean = (ref_min + ref_max) / 2
|
||||
std = (ref_max - ref_min) / 2
|
||||
|
||||
low = max(0.1 * ref_min, mean - 4 * std)
|
||||
high = mean + 4 * std
|
||||
val = random.uniform(low, high)
|
||||
|
||||
if ref_min < val < ref_max:
|
||||
result = 0
|
||||
elif val <= ref_min:
|
||||
result = -1
|
||||
else:
|
||||
result = 1
|
||||
|
||||
# fetch original unit
|
||||
uv = self.references[item_name]["unit_value"]
|
||||
un = self.references[item_name]["unit_name"]
|
||||
|
||||
# apply random power‐of‐10 rescaling for non-% units
|
||||
if self.augment_unit and (un != "%"):
|
||||
exp = random.choice([-2, -1, 0, 1, 2])
|
||||
new_val = val / (10 ** exp)
|
||||
new_uv = uv * (10 ** exp)
|
||||
else:
|
||||
new_val = val
|
||||
new_uv = uv
|
||||
|
||||
new_val = round(new_val, 2)
|
||||
if un == '%':
|
||||
new_val = min(max(new_val, 0), 1)
|
||||
|
||||
out_item_names.append(item_name)
|
||||
out_values.append(new_val)
|
||||
out_unit_values.append(new_uv)
|
||||
out_unit_names.append(un)
|
||||
out_results.append(result)
|
||||
|
||||
return {
|
||||
"item_names": out_item_names,
|
||||
"values": out_values,
|
||||
"unit_values": out_unit_values,
|
||||
"unit_names": out_unit_names,
|
||||
"ground_truths": out_results
|
||||
}
|
||||
|
||||
def prompt_func(self, identity):
|
||||
"""
|
||||
Build a human-readable prompt asking an LLM to judge each blood test item
|
||||
as low/normal/high. The LLM should rely on its own internal knowledge of
|
||||
human physiology.
|
||||
"""
|
||||
|
||||
lines = []
|
||||
for name, val, uv, un in zip(
|
||||
identity["item_names"],
|
||||
identity["values"],
|
||||
identity["unit_values"],
|
||||
identity["unit_names"]
|
||||
):
|
||||
unit_str = un if uv == 1 else f"{uv:.0e}{un}"
|
||||
abbr = self.references[name]["abbr"]
|
||||
lines.append(f"- {name} ({abbr}): {val} ({unit_str})")
|
||||
|
||||
prompt = (
|
||||
"You are a medical expert in hematology and clinical laboratory interpretation. Below are "
|
||||
"several blood test results:\n\n"
|
||||
+ "\n".join(lines)
|
||||
+ "\n\nFor each result, use your knowledge of healthy reference ranges of each item to determine "
|
||||
"whether the value is too low, normal, or too high. Use one of the following integers for each "
|
||||
"judgement: -1 (too low), 0 (within heathly reference range), or 1 (too high). You MUST output "
|
||||
"**ONLY ONE** Python-style list for your judgements (e.g. `[-1, 0, 1]`) in the same order as "
|
||||
"the input blood test results."
|
||||
)
|
||||
|
||||
return prompt
|
||||
|
||||
@staticmethod
|
||||
def extract_output(output: str):
|
||||
"""
|
||||
Parse the LLM's raw text and return the last valid Python list of ints [-1,0,1].
|
||||
Ignores surrounding commentary and any earlier lists.
|
||||
"""
|
||||
pattern = re.compile(r'\[\s*-?\d+\s*(?:,\s*-?\d+\s*)*\]')
|
||||
matches = pattern.findall(output)
|
||||
if not matches:
|
||||
return []
|
||||
|
||||
last_literal = matches[-1]
|
||||
try:
|
||||
solution = ast.literal_eval(last_literal) # safely turn the string into a list
|
||||
if isinstance(solution, (list, tuple)) and all(isinstance(x, (int, float)) for x in solution):
|
||||
return [int(t) for t in solution]
|
||||
except Exception as err:
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def _verify_correction(cls, solution, identity):
|
||||
ground_truths = identity['ground_truths']
|
||||
|
||||
if len(solution) != len(ground_truths):
|
||||
return False
|
||||
|
||||
return all(x == y for x, y in zip(solution, ground_truths))
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 初始化 Bootcamp 任务
|
||||
bootcamp = BloodTestJudgementbootcamp(seed=42) # 使用固定种子以便结果可复现
|
||||
|
||||
# 生成测试用例
|
||||
case = bootcamp.case_generator()
|
||||
print("--------------------------------")
|
||||
print("Generated Case:", case)
|
||||
|
||||
# 构造题面
|
||||
prompt = bootcamp.prompt_func(case)
|
||||
print("Prompt:", prompt)
|
||||
print("--------------------------------")
|
||||
# 模拟模型输出(使用ground_truths作为模拟输出)
|
||||
model_output = f"some reasoning process...{case['ground_truths']}"
|
||||
extracted_answer = bootcamp.extract_output(model_output)
|
||||
print("Extracted Answer:", extracted_answer)
|
||||
print("--------------------------------")
|
||||
# 验证答案
|
||||
is_correct = bootcamp._verify_correction(extracted_answer, case)
|
||||
print("Is Correct:", is_correct)
|
||||
print("--------------------------------")
|
||||
# 验证分数
|
||||
score = bootcamp.verify_score(model_output , case)
|
||||
print("Score:", score)
|
||||
print("--------------------------------")
|
||||
# 错误答案
|
||||
model_output_error = f"some reasoning process...(-1, 1, 1, 0, 0, -1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, -1)"
|
||||
extracted_answer_error = bootcamp.extract_output(model_output_error)
|
||||
print("Extracted Answer Error:", extracted_answer_error)
|
||||
print("--------------------------------")
|
||||
# 验证错误答案
|
||||
is_correct_error = bootcamp._verify_correction(extracted_answer_error, case)
|
||||
print("Is Correct Error:", is_correct_error)
|
||||
print("--------------------------------")
|
||||
# 验证错误答案分数
|
||||
score_error = bootcamp.verify_score(model_output_error , case)
|
||||
print("Score Error:", score_error)
|
||||
print("--------------------------------")
|
||||
Loading…
Add table
Add a link
Reference in a new issue