InternBootcamp/internbootcamp/bootcamp/blood_test_judgement/blood_test_judgement.py
Yongkang Chen a8249acc18
update to tech report version (#10)
* feat(run_eval): add checkpoint resume functionality and update example documentation;
- update new bootcamp benchmark dataset

* refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation

* docs: update bootcamp list and add new scripts

- Update Fulllist_InternBootcamp.md with new bootcamps and categories
- Add new scripts to .gitignore:
  - examples/pipelines/filter_autogen_configs.py
  - examples/pipelines/quickgen_data_configs_from_eval_meta.py
- Update dependencies in setup.py:
  - Add scipy and scikit-learn

* refactor(internbootcamp): update bootcamp modules and improve error handling

- Update import statements in __init__.py files
- Add timestamp to target directory name in verl_data_preprocess.py
- Improve error handling and scoring logic in bootcamp_judger.py
- Remove unnecessary comments and update puzzle descriptions in multiple files
2025-08-28 12:39:47 +08:00

335 lines
No EOL
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import ast
from internbootcamp.bootcamp.base import Basebootcamp
import re
import ast
import random
BLOOD_TEST_REFERENCE = {
"白细胞计数": {
"abbr": "WBC",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (3.5, 9.5)
},
"红细胞计数": {
"abbr": "RBC",
"unit_value": 1e12,
"unit_name": "/L",
"reference_range": (4.3, 5.8)
},
"血红蛋白": {
"abbr": "HGB",
"unit_value": 1,
"unit_name": "g/L",
"reference_range": (130, 175)
},
"红细胞比容": {
"abbr": "HCT",
"unit_value": 1,
"unit_name": "%",
"reference_range": (40, 50)
},
"平均红细胞容积": {
"abbr": "MCV",
"unit_value": 1,
"unit_name": "fL",
"reference_range": (82, 100)
},
"平均红细胞血红蛋白量": {
"abbr": "MCH",
"unit_value": 1,
"unit_name": "pg",
"reference_range": (27, 34)
},
"平均红细胞血红蛋白浓度": {
"abbr": "MCHC",
"unit_value": 1,
"unit_name": "g/L",
"reference_range": (316, 354)
},
"血小板": {
"abbr": "PLT",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (125, 350)
},
"红细胞分布宽度标准差": {
"abbr": "RDW-SD",
"unit_value": 1,
"unit_name": "fL",
"reference_range": (30, 54)
},
"红细胞分布宽度变异系数": {
"abbr": "RDW-c",
"unit_value": 1,
"unit_name": "%",
"reference_range": (0, 14.1)
},
"血小板体积分布宽度": {
"abbr": "PDW",
"unit_value": 1,
"unit_name": "fL",
"reference_range": (9, 17)
},
"平均血小板体积": {
"abbr": "MPV",
"unit_value": 1,
"unit_name": "fL",
"reference_range": (9, 13)
},
"大血小板比率": {
"abbr": "P-LCR",
"unit_value": 1,
"unit_name": "%",
"reference_range": (17.5, 30)
},
"血小板体积分数": {
"abbr": "PCT",
"unit_value": 1,
"unit_name": "%",
"reference_range": (0.13, 0.35)
},
"中性粒细胞绝对值": {
"abbr": "NEUT#",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (1.8, 6.3)
},
"淋巴细胞绝对值": {
"abbr": "LYMPH#",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (1.1, 3.2)
},
"单核细胞绝对值": {
"abbr": "MONO#",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (0.1, 0.6)
},
"嗜酸细胞绝对值": {
"abbr": "EO#",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (0.02, 0.52)
},
"嗜碱细胞绝对值": {
"abbr": "BASO#",
"unit_value": 1e9,
"unit_name": "/L",
"reference_range": (0, 0.06)
},
"中性粒细胞比率": {
"abbr": "NEUT%",
"unit_value": 1,
"unit_name": "%",
"reference_range": (40, 75)
},
"淋巴细胞比率": {
"abbr": "LYMPH%",
"unit_value": 1,
"unit_name": "%",
"reference_range": (20, 50)
},
"单核细胞比率": {
"abbr": "MONO%",
"unit_value": 1,
"unit_name": "%",
"reference_range": (3, 10)
},
"嗜酸细胞比率": {
"abbr": "EO%",
"unit_value": 1,
"unit_name": "%",
"reference_range": (0.4, 8)
},
"嗜碱细胞比率": {
"abbr": "BASO%",
"unit_value": 1,
"unit_name": "%",
"reference_range": (0, 1)
},
}
class BloodTestJudgementbootcamp(Basebootcamp):
def __init__(self, seed: int=0, max_num_items=24, augment_unit=False):
random.seed(seed)
self.references = BLOOD_TEST_REFERENCE
self.max_num_items = min(max(max_num_items, 1), len(self.references))
self.augment_unit = augment_unit
def case_generator(self):
"""
Randomly select ≥1 blood-test items from self.references, generate a reasonable
random value for each (within/slightly out/greatly out of the reference range),
then for non-percentaged units apply a random power-of-10 scaling to (value, unit_value).
Returns:
dict: {
"item_names": [<item_name>, ...],
"values": [<float>, ...],
"unit_names": [<unit_str>, ...],
"unit_value": [<unit_val>, ...],
"ground_truths": [<int>, ...]
}
"""
# 1. pick a random nonempty subset of items
all_items = list(self.references.keys())
k = random.randint(1, self.max_num_items)
items = random.sample(all_items, k)
# 2. randomly generate value and unit for each item
out_item_names, out_values = [], []
out_unit_names, out_unit_values = [], []
out_results = []
for item_name in items:
ref_min, ref_max = self.references[item_name]["reference_range"]
# generate a plausible value
mean = (ref_min + ref_max) / 2
std = (ref_max - ref_min) / 2
low = max(0.1 * ref_min, mean - 4 * std)
high = mean + 4 * std
val = random.uniform(low, high)
if ref_min < val < ref_max:
result = 0
elif val <= ref_min:
result = -1
else:
result = 1
# fetch original unit
uv = self.references[item_name]["unit_value"]
un = self.references[item_name]["unit_name"]
# apply random powerof10 rescaling for non-% units
if self.augment_unit and (un != "%"):
exp = random.choice([-2, -1, 0, 1, 2])
new_val = val / (10 ** exp)
new_uv = uv * (10 ** exp)
else:
new_val = val
new_uv = uv
new_val = round(new_val, 2)
if un == '%':
new_val = min(max(new_val, 0), 1)
out_item_names.append(item_name)
out_values.append(new_val)
out_unit_values.append(new_uv)
out_unit_names.append(un)
out_results.append(result)
return {
"item_names": out_item_names,
"values": out_values,
"unit_values": out_unit_values,
"unit_names": out_unit_names,
"ground_truths": out_results
}
def prompt_func(self, identity):
"""
Build a human-readable prompt asking an LLM to judge each blood test item
as low/normal/high. The LLM should rely on its own internal knowledge of
human physiology.
"""
lines = []
for name, val, uv, un in zip(
identity["item_names"],
identity["values"],
identity["unit_values"],
identity["unit_names"]
):
unit_str = un if uv == 1 else f"{uv:.0e}{un}"
abbr = self.references[name]["abbr"]
lines.append(f"- {name} ({abbr}): {val} ({unit_str})")
prompt = (
"You are a medical expert in hematology and clinical laboratory interpretation. Below are "
"several blood test results:\n\n"
+ "\n".join(lines)
+ "\n\nFor each result, use your knowledge of healthy reference ranges of each item to determine "
"whether the value is too low, normal, or too high. Use one of the following integers for each "
"judgement: -1 (too low), 0 (within heathly reference range), or 1 (too high). You MUST output "
"**ONLY ONE** Python-style list for your judgements (e.g. `[-1, 0, 1]`) in the same order as "
"the input blood test results."
)
return prompt
@staticmethod
def extract_output(output: str):
"""
Parse the LLM's raw text and return the last valid Python list of ints [-1,0,1].
Ignores surrounding commentary and any earlier lists.
"""
pattern = re.compile(r'\[\s*-?\d+\s*(?:,\s*-?\d+\s*)*\]')
matches = pattern.findall(output)
if not matches:
return []
last_literal = matches[-1]
try:
solution = ast.literal_eval(last_literal) # safely turn the string into a list
if isinstance(solution, (list, tuple)) and all(isinstance(x, (int, float)) for x in solution):
return [int(t) for t in solution]
except Exception as err:
return []
@classmethod
def _verify_correction(cls, solution, identity):
ground_truths = identity['ground_truths']
if len(solution) != len(ground_truths):
return False
return all(x == y for x, y in zip(solution, ground_truths))
if __name__ == "__main__":
# 初始化 Bootcamp 任务
bootcamp = BloodTestJudgementbootcamp(seed=42) # 使用固定种子以便结果可复现
# 生成测试用例
case = bootcamp.case_generator()
print("--------------------------------")
print("Generated Case:", case)
# 构造题面
prompt = bootcamp.prompt_func(case)
print("Prompt:", prompt)
print("--------------------------------")
# 模拟模型输出使用ground_truths作为模拟输出
model_output = f"some reasoning process...{case['ground_truths']}"
extracted_answer = bootcamp.extract_output(model_output)
print("Extracted Answer:", extracted_answer)
print("--------------------------------")
# 验证答案
is_correct = bootcamp._verify_correction(extracted_answer, case)
print("Is Correct:", is_correct)
print("--------------------------------")
# 验证分数
score = bootcamp.verify_score(model_output , case)
print("Score:", score)
print("--------------------------------")
# 错误答案
model_output_error = f"some reasoning process...(-1, 1, 1, 0, 0, -1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, -1)"
extracted_answer_error = bootcamp.extract_output(model_output_error)
print("Extracted Answer Error:", extracted_answer_error)
print("--------------------------------")
# 验证错误答案
is_correct_error = bootcamp._verify_correction(extracted_answer_error, case)
print("Is Correct Error:", is_correct_error)
print("--------------------------------")
# 验证错误答案分数
score_error = bootcamp.verify_score(model_output_error , case)
print("Score Error:", score_error)
print("--------------------------------")