mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-30 17:40:36 +00:00
added readme
This commit is contained in:
parent
8deeb3a339
commit
dd70053c90
3 changed files with 50 additions and 2 deletions
|
|
@ -11,7 +11,15 @@ from atroposlib.envs.base import (
|
|||
BaseEnvConfig,
|
||||
EvalHandlingEnum,
|
||||
ScoredDataItem,
|
||||
ScoredDataGroup,
|
||||
)
|
||||
# from atroposlib.envs.base import (
|
||||
# BaseEnv,
|
||||
# BaseEnvConfig,
|
||||
# EvalHandlingEnum,
|
||||
# Item,
|
||||
# APIServerConfig,
|
||||
# )
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.hack0.doctor_agent.patient import patient_profiles
|
||||
|
|
@ -45,6 +53,7 @@ final_message_prompt = final_message + " headache"
|
|||
doctor_system_prompt = """You are a doctor. You are interacting with a patient.
|
||||
You need to diagnose the patient based on the symptoms.
|
||||
You will need to ask the patient follow up questions to diagnose them.
|
||||
Ask up to 10 follow up questions. After that make your diagnosis.
|
||||
Once you are confident in your diagnosis, provide it in the format:
|
||||
|
||||
The diagnosis is: {possible_illness}
|
||||
|
|
@ -174,6 +183,9 @@ class DoctorEnv(BaseEnv):
|
|||
# Grab a dedicated llm server to take advantage of caching
|
||||
async with self.server.dedicated_server() as server:
|
||||
|
||||
scores = ScoredDataGroup()
|
||||
scores["scores"] = list()
|
||||
|
||||
patient_messages = []
|
||||
doctor_messages = [{"role": "system", "content": doctor_system_prompt}]
|
||||
|
||||
|
|
@ -224,9 +236,9 @@ class DoctorEnv(BaseEnv):
|
|||
# check output
|
||||
if doctor_msg.startswith(final_message):
|
||||
diagnosis = doctor_msg.strip(final_message)
|
||||
diagnosis = diagnosis.strip().lower()
|
||||
diagnosis = diagnosis.strip()
|
||||
|
||||
if diagnosis.contains(item["answer"].lower()):
|
||||
if diagnosis.contains(item["answer"]):
|
||||
score = 1
|
||||
else:
|
||||
score = 0
|
||||
|
|
@ -260,6 +272,8 @@ class DoctorEnv(BaseEnv):
|
|||
else:
|
||||
masks.extend(curr_tokens[len(masks) :])
|
||||
|
||||
scores["scores"].append(1.0 if score else -1.0)
|
||||
|
||||
scored_data_item = ScoredDataItem(
|
||||
messages=doctor_messages,
|
||||
finish_reason=score,
|
||||
|
|
@ -267,6 +281,10 @@ class DoctorEnv(BaseEnv):
|
|||
masks=masks,
|
||||
scores=score,
|
||||
)
|
||||
|
||||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
return scored_data_item, []
|
||||
|
||||
async def get_next_item(self):
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ patient_profiles = [
|
|||
Here are your symptoms:
|
||||
{symptoms}.
|
||||
|
||||
Do not give the symptoms directly to the doctor in a single answer.
|
||||
|
||||
You are trying to get a diagnosis for your symptoms.
|
||||
|
||||
The doctor will ask you follow up questions to diagnose you.
|
||||
|
|
|
|||
28
environments/hack0/doctor_agent/readme.md
Normal file
28
environments/hack0/doctor_agent/readme.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
Persona-Aware MedQA Benchmarking
|
||||
|
||||
In this project, we reimagined medical QA evaluation by introducing a persona filter—a novel layer that simulates real-world variability in patient communication styles. Leveraging the MedQA dataset as our foundation, we infused each scenario with distinct personas generated via xAI’s language models:
|
||||
|
||||
1. The Cooperative Patient – open, verbose, and highly informative.
|
||||
2. The Reluctant Patient – terse, vague, and occasionally evasive.
|
||||
3. The Neutral Patient – brief but factually consistent.
|
||||
|
||||
The clinical challenge we explored is simple but critical: Can a medical reasoning system consistently arrive at the correct diagnosis or treatment recommendation regardless of how the patient presents information?
|
||||
|
||||
Our pipeline works as follows:
|
||||
|
||||
Each original MedQA item (stem + multiple choice answers) is enriched with a synthetic patient interaction that simulates one of the three personas.
|
||||
We maintain the original clinical question and choices.
|
||||
Only the narrative context—the patient's communication—changes, testing robustness against dialogue variability.
|
||||
This mirrors how real doctors must interpret patient symptoms, which are often incomplete or colored by personality, emotion, or context.
|
||||
|
||||
Why this matters:
|
||||
Most QA benchmarks assume a perfect narrator. But in the real world, AI systems in healthcare will need to make decisions with varying degrees of input clarity.
|
||||
|
||||
Our approach stress-tests reasoning models under more human-like variability, offering a path toward safer and more empathetic medical AI.
|
||||
|
||||
Future Potential:
|
||||
|
||||
Extendable to reinforcement learning pipelines where the agent adapts its questioning strategy based on persona.
|
||||
Can be used to benchmark bedside AI assistants, triage bots, or LLMs deployed in low-resource clinics.
|
||||
Encourages development of models that ask better follow-up questions, not just give answers.
|
||||
By combining structured medical QA with naturalistic persona variation, our project brings a crucial human dimension to the next generation of AI-health interfaces.
|
||||
Loading…
Add table
Add a link
Reference in a new issue