added readme

2026-04-30 17:40:36 +00:00 · 2025-05-19 00:34:49 +00:00 · 2025-05-19 00:34:49 +00:00 · dd70053c90
commit dd70053c90
parent 8deeb3a339
3 changed files with 50 additions and 2 deletions
--- a/environments/hack0/doctor_agent/doctor.py
+++ b/environments/hack0/doctor_agent/doctor.py
@ -11,7 +11,15 @@ from atroposlib.envs.base import (
    BaseEnvConfig,
    EvalHandlingEnum,
    ScoredDataItem,
+    ScoredDataGroup,
 )
+# from atroposlib.envs.base import (
+#     BaseEnv,
+#     BaseEnvConfig,
+#     EvalHandlingEnum,
+#     Item,
+#     APIServerConfig,
+# )
 from atroposlib.type_definitions import Item

 from environments.hack0.doctor_agent.patient import patient_profiles
@ -45,6 +53,7 @@ final_message_prompt = final_message + " headache"
 doctor_system_prompt = """You are a doctor. You are interacting with a patient.
 You need to diagnose the patient based on the symptoms.
 You will need to ask the patient follow up questions to diagnose them.
+Ask up to 10 follow up questions. After that make your diagnosis.
 Once you are confident in your diagnosis, provide it in the format:

 The diagnosis is: {possible_illness}
@ -174,6 +183,9 @@ class DoctorEnv(BaseEnv):
        # Grab a dedicated llm server to take advantage of caching
        async with self.server.dedicated_server() as server:

+            scores = ScoredDataGroup()
+            scores["scores"] = list()
+
            patient_messages = []
            doctor_messages = [{"role": "system", "content": doctor_system_prompt}]

@ -224,9 +236,9 @@ class DoctorEnv(BaseEnv):
                # check output
                if doctor_msg.startswith(final_message):
                    diagnosis = doctor_msg.strip(final_message)
-                    diagnosis = diagnosis.strip().lower()
+                    diagnosis = diagnosis.strip()

-                    if diagnosis.contains(item["answer"].lower()):
+                    if diagnosis.contains(item["answer"]):
                        score = 1
                    else:
                        score = 0
@ -260,6 +272,8 @@ class DoctorEnv(BaseEnv):
                    else:
                        masks.extend(curr_tokens[len(masks) :])

+            scores["scores"].append(1.0 if score else -1.0)
+
        scored_data_item = ScoredDataItem(
            messages=doctor_messages,
            finish_reason=score,
@ -267,6 +281,10 @@ class DoctorEnv(BaseEnv):
            masks=masks,
            scores=score,
        )
+        
+        for score in scores["scores"]:
+            self.percent_correct_buffer.append(max(score, 0))
+
        return scored_data_item, []

    async def get_next_item(self):
--- a/environments/hack0/doctor_agent/patient.py
+++ b/environments/hack0/doctor_agent/patient.py
@ -5,6 +5,8 @@ patient_profiles = [
    Here are your symptoms:
    {symptoms}.

+    Do not give the symptoms directly to the doctor in a single answer.
+
    You are trying to get a diagnosis for your symptoms.

    The doctor will ask you follow up questions to diagnose you.
--- a/environments/hack0/doctor_agent/readme.md
+++ b/environments/hack0/doctor_agent/readme.md
@ -0,0 +1,28 @@
+Persona-Aware MedQA Benchmarking
+
+In this project, we reimagined medical QA evaluation by introducing a persona filter—a novel layer that simulates real-world variability in patient communication styles. Leveraging the MedQA dataset as our foundation, we infused each scenario with distinct personas generated via xAI’s language models:
+
+1. The Cooperative Patient – open, verbose, and highly informative.
+2. The Reluctant Patient – terse, vague, and occasionally evasive.
+3. The Neutral Patient – brief but factually consistent.
+
+The clinical challenge we explored is simple but critical: Can a medical reasoning system consistently arrive at the correct diagnosis or treatment recommendation regardless of how the patient presents information?
+
+Our pipeline works as follows:
+
+Each original MedQA item (stem + multiple choice answers) is enriched with a synthetic patient interaction that simulates one of the three personas.
+We maintain the original clinical question and choices.
+Only the narrative context—the patient's communication—changes, testing robustness against dialogue variability.
+This mirrors how real doctors must interpret patient symptoms, which are often incomplete or colored by personality, emotion, or context.
+
+Why this matters:
+Most QA benchmarks assume a perfect narrator. But in the real world, AI systems in healthcare will need to make decisions with varying degrees of input clarity. 
+
+Our approach stress-tests reasoning models under more human-like variability, offering a path toward safer and more empathetic medical AI.
+
+Future Potential:
+
+Extendable to reinforcement learning pipelines where the agent adapts its questioning strategy based on persona.
+Can be used to benchmark bedside AI assistants, triage bots, or LLMs deployed in low-resource clinics.
+Encourages development of models that ask better follow-up questions, not just give answers.
+By combining structured medical QA with naturalistic persona variation, our project brings a crucial human dimension to the next generation of AI-health interfaces.