atropos/environments/community/examcraft/examcraft_server.py

#!/usr/bin/env python3
"""
ExamCraft: Adaptive LLM Teacher Training Environment for Atropos

This environment trains language models to become better teachers by generating
adaptive questions, providing explanations, and creating personalized lesson plans.
"""

import argparse
import asyncio
import json
import os
import sys
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple

# Try different Atropos import patterns
try:
    from atroposlib.api import APIServerConfig, BaseEnvConfig
    from atroposlib.envs import BaseLanguageEnv
    from atroposlib.utils import parse_args_and_command

    ATROPOS_AVAILABLE = True
    print("✅ Successfully imported Atropos classes")
except ImportError as e1:
    try:
        from atroposlib.api.config import APIServerConfig, BaseEnvConfig
        from atroposlib.envs.base import BaseLanguageEnv
        from atroposlib.utils import parse_args_and_command

        ATROPOS_AVAILABLE = True
        print("✅ Successfully imported Atropos classes (alternative path)")
    except ImportError as e2:
        try:
            # Check what's actually available
            import atroposlib.api
            import atroposlib.envs

            print("Available in atroposlib.envs:", dir(atroposlib.envs))
            print("Available in atroposlib.api:", dir(atroposlib.api))
            raise ImportError("Could not find correct imports")
        except ImportError as e3:
            print(
                f"Warning: Could not import Atropos classes. Errors: {e1}, {e2}, {e3}"
            )
            print("Running in standalone mode...")
            ATROPOS_AVAILABLE = False

            # Create mock classes for testing
            class BaseLanguageEnv:
                def __init__(self, config):
                    self.config = config

            class BaseEnvConfig:
                def __init__(self, **kwargs):
                    for k, v in kwargs.items():
                        setattr(self, k, v)

            class APIServerConfig:
                def __init__(self, **kwargs):
                    for k, v in kwargs.items():
                        setattr(self, k, v)


@dataclass
class ExamCraftConfig(BaseEnvConfig):
    """Configuration for ExamCraft environment."""

    profile_path: str = "example_profile.json"
    max_questions_per_episode: int = 8
    student_learning_rate: float = 0.03
    enable_lesson_plans: bool = True
    difficulty_adaptation_rate: float = 0.1


class ExamCraftEnv(BaseLanguageEnv):
    """
    ExamCraft: Trains LLMs to be adaptive teachers.

    Features:
    - Adaptive question generation based on student proficiency
    - Multi-topic curriculum support
    - Real-time difficulty adjustment
    - Comprehensive reward system for teaching effectiveness
    - Lesson plan generation and evaluation
    """

    def __init__(self, config: ExamCraftConfig):
        super().__init__(config)
        self.config = config

        # Load student profile
        if os.path.exists(config.profile_path):
            with open(config.profile_path, "r") as file:
                self.profile = json.load(file)
        else:
            # Default profile if file not found
            self.profile = self._create_default_profile()

        # Initialize metrics
        self.reset_metrics()

        # Track teaching session
        self.session_history = []
        self.current_episode = 0

    def _create_default_profile(self) -> Dict[str, Any]:
        """Create a default student profile if none exists."""
        return {
            "student_id": "default_001",
            "target_grade": "11th grade",
            "learning_goal": "Master foundational mathematics",
            "current_avg_score": 65,
            "topics": [
                {"name": "algebra", "proficiency": 0.4},
                {"name": "geometry", "proficiency": 0.6},
                {"name": "statistics", "proficiency": 0.3},
                {"name": "calculus", "proficiency": 0.2},
            ],
            "preferred_learning_style": "visual",
        }

    def reset_metrics(self):
        """Reset student metrics for a new episode."""
        self.student_metrics = {
            "overall_accuracy": self.profile.get("current_avg_score", 65) / 100,
            "topic_accuracies": {},
            "difficulty_preferences": {},
        }

        # Initialize from profile
        for topic in self.profile.get("topics", []):
            topic_name = topic.get("name")
            proficiency = topic.get("proficiency", 0.5)
            self.student_metrics["topic_accuracies"][topic_name] = proficiency
            self.student_metrics["difficulty_preferences"][topic_name] = "medium"

        # Reset session tracking
        self.question_count = 0
        self.correct_count = 0
        self.session_history = []

    def get_system_message(self) -> str:
        """System message defining the teacher's role and capabilities."""
        topics_str = "\n".join(
            [
                f"- {topic['name']}: {topic['proficiency']:.1%} proficiency"
                for topic in self.profile.get("topics", [])
            ]
        )

        target_grade = self.profile.get("target_grade", "high school")
        return f"""You are ExamCraft, an adaptive AI teacher specializing in {target_grade} education.

STUDENT PROFILE:
{topics_str}
Learning Goal: {self.profile.get('learning_goal', 'Academic improvement')}
Preferred Style: {self.profile.get('preferred_learning_style', 'mixed')}
Current Average: {self.profile.get('current_avg_score', 65)}%

TEACHING CAPABILITIES:
1. QUESTION: Generate adaptive multiple-choice questions
2. EXPLANATION: Provide detailed explanations for concepts
3. LESSON_PLAN: Create personalized study plans

RESPONSE FORMAT - Return valid JSON only:
{{
    "action_type": "QUESTION|EXPLANATION|LESSON_PLAN",
    "topic": "topic_name",
    "difficulty": "easy|medium|hard",
    "content": {{
        "question": "Clear, engaging question text",
        "options": {{
            "A": "First option",
            "B": "Second option",
            "C": "Third option",
            "D": "Fourth option"
        }},
        "correct_answer": "A|B|C|D",
        "explanation": "Detailed explanation for the correct answer and why others are wrong",
        "learning_objective": "What this question teaches"
    }}
}}

TEACHING STRATEGY:
- Prioritize topics with low proficiency scores
- Adapt difficulty based on recent performance
- Provide detailed explanations that build understanding
- Focus on the student's learning goal and preferred style"""

    def get_user_message(self) -> str:
        """Generate context-aware prompt for the teacher."""
        if not self.session_history:
            learning_goal = self.profile.get("learning_goal", "general concepts")
            challenge_text = (
                "Please begin with an appropriate question for this student. "
                "Target their weakest areas while maintaining appropriate challenge level."
            )
            prompt = f"""NEW TEACHING SESSION STARTED

Student needs help with: {learning_goal}

Current topic proficiencies:
{json.dumps(self.student_metrics['topic_accuracies'], indent=2)}

{challenge_text}"""
            return prompt

        # Analyze recent performance
        recent_correct = sum(
            1 for item in self.session_history[-3:] if item.get("was_correct", False)
        )
        recent_total = len(self.session_history[-3:])

        last_interaction = self.session_history[-1]
        performance_trend = (
            "improving" if recent_correct > recent_total / 2 else "struggling"
        )

        # Identify struggling topics
        weak_topics = [
            topic
            for topic, acc in self.student_metrics["topic_accuracies"].items()
            if acc < 0.5
        ]

        topic_info = last_interaction.get("topic", "unknown")
        diff_info = last_interaction.get("difficulty", "unknown")
        correct_mark = "✓" if last_interaction.get("was_correct") else "✗"

        return f"""TEACHING SESSION UPDATE

Recent Performance: {recent_correct}/{recent_total} correct - student is {performance_trend}
Last Question: {topic_info} ({diff_info}) - {correct_mark}

Current Status:
- Questions asked: {self.question_count}
- Overall accuracy: {self.student_metrics['overall_accuracy']:.1%}
- Topics needing attention: {', '.join(weak_topics) if weak_topics else 'None'}

Updated proficiencies:
{json.dumps(self.student_metrics['topic_accuracies'], indent=2)}

Please select your next teaching action based on this student's progress."""

    async def step(self, response: str) -> Tuple[str, Dict[str, Any]]:
        """Process teacher's response and simulate student interaction."""
        try:
            # Clean and parse JSON response
            response = response.strip()
            if response.startswith("```json"):
                response = response[7:]
            if response.endswith("```"):
                response = response[:-3]

            teacher_action = json.loads(response.strip())

            # Validate required fields
            if not all(key in teacher_action for key in ["action_type", "content"]):
                return await self._handle_error("Missing required fields in response")

            action_type = teacher_action["action_type"].upper()

            if action_type == "QUESTION":
                return await self._handle_question(teacher_action)
            elif action_type == "EXPLANATION":
                return await self._handle_explanation(teacher_action)
            elif action_type == "LESSON_PLAN":
                return await self._handle_lesson_plan(teacher_action)
            else:
                return await self._handle_error(f"Invalid action_type: {action_type}")

        except json.JSONDecodeError as e:
            return await self._handle_error(f"JSON parsing failed: {str(e)}")
        except Exception as e:
            return await self._handle_error(f"Unexpected error: {str(e)}")

    async def _handle_question(
        self, action: Dict[str, Any]
    ) -> Tuple[str, Dict[str, Any]]:
        """Process a question from the teacher."""
        content = action["content"]
        topic = action.get("topic", "general")
        difficulty = action.get("difficulty", "medium")

        # Validate question format
        required_fields = ["question", "options", "correct_answer"]
        if not all(field in content for field in required_fields):
            return await self._handle_error("Question missing required fields")

        if len(content["options"]) != 4:
            return await self._handle_error("Question must have exactly 4 options")

        # Simulate student response
        student_answer, is_correct = self._simulate_student_response(
            topic, difficulty, content["correct_answer"]
        )

        # Update metrics
        self.question_count += 1
        if is_correct:
            self.correct_count += 1

        # Apply learning effect
        self._update_student_learning(topic, difficulty, is_correct)

        # Calculate reward
        score = self._calculate_question_reward(topic, difficulty, is_correct, content)

        # Record interaction
        interaction = {
            "type": "question",
            "topic": topic,
            "difficulty": difficulty,
            "question": content["question"],
            "student_answer": student_answer,
            "correct_answer": content["correct_answer"],
            "was_correct": is_correct,
            "score": score,
            "timestamp": self.question_count,
        }
        self.session_history.append(interaction)

        # Check episode completion
        episode_done = self.question_count >= self.config.max_questions_per_episode

        info = {
            "score": score,
            "episode_done": episode_done,
            "info": interaction,
            "student_metrics": dict(self.student_metrics),
        }

        return self.get_user_message(), info

    async def _handle_explanation(
        self, action: Dict[str, Any]
    ) -> Tuple[str, Dict[str, Any]]:
        """Process an explanation from the teacher."""
        content = action["content"]
        explanation = content.get("explanation", "")
        topic = action.get("topic", "general")

        # Score explanation quality
        score = self._score_explanation(explanation, topic)

        # Apply learning boost from good explanations
        if len(self.session_history) > 0:
            last_topic = self.session_history[-1].get("topic")
            if last_topic and score > 0.7:
                current_prof = self.student_metrics["topic_accuracies"].get(
                    last_topic, 0.5
                )
                boost = min(
                    0.1, score * 0.05
                )  # Max 0.1 boost from excellent explanation
                self.student_metrics["topic_accuracies"][last_topic] = min(
                    1.0, current_prof + boost
                )

        # Record interaction
        interaction = {
            "type": "explanation",
            "topic": topic,
            "explanation_length": len(explanation.split()),
            "score": score,
        }
        self.session_history.append(interaction)

        info = {"score": score, "episode_done": False, "info": interaction}

        return self.get_user_message(), info

    async def _handle_lesson_plan(
        self, action: Dict[str, Any]
    ) -> Tuple[str, Dict[str, Any]]:
        """Process a lesson plan from the teacher."""
        content = action["content"]

        # Score lesson plan quality
        score = self._score_lesson_plan(content)

        # Record final interaction
        interaction = {
            "type": "lesson_plan",
            "score": score,
            "final_performance": dict(self.student_metrics),
            "total_questions": self.question_count,
            "overall_accuracy": self.correct_count / max(1, self.question_count),
        }
        self.session_history.append(interaction)

        info = {
            "score": score,
            "episode_done": True,  # Always end after lesson plan
            "info": interaction,
            "session_summary": self._generate_session_summary(),
        }

        return "", info  # Empty string indicates episode completion

    async def _handle_error(self, error_msg: str) -> Tuple[str, Dict[str, Any]]:
        """Handle errors in teacher responses."""
        score = -1.0
        info = {
            "score": score,
            "episode_done": False,
            "info": {"error": error_msg, "type": "error"},
        }
        return self.get_user_message(), info

    def _simulate_student_response(
        self, topic: str, difficulty: str, correct_answer: str
    ) -> Tuple[str, bool]:
        """Simulate student's answer based on proficiency and difficulty."""
        import random

        # Get base proficiency
        base_proficiency = self.student_metrics["topic_accuracies"].get(topic, 0.5)

        # Adjust for difficulty
        difficulty_modifiers = {"easy": 0.25, "medium": 0.0, "hard": -0.25}
        modifier = difficulty_modifiers.get(difficulty, 0.0)

        # Calculate success probability
        success_prob = max(0.1, min(0.9, base_proficiency + modifier))

        # Add some session momentum (recent success increases confidence)
        if len(self.session_history) >= 2:
            recent_correct = sum(
                1
                for item in self.session_history[-2:]
                if item.get("was_correct", False)
            )
            momentum = (recent_correct - 1) * 0.1  # ±0.1 based on recent performance
            success_prob = max(0.1, min(0.9, success_prob + momentum))

        # Determine correct/incorrect
        is_correct = random.random() < success_prob

        if is_correct:
            return correct_answer, True
        else:
            # Choose random incorrect answer
            options = ["A", "B", "C", "D"]
            if correct_answer in options:
                options.remove(correct_answer)
            return random.choice(options), False

    def _update_student_learning(self, topic: str, difficulty: str, is_correct: bool):
        """Update student metrics based on teaching interaction."""
        # Update overall accuracy
        self.student_metrics["overall_accuracy"] = (
            self.correct_count / self.question_count
        )

        # Update topic-specific learning
        current_prof = self.student_metrics["topic_accuracies"].get(topic, 0.5)

        if is_correct:
            # Small improvement from correct answers
            improvement = self.config.student_learning_rate * 0.5
        else:
            # Larger improvement from learning from mistakes (assumes good explanations follow)
            improvement = self.config.student_learning_rate

        # Difficulty affects learning rate
        difficulty_multipliers = {"easy": 0.5, "medium": 1.0, "hard": 1.5}
        multiplier = difficulty_multipliers.get(difficulty, 1.0)

        new_prof = current_prof + (improvement * multiplier)
        self.student_metrics["topic_accuracies"][topic] = max(0.0, min(1.0, new_prof))

    def _calculate_question_reward(
        self, topic: str, difficulty: str, is_correct: bool, content: Dict[str, Any]
    ) -> float:
        """Calculate reward for a question based on multiple factors."""
        # Base reward for correctness
        base_reward = 1.5 if is_correct else -0.3

        # Topic targeting bonus (higher reward for focusing on weak areas)
        topic_prof = self.student_metrics["topic_accuracies"].get(topic, 0.5)
        targeting_bonus = (1.0 - topic_prof) * 0.8  # Up to 0.8 bonus for weakest topics

        # Difficulty appropriateness (reward for matching difficulty to ability)
        difficulty_values = {"easy": 0.3, "medium": 0.6, "hard": 0.9}
        target_difficulty = difficulty_values.get(difficulty, 0.6)
        difficulty_appropriateness = 1.0 - abs(target_difficulty - topic_prof) * 2
        difficulty_bonus = max(0.0, difficulty_appropriateness) * 0.5

        # Question quality bonus
        question_text = content.get("question", "")
        explanation = content.get("explanation", "")
        quality_bonus = min(
            0.3, len(question_text.split()) / 50 + len(explanation.split()) / 100
        )

        # Learning objective bonus
        if "learning_objective" in content and len(content["learning_objective"]) > 10:
            objective_bonus = 0.2
        else:
            objective_bonus = 0.0

        total_reward = (
            base_reward
            + targeting_bonus
            + difficulty_bonus
            + quality_bonus
            + objective_bonus
        )
        return round(total_reward, 3)

    def _score_explanation(self, explanation: str, topic: str) -> float:
        """Score the quality of an explanation."""
        if not explanation:
            return 0.0

        words = explanation.split()

        # Length scoring (optimal around 50-150 words)
        word_count = len(words)
        if word_count < 20:
            length_score = word_count / 20 * 0.5
        elif word_count <= 150:
            length_score = 0.5 + (word_count - 20) / 130 * 0.5
        else:
            length_score = max(0.3, 1.0 - (word_count - 150) / 200)

        # Content quality indicators
        quality_indicators = [
            "because",
            "therefore",
            "however",
            "for example",
            "this means",
            "in other words",
            "specifically",
            "notice that",
            "remember",
        ]
        quality_score = min(
            0.5,
            sum(
                0.1
                for indicator in quality_indicators
                if indicator in explanation.lower()
            ),
        )

        return min(1.0, length_score + quality_score)

    def _score_lesson_plan(self, content: Dict[str, Any]) -> float:
        """Score the quality of a lesson plan."""
        base_score = 1.0

        # Check for key lesson plan elements
        if "prioritized_topics" in content:
            priorities = content["prioritized_topics"]
            if isinstance(priorities, list) and len(priorities) > 0:
                # Bonus for prioritizing weak topics
                weak_topics = [
                    topic
                    for topic, acc in self.student_metrics["topic_accuracies"].items()
                    if acc < 0.5
                ]
                weak_coverage = sum(1 for topic in priorities if topic in weak_topics)
                base_score += weak_coverage * 0.3

        if "study_activities" in content:
            activities = content["study_activities"]
            if isinstance(activities, (list, dict)) and len(activities) > 0:
                base_score += 0.4

        if "timeline" in content:
            base_score += 0.3

        if "learning_objectives" in content:
            objectives = content["learning_objectives"]
            if isinstance(objectives, list) and len(objectives) > 0:
                base_score += 0.5

        return min(2.5, base_score)

    def _generate_session_summary(self) -> Dict[str, Any]:
        """Generate a summary of the teaching session."""
        topics_covered = {}
        for interaction in self.session_history:
            if interaction.get("type") == "question":
                topic = interaction.get("topic")
                if topic:
                    if topic not in topics_covered:
                        topics_covered[topic] = {"total": 0, "correct": 0}
                    topics_covered[topic]["total"] += 1
                    if interaction.get("was_correct"):
                        topics_covered[topic]["correct"] += 1

        # Calculate topic accuracies for this session
        topic_accuracies = {}
        for topic, stats in topics_covered.items():
            topic_accuracies[topic] = (
                stats["correct"] / stats["total"] if stats["total"] > 0 else 0
            )

        return {
            "total_questions": self.question_count,
            "overall_accuracy": self.correct_count / max(1, self.question_count),
            "topics_covered": list(topics_covered.keys()),
            "topic_accuracies": topic_accuracies,
            "improvement": {
                topic: self.student_metrics["topic_accuracies"][topic]
                - next(
                    (
                        t["proficiency"]
                        for t in self.profile["topics"]
                        if t["name"] == topic
                    ),
                    0.5,
                )
                for topic in topics_covered.keys()
            },
            "final_proficiencies": dict(self.student_metrics["topic_accuracies"]),
        }

    def run_standalone_demo(self, output_file: str = "demo_output.jsonl"):
        """Run a standalone demo without full Atropos integration."""
        print("🎓 ExamCraft: Standalone Demo Mode")
        print("Simulating teacher-student interactions...")

        # Reset environment
        self.reset_metrics()

        # Simulate a teaching session
        outputs = []

        for episode in range(3):  # Run 3 episodes
            print(f"\n--- Episode {episode + 1} ---")
            episode_data = {
                "episode": episode + 1,
                "initial_state": dict(self.student_metrics),
                "interactions": [],
            }

            # Simulate questions
            for question_num in range(self.config.max_questions_per_episode):
                # Create a mock teacher response (since we don't have an LLM)
                weakest_topic = min(
                    self.student_metrics["topic_accuracies"].items(), key=lambda x: x[1]
                )[0]

                mock_response = {
                    "action_type": "QUESTION",
                    "topic": weakest_topic,
                    "difficulty": "medium",
                    "content": {
                        "question": f"Sample question about {weakest_topic}",
                        "options": {
                            "A": "Option A",
                            "B": "Option B",
                            "C": "Option C",
                            "D": "Option D",
                        },
                        "correct_answer": "B",
                        "explanation": f"This is a detailed explanation about {weakest_topic}.",
                        "learning_objective": f"Understand key concepts in {weakest_topic}",
                    },
                }

                # Process the mock response
                try:
                    result = asyncio.run(self.step(json.dumps(mock_response)))
                    next_prompt, info = result
                    episode_data["interactions"].append(
                        {
                            "question_num": question_num + 1,
                            "teacher_action": mock_response,
                            "result": info,
                        }
                    )
                    question_info = info["info"].get("topic")
                    is_correct = "✓" if info["info"].get("was_correct") else "✗"
                    score = info["score"]
                    print(
                        f"  Q{question_num + 1}: {question_info} - {is_correct} "
                        f"(Score: {score:.2f})"
                    )
                except Exception as e:
                    print(f"  Error processing question {question_num + 1}: {e}")
                    break

                if info.get("episode_done", False):
                    break

            episode_data["final_state"] = dict(self.student_metrics)
            outputs.append(episode_data)

        # Save outputs
        with open(output_file, "w") as f:
            for output in outputs:
                f.write(json.dumps(output) + "\n")

        print(f"\n✅ Demo completed! Results saved to {output_file}")
        print("Final student proficiencies:")
        for topic, prof in self.student_metrics["topic_accuracies"].items():
            print(f"  {topic}: {prof:.1%}")

    @classmethod
    def config_init(cls) -> Tuple[ExamCraftConfig, List[APIServerConfig]]:
        """Initialize configuration for ExamCraft environment."""
        env_config = ExamCraftConfig(
            tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
            group_size=6,
            use_wandb=True,
            rollout_server_url="http://localhost:8000",
            total_steps=150,
            batch_size=12,
            steps_per_eval=30,
            max_token_length=3072,
            wandb_name="examcraft-adaptive-teacher",
            profile_path="example_profile.json",
            max_questions_per_episode=8,
            student_learning_rate=0.03,
            enable_lesson_plans=True,
        )

        # API server configuration
        server_configs = [
            APIServerConfig(
                model_name="hermes-3-405b-instruct",
                base_url="https://api.nousresearch.com/v1",
                api_key=os.environ.get("NOUS_API_KEY"),
                num_requests_for_eval=64,
            ),
        ]

        return env_config, server_configs


def main():
    """Main entry point for ExamCraft environment."""
    parser = argparse.ArgumentParser(
        description="ExamCraft: Adaptive LLM Teacher Training Environment"
    )
    parser.add_argument(
        "command",
        choices=["process", "serve", "demo"],
        help="Command to run (demo for standalone mode)",
    )
    parser.add_argument(
        "--env.data_path_to_save_groups",
        type=str,
        default="demo_output.jsonl",
        help="Path to save demo output",
    )

    args = parser.parse_args()

    if args.command == "demo":
        # Run standalone demo
        config = ExamCraftConfig()
        env = ExamCraftEnv(config)
        output_file = getattr(args, "env.data_path_to_save_groups", "demo_output.jsonl")
        env.run_standalone_demo(output_file)
    elif ATROPOS_AVAILABLE:
        # Run with Atropos
        parse_args_and_command(ExamCraftEnv)
    else:
        print(
            "Atropos not available. Use 'python examcraft_server.py demo' for standalone mode."
        )
        sys.exit(1)


if __name__ == "__main__":
    main()