From eb3de019565087474c5fea886b42feeb0f4e6592 Mon Sep 17 00:00:00 2001 From: AlxAI Date: Mon, 24 Feb 2025 15:49:37 -0800 Subject: [PATCH] dramatically improving logging thanks to new 3.7sonnet cursor agent mode --- CLAUDE.md | 21 + ai_diplomacy/clients.py | 234 ++++++++--- ai_diplomacy/long_story_short.py | 378 ++++++++++++++---- ai_diplomacy/model_loader.py | 9 +- ai_diplomacy/negotiations.py | 32 +- .../prompts/message_summary_prompt.txt | 17 +- .../prompts/recursive_summary_prompt.txt | 16 + ai_diplomacy/test_recursive_summarization.py | 273 +++++++++++++ ai_diplomacy/utils.py | 141 +++++-- lm_game.py | 123 ++++-- 10 files changed, 1025 insertions(+), 219 deletions(-) create mode 100644 CLAUDE.md create mode 100644 ai_diplomacy/prompts/recursive_summary_prompt.txt create mode 100644 ai_diplomacy/test_recursive_summarization.py diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..3215a33 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,21 @@ +# AI Diplomacy Development Guide + +## Commands +- Run game: `python lm_game.py --max_year 1910 --summary_model "gpt-4o-mini" --num_negotiation_rounds 3` +- Run tests: `pytest -v diplomacy/tests/` or `pytest -v -k test_name` +- Run specific test: `pytest -v diplomacy/tests/path_to_test.py::test_function` +- Lint: `pylint diplomacy/path/to/file.py` +- Full test suite: `./diplomacy/run_tests.sh` + +## Code Style +- Use Python type hints for function parameters and return values +- Follow PEP 8 naming: snake_case for functions/variables, UPPER_CASE for constants +- Organize imports: standard library, third-party, local modules +- Error handling: Use specific exceptions with informative messages +- Docstrings: Use multi-line docstrings with parameter descriptions +- Keep functions focused on a single responsibility +- Models/LLM clients inherit from BaseModelClient and implement required methods +- When possible, use concurrent operations (see concurrent.futures in lm_game.py) + +## Environment +Python 3.5+ required. Use virtual environment with requirements.txt. \ No newline at end of file diff --git a/ai_diplomacy/clients.py b/ai_diplomacy/clients.py index 563c727..f709508 100644 --- a/ai_diplomacy/clients.py +++ b/ai_diplomacy/clients.py @@ -22,11 +22,89 @@ from .game_history import GameHistory from .long_story_short import get_optimized_context from .model_loader import load_model_client -# set logger back to just info +# Configure logger with a more useful format logger = logging.getLogger("client") logger.setLevel(logging.DEBUG) logging.basicConfig(level=logging.DEBUG) +# Function to configure logging options +def configure_logging( + log_full_prompts=True, + log_full_responses=True, + suppress_connection_logs=True, + log_level=logging.INFO +): + """ + Configure the logging system for AI Diplomacy + + Parameters: + - log_full_prompts: Whether to log the full prompts sent to models + - log_full_responses: Whether to log the full responses from models + - suppress_connection_logs: Whether to suppress HTTP connection logs + - log_level: The overall logging level for the application + """ + # Configure root logger + logging.getLogger().setLevel(log_level) + + # Set client logger level + logger.setLevel(log_level) + + # Configure specific loggers based on parameters + if suppress_connection_logs: + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("anthropic").setLevel(logging.WARNING) + logging.getLogger("openai").setLevel(logging.WARNING) + + # Set module-level configuration + global SHOULD_LOG_FULL_PROMPTS, SHOULD_LOG_FULL_RESPONSES + SHOULD_LOG_FULL_PROMPTS = log_full_prompts + SHOULD_LOG_FULL_RESPONSES = log_full_responses + + logger.info(f"Logging configured: full_prompts={log_full_prompts}, full_responses={log_full_responses}, level={logging.getLevelName(log_level)}") + +# Initialize defaults +SHOULD_LOG_FULL_PROMPTS = True +SHOULD_LOG_FULL_RESPONSES = True + +# Helper function for truncating long outputs in logs +def _truncate_text(text, max_length=500): + """Truncate text for logging purposes with indicator of original length""" + if not text or len(text) <= max_length: + return text + return f"{text[:max_length]}... [truncated, total length: {len(text)} chars]" + +# Helper function to log full model responses +def _log_full_response(model_type, model_name, power_name, response): + """Logs the full model response at INFO level""" + if not response or not SHOULD_LOG_FULL_RESPONSES: + return + + border = "=" * 80 + logger.info(f"\nMODEL RESPONSE | {model_type} | {model_name} | {power_name or 'Unknown'}\n{border}") + logger.info(f"{response}") + logger.info(f"{border}\n") + +# Helper function to log prompt details +def _log_prompt_details(model_type, model_name, power_name, prompt, system_prompt=None): + """Logs the prompt details at INFO level""" + if not prompt or not SHOULD_LOG_FULL_PROMPTS: + return + + border = "=" * 80 + total_tokens = len(prompt.split()) + + if system_prompt: + system_tokens = len(system_prompt.split()) + logger.info(f"\nPROMPT | {model_type} | {model_name} | {power_name or 'Unknown'} | ~{total_tokens} tokens (user) + ~{system_tokens} tokens (system)\n{border}") + logger.debug(f"System prompt: {_truncate_text(system_prompt)}") + else: + logger.info(f"\nPROMPT | {model_type} | {model_name} | {power_name or 'Unknown'} | ~{total_tokens} tokens\n{border}") + + logger.debug(f"User prompt: {_truncate_text(prompt)}") + logger.info(f"{border}\n") + load_dotenv() @@ -53,7 +131,7 @@ class BaseModelClient: try: self.system_prompt = load_prompt(f"{self.power_name.lower()}_system_prompt.txt") except FileNotFoundError: - logger.warning(f"No specific system prompt found for {self.power_name}; using default.") + logger.warning(f"CONFIG | {self.model_name} | No specific system prompt for {self.power_name}, using default") self.system_prompt = load_prompt("system_prompt.txt") else: self.system_prompt = load_prompt("system_prompt.txt") @@ -137,7 +215,7 @@ class BaseModelClient: possible_orders_text = format_possible_orders(game, possible_orders) # 8) Convoy Paths - logger.debug(f"convoy_paths_possible is: {game.convoy_paths_possible}") + logger.debug(f"CONTEXT | {self.model_name} | {power_name} | Convoy paths: {len(game.convoy_paths_possible) if game.convoy_paths_possible else 0} available") convoy_paths_text = format_convoy_paths(game, game.convoy_paths_possible, power_name) # 9) Threat Assessment @@ -233,8 +311,8 @@ class BaseModelClient: try: raw_response = self.generate_response(prompt) - logger.info( - f"[{self.model_name}] Raw LLM response for {power_name}:\n{raw_response}" + logger.debug( + f"ORDERS | {self.model_name} | {power_name} | Raw response: {_truncate_text(raw_response)}" ) # Attempt to parse the final "orders" from the LLM @@ -242,7 +320,7 @@ class BaseModelClient: if not move_list: logger.warning( - f"[{self.model_name}] Could not extract moves for {power_name}. Using fallback." + f"PARSE_ERROR | {self.model_name} | {power_name} | Failed to extract moves, using fallback" ) if model_error_stats is not None: # forcibly convert sets to string @@ -255,7 +333,7 @@ class BaseModelClient: return validated_moves except Exception as e: - logger.error(f"[{self.model_name}] LLM error for {power_name}: {e}") + logger.error(f"LLM_ERROR | {self.model_name} | {power_name} | {str(e)}") if model_error_stats is not None: # forcibly convert sets to string model_name_for_stats = str(self.model_name) @@ -280,7 +358,7 @@ class BaseModelClient: if not matches: # Some LLMs might not put the colon or might have triple backtick fences. logger.debug( - f"[{self.model_name}] Regex parse #1 failed for {power_name}. Trying alternative patterns." + f"PARSE | {self.model_name} | {power_name} | Regex #1 failed, trying alternative patterns" ) # 1b) Check for inline JSON after "PARSABLE OUTPUT" @@ -289,16 +367,17 @@ class BaseModelClient: if not matches: logger.debug( - f"[{self.model_name}] Regex parse #2 failed for {power_name}. Trying triple-backtick code fences." + f"PARSE | {self.model_name} | {power_name} | Regex #2 failed, trying triple-backtick code fences" ) # 2) If still no match, check for triple-backtick code fences containing JSON if not matches: code_fence_pattern = r"```json\s*(\{.*?\})\s*```" matches = re.search(code_fence_pattern, raw_response, re.DOTALL) + if matches: logger.debug( - f"[{self.model_name}] Found triple-backtick JSON block for {power_name}." + f"PARSE | {self.model_name} | {power_name} | Found triple-backtick JSON block" ) # 3) Attempt to parse JSON if we found anything @@ -310,7 +389,7 @@ class BaseModelClient: if captured.startswith("{{") and captured.endswith("}}"): # remove ONE leading '{' and ONE trailing '}' # so {{ "orders": [...] }} becomes { "orders": [...] } - logger.debug(f"[{self.model_name}] Detected double braces for {power_name}, trimming extra braces.") + logger.debug(f"PARSE | {self.model_name} | {power_name} | Detected double braces, trimming extra braces") # strip exactly one brace pair trimmed = captured[1:-1].strip() json_text = trimmed @@ -323,7 +402,7 @@ class BaseModelClient: if not json_text: logger.debug( - f"[{self.model_name}] No JSON text found in LLM response for {power_name}." + f"PARSE | {self.model_name} | {power_name} | No JSON text found in response" ) return None @@ -333,7 +412,7 @@ class BaseModelClient: return data.get("orders", None) except json.JSONDecodeError as e: logger.warning( - f"[{self.model_name}] JSON decode failed for {power_name}: {e}. Trying bracket fallback." + f"PARSE | {self.model_name} | {power_name} | JSON decode failed: {str(e)[:100]}. Trying bracket fallback" ) # 3b) Attempt bracket fallback: we look for the substring after "orders" @@ -346,10 +425,11 @@ class BaseModelClient: raw_list_str = "[" + bracket_match.group(1).strip() + "]" moves = ast.literal_eval(raw_list_str) if isinstance(moves, list): + logger.debug(f"PARSE | {self.model_name} | {power_name} | Bracket fallback successful") return moves except Exception as e2: logger.warning( - f"[{self.model_name}] Bracket fallback parse also failed for {power_name}: {e2}" + f"PARSE | {self.model_name} | {power_name} | Bracket fallback failed: {str(e2)[:100]}" ) # If all attempts failed @@ -361,12 +441,12 @@ class BaseModelClient: """ Filter out invalid moves, fill missing with HOLD, else fallback. """ - logger.debug(f"[{self.model_name}] Proposed LLM moves: {moves}") + logger.debug(f"VALIDATE | {self.model_name} | Validating {len(moves)} proposed moves") validated = [] used_locs = set() if not isinstance(moves, list): - logger.debug(f"[{self.model_name}] Moves not a list, fallback.") + logger.debug(f"VALIDATE | {self.model_name} | Moves not a list type, using fallback") return self.fallback_orders(possible_orders) for move in moves: @@ -378,7 +458,7 @@ class BaseModelClient: if len(parts) >= 2: used_locs.add(parts[1][:3]) else: - logger.debug(f"[{self.model_name}] Invalid move from LLM: {move_str}") + logger.debug(f"VALIDATE | {self.model_name} | Invalid move: {move_str}") # Fill missing with hold for loc, orders_list in possible_orders.items(): @@ -389,10 +469,10 @@ class BaseModelClient: ) if not validated: - logger.warning(f"[{self.model_name}] All moves invalid, fallback.") + logger.warning(f"VALIDATE | {self.model_name} | All moves invalid, using fallback") return self.fallback_orders(possible_orders) - logger.debug(f"[{self.model_name}] Validated moves: {validated}") + logger.debug(f"VALIDATE | {self.model_name} | Final valid moves: {len(validated)}") return validated def fallback_orders(self, possible_orders: Dict[str, List[str]]) -> List[str]: @@ -461,7 +541,7 @@ class BaseModelClient: if not json_matches: # try normal logger.debug( - f"[{self.model_name}] No JSON block found in LLM response for {power_name}. Trying double braces." + f"CHAT | {self.model_name} | {power_name} | No JSON block, trying double braces" ) json_matches = re.findall( r"PARSABLE OUTPUT:\s*\{(.*?)\}", raw_response, re.DOTALL @@ -470,7 +550,7 @@ class BaseModelClient: if not json_matches: # try backtick fences logger.debug( - f"[{self.model_name}] No JSON block found in LLM response for {power_name}. Trying backtick fences." + f"CHAT | {self.model_name} | {power_name} | Trying backtick fences" ) json_matches = re.findall( r"```json\n(.*?)\n```", raw_response, re.DOTALL @@ -487,23 +567,24 @@ class BaseModelClient: message_type = message_data.get("message_type", "global") content = message_data.get("content", "").strip() recipient = message_data.get("recipient", GLOBAL) + recipient = recipient.upper() # Validate recipient if private message if message_type == "private" and recipient not in active_powers: logger.warning( - f"Invalid recipient {recipient} for private message, defaulting to GLOBAL" + f"CHAT | {self.model_name} | {power_name} | Invalid recipient '{recipient}', defaulting to GLOBAL" ) recipient = GLOBAL # For private messages, ensure recipient is specified if message_type == "private" and recipient == GLOBAL: logger.warning( - "Private message without recipient specified, defaulting to GLOBAL" + f"CHAT | {self.model_name} | {power_name} | Private message without recipient, defaulting to GLOBAL" ) # Log for debugging logger.info( - f"Power {power_name} sends {message_type} message to {recipient}" + f"CHAT | {self.model_name} | {power_name} | Sending {message_type} message to {recipient}" ) # Keep local record for building future conversation context @@ -519,7 +600,7 @@ class BaseModelClient: message = None except AttributeError: - logger.error("Error parsing raw response") + logger.error(f"CHAT | {self.model_name} | {power_name} | Error parsing raw response") # Deduplicate messages messages = list(set([json.dumps(m) for m in messages])) @@ -543,29 +624,34 @@ class OpenAIClient(BaseModelClient): self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) def generate_response(self, prompt: str, empty_system: bool = False) -> str: - # Updated to new API format try: + system_content = self.system_prompt if not empty_system else "" + logger.debug(f"API | OpenAI | {self.model_name} | Sending request") + + _log_prompt_details("OpenAI", self.model_name, self.power_name, prompt, system_content) + response = self.client.chat.completions.create( model=self.model_name, messages=[ - {"role": "system", "content": self.system_prompt if not empty_system else ""}, + {"role": "system", "content": system_content}, {"role": "user", "content": prompt}, ], ) - if not response or not hasattr(response, "choices") or not response.choices: - logger.warning( - f"[{self.model_name}] Empty or invalid result in generate_response. Returning empty." - ) + if not response or not response.choices: + logger.warning(f"API | OpenAI | {self.model_name} | Empty or invalid response") return "" - return response.choices[0].message.content.strip() + logger.debug(f"API | OpenAI | {self.model_name} | Received response of length {len(response.choices[0].message.content)}") + content = response.choices[0].message.content.strip() + _log_full_response("OpenAI", self.model_name, self.power_name, content) + return content except json.JSONDecodeError as json_err: logger.error( - f"[{self.model_name}] JSON decoding failed in generate_response: {json_err}" + f"API | OpenAI | {self.model_name} | JSON decode error: {str(json_err)[:100]}" ) return "" except Exception as e: logger.error( - f"[{self.model_name}] Unexpected error in generate_response: {e}" + f"API | OpenAI | {self.model_name} | Error: {str(e)[:150]}" ) return "" @@ -582,27 +668,38 @@ class ClaudeClient(BaseModelClient): def generate_response(self, prompt: str, empty_system: bool = False) -> str: # Updated Claude messages format try: + system_content = self.system_prompt if not empty_system else "" + + _log_prompt_details("Claude", self.model_name, self.power_name, prompt, system_content) + response = self.client.messages.create( model=self.model_name, max_tokens=2000, - system=self.system_prompt if not empty_system else "", + system=system_content, messages=[{"role": "user", "content": prompt}], ) - if not response.content: - logger.warning( - f"[{self.model_name}] Empty content in Claude generate_response. Returning empty." - ) + if not response or not response.content: + logger.warning(f"API | Claude | {self.model_name} | Empty or invalid response") return "" - return response.content[0].text.strip() if response.content else "" - except json.JSONDecodeError as json_err: - logger.error( - f"[{self.model_name}] JSON decoding failed in generate_response: {json_err}" - ) - return "" + logger.debug(f"API | Claude | {self.model_name} | Received response of length {len(response.content)}") + + # Handle the new response format which might be a list of TextBlock objects + if isinstance(response.content, list): + # Extract text from each TextBlock + content = "" + for block in response.content: + if hasattr(block, 'text'): + content += block.text + elif isinstance(block, dict) and 'text' in block: + content += block['text'] + logger.debug(f"API | Claude | {self.model_name} | Extracted text from {len(response.content)} TextBlocks") + else: + content = response.content + + _log_full_response("Claude", self.model_name, self.power_name, content) + return content except Exception as e: - logger.error( - f"[{self.model_name}] Unexpected error in generate_response: {e}" - ) + logger.error(f"API | Claude | {self.model_name} | Error: {str(e)[:150]}") return "" @@ -616,24 +713,24 @@ class GeminiClient(BaseModelClient): self.client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY")) def generate_response(self, prompt: str, empty_system: bool = False) -> str: - if empty_system: - full_prompt = prompt - else: - full_prompt = self.system_prompt + prompt - try: + system_content = self.system_prompt if not empty_system else "" + logger.debug(f"API | Gemini | {self.model_name} | Sending request") response = self.client.models.generate_content( model=self.model_name, - contents=full_prompt, + contents=system_content + prompt, ) if not response or not response.text: logger.warning( - f"[{self.model_name}] Empty Gemini generate_response. Returning empty." + f"API | Gemini | {self.model_name} | Empty response" ) return "" - return response.text.strip() + logger.debug(f"API | Gemini | {self.model_name} | Received response of length {len(response.text)}") + content = response.text.strip() + _log_full_response("Gemini", self.model_name, self.power_name, content) + return content except Exception as e: - logger.error(f"[{self.model_name}] Error in Gemini generate_response: {e}") + logger.error(f"API | Gemini | {self.model_name} | Error: {str(e)[:150]}") return "" @@ -651,27 +748,34 @@ class DeepSeekClient(BaseModelClient): def generate_response(self, prompt: str, empty_system: bool = False) -> str: try: + system_content = self.system_prompt if not empty_system else "" + logger.debug(f"API | DeepSeek | {self.model_name} | Sending request") + + _log_prompt_details("DeepSeek", self.model_name, self.power_name, prompt, system_content) + response = self.client.chat.completions.create( model=self.model_name, messages=[ - {"role": "system", "content": self.system_prompt if not empty_system else ""}, + {"role": "system", "content": system_content}, {"role": "user", "content": prompt}, ], stream=False, ) - logger.debug(f"[{self.model_name}] Raw DeepSeek response:\n{response}") + logger.debug(f"API | DeepSeek | {self.model_name} | Received response") if not response or not response.choices: logger.warning( - f"[{self.model_name}] No valid response in generate_response." + f"API | DeepSeek | {self.model_name} | No valid response" ) return "" content = response.choices[0].message.content.strip() if not content: - logger.warning(f"[{self.model_name}] DeepSeek returned empty content.") + logger.warning(f"API | DeepSeek | {self.model_name} | Empty content") return "" - + + _log_full_response("DeepSeek", self.model_name, self.power_name, content) + try: json_response = json.loads(content) required_fields = ["message_type", "content"] @@ -679,13 +783,13 @@ class DeepSeekClient(BaseModelClient): required_fields.append("recipient") if not all(field in json_response for field in required_fields): logger.error( - f"[{self.model_name}] Missing required fields in response: {content}" + f"API | DeepSeek | {self.model_name} | Missing fields: {_truncate_text(content, 100)}" ) return "" return content except JSONDecodeError: logger.error( - f"[{self.model_name}] Response is not valid JSON: {content}" + f"API | DeepSeek | {self.model_name} | Invalid JSON: {_truncate_text(content, 100)}" ) content = content.replace("'", '"') try: @@ -696,7 +800,7 @@ class DeepSeekClient(BaseModelClient): except Exception as e: logger.error( - f"[{self.model_name}] Unexpected error in generate_response: {e}" + f"API | DeepSeek | {self.model_name} | Error: {str(e)[:150]}" ) return "" diff --git a/ai_diplomacy/long_story_short.py b/ai_diplomacy/long_story_short.py index 26f448c..0577624 100644 --- a/ai_diplomacy/long_story_short.py +++ b/ai_diplomacy/long_story_short.py @@ -22,7 +22,8 @@ def count_tokens(text: str) -> int: class ContextManager: """ Manages context size for Diplomacy game history and messages. - Provides summarization functionality when context exceeds thresholds. + Provides power-specific recursive summarization functionality + when context exceeds thresholds. """ def __init__( self, @@ -34,21 +35,37 @@ class ContextManager: self.message_token_threshold = message_token_threshold self.summary_model = summary_model - # Cache for summaries - prevents regenerating summaries unnecessarily - self.phase_summary_cache = {} - self.message_summary_cache = {} + # Per-power tracking of summary states + self.power_summary_states = {} # Indexed by power_name - # Track when we last generated summaries - self.last_phase_summary_time = 0 - self.last_message_summary_time = 0 + # Shared phase summary state (phases are globally visible) + self.phase_summary_state = { + 'last_summary': None, # The most recent summary of older phases + 'summarized_phases': [], # List of phase names that have been summarized + 'last_summary_time': 0, # When we last summarized + } # Cooldown period (seconds) - don't summarize more frequently than this self.summary_cooldown = 300 # 5 minutes + + logger.debug(f"CONTEXT | Initialized manager with thresholds: phase={phase_token_threshold}, message={message_token_threshold}") - def load_summarization_prompts(self) -> Tuple[str, str]: + def get_power_state(self, power_name): """ - Load prompts for phase and message summarization. - Returns tuple of (phase_prompt, message_prompt) + Gets or initializes the summary state for a specific power. + """ + if power_name not in self.power_summary_states: + self.power_summary_states[power_name] = { + 'last_message_summary': None, # The most recent message summary + 'summarized_messages': set(), # Set of message IDs that have been summarized + 'last_summary_time': 0, # When we last summarized messages for this power + } + return self.power_summary_states[power_name] + + def load_summarization_prompts(self) -> Tuple[str, str, str]: + """ + Load prompts for phase, message, and recursive summarization. + Returns tuple of (phase_prompt, message_prompt, recursive_prompt) """ try: # Try to load from files @@ -58,10 +75,14 @@ class ContextManager: with open("./ai_diplomacy/prompts/message_summary_prompt.txt", "r") as f: message_prompt = f.read().strip() - return phase_prompt, message_prompt + with open("./ai_diplomacy/prompts/recursive_summary_prompt.txt", "r") as f: + recursive_prompt = f.read().strip() + + logger.debug("CONTEXT | Loaded summarization prompts from files") + return phase_prompt, message_prompt, recursive_prompt except FileNotFoundError: # Return default prompts if files not found - logger.warning("Summarization prompt files not found. Using defaults.") + logger.warning("CONTEXT | Summarization prompt files not found, using default templates") phase_prompt = """You are summarizing the history of a Diplomacy game. Create a concise summary that preserves all strategically relevant information about: @@ -96,42 +117,136 @@ ORIGINAL MESSAGE HISTORY: {message_history} SUMMARY:""" + + recursive_prompt = """You are creating a recursive summary of a Diplomacy game's history. +You have a previous summary of earlier events/messages and new content to incorporate. + +Your task is to create a unified, seamless summary that: +1. Preserves key strategic information from both sources +2. Maintains chronological flow and logical structure +3. Presents the most relevant information for decision-making +4. Emphasizes developments in alliances, betrayals, and territorial control + +PREVIOUS SUMMARY: +{previous_summary} + +NEW CONTENT: +{new_content} + +Create a unified summary that reads as a single coherent narrative while preserving critical strategic information:""" - return phase_prompt, message_prompt + return phase_prompt, message_prompt, recursive_prompt + + def generate_recursive_summary(self, previous_summary, new_content, prompt_type="recursive", power_name=None): + """ + Generates a recursive summary by combining previous summary with new content. + + Args: + previous_summary: Previous summary text + new_content: New content to be incorporated + prompt_type: Type of prompt to use (recursive, phase, or message) + power_name: Name of the power for context + + Returns: + A new summary incorporating both previous and new content + """ + # Load appropriate prompt + phase_prompt, message_prompt, recursive_prompt = self.load_summarization_prompts() + + if prompt_type == "phase" and not previous_summary: + # Initial phase summary + prompt = phase_prompt.replace("{phase_history}", new_content) + logger.debug(f"CONTEXT | SUMMARY | Creating initial phase summary with {len(new_content)} characters") + elif prompt_type == "message" and not previous_summary: + # Initial message summary + prompt = message_prompt.replace("{message_history}", new_content) + logger.debug(f"CONTEXT | SUMMARY | Creating initial message summary for {power_name} with {len(new_content)} characters") + else: + # Recursive summary (or phase/message with previous summary) + prompt = recursive_prompt + prompt = prompt.replace("{previous_summary}", previous_summary or "(No previous summary)") + prompt = prompt.replace("{new_content}", new_content) + logger.debug(f"CONTEXT | SUMMARY | Creating recursive {prompt_type} summary for {power_name or 'game'}") + logger.debug(f"CONTEXT | SUMMARY | Previous summary: {len(previous_summary or '')} chars, New content: {len(new_content)} chars") + + # Get the summary using the LLM + summarization_client = load_model_client(self.summary_model, power_name=power_name, emptysystem=True) + summary = summarization_client.generate_response(prompt) + + logger.debug(f"CONTEXT | Generated {prompt_type} recursive summary ({len(summary)} chars)") + return summary def should_summarize_phases(self, phase_summaries: Dict[str, str]) -> bool: """ - Determine if phase summaries need to be condensed based on token count - and cooldown period. + Determine if phase summaries need to be condensed based on token count, + cooldown period, and new content since last summarization. """ # Check if we're in cooldown period current_time = time.time() - if current_time - self.last_phase_summary_time < self.summary_cooldown: + if current_time - self.phase_summary_state['last_summary_time'] < self.summary_cooldown: + logger.debug("CONTEXT | Phase summarization skipped (in cooldown period)") return False - # Join all summaries to count total tokens - all_text = "\n\n".join(phase_summaries.values()) - token_count = count_tokens(all_text) + # Get unsummarized phase content + unsummarized_phase_names = [p for p in phase_summaries.keys() + if p not in self.phase_summary_state['summarized_phases'] + and not p.startswith("SUMMARY_UNTIL_")] - return token_count > self.phase_token_threshold + # If we have a previous summary, count its tokens + base_token_count = 0 + if self.phase_summary_state['last_summary']: + base_token_count = count_tokens(self.phase_summary_state['last_summary']) + + # Count tokens in unsummarized phases + unsummarized_text = "\n\n".join([phase_summaries[phase] for phase in unsummarized_phase_names]) + unsummarized_token_count = count_tokens(unsummarized_text) + + # Check if total exceeds threshold + total_token_count = base_token_count + unsummarized_token_count + should_summarize = total_token_count > self.phase_token_threshold + + if should_summarize: + logger.debug(f"CONTEXT | Phase token count ({total_token_count}) exceeds threshold ({self.phase_token_threshold}), will summarize") + logger.debug(f"CONTEXT | Phase breakdown: {base_token_count} tokens from previous summary + {unsummarized_token_count} tokens from {len(unsummarized_phase_names)} new phases") + + return should_summarize - def should_summarize_messages(self, message_history: str) -> bool: + def should_summarize_messages(self, message_history: str, power_name: str) -> bool: """ - Determine if message history needs to be condensed based on token count - and cooldown period. + Determine if message history for a specific power needs to be condensed + based on token count and cooldown period. """ + # Get power-specific state + power_state = self.get_power_state(power_name) + # Check if we're in cooldown period current_time = time.time() - if current_time - self.last_message_summary_time < self.summary_cooldown: + if current_time - power_state['last_summary_time'] < self.summary_cooldown: + logger.debug(f"CONTEXT | Message summarization for {power_name} skipped (in cooldown period)") return False - token_count = count_tokens(message_history) - return token_count > self.message_token_threshold + # If we have a previous summary, count its tokens + base_token_count = 0 + if power_state['last_message_summary']: + base_token_count = count_tokens(power_state['last_message_summary']) + + # Count tokens in the new content + new_token_count = count_tokens(message_history) + + # Check if total exceeds threshold + total_token_count = base_token_count + new_token_count + should_summarize = total_token_count > self.message_token_threshold + + if should_summarize: + logger.debug(f"CONTEXT | Message token count for {power_name} ({total_token_count}) exceeds threshold ({self.message_token_threshold}), will summarize") + logger.debug(f"CONTEXT | Message breakdown for {power_name}: {base_token_count} tokens from previous summary + {new_token_count} tokens from new messages") + + return should_summarize def summarize_phase_history(self, phase_summaries: Dict[str, str], power_name: Optional[str] = None) -> Dict[str, str]: """ - Create a condensed version of phase summaries. - Keeps the most recent phases intact and summarizes older ones. + Create a recursively updated summary of phase history. + Keeps recent phases intact and summarizes older ones. Returns a new dictionary with condensed history. """ @@ -139,84 +254,142 @@ SUMMARY:""" return phase_summaries # Mark summarization time - self.last_phase_summary_time = time.time() + self.phase_summary_state['last_summary_time'] = time.time() - # Sort phases chronologically + # Sort phases chronologically sorted_phases = sorted(phase_summaries.keys()) - # Keep the 3 most recent phases intact - recent_phases = sorted_phases[-3:] if len(sorted_phases) > 3 else sorted_phases - older_phases = sorted_phases[:-3] if len(sorted_phases) > 3 else [] + # Get unsummarized phase names + unsummarized_phase_names = [p for p in sorted_phases + if p not in self.phase_summary_state['summarized_phases'] + and not p.startswith("SUMMARY_UNTIL_")] - if not older_phases: + # Keep the 3 most recent phases intact + recent_phases = unsummarized_phase_names[-3:] if len(unsummarized_phase_names) > 3 else unsummarized_phase_names + phases_to_summarize = [p for p in unsummarized_phase_names if p not in recent_phases] + + if not phases_to_summarize: + logger.debug("CONTEXT | No new phases to summarize") return phase_summaries # Nothing to summarize - # Get summarization prompt - phase_prompt, _ = self.load_summarization_prompts() + # Text to summarize: previous summary + new phases to summarize + previous_summary = self.phase_summary_state['last_summary'] or "" - # Generate a summary of the older phases - older_text = "" - for phase in older_phases: - older_text += f"PHASE {phase}:\n{phase_summaries[phase]}\n\n" + new_content = "" + for phase in phases_to_summarize: + new_content += f"PHASE {phase}:\n{phase_summaries[phase]}\n\n" - # Check if we already have a cached summary for this exact text - if older_text in self.phase_summary_cache: - summary = self.phase_summary_cache[older_text] + # Log before summarization + logger.info(f"CONTEXT | PHASE SUMMARIZATION | Starting recursive summarization for {len(phases_to_summarize)} phases") + logger.info(f"CONTEXT | PHASE SUMMARIZATION | Phases being summarized: {', '.join(phases_to_summarize)}") + + # Generate recursive summary + if previous_summary: + # We have a previous summary, do recursive summarization + logger.debug(f"CONTEXT | PHASE SUMMARIZATION | Performing recursive summarization with previous summary ({len(previous_summary)} chars)") + summary = self.generate_recursive_summary( + previous_summary, + new_content, + prompt_type="recursive", + power_name=power_name + ) else: - # Generate new summary - summarization_client = load_model_client(self.summary_model, power_name=power_name, emptysystem=True) - formatted_prompt = phase_prompt.replace("{phase_history}", older_text) - summary = summarization_client.generate_response(formatted_prompt) - - # Cache the result - self.phase_summary_cache[older_text] = summary + # No previous summary, do initial summarization + logger.debug(f"CONTEXT | PHASE SUMMARIZATION | Performing initial phase summarization ({len(new_content)} chars)") + summary = self.generate_recursive_summary( + None, + new_content, + prompt_type="phase", + power_name=power_name + ) + + # Update phase summary state + self.phase_summary_state['last_summary'] = summary + self.phase_summary_state['summarized_phases'].extend(phases_to_summarize) # Create new dictionary with summarized older phases and intact recent phases result = {} # Add the summary as a special entry - summary_key = f"SUMMARY_UNTIL_{older_phases[-1]}" - result[summary_key] = summary + if phases_to_summarize: + last_summarized = max(phases_to_summarize) + summary_key = f"SUMMARY_UNTIL_{last_summarized}" + result[summary_key] = summary + logger.info(f"CONTEXT | PHASE SUMMARIZATION | Created summary key '{summary_key}' ({len(summary)} chars)") # Add the recent phases as-is for phase in recent_phases: result[phase] = phase_summaries[phase] - + + logger.info(f"CONTEXT | PHASE SUMMARIZATION | Recursively condensed {len(phase_summaries)} phase entries to {len(result)}") + logger.debug(f"CONTEXT | PHASE SUMMARIZATION | Result contains summary + {len(recent_phases)} intact recent phases") return result def summarize_message_history( self, message_history: str, - power_name: Optional[str] = None, + power_name: str, organized_by_relationship: bool = True ) -> str: """ - Create a condensed version of message history. - If organized_by_relationship is True, assumes the history is already - organized by power relationships. + Create a recursively updated summary of message history for a specific power. - Returns a condensed message history. + Args: + message_history: Current unsummarized message history + power_name: The power whose history is being summarized + organized_by_relationship: If True, assumes messages are organized by relationship + + Returns: + Updated message history with recursive summarization applied """ - if not self.should_summarize_messages(message_history): + if not self.should_summarize_messages(message_history, power_name): return message_history + # Get power-specific state + power_state = self.get_power_state(power_name) + # Mark summarization time - self.last_message_summary_time = time.time() + power_state['last_summary_time'] = time.time() - # Get summarization prompt - _, message_prompt = self.load_summarization_prompts() + # Log before summarization + logger.info(f"CONTEXT | MESSAGE SUMMARIZATION | Starting message summarization for {power_name}") + logger.info(f"CONTEXT | MESSAGE SUMMARIZATION | Current message history size: {len(message_history)} chars") - # Check if we already have a cached summary for this exact text - if message_history in self.message_summary_cache: - return self.message_summary_cache[message_history] + # Generate recursive summary + previous_summary = power_state['last_message_summary'] + + if previous_summary: + # We have a previous summary, do recursive summarization + logger.debug(f"CONTEXT | MESSAGE SUMMARIZATION | Performing recursive message summarization for {power_name}") + logger.debug(f"CONTEXT | MESSAGE SUMMARIZATION | Previous summary: {len(previous_summary)} chars, New messages: {len(message_history)} chars") - # Generate new summary - summarization_client = load_model_client(self.summary_model, power_name=power_name, emptysystem=True) - formatted_prompt = message_prompt.replace("{message_history}", message_history) - summary = summarization_client.generate_response(formatted_prompt) + summary = self.generate_recursive_summary( + previous_summary, + message_history, + prompt_type="recursive", + power_name=power_name + ) + else: + # No previous summary, do initial summarization + logger.debug(f"CONTEXT | MESSAGE SUMMARIZATION | Performing initial message summarization for {power_name} ({len(message_history)} chars)") + + summary = self.generate_recursive_summary( + None, + message_history, + prompt_type="message", + power_name=power_name + ) - # Cache the result - self.message_summary_cache[message_history] = summary + # Update power state + power_state['last_message_summary'] = summary + + # Track metrics for logging + message_tokens = count_tokens(message_history) + summary_tokens = count_tokens(summary) + reduction = 100 - (summary_tokens * 100 / message_tokens) if message_tokens > 0 else 0 + + logger.info(f"CONTEXT | MESSAGE SUMMARIZATION | Completed for {power_name}: {message_tokens} → {summary_tokens} tokens ({reduction:.1f}% reduction)") + logger.debug(f"CONTEXT | MESSAGE SUMMARIZATION | Original size: {len(message_history)} chars, Summary size: {len(summary)} chars") return summary @@ -228,16 +401,34 @@ SUMMARY:""" """ Main access point for getting optimized phase summaries. If summaries are below threshold, returns original. - Otherwise, returns condensed version. + Otherwise, returns recursively condensed version. """ if not hasattr(game, "phase_summaries") or not game.phase_summaries: + logger.debug("CONTEXT | No phase summaries available") return {} + logger.debug(f"CONTEXT | Checking phase optimization for {power_name or 'game'} with {len(game.phase_summaries)} phases") + if self.should_summarize_phases(game.phase_summaries): - # Create condensed version - return self.summarize_phase_history(game.phase_summaries, power_name) + # Create condensed version using recursive summarization + logger.debug(f"CONTEXT | Creating optimized phase summaries for {power_name or 'game'}") + result = self.summarize_phase_history(game.phase_summaries, power_name) + + # Add a log showing which phases are included in the optimized version + phase_keys = list(result.keys()) + summary_keys = [k for k in phase_keys if k.startswith("SUMMARY_UNTIL_")] + regular_phases = [k for k in phase_keys if not k.startswith("SUMMARY_UNTIL_")] + + logger.info(f"CONTEXT | PHASE OPTIMIZATION | Returning {len(summary_keys)} summary entries and {len(regular_phases)} regular phases") + if summary_keys: + logger.debug(f"CONTEXT | PHASE OPTIMIZATION | Summary entries: {', '.join(summary_keys)}") + if regular_phases: + logger.debug(f"CONTEXT | PHASE OPTIMIZATION | Regular phases: {', '.join(regular_phases)}") + + return result else: # Return original + logger.debug("CONTEXT | Using original phase summaries (below threshold)") return game.phase_summaries def get_optimized_message_history( @@ -257,7 +448,11 @@ SUMMARY:""" Returns: Optimized message history as string """ - # Get the raw message history + if not power_name: + logger.warning("CONTEXT | No power_name provided for message history optimization, using raw history") + return organized_history or (game_history.get_game_history() if hasattr(game_history, "get_game_history") else str(game_history)) + + # Get the raw message history for this power if organized_history is not None: message_history = organized_history elif hasattr(game_history, "get_game_history"): @@ -265,11 +460,28 @@ SUMMARY:""" else: message_history = str(game_history) if game_history else "(No history yet)" - if self.should_summarize_messages(message_history): - # Create condensed version - return self.summarize_message_history(message_history, power_name) + if message_history == "(No history yet)": + return message_history + + logger.debug(f"CONTEXT | Checking message optimization for {power_name} with {len(message_history)} chars") + + # Check if we need to create a recursive summary + if self.should_summarize_messages(message_history, power_name): + # Create recursively condensed version + logger.debug(f"CONTEXT | Creating optimized message history for {power_name}") + result = self.summarize_message_history(message_history, power_name) + + # Log the optimization stats + power_state = self.get_power_state(power_name) + has_previous_summary = power_state['last_message_summary'] is not None + + logger.info(f"CONTEXT | MESSAGE OPTIMIZATION | {power_name} | Original size: {len(message_history)} chars, Optimized size: {len(result)} chars") + logger.info(f"CONTEXT | MESSAGE OPTIMIZATION | {power_name} | Using {'recursive' if has_previous_summary else 'initial'} message summary") + + return result else: # Return original + logger.debug(f"CONTEXT | Using original message history for {power_name} (below threshold)") return message_history @@ -287,6 +499,7 @@ def configure_context_manager( Should be called early in the application lifecycle. """ global context_manager + logger.info(f"CONTEXT | Configuring manager with thresholds: phase={phase_threshold}, message={message_threshold}, model={summary_model}") context_manager = ContextManager( phase_token_threshold=phase_threshold, message_token_threshold=message_threshold, @@ -305,9 +518,18 @@ def get_optimized_context( Returns: Tuple of (optimized_phase_summaries, optimized_message_history) """ + logger.debug(f"CONTEXT | Getting optimized context for {power_name or 'game'}") + optimized_phases = context_manager.get_optimized_phase_summaries(game, power_name) optimized_messages = context_manager.get_optimized_message_history( game_history, power_name, organized_history ) + # Add a log entry showing what we're returning for tracking + phase_count = len(optimized_phases) if optimized_phases else 0 + message_length = len(optimized_messages) if optimized_messages else 0 + summary_count = len([k for k in optimized_phases.keys() if k.startswith("SUMMARY_UNTIL_")]) if optimized_phases else 0 + + logger.info(f"CONTEXT | OPTIMIZATION RESULT | {power_name or 'game'} | Returning {phase_count} phases ({summary_count} summaries) and {message_length} chars of messages") + return optimized_phases, optimized_messages \ No newline at end of file diff --git a/ai_diplomacy/model_loader.py b/ai_diplomacy/model_loader.py index 372aef6..74e912d 100644 --- a/ai_diplomacy/model_loader.py +++ b/ai_diplomacy/model_loader.py @@ -21,12 +21,19 @@ def load_model_client(model_id: str, power_name: Optional[str] = None, emptysyst from .clients import ClaudeClient, GeminiClient, DeepSeekClient, OpenAIClient lower_id = model_id.lower() + + logger.debug(f"MODEL | Loading client for {model_id}{' for ' + power_name if power_name else ''}{' with empty system' if emptysystem else ''}") + if "claude" in lower_id: + logger.debug(f"MODEL | Selected Claude client for {model_id}") return ClaudeClient(model_id, power_name, emptysystem=emptysystem) elif "gemini" in lower_id: + logger.debug(f"MODEL | Selected Gemini client for {model_id}") return GeminiClient(model_id, power_name, emptysystem=emptysystem) elif "deepseek" in lower_id: + logger.debug(f"MODEL | Selected DeepSeek client for {model_id}") return DeepSeekClient(model_id, power_name, emptysystem=emptysystem) else: # Default to OpenAI - return OpenAIClient(model_id, power_name) \ No newline at end of file + logger.debug(f"MODEL | Selected OpenAI client for {model_id}") + return OpenAIClient(model_id, power_name, emptysystem=emptysystem) \ No newline at end of file diff --git a/ai_diplomacy/negotiations.py b/ai_diplomacy/negotiations.py index f818a10..9cf64be 100644 --- a/ai_diplomacy/negotiations.py +++ b/ai_diplomacy/negotiations.py @@ -20,7 +20,7 @@ def conduct_negotiations(game, game_history, model_error_stats, max_rounds=3): Each power can send up to 'max_rounds' messages, choosing between private and global messages each turn. """ - logger.info("Starting negotiation phase.") + logger.info(f"DIPLOMACY | Starting negotiation phase with {max_rounds} rounds") # Conversation messages are kept in a local list ONLY to build conversation_so_far text. conversation_messages = [] @@ -28,9 +28,12 @@ def conduct_negotiations(game, game_history, model_error_stats, max_rounds=3): active_powers = [ p_name for p_name, p_obj in game.powers.items() if not p_obj.is_eliminated() ] + + logger.debug(f"DIPLOMACY | Found {len(active_powers)} active powers for negotiations") # We do up to 'max_rounds' single-message turns for each power for round_index in range(max_rounds): + logger.debug(f"DIPLOMACY | Starting round {round_index+1}/{max_rounds}") with concurrent.futures.ThreadPoolExecutor( max_workers=len(active_powers) ) as executor: @@ -40,10 +43,11 @@ def conduct_negotiations(game, game_history, model_error_stats, max_rounds=3): client = load_model_client(model_id) possible_orders = gather_possible_orders(game, power_name) if not possible_orders: - logger.info(f"No orderable locations for {power_name}; skipping.") + logger.info(f"DIPLOMACY | {power_name} | No orderable locations, skipping negotiation") continue board_state = game.get_state() + logger.debug(f"DIPLOMACY | {power_name} | Requesting conversation response from {model_id}") future = executor.submit( client.get_conversation_reply, game, @@ -57,14 +61,21 @@ def conduct_negotiations(game, game_history, model_error_stats, max_rounds=3): ) futures[future] = power_name - logger.debug(f"Submitted get_conversation_reply task for {power_name}.") + message_count = 0 for future in concurrent.futures.as_completed(futures): power_name = futures[future] try: messages = future.result() if messages: + logger.debug(f"DIPLOMACY | {power_name} | Generated {len(messages)} messages") for message in messages: + recipient = message["recipient"] + msg_type = "global" if recipient == GLOBAL else "private" + truncated_content = message["content"][:50] + ("..." if len(message["content"]) > 50 else "") + + logger.debug(f"DIPLOMACY | {power_name} → {recipient} | {msg_type.upper()} | {truncated_content}") + # Create an official message in the Diplomacy engine diplo_message = Message( phase=game.current_short_phase, @@ -80,11 +91,20 @@ def conduct_negotiations(game, game_history, model_error_stats, max_rounds=3): message["content"], ) conversation_messages.append(message) + message_count += 1 else: - logger.debug(f"No valid messages returned for {power_name}.") + logger.warning(f"DIPLOMACY | {power_name} | No valid messages generated") model_error_stats[power_name]["conversation_errors"] += 1 except Exception as exc: - logger.error(f"LLM request failed for {power_name}: {exc}") + error_msg = str(exc)[:150] + logger.error(f"DIPLOMACY | {power_name} | Request failed: {error_msg}") + model_error_stats[power_name]["conversation_errors"] += 1 + + logger.debug(f"DIPLOMACY | Round {round_index+1} completed with {message_count} new messages") - logger.info("Negotiation phase complete.") + total_messages = len(conversation_messages) + global_msgs = sum(1 for m in conversation_messages if m["recipient"] == GLOBAL) + private_msgs = total_messages - global_msgs + + logger.info(f"DIPLOMACY | Negotiation complete: {total_messages} messages ({global_msgs} global, {private_msgs} private)") return conversation_messages diff --git a/ai_diplomacy/prompts/message_summary_prompt.txt b/ai_diplomacy/prompts/message_summary_prompt.txt index 9dc34e1..3b73304 100644 --- a/ai_diplomacy/prompts/message_summary_prompt.txt +++ b/ai_diplomacy/prompts/message_summary_prompt.txt @@ -9,15 +9,16 @@ Create a concise summary of the conversations between powers that preserves: Organize by relationships (e.g., FRANCE-GERMANY, ENGLAND-RUSSIA), prioritizing the most significant interactions. Include specific territory names mentioned. -In your summary, maintain all of the following critical diplomatic information: -- Specific agreements about attacking or supporting certain territories -- Promises of non-aggression and their scope/duration -- Discussions about supply center control and transfers -- Stated preferences about other powers (who they want to attack/support) -- Explicit lies or deceptions that were revealed -- Coordination of moves between powers +The summary must reflect the actual diplomatic landscape accurately so players can make informed decisions. +When summarizing, preserve: +- Any explicit commitments to support or attack specific territories +- Agreements about dividing Supply Centers or territories +- Offers of non-aggression or alliance +- Signals of intent or warnings about future actions +- Discussions of shared enemies or mutual threats +- Any deceptions or misleading statements that could indicate future betrayal -The summary must reflect the actual diplomatic landscape accurately so players can make informed decisions and remember past interactions that might influence current negotiations. +Be precise about which powers communicated what information to whom, as this impacts trust and future decision-making. ORIGINAL MESSAGE HISTORY: {message_history} diff --git a/ai_diplomacy/prompts/recursive_summary_prompt.txt b/ai_diplomacy/prompts/recursive_summary_prompt.txt new file mode 100644 index 0000000..d52ae45 --- /dev/null +++ b/ai_diplomacy/prompts/recursive_summary_prompt.txt @@ -0,0 +1,16 @@ +You are creating a recursive summary of a Diplomacy game's history. +You have a previous summary of earlier events/messages and new content to incorporate. + +Your task is to create a unified, seamless summary that: +1. Preserves key strategic information from both sources +2. Maintains chronological flow and logical structure +3. Presents the most relevant information for decision-making +4. Emphasizes developments in alliances, betrayals, and territorial control + +PREVIOUS SUMMARY: +{previous_summary} + +NEW CONTENT: +{new_content} + +Create a unified summary that reads as a single coherent narrative while preserving critical strategic information: \ No newline at end of file diff --git a/ai_diplomacy/test_recursive_summarization.py b/ai_diplomacy/test_recursive_summarization.py new file mode 100644 index 0000000..c0655ba --- /dev/null +++ b/ai_diplomacy/test_recursive_summarization.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Test script to validate the recursive summarization functionality in long_story_short.py +""" + +import os +import sys +import logging +import time +from typing import Dict + +# Configure logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", + datefmt="%H:%M:%S", +) + +logger = logging.getLogger("test_summarization") + +# Add the parent directory to the path so we can import the module +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Import the module we want to test +from ai_diplomacy.long_story_short import ( + ContextManager, + configure_context_manager, + get_optimized_context +) + +def test_phase_summarization(): + """ + Test the recursive phase summarization functionality + """ + logger.info("Testing phase summarization...") + + # Create a context manager with a very low threshold to force summarization + cm = ContextManager( + phase_token_threshold=200, # Very low to trigger summarization + message_token_threshold=1000, + summary_model="o3-mini" # Use a simple model for testing + ) + + # Create a mock game object with phase summaries + mock_game = type('MockGame', (), {})() + mock_game.phase_summaries = { + "S1901M": "Spring 1901 Movement: France moved to Burgundy. Germany attacked Paris but was repelled. Russia and Turkey formed an alliance against Austria.", + "F1901M": "Fall 1901 Movement: England took Norway. Russia captured Sweden. Italy moved into Tyrolia threatening Vienna.", + "W1901A": "Winter 1901 Adjustments: France built F Brest. England built F London. Russia built A Moscow.", + "S1902M": "Spring 1902 Movement: Germany attacked Burgundy with support from Munich. Italy took Vienna from Austria. Turkey moved into Armenia threatening Russia." + } + + # Get optimized summaries + optimized_phases = cm.get_optimized_phase_summaries(mock_game, power_name="FRANCE") + + # Print the result + logger.info(f"Original phases: {len(mock_game.phase_summaries)}") + logger.info(f"Optimized phases: {len(optimized_phases)}") + + # Now add more phases to trigger another round of summarization + mock_game.phase_summaries.update({ + "F1902M": "Fall 1902 Movement: France retook Burgundy. England invaded St. Petersburg. Austria was eliminated by combined Italian and Turkish forces.", + "W1902A": "Winter 1902 Adjustments: Russia disbanded A Warsaw. Italy built A Rome and F Naples. Turkey built F Smyrna.", + "S1903M": "Spring 1903 Movement: Germany and France formed an alliance against England. Russia's position in the north collapsed." + }) + + # Get optimized summaries again - should see recursive summarization + new_optimized_phases = cm.get_optimized_phase_summaries(mock_game, power_name="FRANCE") + + logger.info(f"Updated original phases: {len(mock_game.phase_summaries)}") + logger.info(f"New optimized phases: {len(new_optimized_phases)}") + + # Print summary content + for key, summary in new_optimized_phases.items(): + if key.startswith("SUMMARY_UNTIL_"): + logger.info(f"=== {key} ===") + logger.info(summary[:200] + "..." if len(summary) > 200 else summary) + + # Add a third batch to trigger recursive summarization of the previous summary + mock_game.phase_summaries.update({ + "F1903M": "Fall 1903 Movement: Italy captured Tunis. France took Belgium with German support. England lost Edinburgh to a combined Russian and German attack.", + "W1903A": "Winter 1903 Adjustments: England disbanded F London. France built A Paris. Germany built F Kiel.", + "S1904M": "Spring 1904 Movement: Turkey attacked Italy in the Ionian Sea. Russia and Germany continued their assault on England. France moved towards Spain." + }) + + # Get optimized summaries a third time - should see further recursive summarization + final_optimized_phases = cm.get_optimized_phase_summaries(mock_game, power_name="FRANCE") + + logger.info(f"Final original phases: {len(mock_game.phase_summaries)}") + logger.info(f"Final optimized phases: {len(final_optimized_phases)}") + + # Print final summary content + for key, summary in final_optimized_phases.items(): + if key.startswith("SUMMARY_UNTIL_"): + logger.info(f"=== {key} ===") + logger.info(summary[:200] + "..." if len(summary) > 200 else summary) + + return optimized_phases, new_optimized_phases, final_optimized_phases + +def test_message_summarization(): + """ + Test the power-specific message summarization functionality + """ + logger.info("Testing message summarization...") + + # Create a context manager with a very low threshold to force summarization + cm = ContextManager( + phase_token_threshold=1000, + message_token_threshold=200, # Very low to trigger summarization + summary_model="o3-mini" # Use a simple model for testing + ) + + # Test with multiple powers + powers = ["FRANCE", "GERMANY", "ENGLAND"] + + # Create mock message histories for each power + messages = { + "FRANCE": "FROM: FRANCE, TO: GERMANY\nI propose we ally against England. I'll support your move to Belgium if you don't move to Burgundy.\n\nFROM: GERMANY, TO: FRANCE\nAgreed. I won't move to Burgundy. Let's coordinate against England.\n\nFROM: ENGLAND, TO: GLOBAL\nI'm looking for allies against France. Any takers?", + "GERMANY": "FROM: FRANCE, TO: GERMANY\nI propose we ally against England. I'll support your move to Belgium if you don't move to Burgundy.\n\nFROM: GERMANY, TO: FRANCE\nAgreed. I won't move to Burgundy. Let's coordinate against England.\n\nFROM: GERMANY, TO: RUSSIA\nI suggest we avoid conflict in Sweden and focus on other directions.", + "ENGLAND": "FROM: ENGLAND, TO: GLOBAL\nI'm looking for allies against France. Any takers?\n\nFROM: RUSSIA, TO: ENGLAND\nI could support you against France if you help me with Germany.\n\nFROM: ENGLAND, TO: RUSSIA\nThat works for me. I'll help you take Denmark if you support me into the English Channel." + } + + # Test for each power + results = {} + for power in powers: + logger.info(f"Testing message summarization for {power}...") + + # Get optimized message history for this power + optimized_messages = cm.get_optimized_message_history(messages[power], power) + + logger.info(f"Original message length: {len(messages[power])}") + logger.info(f"Optimized message length: {len(optimized_messages)}") + + # Now add more messages to trigger recursive summarization + additional_messages = f"\n\nFROM: {power}, TO: GLOBAL\nI declare that I am focusing on defense this turn.\n\nFROM: ITALY, TO: {power}\nI propose a mutual non-aggression pact." + combined_messages = additional_messages * 3 # Multiply to ensure we exceed threshold + + # Get optimized messages again with the combined content + new_optimized_messages = cm.get_optimized_message_history( + optimized_messages + combined_messages, + power + ) + + logger.info(f"Updated original + new message length: {len(optimized_messages + combined_messages)}") + logger.info(f"Recursive optimized message length: {len(new_optimized_messages)}") + + # Add a third batch to trigger recursive summarization of the previous summary + more_messages = f"\n\nFROM: TURKEY, TO: {power}\nI suggest we coordinate our moves in the Mediterranean.\n\nFROM: {power}, TO: TURKEY\nI agree to non-aggression in the Mediterranean. Let's focus on other targets." + third_combined = more_messages * 4 # Multiply to ensure we exceed threshold again + + final_optimized_messages = cm.get_optimized_message_history( + new_optimized_messages + third_combined, + power + ) + + logger.info(f"Final combined message length: {len(new_optimized_messages + third_combined)}") + logger.info(f"Final optimized message length: {len(final_optimized_messages)}") + + results[power] = (optimized_messages, new_optimized_messages, final_optimized_messages) + + return results + +def test_with_game_integration(): + """ + Test using the get_optimized_context function which is what the game actually uses + """ + logger.info("Testing integration with game context...") + + # Configure the global context manager with very low thresholds + configure_context_manager( + phase_threshold=200, + message_threshold=200, + summary_model="o3-mini" + ) + + # Create a mock game object with phase summaries + mock_game = type('MockGame', (), {})() + mock_game.phase_summaries = { + "S1901M": "Spring 1901 Movement: France moved to Burgundy. Germany attacked Paris but was repelled.", + "F1901M": "Fall 1901 Movement: England took Norway. Russia captured Sweden.", + "W1901A": "Winter 1901 Adjustments: France built F Brest. England built F London.", + "S1902M": "Spring 1902 Movement: Germany attacked Burgundy with support from Munich." + } + + # Create mock message histories + mock_messages = { + "FRANCE": "FROM: FRANCE, TO: GERMANY\nI propose we ally against England.\n\nFROM: GERMANY, TO: FRANCE\nAgreed. Let's coordinate against England.", + "GERMANY": "FROM: FRANCE, TO: GERMANY\nI propose we ally against England.\n\nFROM: GERMANY, TO: FRANCE\nAgreed. Let's coordinate against England.", + } + + # Create a mock game history object + class MockGameHistory: + def get_game_history(self, power_name=None): + if power_name: + return mock_messages.get(power_name, "") + return "" + + mock_history = MockGameHistory() + + # Test for multiple powers + for power in ["FRANCE", "GERMANY"]: + logger.info(f"Testing integration for {power}...") + + # First call - should be under threshold + optimized_phases, optimized_messages = get_optimized_context( + mock_game, mock_history, power_name=power + ) + + logger.info(f"{power} initial optimized_phases count: {len(optimized_phases)}") + logger.info(f"{power} initial optimized_messages length: {len(optimized_messages)}") + + # Add more content to exceed thresholds + mock_game.phase_summaries.update({ + "F1902M": "Fall 1902 Movement: France retook Burgundy. England invaded St. Petersburg.", + "W1902A": "Winter 1902 Adjustments: Russia disbanded A Warsaw. Italy built A Rome.", + "S1903M": "Spring 1903 Movement: Germany and France formed an alliance against England." + }) + + # Update message history + additional_msg = f"\n\nFROM: {power}, TO: GLOBAL\nI declare that I am focusing on defense this turn." * 3 + mock_messages[power] += additional_msg + + # Second call - should trigger summarization + optimized_phases2, optimized_messages2 = get_optimized_context( + mock_game, mock_history, power_name=power + ) + + logger.info(f"{power} second optimized_phases count: {len(optimized_phases2)}") + if len(optimized_phases2) < len(mock_game.phase_summaries): + logger.info(f"✅ {power} phase summarization successful!") + else: + logger.warning(f"❌ {power} phase summarization did not occur as expected") + + logger.info(f"{power} second optimized_messages length: {len(optimized_messages2)}") + if len(optimized_messages2) < len(mock_messages[power]): + logger.info(f"✅ {power} message summarization successful!") + else: + logger.warning(f"❌ {power} message summarization did not occur as expected") + + return True + +def main(): + """ + Main test function + """ + logger.info("Starting recursive summarization tests...") + + # Test phase summarization + phase_test_results = test_phase_summarization() + + # Test message summarization + message_test_results = test_message_summarization() + + # Test integration with game context + integration_result = test_with_game_integration() + + logger.info("All tests completed!") + + # Final validation checks + # Check if we're getting different summaries for different powers (should be!) + message_summaries = {power: result[1] for power, result in message_test_results.items()} + unique_summaries = set(message_summaries.values()) + + logger.info(f"Number of powers tested: {len(message_summaries)}") + logger.info(f"Number of unique message summaries: {len(unique_summaries)}") + + if len(unique_summaries) == len(message_summaries): + logger.info("✅ SUCCESS: Each power has a unique message summary!") + else: + logger.warning("❌ FAILURE: Some powers have identical message summaries!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ai_diplomacy/utils.py b/ai_diplomacy/utils.py index ab6b76e..3974203 100644 --- a/ai_diplomacy/utils.py +++ b/ai_diplomacy/utils.py @@ -28,7 +28,7 @@ def assign_models_to_powers(randomize=True): "o3-mini", "gemini-1.5-flash", "gemini-2.0-flash", - "gemini-2.0-flash-lite-preview-02-05", + "claude-3-7-sonnet-20250219", "gemini-1.5-pro", "gpt-4o-mini", "claude-3-5-haiku-20241022", @@ -46,8 +46,10 @@ def assign_models_to_powers(randomize=True): model = random.choice(available_models) available_models.remove(model) result[power] = model + logger.debug(f"CONFIG | Generated randomized power-model mapping for {len(POWERS)} powers") return result else: + logger.debug(f"CONFIG | Using fixed power-model mapping with {len(model_list)} models") return { power: model_list[i] for i, power in enumerate(POWERS) } @@ -63,6 +65,9 @@ def gather_possible_orders(game, power_name): result = {} for loc in orderable_locs: result[loc] = all_possible.get(loc, []) + + order_count = sum(len(orders) for orders in result.values()) + logger.debug(f"ORDERS | {power_name} | Found {len(result)} orderable locations with {order_count} total possible orders") return result @@ -81,8 +86,11 @@ def get_valid_orders( If invalid, we append the error feedback to the conversation context for the next retry. If still invalid, return fallback. """ + # Track invalid orders for feedback + invalid_info = [] # Ask the LLM for orders + logger.debug(f"ORDERS | {power_name} | Requesting orders from {client.model_name}") orders = client.get_orders( game=game, board_state=board_state, @@ -115,12 +123,14 @@ def get_valid_orders( if validity == 1: # All orders are fully valid + logger.debug(f"ORDERS | {power_name} | Validated {len(orders)} orders successfully") return orders else: logger.warning( - f"[{power_name}] failed to produce a valid order, using fallback." + f"ORDERS | {power_name} | Failed validation: '{move}' is invalid" ) model_error_stats[power_name]["order_decoding_errors"] += 1 + logger.debug(f"ORDERS | {power_name} | Using fallback orders") fallback = client.fallback_orders(possible_orders) return fallback @@ -355,6 +365,13 @@ def format_possible_orders(game, possible_orders): for order, desc in strategic_orders["SUPPORT"]: output += f" {order} {desc}\n" + # Log order counts for debugging + logger.debug(f"ORDERS | Strategic classification: " + + f"Offensive: {len(strategic_orders['OFFENSIVE'])}, " + + f"Defensive: {len(strategic_orders['DEFENSIVE'])}, " + + f"Tactical: {len(strategic_orders['TACTICAL'])}, " + + f"Support: {len(strategic_orders['SUPPORT'])}") + return output @@ -433,28 +450,42 @@ def format_convoy_paths(game, convoy_paths_possible, power_name): # check if convoy_paths_possible is empty dictionary or list or none output = "" if not convoy_paths_possible: - output = "CONVOY POSSIBILITIES: None currently available.\n" - return output + return "CONVOY POSSIBILITIES: None currently available.\n" - - # Get unit ownership for identifying our convoys vs others + # Get our units and all other powers' units our_units = set(game.get_units(power_name)) our_unit_locs = {unit[2:5] for unit in our_units} + + # Get all powers' units and centers for context + power_units = {} + power_centers = {} + for pwr in game.powers: + power_units[pwr] = {unit[2:5] for unit in game.get_units(pwr)} + power_centers[pwr] = set(game.get_centers(pwr)) - # Group convoys by region and relevance + # Organize convoys by strategic relationship convoys = { - "YOUR ARMY CONVOYS": [], # Convoys using your armies - "YOUR FLEET CONVOYS": [], # Convoys using your fleets - "ENEMY CONVOYS": [] # Convoys you should watch for + "YOUR CONVOYS": [], # Convoys using your armies + "CONVOYS YOU CAN ENABLE": [], # Using your fleets to help others + "ALLIED OPPORTUNITIES": [], # Convoys that could help contain common enemies + "THREATS TO WATCH": [] # Convoys that could threaten your positions } - # Define major sea regions for better organization + # Make sea regions more readable sea_regions = { 'NTH': "North Sea", 'MAO': "Mid-Atlantic", 'TYS': "Tyrrhenian Sea", 'BLA': "Black Sea", 'SKA': "Skagerrak", + 'ION': "Ionian Sea", + 'EAS': "Eastern Mediterranean", + 'WES': "Western Mediterranean", + 'BAL': "Baltic Sea", + 'BOT': "Gulf of Bothnia", + 'ADR': "Adriatic Sea", + 'AEG': "Aegean Sea", + 'ENG': "English Channel" } for start, fleets, destinations in convoy_paths_possible: @@ -462,38 +493,77 @@ def format_convoy_paths(game, convoy_paths_possible, power_name): if not destinations or not fleets: continue - # Determine if this is our army that could be convoyed - is_our_army = start in our_unit_locs - - # Determine if these are our fleets that could convoy - is_our_fleet = any(fleet_loc in our_unit_locs for fleet_loc in fleets) + # Identify the power that owns the army at start (if any) + army_owner = None + for pwr, locs in power_units.items(): + if start in locs: + army_owner = pwr + break + # Determine if we own any of the required fleets + our_fleet_count = sum(1 for fleet_loc in fleets if fleet_loc in our_unit_locs) + # Format the fleet path nicely - fleet_path = " + ".join(f"{sea_regions.get(f, f)}" for f in fleets) + fleet_path = " + ".join(sea_regions.get(f, f) for f in fleets) - # Create a list of destinations with context for dest in destinations: + # Get destination owner if any + dest_owner = None + for pwr, centers in power_centers.items(): + if dest in centers: + dest_owner = pwr + break + # Determine if destination is a supply center is_sc = dest in game.map.scs sc_note = " (SC)" if is_sc else "" - # Create the basic convoy description + # Create base convoy description convoy_desc = f"A {start} -> {dest}{sc_note} via {fleet_path}" - - # Add strategic notes - if is_our_army: - category = "YOUR ARMY CONVOYS" - convoys[category].append(f"{convoy_desc}") - elif is_our_fleet: - category = "YOUR FLEET CONVOYS" - convoys[category].append(f"{convoy_desc} (you provide the convoy)") + + # Add strategic context based on relationships + if army_owner == power_name: + category = "YOUR CONVOYS" + if dest_owner: + note = f"attack {dest_owner}'s position" + else: + note = "gain strategic position" if not is_sc else "capture neutral SC" + convoys[category].append(f"{convoy_desc} ({note})") + + elif our_fleet_count > 0: + category = "CONVOYS YOU CAN ENABLE" + # Add diplomatic context + if army_owner: + if dest_owner == power_name: + note = f"WARNING: {army_owner} could attack your SC" + else: + note = f"help {army_owner} attack {dest_owner or 'neutral'} position" + else: + note = "potential diplomatic bargaining chip" + convoys[category].append(f"{convoy_desc} ({note})") + else: - category = "ENEMY CONVOYS" - convoys[category].append(f"{convoy_desc} (possible enemy convoy)") + # Analyze if this convoy represents opportunity or threat + if dest_owner == power_name: + category = "THREATS TO WATCH" + note = f"{army_owner or 'potential'} attack on your position" + elif army_owner and dest_owner: + category = "ALLIED OPPORTUNITIES" + note = f"{army_owner} could attack {dest_owner} - potential alliance" + else: + category = "ALLIED OPPORTUNITIES" + note = "potential diplomatic leverage" + + convoys[category].append(f"{convoy_desc} ({note})") # Format output output = "CONVOY POSSIBILITIES:\n\n" + # Log convoy counts for debugging + convoy_counts = {category: len(convoys[category]) for category in convoys} + logger.debug(f"CONVOYS | {power_name} | Counts: " + + ", ".join(f"{category}: {count}" for category, count in convoy_counts.items())) + for category, convoy_list in convoys.items(): if convoy_list: output += f"{category}:\n" @@ -538,8 +608,12 @@ def generate_threat_assessment(game, board_state, power_name): output = "THREAT ASSESSMENT:\n" if not threats: output += " No immediate threats detected.\n\n" + logger.debug(f"THREATS | {power_name} | No immediate threats detected") return output + # Log threat counts for debugging + logger.debug(f"THREATS | {power_name} | Detected {len(threats)} threats from {len(set(t[0] for t in threats))} powers") + for (enemy_pwr, code, targets) in threats: output += f" {enemy_pwr}'s {code} threatens {', '.join(targets)}\n" output += "\n" @@ -606,4 +680,13 @@ def generate_sc_projection(game, board_state, power_name): best_case = len(our_centers) + len(neutral_gains) + len(contestable) worst_case = len(our_centers) - len(at_risk) output += f" Next-phase range: {worst_case} to {best_case} centers\n\n" + + # Log SC projection for debugging + logger.debug(f"SC_PROJ | {power_name} | " + + f"Current: {len(our_centers)}, " + + f"Neutral gains: {len(neutral_gains)}, " + + f"Contestable: {len(contestable)}, " + + f"At risk: {len(at_risk)}, " + + f"Range: {worst_case}-{best_case}") + return output diff --git a/lm_game.py b/lm_game.py index efc9a9f..e28b4e4 100644 --- a/lm_game.py +++ b/lm_game.py @@ -22,9 +22,11 @@ from ai_diplomacy.utils import ( from ai_diplomacy.negotiations import conduct_negotiations from ai_diplomacy.game_history import GameHistory from ai_diplomacy.long_story_short import configure_context_manager +from ai_diplomacy.clients import configure_logging dotenv.load_dotenv() +# Configure logger with a consistent format logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, @@ -32,12 +34,23 @@ logging.basicConfig( datefmt="%H:%M:%S", ) +# Configure specific loggers to reduce noise +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) +logging.getLogger("anthropic").setLevel(logging.WARNING) +logging.getLogger("openai").setLevel(logging.WARNING) + +# Ensure our application loggers are at appropriate levels +logging.getLogger("client").setLevel(logging.INFO) +logging.getLogger("ai_diplomacy").setLevel(logging.INFO) + def my_summary_callback(system_prompt, user_prompt, model_name): # Route to the desired model specified by the command-line argument client = load_model_client(model_name, emptysystem=True) combined_prompt = f"{system_prompt}\n\n{user_prompt}" - # Pseudo-code for generating a response: + logger.debug(f"SUMMARY | Requesting phase summary from {model_name}") return client.generate_response(combined_prompt, empty_system=True) @@ -78,6 +91,29 @@ def parse_arguments(): "The order is: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY." ), ) + # Logging configuration options + parser.add_argument( + "--log_full_prompts", + action="store_true", + help="Log the full prompts sent to models", + ) + parser.add_argument( + "--log_full_responses", + action="store_true", + help="Log the full responses from models", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging including HTTP connection details", + ) + parser.add_argument( + "--log_level", + type=str, + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Set the logging level", + ) return parser.parse_args() @@ -101,7 +137,7 @@ def save_game_state(game, result_folder, game_file_path, model_error_stats, args output_path = game_file_path # If final file exists, append timestamp if os.path.exists(output_path): - logger.info("Game file already exists, saving with unique filename.") + logger.info("STORAGE | Final game file already exists, saving with unique timestamp") output_path = f"{output_path}_{timestamp}.json" # Save game state @@ -114,11 +150,21 @@ def save_game_state(game, result_folder, game_file_path, model_error_stats, args overview_file.write(json.dumps(game.power_model_map) + "\n") overview_file.write(json.dumps(vars(args)) + "\n") - logger.info(f"Saved game checkpoint to: {output_path}") + logger.info(f"STORAGE | Game checkpoint saved to: {output_path}") def main(): args = parse_arguments() + + # Configure logging + log_level = getattr(logging, args.log_level) + configure_logging( + log_full_prompts=args.log_full_prompts, + log_full_responses=args.log_full_responses, + suppress_connection_logs=not args.verbose, + log_level=log_level + ) + # Configure the context manager with the same summary model configure_context_manager( phase_threshold=10000, @@ -128,7 +174,7 @@ def main(): max_year = args.max_year summary_model = args.summary_model - logger.info("Starting a new Diplomacy game for testing with multiple LLMs, now concurrent!") + logger.info("GAME_START | Initializing Diplomacy game with multiple LLM agents") start_whole = time.time() model_error_stats = defaultdict( @@ -153,12 +199,20 @@ def main(): # --------------------------- log_file_path = os.path.join(result_folder, "game.log") file_handler = logging.FileHandler(log_file_path) - file_handler.setLevel(logging.DEBUG) + file_handler.setLevel(logging.DEBUG) # Ensure we capture all levels in the file file_handler.setFormatter( logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s", datefmt="%H:%M:%S") ) - logger.addHandler(file_handler) - logger.info(f"File handler added. Writing logs to {log_file_path}.") + + # Add the handler to root logger to capture all modules' logs + logging.getLogger().addHandler(file_handler) + + # Also add to specific loggers we care about most for summarization + logging.getLogger("ai_diplomacy.long_story_short").addHandler(file_handler) + logging.getLogger("ai_diplomacy.long_story_short").setLevel(logging.DEBUG) + + logger.info(f"LOGGING | File handler configured to write logs to {log_file_path}") + logger.info(f"LOGGING | Capturing detailed context management logs at DEBUG level") # File paths manifesto_path = f"{result_folder}/game_manifesto.txt" @@ -181,16 +235,16 @@ def main(): provided_models = [name.strip() for name in args.models.split(",")] if len(provided_models) != len(powers_order): logger.error( - f"Expected {len(powers_order)} models for --power-models but got {len(provided_models)}. Exiting." + f"CONFIG_ERROR | Expected {len(powers_order)} models in --models argument but got {len(provided_models)}. Exiting." ) return game.power_model_map = dict(zip(powers_order, provided_models)) else: game.power_model_map = assign_models_to_powers(randomize=True) - logger.debug("Power model assignments:") + logger.debug("POWERS | Model assignments:") for power, model_id in game.power_model_map.items(): - logger.debug(f"{power} => type={type(model_id)}, value={model_id}") + logger.debug(f"POWERS | {power} assigned to {model_id}") # Also, if you prefer to fix the negotiation function: # We could do a one-liner ensuring all model_id are strings: @@ -198,7 +252,7 @@ def main(): if not isinstance(game.power_model_map[p], str): game.power_model_map[p] = str(game.power_model_map[p]) - logger.info("Post-cleanup: Verified all power model IDs are strings.") + logger.debug("POWERS | Verified all power model IDs are strings") round_counter = 0 # Track number of rounds @@ -206,28 +260,29 @@ def main(): phase_start = time.time() current_phase = game.get_current_phase() logger.info( - f"PHASE: {current_phase} (time so far: {phase_start - start_whole:.2f}s)" + f"PHASE | {current_phase} | Starting (elapsed game time: {phase_start - start_whole:.2f}s)" ) - # DEBUG: Print the short phase to confirm - logger.info(f"INFO: The current short phase is '{game.current_short_phase}'") + # Get the current short phase + logger.debug(f"PHASE | Current short phase: '{game.current_short_phase}'") # Prevent unbounded simulation based on year year_str = current_phase[1:5] year_int = int(year_str) if year_int > max_year: - logger.info(f"Reached year {year_int}, stopping the test game early.") + logger.info(f"GAME_END | Reached year limit ({year_int} > {max_year}), terminating game") break # If it's a movement phase (e.g. ends with "M"), conduct negotiations if game.current_short_phase.endswith("M"): - logger.info("Starting negotiation phase block...") + logger.info(f"NEGOTIATIONS | {current_phase} | Starting diplomacy round") conversation_messages = conduct_negotiations( game, game_history, model_error_stats, max_rounds=args.num_negotiation_rounds, ) + logger.debug(f"NEGOTIATIONS | {current_phase} | Completed with {len(conversation_messages)} messages") else: conversation_messages = [] @@ -237,6 +292,8 @@ def main(): for p_name, p_obj in game.powers.items() if not p_obj.is_eliminated() ] + + logger.info(f"ORDERS | {current_phase} | Requesting orders from {len(active_powers)} active powers") with concurrent.futures.ThreadPoolExecutor( max_workers=len(active_powers) @@ -247,7 +304,7 @@ def main(): client = load_model_client(model_id, power_name=power_name) possible_orders = gather_possible_orders(game, power_name) if not possible_orders: - logger.info(f"No orderable locations for {power_name}; skipping.") + logger.info(f"ORDERS | {power_name} | No orderable locations, skipping") continue board_state = game.get_state() @@ -263,24 +320,22 @@ def main(): model_error_stats, ) futures[future] = power_name - logger.debug(f"Submitted get_valid_orders task for {power_name}.") + logger.debug(f"ORDERS | {power_name} | Requested orders from {model_id}") for future in concurrent.futures.as_completed(futures): p_name = futures[future] try: orders = future.result() - logger.debug(f"Validated orders for {p_name}: {orders}") if orders: + logger.debug(f"ORDERS | {p_name} | Received {len(orders)} valid orders") game.set_orders(p_name, orders) - logger.debug( - f"Set orders for {p_name} in {game.current_short_phase}: {orders}" - ) + logger.debug(f"ORDERS | {p_name} | Orders set for {game.current_short_phase}") else: - logger.debug(f"No valid orders returned for {p_name}.") + logger.warning(f"ORDERS | {p_name} | No valid orders returned") except Exception as exc: - logger.error(f"LLM request failed for {p_name}: {exc}") + logger.error(f"ORDERS | {p_name} | Request failed: {str(exc)[:150]}") - logger.info("Processing orders...\n") + logger.info(f"PROCESSING | {current_phase} | Processing orders") # Pass the summary model to the callback via a lambda function phase_data = game.process( phase_summary_callback=lambda sys, usr: my_summary_callback( @@ -307,42 +362,46 @@ def main(): game.order_history[current_phase][power_name], results, ) - logger.info("Phase complete.\n") + logger.info(f"PROCESSING | {current_phase} | Phase completed") # Retrieve and log the summary of the phase summary_text = phase_data.summary or "(No summary found.)" border = "=" * 80 logger.info( - f"{border}\nPHASE SUMMARY for {phase_data.name}:\n{summary_text}\n{border}" + f"SUMMARY | {phase_data.name} | Phase summary: {len(summary_text)} chars" ) + logger.debug(f"SUMMARY | {phase_data.name} | Full text:\n{border}\n{summary_text}\n{border}") # Append the summary to the manifesto file with open(manifesto_path, "a") as f: f.write(f"=== {phase_data.name} ===\n{summary_text}\n\n") + + phase_duration = time.time() - phase_start + logger.debug(f"PHASE | {current_phase} | Completed in {phase_duration:.2f}s") # Increment round counter after processing each phase round_counter += 1 # Save every 5 rounds if round_counter % 5 == 0: - logger.info(f"Saving checkpoint after round {round_counter}...") + logger.info(f"CHECKPOINT | Saving after round {round_counter}") save_game_state(game, result_folder, game_file_path, model_error_stats, args, is_final=False) # Check if we've exceeded the max year year_str = current_phase[1:5] year_int = int(year_str) if year_int > max_year: - logger.info(f"Reached year {year_int}, stopping the test game early.") + logger.info(f"GAME_END | Reached year limit ({year_int} > {max_year}), terminating game") break # Save final result duration = time.time() - start_whole - logger.info(f"Game ended after {duration:.2f}s. Saving final state...") + logger.info(f"GAME_END | Duration: {duration:.2f}s | Saving final state") save_game_state(game, result_folder, game_file_path, model_error_stats, args, is_final=True) - logger.info(f"Saved game data, manifesto, and error stats in: {result_folder}") - logger.info("Done.") + logger.info(f"STORAGE | Game data saved in: {result_folder}") + logger.info("GAME_END | Simulation complete") if __name__ == "__main__":