From 1b042cf6c6d0a26fbda96b578f37d18b65b39b76 Mon Sep 17 00:00:00 2001
From: Oam Patel <oampatel114@gmail.com>
Date: Sun, 16 Feb 2025 01:27:16 +0000
Subject: [PATCH] script + error logging for order decoding

---
 ai_diplomacy/clients.py         |  27 ------
 ai_diplomacy/negotiations.py    |   4 +-
 ai_diplomacy/utils.py           |  28 +++++++
 lm_game.py                      | 141 ++++++++++++++++++++++----------
 randomgame.py => random_game.py |   0
 run.sh                          |   6 ++
 6 files changed, 136 insertions(+), 70 deletions(-)
 rename randomgame.py => random_game.py (100%)
 create mode 100644 run.sh

diff --git a/ai_diplomacy/clients.py b/ai_diplomacy/clients.py
index 4a5b723..803a36a 100644
--- a/ai_diplomacy/clients.py
+++ b/ai_diplomacy/clients.py
@@ -699,33 +699,6 @@ def load_model_client(model_id: str) -> BaseModelClient:
 ##############################################################################
 
 
-def assign_models_to_powers():
-    """
-    Example usage: define which model each power uses.
-    Return a dict: { power_name: model_id, ... }
-    POWERS = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
-    """
-    return {
-        "FRANCE": "gemini-2.0-flash",
-        "GERMANY": "gemini-2.0-flash",
-        "ENGLAND": "gemini-2.0-flash",
-        "RUSSIA": "gemini-2.0-flash",
-        "ITALY": "gemini-2.0-flash",
-        "AUSTRIA": "gemini-2.0-flash",
-        "TURKEY": "gemini-2.0-flash",
-    }
-
-    # return {
-    #     "FRANCE": "o3-mini",
-    #     "GERMANY": "claude-3-5-sonnet-20241022",
-    #     "ENGLAND": "gemini-2.0-flash",
-    #     "RUSSIA": "gemini-2.0-flash-lite-preview-02-05",
-    #     "ITALY": "gpt-4o",
-    #     "AUSTRIA": "gpt-4o-mini",
-    #     "TURKEY": "claude-3-5-haiku-20241022",
-    # }
-
-
 def example_game_loop(game):
     """
     Pseudocode: Integrate with the Diplomacy loop.
diff --git a/ai_diplomacy/negotiations.py b/ai_diplomacy/negotiations.py
index ee24aff..a8e893f 100644
--- a/ai_diplomacy/negotiations.py
+++ b/ai_diplomacy/negotiations.py
@@ -44,6 +44,7 @@ def conduct_negotiations(game, model_error_stats, max_rounds=10):
 
     # Conversation messages are kept in a local list ONLY to build conversation_so_far text.
     conversation_messages = []
+    return conversation_messages
 
     active_powers = [
         p_name for p_name, p_obj in game.powers.items() if not p_obj.is_eliminated()
@@ -128,8 +129,7 @@ def conduct_negotiations(game, model_error_stats, max_rounds=10):
                 except (json.JSONDecodeError, AttributeError) as e:
                     logger.error(f"Failed to parse message from {power_name}: {e}")
                     # Increment conversation parse error
-                    model_id = game.power_model_map.get(power_name, "unknown")
-                    model_error_stats[model_id]["conversation_errors"] += 1
+                    model_error_stats[power_name]["conversation_errors"] += 1
                     continue
 
     logger.info("Negotiation phase complete.")
diff --git a/ai_diplomacy/utils.py b/ai_diplomacy/utils.py
index 38a2371..4b59a55 100644
--- a/ai_diplomacy/utils.py
+++ b/ai_diplomacy/utils.py
@@ -8,6 +8,33 @@ logging.basicConfig(level=logging.INFO)
 load_dotenv()
 
 
+def assign_models_to_powers():
+    """
+    Example usage: define which model each power uses.
+    Return a dict: { power_name: model_id, ... }
+    POWERS = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
+    """
+    return {
+        "FRANCE": "gemini-2.0-flash",
+        "GERMANY": "gemini-2.0-flash",
+        "ENGLAND": "gemini-2.0-flash",
+        "RUSSIA": "gemini-2.0-flash",
+        "ITALY": "gemini-2.0-flash",
+        "AUSTRIA": "gemini-2.0-flash",
+        "TURKEY": "gemini-2.0-flash",
+    }
+
+    # return {
+    #     "FRANCE": "o3-mini",
+    #     "GERMANY": "claude-3-5-sonnet-20241022",
+    #     "ENGLAND": "gemini-2.0-flash",
+    #     "RUSSIA": "gemini-2.0-flash-lite-preview-02-05",
+    #     "ITALY": "gpt-4o",
+    #     "AUSTRIA": "gpt-4o-mini",
+    #     "TURKEY": "claude-3-5-haiku-20241022",
+    # }
+
+
 def gather_possible_orders(game, power_name):
     """
     Returns a dictionary mapping each orderable location to the list of valid orders.
@@ -97,5 +124,6 @@ def get_valid_orders_with_retry(
     logger.warning(
         f"[{power_name}] Exhausted {max_retries} attempts for valid orders, using fallback."
     )
+    model_error_stats[power_name]['order_decoding_errors'] += 1
     fallback = client.fallback_orders(possible_orders)
     return fallback
diff --git a/lm_game.py b/lm_game.py
index 813227c..4830c20 100644
--- a/lm_game.py
+++ b/lm_game.py
@@ -1,11 +1,11 @@
+import argparse
 import logging
 import time
 import dotenv
 import os
 import json
-
-# Additional import for error stats
 from collections import defaultdict
+import concurrent.futures
 
 # Suppress Gemini/PaLM gRPC warnings
 os.environ["GRPC_PYTHON_LOG_LEVEL"] = "40"  # ERROR level only
@@ -13,12 +13,12 @@ os.environ["GRPC_PYTHON_LOG_LEVEL"] = "40"  # ERROR level only
 from diplomacy import Game
 from diplomacy.utils.export import to_saved_game_format
 
-# Added import: we'll create and add standard Diplomacy messages
-
-import concurrent.futures
-
-from ai_diplomacy.clients import load_model_client, assign_models_to_powers
-from ai_diplomacy.utils import get_valid_orders_with_retry, gather_possible_orders
+from ai_diplomacy.clients import load_model_client
+from ai_diplomacy.utils import (
+    get_valid_orders_with_retry,
+    gather_possible_orders,
+    assign_models_to_powers,
+)
 from ai_diplomacy.negotiations import conduct_negotiations
 
 dotenv.load_dotenv()
@@ -31,15 +31,53 @@ logging.basicConfig(
 )
 
 
-def my_summary_callback(system_prompt, user_prompt):
-    # e.g., route to your desired model:
-    client = load_model_client("o3-mini")
+def my_summary_callback(system_prompt, user_prompt, model_name):
+    # Route to the desired model specified by the command-line argument
+    client = load_model_client(model_name)
     combined_prompt = f"{system_prompt}\n\n{user_prompt}"
     # Pseudo-code for generating a response:
     return client.generate_response(combined_prompt)
 
 
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Run a Diplomacy game simulation with configurable parameters."
+    )
+    parser.add_argument(
+        "--max-year",
+        type=int,
+        default=1901,
+        help="Maximum year to simulate. The game will stop once this year is reached.",
+    )
+    parser.add_argument(
+        "--summary-model",
+        type=str,
+        default="o3-mini",
+        help="Model name to use for generating phase summaries.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="",
+        help="Output filename for the final JSON result. If not provided, a timestamped name will be generated.",
+    )
+    parser.add_argument(
+        "--models",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated list of model names to assign to powers in order. "
+            "The order is: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY."
+        ),
+    )
+    return parser.parse_args()
+
+
 def main():
+    args = parse_arguments()
+    max_year = args.max_year
+    summary_model = args.summary_model
+
     logger.info(
         "Starting a new Diplomacy game for testing with multiple LLMs, now concurrent!"
     )
@@ -51,23 +89,44 @@ def main():
 
     # Create a fresh Diplomacy game
     game = Game()
-    # Ensure game has phase_summaries = {}
+    # Ensure game has phase_summaries attribute
     if not hasattr(game, "phase_summaries"):
         game.phase_summaries = {}
 
-    # For storing results in a unique subfolder
+    # Determine the result folder based on a timestamp
     timestamp_str = time.strftime("%Y%m%d_%H%M%S")
     result_folder = f"./results/{timestamp_str}"
-    if not os.path.exists(result_folder):
-        os.makedirs(result_folder)
+    os.makedirs(result_folder, exist_ok=True)
 
-    # Manifesto and game file paths
+    # File paths
     manifesto_path = f"{result_folder}/game_manifesto.txt"
-    game_file_path = f"{result_folder}/lmvsgame.json"
-    stats_file_path = f"{result_folder}/error_stats.json"
+    # Use provided output filename or generate one based on the timestamp
+    game_file_path = (
+        args.output if args.output else f"{result_folder}/lmvsgame.json"
+    )
+    overview_file_path = f"{result_folder}/overview.jsonl"
 
-    game.power_model_map = assign_models_to_powers()
-    max_year = 1910
+    # Handle power model mapping
+    if args.models:
+        # Expected order: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY
+        powers_order = [
+            "AUSTRIA",
+            "ENGLAND",
+            "FRANCE",
+            "GERMANY",
+            "ITALY",
+            "RUSSIA",
+            "TURKEY",
+        ]
+        provided_models = [name.strip() for name in args.models.split(",")]
+        if len(provided_models) != len(powers_order):
+            logger.error(
+                f"Expected {len(powers_order)} models for --power-models but got {len(provided_models)}. Exiting."
+            )
+            return
+        game.power_model_map = dict(zip(powers_order, provided_models))
+    else:
+        game.power_model_map = assign_models_to_powers()
 
     while not game.is_game_done:
         phase_start = time.time()
@@ -79,21 +138,20 @@ def main():
         # DEBUG: Print the short phase to confirm
         logger.info(f"DEBUG: current_short_phase is '{game.current_short_phase}'")
 
-        # Prevent unbounded sim
+        # Prevent unbounded simulation based on year
         year_str = current_phase[1:5]
         year_int = int(year_str)
         if year_int > max_year:
             logger.info(f"Reached year {year_int}, stopping the test game early.")
             break
 
-        # Use endswith("M") for movement phases (like F1901M, S1902M)
+        # If it's a movement phase (e.g. ends with "M"), conduct negotiations
         if game.current_short_phase.endswith("M"):
             logger.info("Starting negotiation phase block...")
             conversation_messages = conduct_negotiations(
                 game, model_error_stats, max_rounds=10
             )
         else:
-            # If we have no conversation_messages in phases that are not Movement (e.g. Retreat/Build)
             conversation_messages = []
 
         conversation_text_for_orders = "\n".join(
@@ -110,7 +168,6 @@ def main():
             if not p_obj.is_eliminated()
         ]
 
-        # Then proceed with concurrent order generation
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=len(active_powers)
         ) as executor:
@@ -124,7 +181,7 @@ def main():
                     continue
                 board_state = game.get_state()
 
-                # Submit a task that includes up to 3 attempts at valid orders
+                # Submit task with up to 3 retries for valid orders
                 future = executor.submit(
                     get_valid_orders_with_retry,
                     game,
@@ -132,7 +189,7 @@ def main():
                     board_state,
                     power_name,
                     possible_orders,
-                    conversation_text_for_orders,  # existing conversation text
+                    conversation_text_for_orders,  # conversation text
                     game.phase_summaries,
                     model_error_stats,
                     3,  # max_retries
@@ -158,23 +215,26 @@ def main():
                     logger.error(f"LLM request failed for {p_name}: {exc}")
 
         logger.info("Processing orders...\n")
-        phase_data = game.process(phase_summary_callback=my_summary_callback)
+        # Pass the summary model to the callback via a lambda function
+        phase_data = game.process(
+            phase_summary_callback=lambda sys, usr: my_summary_callback(
+                sys, usr, summary_model
+            )
+        )
         logger.info("Phase complete.\n")
 
-        # Retrieve the last-processed phase data from the game's history
+        # Retrieve and log the summary of the phase
         summary_text = phase_data.summary or "(No summary found.)"
-
-        # Print in pretty ASCII format
         border = "=" * 80
         logger.info(
             f"{border}\nPHASE SUMMARY for {phase_data.name}:\n{summary_text}\n{border}"
         )
 
-        # Write to unique game_manifesto in the timestamped folder
+        # Append the summary to the manifesto file
         with open(manifesto_path, "a") as f:
             f.write(f"=== {phase_data.name} ===\n{summary_text}\n\n")
 
-        # End-of-loop checks
+        # Check if we've exceeded the max year
         year_str = current_phase[1:5]
         year_int = int(year_str)
         if year_int > max_year:
@@ -185,19 +245,18 @@ def main():
     duration = time.time() - start_whole
     logger.info(f"Game ended after {duration:.2f}s. Saving to final JSON...")
 
-    # Save final result to the unique subfolder
     output_path = game_file_path
-    if not os.path.exists(output_path):
-        to_saved_game_format(game, output_path=output_path)
-    else:
+    # If the file already exists, append a timestamp to the filename
+    if os.path.exists(output_path):
         logger.info("Game file already exists, saving with unique filename.")
         output_path = f"{output_path}_{time.strftime('%Y%m%d_%H%M%S')}.json"
-        to_saved_game_format(game, output_path=output_path)
+    to_saved_game_format(game, output_path=output_path)
 
-    # Dump our error stats to JSON
-
-    with open(stats_file_path, "w") as stats_f:
-        json.dump(model_error_stats, stats_f, indent=2)
+    # Dump error stats and power model mapping to the overview file
+    with open(overview_file_path, "w") as overview_file:
+        overview_file.write(json.dumps(model_error_stats) + "\n")
+        overview_file.write(json.dumps(game.power_model_map) + "\n")
+        overview_file.write(json.dumps(vars(args)) + "\n")
 
     logger.info(f"Saved game data, manifesto, and error stats in: {result_folder}")
     logger.info("Done.")
diff --git a/randomgame.py b/random_game.py
similarity index 100%
rename from randomgame.py
rename to random_game.py
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..0aed3f2
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+python lm_game.py \
+    --max-year 1900 \
+    --summary-model "o3-mini" \
+    --models "gemini-2.0-flash, gemini-2.0-flash, gemini-2.0-flash, gemini-2.0-flash, gemini-2.0-flash, gemini-2.0-flash, gemini-2.0-flash"
\ No newline at end of file