Merge pull request #61 from peregrinates/ordersdata

patched for compatability with new data logging structure
2026-04-19 12:58:09 +00:00 · 2025-09-05 14:24:48 -07:00 · 2025-09-05 14:24:48 -07:00 · 287d845d4c
commit 287d845d4c
parent 4e4aa6e946 66d5f91225
10 changed files with 942 additions and 201 deletions
--- a/analysis/init.py
+++ b/analysis/init.py
--- a/analysis/analysis_helpers.py
+++ b/analysis/analysis_helpers.py
@ -1,63 +1,94 @@
-# analysis_constants.py 
+"""Utility functions and constants for loading Diplomacy analysis data.
-import os 
+
-import json 
+This module provides helpers to read game data stored either as a folder on disk
 or inside a zip archive, plus a few constant lists and regex patterns that are
 used across the analysis scripts.
 """
 from pathlib import Path
-import pandas as pd
+from typing import Dict, Union
 import json
 import zipfile
-def process_standard_game_inputs(game_data_folder : Path, selected_game : str) -> dict[str, pd.DataFrame]:
+import pandas as pd
-    path_to_folder = game_data_folder / selected_game
+from analysis.schemas import COUNTRIES
 from analysis.validation import LMVSGame
-    assert os.path.exists(path_to_folder / "overview.jsonl"), f"Overview file not found in {path_to_folder}"
+__all__: list[str] = [
-    overview = pd.read_json(path_to_folder / "overview.jsonl", lines=True)
+    "process_standard_game_inputs",
    "process_game_inputs_in_zip",
    "get_country_to_model_mapping",
 ]
-    # get all turn actions from lmvs
+def process_standard_game_inputs(path_to_folder: Path) -> Dict[str, Union[pd.DataFrame, dict]]:
-    assert os.path.exists(path_to_folder / "lmvsgame.json"), f"LMVS file not found in {path_to_folder}"
+    """
-    path_to_file = path_to_folder / "lmvsgame.json"
+    Read in a game folder and return the overview, lmvs_data, and all_responses
-    # Use the standard `json` library to load the file into a Python object
+    Args:
-    with open(path_to_file, 'r') as f:
+        path_to_folder: Path to the game folder. Must contain overview.jsonl, lmvsgame.json, and llm_responses.csv files.
    Returns:
        Dictionary containing overview, lmvs_data, and all_responses
    """
    # ----- check files exist -----
    overview_path = path_to_folder / "overview.jsonl"
    lmvsgame_path = path_to_folder / "lmvsgame.json"
    llm_resp_path = path_to_folder / "llm_responses.csv"
    if not overview_path.exists():
        raise FileNotFoundError(str(overview_path))
    if overview_path.stat().st_size == 0:
        raise FileNotFoundError(f"{overview_path} is empty")
    if not lmvsgame_path.exists():
        raise FileNotFoundError(str(lmvsgame_path))
    if lmvsgame_path.stat().st_size == 0:
        raise FileNotFoundError(f"{lmvsgame_path} is empty")
    if not llm_resp_path.exists():
        raise FileNotFoundError(str(llm_resp_path))
    if llm_resp_path.stat().st_size == 0:
        raise FileNotFoundError(f"{llm_resp_path} is empty")
    # ----- load data -----
    overview = pd.read_json(overview_path, lines=True)
    with open(lmvsgame_path, "r") as f:
        lmvs_data = json.load(f)
    # validate the LMVS data format
    LMVSGame.model_validate(
        lmvs_data,
    )
-    assert os.path.exists(path_to_folder / "llm_responses.csv"), f"LLM responses file not found in {path_to_folder}"
+    all_responses = pd.read_csv(llm_resp_path)
-    all_responses = pd.read_csv(path_to_folder / "llm_responses.csv")
+    expected_columns = ['model', 'power', 'phase', 'response_type', 'raw_input', 'raw_response',
-    
+       'success']
    missing_columns = [col for col in expected_columns if col not in all_responses.columns]
    assert len(missing_columns) == 0, f"Missing required columns in CSV: {missing_columns}"
    return {"overview":overview, "lmvs_data":lmvs_data, "all_responses":all_responses}
-def process_game_in_zip(zip_path: Path, selected_game: str) -> dict[str, pd.DataFrame]:
+def get_country_to_model_mapping(overview_df : pd.DataFrame, llm_responses_df : pd.DataFrame) -> pd.Series:
    """ Get a country:model map of which country was played by which model, different in different versions of data"""
    country_to_model = overview_df.loc[1].reindex(COUNTRIES)
    if pd.isnull(country_to_model).any(): 
        if llm_responses_df is not None:
            country_to_model = llm_responses_df.set_index("power")["model"].reindex(COUNTRIES)
    return country_to_model
 def process_game_inputs_in_zip(zip_path: Path, selected_game: str) -> Dict[str, Union[pd.DataFrame, dict]]:
    """
    Read in a game folder and return the overview, lmvs_data, and all_responses
    Args:
        zip_path: Path to the zip file
        selected_game: Name of the game to extract
    Returns:
        Dictionary containing overview, lmvs_data, and all_responses
    """
    zip_name = zip_path.stem  # Gets filename without extension
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        overview = pd.read_json(zip_ref.open(f"{zip_name}/{selected_game}/overview.jsonl"), lines=True)
        lmvs_data = json.load(zip_ref.open(f"{zip_name}/{selected_game}/lmvsgame.json"))
        all_responses = pd.read_csv(zip_ref.open(f"{zip_name}/{selected_game}/llm_responses.csv"))
    return {"overview": overview, "lmvs_data": lmvs_data, "all_responses": all_responses}
 supply_centers = [
    "ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
    "HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
    "ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
    "VEN", "VIE", "WAR", 
    "SPA", "STP", "BUL" # coastal provinces
 ]
 coastal_scs = ["SPA/SC", "SPA/NC",
    "STP/SC", "STP/NC", 'BUL/EC',
       'BUL/SC',]
 COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
 place_identifier = "[A-Z]{3}(?:/[A-Z]{2})?"
 place_capturing_regex = r"([A-Z]{3})"
 unit_identifier = rf"[AF] {place_identifier}"
 unit_move = rf"{unit_identifier} . {place_identifier}"
 possible_commands = {
    "Move": f"^"+unit_move, # distinguishing this from support
    "Support Move": f"{unit_identifier} S {unit_move}",
    "Support Hold": fr"{unit_identifier} S {unit_identifier}(?!\s+[.\-]\s+{place_identifier})",
    "Convoy": f"F {place_identifier} C {unit_move}", # No convoys in here? 
    "Hold": f"{unit_identifier} H",
    "Build": f"{unit_identifier} B",
    "Disband": f"{unit_identifier} D",
    "Retreat": f"{unit_identifier} R",
 }
--- a/analysis/make_all_analysis_data.py
+++ b/analysis/make_all_analysis_data.py
@ -19,7 +19,7 @@ python analysis/make_all_analysis_data.py --selected_game game1 --game_data_fold
 python analysis/make_all_analysis_data.py --game_data_folder "/path/to/Game Data" --output_folder "/path/to/Game Data - Analysis"
 """
 import argparse
-import os
+
 from pathlib import Path
 import pandas as pd
 from tqdm import tqdm
@ -27,49 +27,68 @@ from tqdm import tqdm
 from analysis.p1_make_longform_orders_data import make_longform_order_data
 from analysis.p2_make_convo_data import make_conversation_data   
 from analysis.p3_make_phase_data import make_phase_data
-from analysis.analysis_helpers import process_standard_game_inputs, process_game_in_zip
+from analysis.analysis_helpers import get_country_to_model_mapping, process_standard_game_inputs, process_game_inputs_in_zip
 from analysis.schemas import COUNTRIES
 from typing import Dict
-def process_game_data_from_folders(game_name : str, game_path : Path) -> Dict[str, pd.DataFrame]:
+
 def process_game_data_from_folders(game_path : Path) -> Dict[str, pd.DataFrame]:
    """Reads log data from folder and makes analytic data sets"""
-    game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
+    game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(path_to_folder=game_path)
-    orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"], 
+    country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
    orders_data : pd.DataFrame = make_longform_order_data(country_to_model=country_to_model, 
                                   lmvs_data=game_data["lmvs_data"],
                                   all_responses=game_data["all_responses"])
-    conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
+    conversations_data : pd.DataFrame = make_conversation_data(country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"])
-    phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"], 
+    phase_data : pd.DataFrame = make_phase_data(country_to_model=country_to_model, 
                           lmvs_data=game_data["lmvs_data"], 
                           conversations_data=conversations_data, 
                           orders_data=orders_data)
-    return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
+    return {"orders_data": orders_data, 
            "conversations_data": conversations_data, 
            "phase_data": phase_data}
 def process_game_data_from_zip(zip_path: Path, game_name: str) -> Dict[str, pd.DataFrame]:
    """Reads log data from zip and makes analytic data sets"""
-    game_data : dict[str, pd.DataFrame] = process_game_in_zip(zip_path=zip_path, selected_game=game_name)
+    game_data: dict[str, pd.DataFrame] = process_game_inputs_in_zip(zip_path=zip_path, selected_game=game_name)
-    orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"], 
+    country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
    orders_data: pd.DataFrame = make_longform_order_data(
        country_to_model=country_to_model,
        lmvs_data=game_data["lmvs_data"],
-                                   all_responses=game_data["all_responses"])
+        all_responses=game_data["all_responses"],
    )
-    conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
+    conversations_data: pd.DataFrame = make_conversation_data(
        country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"]
    )
-    phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"], 
+    phase_data: pd.DataFrame = make_phase_data(
        country_to_model=country_to_model,
        lmvs_data=game_data["lmvs_data"],
        conversations_data=conversations_data,
-                           orders_data=orders_data)
+        orders_data=orders_data,
    )
-    return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
+    return {
        "orders_data": orders_data,
        "conversations_data": conversations_data,
        "phase_data": phase_data,
    }
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run all three analysis scripts in sequence with the same arguments.")
    parser.add_argument(
        "--selected_game", 
        type=str, 
@ -98,16 +117,18 @@ if __name__ == "__main__":
    games_to_process = args.selected_game
    if not games_to_process:
-        games_to_process = os.listdir(args_dict["game_data_folder"])
+        games_to_process = [p.name for p in args_dict["game_data_folder"].iterdir() if p.is_dir()]
    for game in tqdm(games_to_process, desc="Processing games"):
        game_path = args_dict["game_data_folder"] / game
        if not game_path.is_dir():
            continue
        try:
-            results = process_game_data_from_folders(game_name=game, game_path=args_dict["game_data_folder"])
+            results = process_game_data_from_folders(game_path=game_path)
            for data_set, df in results.items():
-                output_path = args_dict["analysis_folder"] / data_set / f"{game}_{data_set}.csv"
+                output_dir = args_dict["analysis_folder"] / data_set
                output_dir.mkdir(parents=True, exist_ok=True)
                output_path = output_dir / f"{game}_{data_set}.csv"
                df.to_csv(output_path, index=False)
        except Exception as e:
            print(f"Error processing game {game}: {e}")
--- a/analysis/p1_make_longform_orders_data.py
+++ b/analysis/p1_make_longform_orders_data.py
@ -53,24 +53,33 @@ Return / save
 import pandas as pd
 import numpy as np
-import os 
+
 import copy
 import re 
 import argparse
 import warnings
 from pathlib import Path
-from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES, supply_centers, coastal_scs, place_identifier, unit_identifier, unit_move, possible_commands
+from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
 from analysis.schemas import COUNTRIES, ALL_SUPPLY_CENTERS, COASTAL_SCs, PLACE_IDENTIFIER, UNIT_IDENTIFIER, UNIT_MOVE, POSSIBLE_COMMANDS
 from tqdm import tqdm
-
+import traceback
 from typing import List, Optional, Dict 
 # Suppress pandas warnings
 warnings.filterwarnings('ignore', category=UserWarning, module='pandas.core.strings')
 warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
-def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
+def make_longform_order_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
-    try:
+    """
-        country_to_model = overview.loc[1, COUNTRIES] # map countries to models
+    Makes a dataframe with a row for each order given by every power, in every phase (see module docstring for more details).
-    except:
+    
-        country_to_model = {country: "not specified in overview.jsonl" for country in COUNTRIES}    
+    Args:
        country_to_model: A Series mapping country names to model names
        lmvs_data: A DataFrame containing the game data
        all_responses: A DataFrame containing the responses from the LLM responses csv
    Returns:
        A DataFrame with a row for each order given by every power, in every phase
    """
    ################## PART 1 ##################
    # build `turn_actions` dataframe
@ -120,22 +129,22 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    # categorize each order based on regex
    # note that this will overwrite if multiple regexes match, which is why we've split support into 2 commands
-    for possible_command, regex in possible_commands.items():
+    for possible_command, regex in POSSIBLE_COMMANDS.items():
        all_orders_ever.loc[all_orders_ever.order.str.contains(regex, regex=True), "command"] = possible_command
-    all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({place_identifier})")
+    all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({PLACE_IDENTIFIER})")
-    all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(supply_centers) | all_orders_ever["unit_location"].isin(coastal_scs)
+    all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["unit_location"].isin(COASTAL_SCs)
    # only MOVE has a destination
    all_orders_ever["destination"] = np.where(
        all_orders_ever["command"]=="Move",
-        all_orders_ever["order"].str.extract(rf"{unit_identifier} . ({place_identifier})", expand=False),
+        all_orders_ever["order"].str.extract(rf"{UNIT_IDENTIFIER} . ({PLACE_IDENTIFIER})", expand=False),
        np.nan
    )
-    all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(supply_centers) | all_orders_ever["destination"].isin(coastal_scs)
+    all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["destination"].isin(COASTAL_SCs)
    # Retreat also has a destination
-    all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{unit_identifier} R ({place_identifier})", expand=False)
+    all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{UNIT_IDENTIFIER} R ({PLACE_IDENTIFIER})", expand=False)
    all_orders_ever["immediate_result"] = all_orders_ever["order"].str.extract(r"\(([^)]+)\)")
    all_orders_ever["immediate_result"] = all_orders_ever["immediate_result"].fillna("PASS")
@ -146,7 +155,17 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    all_orders_ever["model_short_name"] = all_orders_ever["model"].str.split("/").str[-1]
    all_orders_ever["country_model"] = all_orders_ever["country"] + " (" + all_orders_ever["model_short_name"] + ")"
-    def check_location_influence(phase_id, location):
+    def check_location_influence(phase_id : str, location : str) -> str:
        """
        Helper - checks who owns a location at a given phase. Uses the `turn_actions` dataframe from overall context.
        Args:
            phase_id: The phase to check
            location: The location to check
        Returns:
            The country that owns the location, or "Unowned" if no country owns it
        """
        # checking who owns a location at `phase_id`
        if pd.isnull(location):
            return np.nan
@ -162,22 +181,45 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    all_orders_ever["destination_affiliation"] = all_orders_ever.apply(lambda row: check_location_influence(row["phase"],
                                                                                                            row["destination"]), axis=1)
-    def find_supporting_country(unit_command, command_type, phase):
+    def find_supporting_country(unit_command, command_type, phase) -> Optional[str]:
        """
        Helper - finds which orders support a given unit and records the supporting powers. Operating on the `all_orders_ever` dataframe.
        Args:
            unit_command: The unit command to find supporting orders for
            command_type: The type of command ("Move" or "Hold")
            phase: The phase to check
        Returns:
            A string containing a comma-separated list of countries that issued an order to support that unit, or None if no such orders exist
        """
        if command_type == "Move" or command_type == "Hold":  # commands that can be supported
            potential_supports = all_orders_ever[(all_orders_ever["phase"] == phase) & 
                                                (all_orders_ever["command"].isin(["Support Move", "Support Hold"]))]
            potential_supports = potential_supports[potential_supports["order"].str.contains(unit_command, regex=False)]
            if potential_supports.empty:
-                return np.nan
+                return None
            else:
                return ",".join(potential_supports["country"].tolist())
-        return np.nan
+        return None
    all_orders_ever["supported_by"] = all_orders_ever.apply(lambda row: find_supporting_country(row["order"], row["command"], row["phase"]), axis=1)
    all_orders_ever["in_anothers_territory"] =( all_orders_ever["country"] != all_orders_ever["unit_location_affiliation"]) & (all_orders_ever["unit_location_affiliation"] != "Unowned")
-    all_orders_ever["moving_into_anothers_territory"] = (all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & (all_orders_ever["destination_affiliation"].notnull()) & (all_orders_ever["destination_affiliation"] != "Unowned")
+    all_orders_ever["moving_into_anothers_territory"] = ((all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & 
                                                         (all_orders_ever["destination_affiliation"].notnull()) & 
                                                         (all_orders_ever["destination_affiliation"] != "Unowned"))
-    def find_owner_of_unit(unit_location, phase):
+    def find_owner_of_unit(unit_location : str, phase : str) -> Optional[str]:
        """
        Helper - finds the owner of a unit at a given phase. Operating on the `turn_actions` dataframe from overall context.
        Args:
            unit_location: The location of the unit to find the owner of
            phase: The phase to check
        Returns:
            The country that owns the unit, or None if no country owns it
        """
        if pd.notnull(unit_location):
            unit_status = turn_actions.loc[turn_actions.index.str.contains("_units"), phase]
            unit_status.index = unit_status.index.str.replace("_units", "")
@ -186,18 +228,30 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
                    if re.match(f"[AF] {unit_location}", unit):
                        return country
-    # where were they going? what was their destination like?
+    def find_destination_info(destination, phase) -> Optional[Dict[str, Optional[str]]]:
-    def find_destination_info(destination, phase):
+        """
        Helper - finds information about the destination of a unit at a given phase. 
        Operating on the `all_orders_ever` dataframe from overall context.
        Args:
            destination: The location of the unit to find the owner of
            phase: The phase to check
        Returns:
            A dictionary containing information about the destination unit, or None if no such unit exists
        """
        if pd.notnull(destination):
            country = find_owner_of_unit(destination, phase)
            # there should only ever be one unit at a given location during a phase
            destination_unit_orders = all_orders_ever[(all_orders_ever["country"] == country) & 
                                                                (all_orders_ever["phase"] == phase) & 
                                                                (all_orders_ever["unit_location"] == destination)]
            if not destination_unit_orders.empty:
                destination_unit_orders = destination_unit_orders.iloc[0] # safe conversion to a series
                return {"destination_unit_owner": country, 
-                                "destination_unit_order": destination_unit_orders["command"].squeeze(),
+                                "destination_unit_order": destination_unit_orders["command"],
-                                "destination_unit_outcome":destination_unit_orders["immediate_result"].squeeze(),
+                                "destination_unit_outcome":destination_unit_orders["immediate_result"],
-                                "destination_unit_supported_by": destination_unit_orders["supported_by"].squeeze()}    
+                                "destination_unit_supported_by": destination_unit_orders["supported_by"]}
    destination_unit_info = all_orders_ever.apply(lambda row: find_destination_info(row["destination"], row["phase"]), axis=1).apply(pd.Series)
    destination_unit_info["destination_was_occupied"] = destination_unit_info["destination_unit_owner"].notnull()
@ -205,30 +259,56 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    all_orders_ever = pd.concat([all_orders_ever, destination_unit_info], axis=1)
    # if a Support action: who were they supporting? what was their support doing?
-    def find_support_recipient_info(unit_order, command, phase):
+    def find_support_recipient_info(unit_order, command, phase) -> Optional[Dict[str, Optional[str]]]:
        """
        Helper - finds information about the recipient of a support action at a given phase. 
        Operating on the `all_orders_ever` dataframe from overall context.
        Args:
            unit_order: The order of the unit to find the recipient of support for
            command: The type of command ("Support Move" or "Support Hold")
            phase: The phase to check
        Returns:
            A dictionary containing information about the recipient of support, or None if no such recipient exists
        """
        if "Support" in command:
-            recipient_location = re.match(rf"{unit_identifier} S [AF] ({place_identifier})", unit_order).group(1)
+            recipient_location = re.match(rf"{UNIT_IDENTIFIER} S [AF] ({PLACE_IDENTIFIER})", unit_order).group(1)
            recipient_country = find_owner_of_unit(recipient_location, phase)
            # there should only ever be one unit at a given location during a phase
            recipient_order_info = all_orders_ever[(all_orders_ever["country"] == recipient_country) & 
                                                (all_orders_ever["phase"] == phase) & 
-                                                (all_orders_ever["unit_location"] == recipient_location)]
+                                                (all_orders_ever["unit_location"] == recipient_location)].iloc[0]
-            return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"].squeeze(),
+            return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"],
-                    "recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"].squeeze(),
+                    "recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"],
-                    "recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"].squeeze(),
+                    "recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"],
-                    "recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"].squeeze()}
+                    "recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"]}
-    support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], row["command"], row["phase"]), axis=1).apply(pd.Series)
+    support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], 
                                                                                           row["command"], 
                                                                                           row["phase"]), axis=1).apply(pd.Series)
    # add support recipient info to all_orders_ever as additional columns
    all_orders_ever = pd.concat([all_orders_ever, support_recipient_info], axis=1)
    # add relationships with other countries
    # if original v1
    agent_relationship_matrix_over_time = {}
    for phase in lmvs_data["phases"]:
        agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
    longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
    if longform_relationships.empty:
-        print("Warning: no relationship data found in phase data")
+        # Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
-    else: 
+        agent_relationship_matrix_over_time = {}
        for phase in lmvs_data["phases"]:
            agent_state = phase.get("state_agents", {}) 
            country_relationships = {}
            for c in COUNTRIES:
                country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
            agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
        longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
    longform_relationships.columns = longform_relationships.columns.str.lower()
    longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
        'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
@ -245,14 +325,24 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view, 
            left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
-    # if action was supporting
+    # if action was supporting, add flags
    all_orders_ever["supporting_self"] = all_orders_ever["country"]==all_orders_ever["recipient_unit_owner"]
    all_orders_ever["supporting_an_ally"] = (all_orders_ever["country"] !=all_orders_ever["recipient_unit_owner"]) & (all_orders_ever["recipient_unit_owner"].notnull())
-    def countries_aside_from(a_country):
+    def countries_aside_from(a_country : str) -> List[str]:
        return [country for country in all_orders_ever["country"].unique() if country != a_country]
-    def check_country(supporters, country):
+    def check_country(supporters : List[str], country : str) -> bool:
        """
        Helper - checks if a given country is in a list of supporters
        Args:
            supporters: The list of supporters to check
            country: The country to check
        Returns:
            True if the country is in the list of supporters, False otherwise
        """
        if pd.isnull(supporters):
            return False
        for other_countries in countries_aside_from(country):
@ -267,7 +357,7 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
    all_orders_ever["destination_unit_was_supported"] = all_orders_ever["destination_unit_supported_by"].notnull()
-    # add number of unit orders ever made
+    # add number of unit orders ever made during this game
    unit_order_weight = 1 / all_orders_ever.groupby("country").size()
    all_orders_ever["unit_order_weight"] = all_orders_ever["country"].map(unit_order_weight)
@ -317,25 +407,24 @@ if __name__ == "__main__":
    current_game_data_folder = Path(args.game_data_folder)
    analysis_folder = Path(args.analysis_folder) / "orders_data" 
-    if not os.path.exists(analysis_folder):
+    if not analysis_folder.exists():
        print(f"Output folder {analysis_folder} not found, creating it.")
-        os.makedirs(analysis_folder)
+        analysis_folder.mkdir(parents=True, exist_ok=True)
    games_to_process = args.selected_game
    if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
    for game_name in tqdm(games_to_process, desc="Processing games"):
        if game_name == ".DS_Store":
            continue
        game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
            continue
        try:
-            game_source_data = process_standard_game_inputs(game_path, game_name)
+            game_source_data = process_standard_game_inputs(game_path)
-            data = make_longform_order_data(overview=game_source_data["overview"], 
+            overview_df = game_source_data["overview"]
            country_to_model = get_country_to_model_mapping(overview_df, game_source_data["all_responses"])
            data = make_longform_order_data(country_to_model=country_to_model,
                                            lmvs_data=game_source_data["lmvs_data"],
                                            all_responses=game_source_data["all_responses"])
            output_path = analysis_folder / f"{game_name}_orders_data.csv"
@ -344,3 +433,4 @@ if __name__ == "__main__":
            print(f"Could not process {game_name}. Missing file: {e.filename}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {game_name}: {e}")
            traceback.print_exc()
--- a/analysis/p2_make_convo_data.py
+++ b/analysis/p2_make_convo_data.py
@ -1,5 +1,5 @@
 """
-Make conversation data from diplomacy game logs.
+Make conversation data from diplomacy game logs, for convenience in analyzing conversationd data alone.
 Resulting columns: 
 ['phase',
@ -21,14 +21,23 @@ Resulting columns:
 import pandas as pd
 import itertools 
 import argparse
-import os
+
 from tqdm import tqdm
 from pathlib import Path
-from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs
+from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs, get_country_to_model_mapping
 import traceback
-def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) -> pd.DataFrame:
+def make_conversation_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame) -> pd.DataFrame:
-    country_to_model = overview.loc[1, COUNTRIES]
+    """
    Make conversation data from diplomacy game logs.
    Args:
        country_to_model: A Series mapping country names to model names
        lmvs_data: A DataFrame containing the game data
    Returns:
        A DataFrame containing the conversation data (a row for every conversation between 2 powers, at every phase)
    """
    COUNTRY_COMBINATIONS = list(itertools.combinations(COUNTRIES, r=2))
    # relationship data
@ -67,20 +76,12 @@ def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) ->
            messages_from_sender = (messages_exchanged['sender']==sender).sum()
            sender_streak = max_consecutive[sender] if sender in max_consecutive.index else 0
-            messages_from_recipient = (messages_exchanged['recipient']==sender).sum()
+            messages_from_recipient = (messages_exchanged['sender'] == recipient).sum()
-            recipient_streak = max_consecutive[recipient] if recipient in max_consecutive.index else 0
+            recipient_streak = max_consecutive.get(recipient, 0)
-            party_1_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) & 
+            party_1_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == sender)].reindex(columns=[recipient])
-                                (longform_relationships["agent"]==sender)][recipient]
+            party_1_opinion = party_1_opinion_series.iloc[0,0] if not party_1_opinion_series.empty else ""
-            if party_1_opinion.empty:
+            party_2_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == recipient)].reindex(columns=[sender])
-                party_1_opinion = ""
+            party_2_opinion = party_2_opinion_series.iloc[0,0] if not party_2_opinion_series.empty else ""
            else: 
                party_1_opinion = party_1_opinion.squeeze()
            party_2_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) & 
                                (longform_relationships["agent"]==recipient)][sender]
            if party_2_opinion.empty:
                party_2_opinion = ""
            else: 
                party_2_opinion = party_2_opinion.squeeze()
            conversation_data = {
                "party_1": sender,
@ -132,29 +133,30 @@ if __name__ == "__main__":
    current_game_data_folder = Path(args.game_data_folder)
    analysis_folder = Path(args.analysis_folder) / "conversations_data"
-    if not os.path.exists(analysis_folder):
+    if not analysis_folder.exists():
        print(f"Output folder {analysis_folder} not found, creating it.")
-        os.makedirs(analysis_folder)
+        analysis_folder.mkdir(parents=True, exist_ok=True)
    games_to_process = args.selected_game
    if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
    for game_name in tqdm(games_to_process):
        if game_name == ".DS_Store":
            continue
    for game_name in tqdm(games_to_process, desc="Processing games"):
        game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
            continue
        try:
-            game_data = process_standard_game_inputs(game_data_folder=game_path, 
+            game_data = process_standard_game_inputs(game_path)
-                                                     selected_game=game_name)
+            overview_df = game_data["overview"]
-            data = make_conversation_data(overview=game_data["overview"], 
+            country_to_model = get_country_to_model_mapping(overview_df, game_data["all_responses"])
            data = make_conversation_data(country_to_model=country_to_model,
                                          lmvs_data=game_data["lmvs_data"])
            output_path = analysis_folder / f"{game_name}_conversations_data.csv"
            data.to_csv(output_path, index=False)
        except FileNotFoundError as e:
            print(f"Could not process {game_name}. Missing file: {e.filename}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {game_name}: {e}")
            print(f"Skipping {game_name}.")
            traceback.print_exc()
--- a/analysis/p3_make_phase_data.py
+++ b/analysis/p3_make_phase_data.py
@ -74,24 +74,39 @@ etc (a lot of possible combinations here)
 'invalid_order_count', (number of invalid orders given)
 'no_moves_extracted_flag', (flag for if no moves were extracted)
 'valid_order_count', (number of valid orders, calculated as unit_count - invalid_order_count, unless no valid orders were extracted )
 'goals', (list of goals for the phase separated by \n\n)
 'diary', (diary entry for the phase)
 """
 import pandas as pd
 import numpy as np
-import os 
+
 import json 
 import copy
 import re 
 import argparse
 from pathlib import Path
-from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES
+from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
 from analysis.schemas import COUNTRIES
 from tqdm import tqdm
 import traceback 
-def make_phase_data(overview : pd.DataFrame, 
+def make_phase_data(country_to_model : pd.Series, 
                    lmvs_data : pd.DataFrame, 
                    conversations_data : pd.DataFrame, 
                    orders_data : pd.DataFrame) -> pd.DataFrame:
-    country_to_model = overview.loc[1, COUNTRIES]
+    """
    takes country-to-model mapping, game state (lmvs_data), conversations, and orders, and returns a dataframe with one row per (power, phase). 
    Args:
        country_to_model: mapping of country to model
        lmvs_data: raw lmvs_data dataframe
        conversations_data: dataframe of conversations
        orders_data: dataframe of orders
    Returns:
        dataframe with one row per (power, phase) containing phase-level features, convos, relationships, and orders info. 
    """
    longform_conversations_complete = []
    for c in COUNTRIES: 
@ -109,12 +124,27 @@ def make_phase_data(overview : pd.DataFrame,
    ############ Relationships #############
    agent_relationship_matrix_over_time = {}
    state_list = {}
    for phase in lmvs_data["phases"]:
        agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
    longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
    if longform_relationships.empty:
        # Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
        agent_relationship_matrix_over_time = {}
        for phase in lmvs_data["phases"]:
            agent_state = phase.get("state_agents", {}) 
            country_relationships = {}
            for c in COUNTRIES:
                country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
            agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
        longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
    longform_relationships.columns = longform_relationships.columns.str.lower()
    longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
        'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
        'russia', 'turkey']].fillna("Self") 
    longform_relationships = longform_relationships.add_prefix("relationship_")
    ########### ORDERS DATA ###########
    # adding results to lmvs
@ -200,7 +230,7 @@ def make_phase_data(overview : pd.DataFrame,
    # lost a supply center
-    # territories held, territories moved to? 
+    # territories held, territories moved to
    orders_summary = pd.concat([commands_given.unstack().add_prefix("count_").add_suffix("_commands"), 
                                immediate_outcomes.unstack().add_prefix("count_got_"),
@ -240,10 +270,33 @@ def make_phase_data(overview : pd.DataFrame,
        state_list[phase["name"]].append(orders_over_time.loc[phase["name"]].rename("orders"))
        state_list[phase["name"]] = pd.concat(state_list[phase["name"]], axis=1)
    # goals and diaries        
    goals_over_time = {}
    diary_over_time = {}
    for phase in lmvs_data["phases"]:
        agent_state = phase.get("state_agents", {}) 
        if agent_state: # Not all versions have this
            country_goals = {}
            country_diary = {}
            for c in COUNTRIES:
                country_goals[c] = "\n\n".join(agent_state.get(c, {}).get("goals", {}))
                country_diary[c] = "\n\n".join(agent_state.get(c, {}).get("full_private_diary", []))
            goals_over_time[phase["name"]] = pd.Series(country_goals)
            diary_over_time[phase["name"]] = pd.Series(country_diary)
    state_list = pd.concat(state_list, axis=0)
    state_list.index.names = ["phase", "agent"]
    if goals_over_time:
        goals_over_time = pd.DataFrame(goals_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"goal"}).set_index(["phase", "agent"])
        state_list = pd.concat([state_list, goals_over_time], axis=1)
    if diary_over_time:
        diary_over_time = pd.DataFrame(diary_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"diary"}).set_index(["phase", "agent"])
        state_list = pd.concat([state_list, diary_over_time], axis=1)
    longform_relationships = longform_relationships.set_index(["relationship_phase", "relationship_agent"])
    longform_relationships.index.names = ["phase", "agent"]
    full_phase_data = pd.merge(state_list, 
-                            longform_relationships.set_index(["phase", "agent"]).add_prefix("relationship_to_").fillna("Self"),
+                            longform_relationships,
                            left_index=True, right_index=True).reset_index()
    full_phase_data["centers_count"] = full_phase_data["centers"].apply(lambda x: len(x))
    full_phase_data["units_count"] = full_phase_data["units"].apply(lambda x: len(x))
@ -262,7 +315,7 @@ def make_phase_data(overview : pd.DataFrame,
                                                                            "influence_count"]].diff()
    full_phase_data = pd.merge(full_phase_data, longform_conversations_complete, 
-                               left_on=["phase", "agent"], right_on=["phase", "power"]).drop(columns=["agent"])
+                               left_on=["phase", "agent"], right_on=["phase", "power"])
    full_phase_data = pd.merge(full_phase_data, orders_summary, how="left", left_on=["power", "phase"],
                               right_index=True)
    full_phase_data["model"] = full_phase_data["power"].map(country_to_model)
@ -301,32 +354,39 @@ if __name__ == "__main__":
    args = parser.parse_args()
    current_game_data_folder = Path(args.game_data_folder)
-    analysis_folder = args.analysis_folder
+    analysis_folder = Path(args.analysis_folder)
-    output_folder = Path(analysis_folder) / "phase_data"
+    output_folder = analysis_folder / "phase_data"
-    if not os.path.exists(output_folder):
+    if not output_folder.exists():
        print(f"Output folder {output_folder} not found, creating it.")
-        os.makedirs(output_folder)
+        output_folder.mkdir(parents=True, exist_ok=True)
    games_to_process = args.selected_game
    if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
    for game_name in tqdm(games_to_process):
        if game_name == ".DS_Store":
            continue
        game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
            continue
-        #try:
+        try:
-        game_data = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
+            game_data = process_standard_game_inputs(game_path)
            orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
            conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
-        data = make_phase_data(overview=game_data["overview"], 
+            country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
            data = make_phase_data(country_to_model=country_to_model, 
                                   lmvs_data=game_data["lmvs_data"], 
                                   conversations_data=conversations_data, 
                                   orders_data=orders_data)
            output_path = output_folder / f"{game_name}_phase_data.csv"
            data.to_csv(output_path, index=False)
        except FileNotFoundError as e:
            print(f"Could not process {game_name}. Missing file: {e.filename}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {game_name}: {e}")
            print(f"Skipping {game_name}.")
            traceback.print_exc()
--- a/analysis/readme.md
+++ b/analysis/readme.md
@ -0,0 +1,143 @@
 # Analysis Pipeline
 This folder contains the data processing pipeline for converting raw diplomacy game logs into structured analysis datasets.
 ## Overview
 The module contains pipelines transforms raw game logs data (stored as json/csv files) into four analytical datasets:
 1. **Orders Data** - one row per order given by each power in each phase
 2. **Conversations Data** - one row per conversation between two powers in each phase  
 3. **Phase Data** - one row per power per phase with aggregated state and action summaries
 4. **Game Data** - Summary of overall game features
 ## Main entry point
 ### `make_all_analysis_data.py` - Primary orchestrator
 **main use case**: process all games in a data folder, create corresponding orders, conversations, and phase datasets. Supports batch and individual processing.
 ```bash
 # process all games in a folder
 python analysis/make_all_analysis_data.py \
  --game_data_folder "/path/to/Game Data" \
  --output_folder "/path/to/Game Data - Analysis"
 # process specific games
 python analysis/make_all_analysis_data.py \
  --selected_game game1 game2 \
  --game_data_folder "/path/to/Game Data" \
  --output_folder "/path/to/Game Data - Analysis"
 ```
 This script runs the three p1, p2 and p3 analysis scripts in sequence and saves outputs to organized subfolders. 
 ### Individual analysis scripts
 #### `p1_make_longform_orders_data.py`
 **what it does**: creates detailed order-level data with one row per order given
 **key outputs**: 
 - order classification (move, support, hold, etc.)
 - unit locations and destinations
 - support relationships and outcomes
 - relationship matrices between powers
 - llm reasoning for order generation
 ```bash
 python analysis/p1_make_longform_orders_data.py \
  --game_data_folder "/path/to/Game Data" \
  --analysis_folder "/path/to/output"
 ```
 #### `p2_make_convo_data.py` 
 **what it does**: extracts conversation data between all pairs of powers
 **key outputs**:
 - message counts and streaks per party
 - conversation transcripts
 - relationship context for each conversation
 ```bash
 python analysis/p2_make_convo_data.py \
  --game_data_folder "/path/to/Game Data" \
  --analysis_folder "/path/to/output"
 ```
 #### `p3_make_phase_data.py`
 **what it does**: creates power-phase level summaries combining state, actions, and conversations
 **key outputs**:
 - current state (units, centers, influence counts)
 - action summaries (command counts, outcomes)
 - conversation transcripts with each power
 - change metrics between phases
 - llm reasoning and diary entries
 ```bash
 python analysis/p3_make_phase_data.py \
  --game_data_folder "/path/to/Game Data" \
  --analysis_folder "/path/to/output"
 ```
 #### `statistical_game_analysis.py`
 **what it does**: comprehensive statistical analysis of game results and llm performance
 **key outputs**:
 - game-level aggregated metrics and features
 - response success/failure rates by type
 - relationship dynamics and negotiation patterns
 - phase-level analysis with response-type granularity
 - comprehensive failure tracking and validation
 ```bash
 # analyze single game folder
 python analysis/statistical_game_analysis.py /path/to/game_folder
 # batch analyze multiple games
 python analysis/statistical_game_analysis.py /path/to/parent_folder --multiple
 # specify output directory
 python analysis/statistical_game_analysis.py /path/to/game_folder --output /path/to/output
 ```
 **note**: this is a separate analysis tool that operates independently of the main pipeline
 ## supporting modules
 ### `analysis_helpers.py`
 utility functions for:
 - loading game data from folders or zip files
 - mapping countries to their llm models
 - standardizing data loading across scripts
 ### `schemas.py` 
 constants and regex patterns:
 - supply center lists and coastal variants
 - country names
 - order parsing regexes
 - phase naming patterns
 ## expected input data structure
 each game folder should contain:
 - `overview.jsonl` - maps countries to llm models
 - `lmvsgame.json` - full turn-by-turn game state and actions
 - `llm_responses.csv` - all llm prompts and responses
 ## output structure
 the pipeline creates organized subfolders:
 output_folder/
 ├── orders_data/
 │ └── {game_name}orders_data.csv
 ├── conversations_data/
 │ └── {game_name}conversations_data.csv
 └── phase_data/
 │ └── {game_name}phase_data.csv
 ## Use cases
 - **game analysis**: examine specific games in detail
 - **model comparison**: compare llm performance across games
 - **relationship analysis**: study diplomatic dynamics
 - **order validation**: check llm order generation success rates
 - **conversation analysis**: study negotiation patterns
 - **phase progression**: track game state evolution
--- a/analysis/requirements-analysis.txt
+++ b/analysis/requirements-analysis.txt
@ -0,0 +1,19 @@
 # Analysis Pipeline Requirements
 # External packages needed for the analysis scripts
 pandas>=1.5.0
 numpy>=1.21.0
 tqdm>=4.64.0
 pydantic>=2.0.0
 # Standard library modules (included with Python):
 # - pathlib
 # - typing
 # - json
 # - zipfile
 # - copy
 # - re
 # - argparse
 # - warnings
 # - traceback
 # - itertools
--- a/analysis/schemas.py
+++ b/analysis/schemas.py
@ -0,0 +1,57 @@
 """ separate module for constants"""
 import re
 __all__ = ["ALL_PROVINCES", "ALL_SUPPLY_CENTERS", "COASTAL_SCs", "COUNTRIES", "PHASE_REGEX", "PLACE_IDENTIFIER", "UNIT_IDENTIFIER", "UNIT_MOVE", "POSSIBLE_COMMANDS", "POSSIBLE_COMMAND_RESULTS", "COUNTRY_RE", "PHASE_RE", "UNIT_RE", "PLACE_RE", "COMMAND_PATTERNS", "ALLOWED_RESULTS", "ALLOWED_COUNTRIES"]
 ALL_PROVINCES = ['BRE', 'PAR', 'MAR', 'PIC', 'BUR', 'GAS', 'SPA', 'POR', 'NAF',
       'TUN', 'LON', 'WAL', 'LVP', 'YOR', 'EDI', 'CLY', 'NWY', 'SWE',
       'DEN', 'FIN', 'STP', 'STP/NC', 'STP/SC', 'MOS', 'SEV', 'UKR',
       'WAR', 'LVN', 'BER', 'PRU', 'SIL', 'MUN', 'RUH', 'KIE', 'HOL',
       'BEL', 'VIE', 'BOH', 'GAL', 'TYR', 'TRI', 'BUD', 'SER', 'RUM',
       'BUL', 'BUL/EC', 'BUL/SC', 'GRE', 'ALB', 'CON', 'ANK', 'SMY',
       'ARM', 'SYR', 'VEN', 'PIE', 'TUS', 'ROM', 'NAP', 'APU', 'NTH',
       'ENG', 'IRI', 'MAO', 'WES', 'LYO', 'TYS', 'ION', 'ADR', 'AEG',
       'EAS', 'BLA', 'BAL', 'BOT', 'SKA', 'BAR', 'NWG', 'NAO']
 ALL_SUPPLY_CENTERS = [
    "ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
    "HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
    "ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
    "VEN", "VIE", "WAR", 
    "SPA", "STP",  # coastal provinces
 ]
 COASTAL_SCs = ["SPA/SC", "SPA/NC",
    "STP/SC", "STP/NC", 'BUL/EC',
       'BUL/SC',]
 COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
 PHASE_REGEX = r"^[A-Z]\d{4}[MRA]$"
 PLACE_IDENTIFIER = r"[A-Z]{3}(?:/[A-Z]{2})?"
 PLACE_CAPTURING_REGEX = r"([A-Z]{3})"
 UNIT_IDENTIFIER = rf"[AF] {PLACE_IDENTIFIER}"
 UNIT_MOVE = rf"{UNIT_IDENTIFIER} . {PLACE_IDENTIFIER}"
 POSSIBLE_COMMANDS = {
    "Move": f"^"+UNIT_MOVE, # distinguishing this from support
    "Support Move": f"{UNIT_IDENTIFIER} S {UNIT_MOVE}",
    "Support Hold": fr"{UNIT_IDENTIFIER} S {UNIT_IDENTIFIER}(?!\s+[.\-]\s+{PLACE_IDENTIFIER})",
    "Convoy": f"F {PLACE_IDENTIFIER} C {UNIT_MOVE}", # No convoys in here? 
    "Hold": f"{UNIT_IDENTIFIER} H",
    "Build": f"{UNIT_IDENTIFIER} B",
    "Disband": f"{UNIT_IDENTIFIER} D",
    "Retreat": f"{UNIT_IDENTIFIER} R",
 }
 POSSIBLE_COMMAND_RESULTS = [
 "void", "bounce", "cut", "dislodged", "disband", "no convoy"]
 COUNTRY_RE = re.compile(r"^[A-Z][A-Z]+$")
 PHASE_RE   = re.compile(PHASE_REGEX)
 UNIT_RE    = re.compile(rf"^(?P<ut>A|F) (?P<ter>[A-Z]{{3}}(?:/(?:NC|SC|EC|WC))?)$")  # allow coasts
 PLACE_RE   = re.compile(rf"^{PLACE_IDENTIFIER}$")
 COMMAND_PATTERNS = [(name, re.compile(p)) for name, p in POSSIBLE_COMMANDS.items()]
 ALLOWED_RESULTS  = set(POSSIBLE_COMMAND_RESULTS)
 ALLOWED_COUNTRIES = set(COUNTRIES)
--- a/analysis/validation.py
+++ b/analysis/validation.py
@ -0,0 +1,318 @@
 """
 LMVS JSON Schema Validator
 ==========================
 This module defines a Pydantic v2 schema for validating Diplomacy game logs
 exported in the LMVS (Language Model vs. State) format.
 Friendly overview
 -----------------
 An LMVS file should contain a top-level object with metadata (`id`, `map`,
 `rules`) and a list of **PhaseData** objects under the key `"phases"`.
 Each PhaseData object has the following important parts:
 * `"name"`: a string like "S1901M" or "F1903A" that encodes season, year, and
  sub-phase. The format is '^[A-Z]\d{4}[MRA]$' or the word 'COMPLETED'.
 * `"state"`: a dictionary describing the game board at this phase, with keys:
  - `"units"`: {country → [list of unit identifiers]}.
    Each unit identifier looks like `"A BUD"` or `"F STP/SC"`.  
    This says which units each power currently controls and where they are.
  - `"centers"`: {country → [list of supply center locations]}.
    Shows which supply centers each power owns.
  - `"influence"`: {country → [list of provinces]}.
    Records which provinces each power currently controls.
  - `"homes"`: {country → [list of home supply centers]}.
    A country's build home centers.
  - `"retreats"`: {country → {unit → [list of possible retreat provinces]}}.
    Units that must retreat and their allowed destinations.
  - `"civil_disorder"`: {country → integer flag}.
    Records whether a country is in civil disorder.
  - `"builds"`: {country → {count: int, homes: [list of places]}}.
    Tracks build counts and home sites during adjustment phases.
 * `"orders"`: {country → [list of orders]}.
  Each order string must follow one of the canonical Diplomacy order formats,
  such as Move, Hold, Support, Convoy, Build, Disband, or Retreat.
  For example: `"A BUD - SER"`, `"F LON S F EDI - NTH"`.
 * `"results"`: {unit identifier → [list of result codes]}.
  Each result code is one of: `"void"`, `"bounce"`, `"cut"`, `"dislodged"`,
  `"disband"`, or `"no convoy"`.  
  These describe how the order for that unit resolved.
 * `"messages"` (optional): a list of dictionaries, each with:
  - `"sender"`: a valid country
  - `"recipient"`: a valid country
  - `"phase"`: phase code like "S1901M"
  - `"message"`: the text of the press message
 Validation rules
 ----------------
 The schema enforces:
 - Country names must be one of: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY.
 - Phase codes must match the required regex format.
 - Units must be of the form "A XXX" or "F XXX[/COAST]".
 - Orders must match one of the defined command regexes.
 - Result codes must be from the allowed list.
 - Orders must reference units that exist in the corresponding `state.units`.
 Usage
 -----
 To use, call:
    from lmvs_light_validation import LMVSGame
    import json
    data = json.load(open("lmvs.json"))
    game = LMVSGame.model_validate(data)
 Any structural or semantic mismatches will raise a pydantic `ValidationError`.
 This module can be extended with stricter checks by toggling options such as
 strict territory membership, strict supply center membership, or coast handling.
 """
 from __future__ import annotations
 import re
 from typing import Dict, List, Optional, Any
 from dataclasses import dataclass
 from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator, ConfigDict
 from pydantic_core.core_schema import ValidationInfo
 from analysis.schemas import ALL_PROVINCES, COASTAL_SCs, COUNTRY_RE, PHASE_RE, UNIT_RE, PLACE_RE, COMMAND_PATTERNS, ALLOWED_RESULTS, ALLOWED_COUNTRIES
 # build a strict territory set that includes both underscore and slash spellings
 STRICT_TERRITORY = set(ALL_PROVINCES)
@dataclass(frozen=True)
 class ValidationConfig:
    strict_countries: bool = True
    strict_territories: bool = False  # set True to enforce membership in STRICT_TERRITORY for places
    strict_sc: bool = False           # optionally enforce that centers ⊆ ALL_SUPPLY_CENTERS(+coasts)
 # -------------------- validators --------------------
 def _validate_country(c: str) -> str:
    if not COUNTRY_RE.match(c):
        raise ValueError(f"bad country name: {c!r}")
    if c not in ALLOWED_COUNTRIES:
        raise ValueError(f"unknown country: {c!r}")
    return c
 def _validate_place(code: str, info: ValidationInfo, allow_coast: bool = True) -> str:
    if not PLACE_RE.match(code) and code != "WAIVE":
        raise ValueError(f"bad place code: {code!r}")
    cfg: ValidationConfig = (info.context or {}).get("cfg", ValidationConfig())  # type: ignore
    if cfg.strict_territories:
        if allow_coast:
            if code not in STRICT_TERRITORY:
                raise ValueError(f"unknown territory: {code!r}")
        else:
            base = code.split("/")[0]
            if base not in STRICT_TERRITORY:
                raise ValueError(f"unknown base territory: {code!r}")
    return code
 def _validate_unit(u: str, info: ValidationInfo) -> str:
    # handle dislodged units (prefixed with *)
    if u.startswith("*"):
        u = u[1:]  # remove the * prefix for validation
    m = UNIT_RE.match(u)
    if not m:
        raise ValueError(f"bad unit: {u!r} (expected 'A XXX' or 'F XXX[/COAST]')")
    _validate_place(m.group("ter"), info, allow_coast=True)
    return u
 def _order_kind(order: str) -> Optional[str]:
    # handle build orders (B suffix)
    if order.endswith(" B"):
        return "Build"
    for name, pat in COMMAND_PATTERNS:
        if pat.match(order):
            return name
    return None
 def _unit_head(order: str) -> Optional[str]:
    # returns the leading "A XXX" or "F XXX/COAST" if present
    m = UNIT_RE.match(order.split(" ", 2)[0] + " " + order.split(" ", 2)[1]) if len(order.split()) >= 2 else None
    if m:
        return m.group(0)
    # fallback: looser search at start
    m = UNIT_RE.match(order)
    return m.group(0) if m else None
 def _base_ter(u: str) -> str:
    # "A STP/NC" -> "STP"; "F BUD" -> "BUD"
    ter = u.split(" ", 1)[1] if " " in u else u
    return ter.split("/")[0]
 def _unit_type(u: str) -> str:
    return u.split(" ", 1)[0]
 # -------------------- models --------------------
 class PhaseState(BaseModel):
    name: str
    phase: str
    game_id: str
    units: Dict[str, List[str]]
    centers: Dict[str, List[str]]
    influence: Dict[str, List[str]]
    model_config = ConfigDict(extra="ignore")
    @field_validator("name","phase")
    @classmethod
    def _phase_format(cls, v: str, info: ValidationInfo) -> str:
        if not PHASE_RE.match(v) and v != "COMPLETED":
            raise ValueError(f"bad phase: {v!r}")
        return v
    # these should be dictionaries with country names as keys
    @field_validator("units","centers","influence", mode="after")
    @classmethod
    def _country_keys_ok(cls, mapping: Dict[str, Any], info: ValidationInfo) -> Dict[str, Any]:
        for c in mapping.keys():
            _validate_country(c)
        return mapping
    # these should be lists of unit strings
    @field_validator("units", mode="after")
    @classmethod
    def _units_ok(cls, u: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
        for c, lst in u.items():
            if not isinstance(lst, list):
                raise ValueError(f"units[{c}] must be a list")
            for unit in lst:
                _validate_unit(unit, info)
        return u
    # these should be lists of place strings
    @field_validator("centers","influence", mode="after")
    @classmethod
    def _place_lists_ok(cls, d: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
        for c, lst in d.items():
            if not isinstance(lst, list):
                raise ValueError(f"{c} values must be a list")
            for t in lst:
                _validate_place(t, info, allow_coast=False)
        return d
 class Phase(BaseModel):
    name: str
    state: PhaseState
    orders: Dict[str, Optional[List[str]]]  # allow None values
    results: Dict[str, List[str]]
    model_config = ConfigDict(extra="ignore")
    @field_validator("name")
    @classmethod
    def _phase_format(cls, v: str, info: ValidationInfo) -> str:
        if not PHASE_RE.match(v) and v != "COMPLETED":
            raise ValueError(f"bad phase: {v!r}")
        return v
    @field_validator("orders", mode="after")
    @classmethod
    def _orders_ok(cls, orders: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
        # handle null orders by converting to empty lists
        cleaned_orders = {}
        for c, lst in orders.items():
            _validate_country(c)
            if lst is None or lst == "null":
                cleaned_orders[c] = []
            elif not isinstance(lst, list):
                raise ValueError(f"orders[{c}] must be a list")
            else:
                cleaned_orders[c] = lst
                for o in lst:
                    kind = _order_kind(o)
                    if not kind:
                        raise ValueError(f"order doesn't match any known command: {o!r}")
        return cleaned_orders
    @field_validator("results", mode="after")
    @classmethod
    def _results_ok(cls, res: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
        for unit, lst in res.items():
            # skip unit validation for special unit names
            if unit != "WAIVE":
                _validate_unit(unit, info)
            if not isinstance(lst, list):
                raise ValueError(f"results[{unit}] must be a list")
            for r in lst:
                if r == "":
                    pass  # allow empty result codes
                elif r == "WAIVE":
                    pass  # allow WAIVE as a special result code
                elif r not in ALLOWED_RESULTS:
                    raise ValueError(f"illegal result code {r!r} for {unit}")
        return res
    @model_validator(mode="after")
    def _orders_correspond_to_units(self) -> "Phase":
        # derive {country -> set(units)} from state
        country_units = {c: set(v) for c, v in self.state.units.items()}
        # for each order, subject unit must exist for that country (coast-tolerant)
        for c, lst in self.orders.items():
            known = country_units.get(c, set())
            for o in lst:
                # skip validation for build orders (they create new units)
                if o.endswith(' B'):
                    continue
                # skip validation for retreat orders (they operate on dislodged units)
                if ' R ' in o or o.endswith(' R') or o.endswith(' D'):
                    continue
                head = _unit_head(o)
                if not head:
                    # already caught by regex; skip
                    continue
                if head in known:
                    continue
                # coast/base tolerant match
                base = _base_ter(head)
                ut   = _unit_type(head)
                if not any((_unit_type(u) == ut and _base_ter(u) == base) for u in known):
                    raise ValueError(f"order {o!r} for {c} does not match any unit in state.units[{c}]")
        return self
 class LMVSGame(BaseModel):
    id: str
    map: str
    rules: List[str]
    phases: List[Phase]
    model_config = ConfigDict(extra="ignore")
 # -------------------- example usage --------------------
 if __name__ == "__main__":
    import sys, json, pathlib
    if len(sys.argv) != 2:
        print("usage: python validation.py <path_to_lmvsgame.json>")
        sys.exit(1)
    cfg = ValidationConfig(strict_territories=False, strict_sc=False)
    p = pathlib.Path(sys.argv[1])
    data = json.loads(p.read_text())
    game = LMVSGame.model_validate(
        data,
        context={"cfg": cfg},
    )
    print(f"{p} is valid: game_id={game.id} phases={len(game.phases)}")