diff --git a/analysis/__init__.py b/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/analysis/analysis_helpers.py b/analysis/analysis_helpers.py
index 792b218..54141f5 100644
--- a/analysis/analysis_helpers.py
+++ b/analysis/analysis_helpers.py
@@ -1,63 +1,94 @@
-# analysis_constants.py 
-import os 
-import json 
+"""Utility functions and constants for loading Diplomacy analysis data.
+
+This module provides helpers to read game data stored either as a folder on disk
+or inside a zip archive, plus a few constant lists and regex patterns that are
+used across the analysis scripts.
+
+"""
+
 from pathlib import Path
-import pandas as pd
+from typing import Dict, Union
+import json
 import zipfile
 
-def process_standard_game_inputs(game_data_folder : Path, selected_game : str) -> dict[str, pd.DataFrame]:
-    path_to_folder = game_data_folder / selected_game
+import pandas as pd
+from analysis.schemas import COUNTRIES
+from analysis.validation import LMVSGame
 
-    assert os.path.exists(path_to_folder / "overview.jsonl"), f"Overview file not found in {path_to_folder}"
-    overview = pd.read_json(path_to_folder / "overview.jsonl", lines=True)
+__all__: list[str] = [
+    "process_standard_game_inputs",
+    "process_game_inputs_in_zip",
+    "get_country_to_model_mapping",
+]
     
-    # get all turn actions from lmvs
-    assert os.path.exists(path_to_folder / "lmvsgame.json"), f"LMVS file not found in {path_to_folder}"
-    path_to_file = path_to_folder / "lmvsgame.json"
+def process_standard_game_inputs(path_to_folder: Path) -> Dict[str, Union[pd.DataFrame, dict]]:
+    """
+    Read in a game folder and return the overview, lmvs_data, and all_responses
+    
+    Args:
+        path_to_folder: Path to the game folder. Must contain overview.jsonl, lmvsgame.json, and llm_responses.csv files.
+    
+    Returns:
+        Dictionary containing overview, lmvs_data, and all_responses
+    """
+    # ----- check files exist -----
+    overview_path = path_to_folder / "overview.jsonl"
+    lmvsgame_path = path_to_folder / "lmvsgame.json"
+    llm_resp_path = path_to_folder / "llm_responses.csv"
 
-    # Use the standard `json` library to load the file into a Python object
-    with open(path_to_file, 'r') as f:
+    if not overview_path.exists():
+        raise FileNotFoundError(str(overview_path))
+    if overview_path.stat().st_size == 0:
+        raise FileNotFoundError(f"{overview_path} is empty")
+
+    if not lmvsgame_path.exists():
+        raise FileNotFoundError(str(lmvsgame_path))
+    if lmvsgame_path.stat().st_size == 0:
+        raise FileNotFoundError(f"{lmvsgame_path} is empty")
+    if not llm_resp_path.exists():
+        raise FileNotFoundError(str(llm_resp_path))
+    if llm_resp_path.stat().st_size == 0:
+        raise FileNotFoundError(f"{llm_resp_path} is empty")
+
+    # ----- load data -----
+    overview = pd.read_json(overview_path, lines=True)
+
+    with open(lmvsgame_path, "r") as f:
         lmvs_data = json.load(f)
-        
-    assert os.path.exists(path_to_folder / "llm_responses.csv"), f"LLM responses file not found in {path_to_folder}"
-    all_responses = pd.read_csv(path_to_folder / "llm_responses.csv")
-    
+    # validate the LMVS data format
+    LMVSGame.model_validate(
+        lmvs_data,
+    )
+
+    all_responses = pd.read_csv(llm_resp_path)
+    expected_columns = ['model', 'power', 'phase', 'response_type', 'raw_input', 'raw_response',
+       'success']
+    missing_columns = [col for col in expected_columns if col not in all_responses.columns]
+    assert len(missing_columns) == 0, f"Missing required columns in CSV: {missing_columns}"
     return {"overview":overview, "lmvs_data":lmvs_data, "all_responses":all_responses}
 
-def process_game_in_zip(zip_path: Path, selected_game: str) -> dict[str, pd.DataFrame]:
+def get_country_to_model_mapping(overview_df : pd.DataFrame, llm_responses_df : pd.DataFrame) -> pd.Series:
+    """ Get a country:model map of which country was played by which model, different in different versions of data"""
+    country_to_model = overview_df.loc[1].reindex(COUNTRIES)
+    if pd.isnull(country_to_model).any(): 
+        if llm_responses_df is not None:
+            country_to_model = llm_responses_df.set_index("power")["model"].reindex(COUNTRIES)
+    return country_to_model
+
+def process_game_inputs_in_zip(zip_path: Path, selected_game: str) -> Dict[str, Union[pd.DataFrame, dict]]:
+    """
+    Read in a game folder and return the overview, lmvs_data, and all_responses
+    
+    Args:
+        zip_path: Path to the zip file
+        selected_game: Name of the game to extract
+    
+    Returns:
+        Dictionary containing overview, lmvs_data, and all_responses
+    """
     zip_name = zip_path.stem  # Gets filename without extension
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         overview = pd.read_json(zip_ref.open(f"{zip_name}/{selected_game}/overview.jsonl"), lines=True)
         lmvs_data = json.load(zip_ref.open(f"{zip_name}/{selected_game}/lmvsgame.json"))
         all_responses = pd.read_csv(zip_ref.open(f"{zip_name}/{selected_game}/llm_responses.csv"))
     return {"overview": overview, "lmvs_data": lmvs_data, "all_responses": all_responses}
-
-supply_centers = [
-    "ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
-    "HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
-    "ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
-    "VEN", "VIE", "WAR", 
-    "SPA", "STP", "BUL" # coastal provinces
-]
-
-coastal_scs = ["SPA/SC", "SPA/NC",
-    "STP/SC", "STP/NC", 'BUL/EC',
-       'BUL/SC',]
-
-COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
-
-place_identifier = "[A-Z]{3}(?:/[A-Z]{2})?"
-place_capturing_regex = r"([A-Z]{3})"
-unit_identifier = rf"[AF] {place_identifier}"
-unit_move = rf"{unit_identifier} . {place_identifier}"
-
-possible_commands = {
-    "Move": f"^"+unit_move, # distinguishing this from support
-    "Support Move": f"{unit_identifier} S {unit_move}",
-    "Support Hold": fr"{unit_identifier} S {unit_identifier}(?!\s+[.\-]\s+{place_identifier})",
-    "Convoy": f"F {place_identifier} C {unit_move}", # No convoys in here? 
-    "Hold": f"{unit_identifier} H",
-    "Build": f"{unit_identifier} B",
-    "Disband": f"{unit_identifier} D",
-    "Retreat": f"{unit_identifier} R",
-}
diff --git a/analysis/make_all_analysis_data.py b/analysis/make_all_analysis_data.py
index 51ffe53..c9da357 100644
--- a/analysis/make_all_analysis_data.py
+++ b/analysis/make_all_analysis_data.py
@@ -19,7 +19,7 @@ python analysis/make_all_analysis_data.py --selected_game game1 --game_data_fold
 python analysis/make_all_analysis_data.py --game_data_folder "/path/to/Game Data" --output_folder "/path/to/Game Data - Analysis"
 """
 import argparse
-import os
+
 from pathlib import Path
 import pandas as pd
 from tqdm import tqdm
@@ -27,49 +27,68 @@ from tqdm import tqdm
 from analysis.p1_make_longform_orders_data import make_longform_order_data
 from analysis.p2_make_convo_data import make_conversation_data   
 from analysis.p3_make_phase_data import make_phase_data
-from analysis.analysis_helpers import process_standard_game_inputs, process_game_in_zip
+from analysis.analysis_helpers import get_country_to_model_mapping, process_standard_game_inputs, process_game_inputs_in_zip
+from analysis.schemas import COUNTRIES
 
 from typing import Dict
-def process_game_data_from_folders(game_name : str, game_path : Path) -> Dict[str, pd.DataFrame]:
+
+
+
+def process_game_data_from_folders(game_path : Path) -> Dict[str, pd.DataFrame]:
     """Reads log data from folder and makes analytic data sets"""
     
-    game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
+    game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(path_to_folder=game_path)
     
-    orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"], 
+    country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
+    
+    orders_data : pd.DataFrame = make_longform_order_data(country_to_model=country_to_model, 
                                    lmvs_data=game_data["lmvs_data"],
                                    all_responses=game_data["all_responses"])
     
-    conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
-    
-    phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"], 
-                           lmvs_data=game_data["lmvs_data"], 
-                           conversations_data=conversations_data, 
-                           orders_data=orders_data)
-    
-    return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
+    conversations_data : pd.DataFrame = make_conversation_data(country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"])
 
-def process_game_data_from_zip(zip_path : Path, game_name : str) -> Dict[str, pd.DataFrame]:
-    """Reads log data from zip and makes analytic data sets"""
-    
-    game_data : dict[str, pd.DataFrame] = process_game_in_zip(zip_path=zip_path, selected_game=game_name)
-    
-    orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"], 
-                                   lmvs_data=game_data["lmvs_data"],
-                                   all_responses=game_data["all_responses"])
-    
-    conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
-    
-    phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"], 
+    phase_data : pd.DataFrame = make_phase_data(country_to_model=country_to_model, 
                            lmvs_data=game_data["lmvs_data"], 
                            conversations_data=conversations_data, 
                            orders_data=orders_data)
     
-    return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
+    return {"orders_data": orders_data, 
+            "conversations_data": conversations_data, 
+            "phase_data": phase_data}
+
+def process_game_data_from_zip(zip_path: Path, game_name: str) -> Dict[str, pd.DataFrame]:
+    """Reads log data from zip and makes analytic data sets"""
+
+    game_data: dict[str, pd.DataFrame] = process_game_inputs_in_zip(zip_path=zip_path, selected_game=game_name)
+
+    country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
+
+    orders_data: pd.DataFrame = make_longform_order_data(
+        country_to_model=country_to_model,
+        lmvs_data=game_data["lmvs_data"],
+        all_responses=game_data["all_responses"],
+    )
+
+    conversations_data: pd.DataFrame = make_conversation_data(
+        country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"]
+    )
+
+    phase_data: pd.DataFrame = make_phase_data(
+        country_to_model=country_to_model,
+        lmvs_data=game_data["lmvs_data"],
+        conversations_data=conversations_data,
+        orders_data=orders_data,
+    )
+
+    return {
+        "orders_data": orders_data,
+        "conversations_data": conversations_data,
+        "phase_data": phase_data,
+    }
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run all three analysis scripts in sequence with the same arguments.")
-    
     parser.add_argument(
         "--selected_game", 
         type=str, 
@@ -98,16 +117,18 @@ if __name__ == "__main__":
     
     games_to_process = args.selected_game
     if not games_to_process:
-        games_to_process = os.listdir(args_dict["game_data_folder"])
+        games_to_process = [p.name for p in args_dict["game_data_folder"].iterdir() if p.is_dir()]
     for game in tqdm(games_to_process, desc="Processing games"):
         game_path = args_dict["game_data_folder"] / game
         if not game_path.is_dir():
             continue
         
         try:
-            results = process_game_data_from_folders(game_name=game, game_path=args_dict["game_data_folder"])
+            results = process_game_data_from_folders(game_path=game_path)
             for data_set, df in results.items():
-                output_path = args_dict["analysis_folder"] / data_set / f"{game}_{data_set}.csv"
+                output_dir = args_dict["analysis_folder"] / data_set
+                output_dir.mkdir(parents=True, exist_ok=True)
+                output_path = output_dir / f"{game}_{data_set}.csv"
                 df.to_csv(output_path, index=False)
         except Exception as e:
             print(f"Error processing game {game}: {e}")
\ No newline at end of file
diff --git a/analysis/p1_make_longform_orders_data.py b/analysis/p1_make_longform_orders_data.py
index 2068214..700e1a5 100644
--- a/analysis/p1_make_longform_orders_data.py
+++ b/analysis/p1_make_longform_orders_data.py
@@ -53,24 +53,33 @@ Return / save
 
 import pandas as pd
 import numpy as np
-import os 
+
 import copy
 import re 
 import argparse
 import warnings
 from pathlib import Path
-from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES, supply_centers, coastal_scs, place_identifier, unit_identifier, unit_move, possible_commands
+from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
+from analysis.schemas import COUNTRIES, ALL_SUPPLY_CENTERS, COASTAL_SCs, PLACE_IDENTIFIER, UNIT_IDENTIFIER, UNIT_MOVE, POSSIBLE_COMMANDS
 from tqdm import tqdm
-
+import traceback
+from typing import List, Optional, Dict 
 # Suppress pandas warnings
 warnings.filterwarnings('ignore', category=UserWarning, module='pandas.core.strings')
 warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
 
-def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
-    try:
-        country_to_model = overview.loc[1, COUNTRIES] # map countries to models
-    except:
-        country_to_model = {country: "not specified in overview.jsonl" for country in COUNTRIES}    
+def make_longform_order_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
+    """
+    Makes a dataframe with a row for each order given by every power, in every phase (see module docstring for more details).
+    
+    Args:
+        country_to_model: A Series mapping country names to model names
+        lmvs_data: A DataFrame containing the game data
+        all_responses: A DataFrame containing the responses from the LLM responses csv
+    
+    Returns:
+        A DataFrame with a row for each order given by every power, in every phase
+    """
     ################## PART 1 ##################
     # build `turn_actions` dataframe
 
@@ -120,22 +129,22 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
 
     # categorize each order based on regex
     # note that this will overwrite if multiple regexes match, which is why we've split support into 2 commands
-    for possible_command, regex in possible_commands.items():
+    for possible_command, regex in POSSIBLE_COMMANDS.items():
         all_orders_ever.loc[all_orders_ever.order.str.contains(regex, regex=True), "command"] = possible_command
         
-    all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({place_identifier})")
-    all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(supply_centers) | all_orders_ever["unit_location"].isin(coastal_scs)
+    all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({PLACE_IDENTIFIER})")
+    all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["unit_location"].isin(COASTAL_SCs)
 
     # only MOVE has a destination
     all_orders_ever["destination"] = np.where(
         all_orders_ever["command"]=="Move",
-        all_orders_ever["order"].str.extract(rf"{unit_identifier} . ({place_identifier})", expand=False),
+        all_orders_ever["order"].str.extract(rf"{UNIT_IDENTIFIER} . ({PLACE_IDENTIFIER})", expand=False),
         np.nan
     )
-    all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(supply_centers) | all_orders_ever["destination"].isin(coastal_scs)
+    all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["destination"].isin(COASTAL_SCs)
 
     # Retreat also has a destination
-    all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{unit_identifier} R ({place_identifier})", expand=False)
+    all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{UNIT_IDENTIFIER} R ({PLACE_IDENTIFIER})", expand=False)
 
     all_orders_ever["immediate_result"] = all_orders_ever["order"].str.extract(r"\(([^)]+)\)")
     all_orders_ever["immediate_result"] = all_orders_ever["immediate_result"].fillna("PASS")
@@ -146,7 +155,17 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
     all_orders_ever["model_short_name"] = all_orders_ever["model"].str.split("/").str[-1]
     all_orders_ever["country_model"] = all_orders_ever["country"] + " (" + all_orders_ever["model_short_name"] + ")"
 
-    def check_location_influence(phase_id, location):
+    def check_location_influence(phase_id : str, location : str) -> str:
+        """
+        Helper - checks who owns a location at a given phase. Uses the `turn_actions` dataframe from overall context.
+        
+        Args:
+            phase_id: The phase to check
+            location: The location to check
+        
+        Returns:
+            The country that owns the location, or "Unowned" if no country owns it
+        """
         # checking who owns a location at `phase_id`
         if pd.isnull(location):
             return np.nan
@@ -162,22 +181,45 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
     all_orders_ever["destination_affiliation"] = all_orders_ever.apply(lambda row: check_location_influence(row["phase"],
                                                                                                             row["destination"]), axis=1)
 
-    def find_supporting_country(unit_command, command_type, phase):
+    def find_supporting_country(unit_command, command_type, phase) -> Optional[str]:
+        """
+        Helper - finds which orders support a given unit and records the supporting powers. Operating on the `all_orders_ever` dataframe.
+        
+        Args:
+            unit_command: The unit command to find supporting orders for
+            command_type: The type of command ("Move" or "Hold")
+            phase: The phase to check
+        
+        Returns:
+            A string containing a comma-separated list of countries that issued an order to support that unit, or None if no such orders exist
+        """
         if command_type == "Move" or command_type == "Hold":  # commands that can be supported
             potential_supports = all_orders_ever[(all_orders_ever["phase"] == phase) & 
                                                 (all_orders_ever["command"].isin(["Support Move", "Support Hold"]))]
             potential_supports = potential_supports[potential_supports["order"].str.contains(unit_command, regex=False)]
             if potential_supports.empty:
-                return np.nan
+                return None
             else:
                 return ",".join(potential_supports["country"].tolist())
-        return np.nan
+        return None
 
     all_orders_ever["supported_by"] = all_orders_ever.apply(lambda row: find_supporting_country(row["order"], row["command"], row["phase"]), axis=1)
     all_orders_ever["in_anothers_territory"] =( all_orders_ever["country"] != all_orders_ever["unit_location_affiliation"]) & (all_orders_ever["unit_location_affiliation"] != "Unowned")
-    all_orders_ever["moving_into_anothers_territory"] = (all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & (all_orders_ever["destination_affiliation"].notnull()) & (all_orders_ever["destination_affiliation"] != "Unowned")
+    all_orders_ever["moving_into_anothers_territory"] = ((all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & 
+                                                         (all_orders_ever["destination_affiliation"].notnull()) & 
+                                                         (all_orders_ever["destination_affiliation"] != "Unowned"))
 
-    def find_owner_of_unit(unit_location, phase):
+    def find_owner_of_unit(unit_location : str, phase : str) -> Optional[str]:
+        """
+        Helper - finds the owner of a unit at a given phase. Operating on the `turn_actions` dataframe from overall context.
+        
+        Args:
+            unit_location: The location of the unit to find the owner of
+            phase: The phase to check
+        
+        Returns:
+            The country that owns the unit, or None if no country owns it
+        """
         if pd.notnull(unit_location):
             unit_status = turn_actions.loc[turn_actions.index.str.contains("_units"), phase]
             unit_status.index = unit_status.index.str.replace("_units", "")
@@ -186,18 +228,30 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
                     if re.match(f"[AF] {unit_location}", unit):
                         return country
 
-    # where were they going? what was their destination like?
-    def find_destination_info(destination, phase):
+    def find_destination_info(destination, phase) -> Optional[Dict[str, Optional[str]]]:
+        """
+        Helper - finds information about the destination of a unit at a given phase. 
+        Operating on the `all_orders_ever` dataframe from overall context.
+        
+        Args:
+            destination: The location of the unit to find the owner of
+            phase: The phase to check
+        
+        Returns:
+            A dictionary containing information about the destination unit, or None if no such unit exists
+        """
         if pd.notnull(destination):
             country = find_owner_of_unit(destination, phase)
+            # there should only ever be one unit at a given location during a phase
             destination_unit_orders = all_orders_ever[(all_orders_ever["country"] == country) & 
                                                                 (all_orders_ever["phase"] == phase) & 
                                                                 (all_orders_ever["unit_location"] == destination)]
             if not destination_unit_orders.empty:
+                destination_unit_orders = destination_unit_orders.iloc[0] # safe conversion to a series
                 return {"destination_unit_owner": country, 
-                                "destination_unit_order": destination_unit_orders["command"].squeeze(),
-                                "destination_unit_outcome":destination_unit_orders["immediate_result"].squeeze(),
-                                "destination_unit_supported_by": destination_unit_orders["supported_by"].squeeze()}    
+                                "destination_unit_order": destination_unit_orders["command"],
+                                "destination_unit_outcome":destination_unit_orders["immediate_result"],
+                                "destination_unit_supported_by": destination_unit_orders["supported_by"]}
 
     destination_unit_info = all_orders_ever.apply(lambda row: find_destination_info(row["destination"], row["phase"]), axis=1).apply(pd.Series)
     destination_unit_info["destination_was_occupied"] = destination_unit_info["destination_unit_owner"].notnull()
@@ -205,54 +259,90 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
     all_orders_ever = pd.concat([all_orders_ever, destination_unit_info], axis=1)
 
     # if a Support action: who were they supporting? what was their support doing?
-    def find_support_recipient_info(unit_order, command, phase):
+    def find_support_recipient_info(unit_order, command, phase) -> Optional[Dict[str, Optional[str]]]:
+        """
+        Helper - finds information about the recipient of a support action at a given phase. 
+        Operating on the `all_orders_ever` dataframe from overall context.
+        
+        Args:
+            unit_order: The order of the unit to find the recipient of support for
+            command: The type of command ("Support Move" or "Support Hold")
+            phase: The phase to check
+        
+        Returns:
+            A dictionary containing information about the recipient of support, or None if no such recipient exists
+        """
         if "Support" in command:
-            recipient_location = re.match(rf"{unit_identifier} S [AF] ({place_identifier})", unit_order).group(1)
+            recipient_location = re.match(rf"{UNIT_IDENTIFIER} S [AF] ({PLACE_IDENTIFIER})", unit_order).group(1)
             recipient_country = find_owner_of_unit(recipient_location, phase)
+            # there should only ever be one unit at a given location during a phase
             recipient_order_info = all_orders_ever[(all_orders_ever["country"] == recipient_country) & 
                                                 (all_orders_ever["phase"] == phase) & 
-                                                (all_orders_ever["unit_location"] == recipient_location)]
-            return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"].squeeze(),
-                    "recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"].squeeze(),
-                    "recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"].squeeze(),
-                    "recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"].squeeze()}
+                                                (all_orders_ever["unit_location"] == recipient_location)].iloc[0]
+            return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"],
+                    "recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"],
+                    "recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"],
+                    "recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"]}
 
-    support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], row["command"], row["phase"]), axis=1).apply(pd.Series)
+    support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], 
+                                                                                           row["command"], 
+                                                                                           row["phase"]), axis=1).apply(pd.Series)
+    # add support recipient info to all_orders_ever as additional columns
     all_orders_ever = pd.concat([all_orders_ever, support_recipient_info], axis=1)
 
     # add relationships with other countries
+    # if original v1
     agent_relationship_matrix_over_time = {}
     for phase in lmvs_data["phases"]:
         agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
     longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
     
     if longform_relationships.empty:
-        print("Warning: no relationship data found in phase data")
-    else: 
-        longform_relationships.columns = longform_relationships.columns.str.lower()
-        longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
-            'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
-            'russia', 'turkey']].fillna("Self") 
-        longform_relationships = longform_relationships.add_prefix("relationship_")
-        all_orders_ever = pd.merge(all_orders_ever, longform_relationships, 
-                left_on=["phase", "country"], right_on=["relationship_phase", "relationship_agent"]).drop(columns=["relationship_phase", "relationship_agent"])
-        
-        alternate_relationship_view = pd.concat(agent_relationship_matrix_over_time)
-        alternate_relationship_view.index.names = ["phase", "agent"]
-        alternate_relationship_view = alternate_relationship_view.stack().reset_index().rename(columns={"level_2":"recipient",
-                0:"status"}).set_index(["phase", "recipient", 
-                "agent"])["status"].unstack("agent").fillna("Self").add_suffix("s_relationship_rating").reset_index()
-        all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view, 
-                left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
+        # Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
+        agent_relationship_matrix_over_time = {}
+        for phase in lmvs_data["phases"]:
+            agent_state = phase.get("state_agents", {}) 
+            country_relationships = {}
+            for c in COUNTRIES:
+                country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
+            agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
+        longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
+   
+   
+    longform_relationships.columns = longform_relationships.columns.str.lower()
+    longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
+        'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
+        'russia', 'turkey']].fillna("Self") 
+    longform_relationships = longform_relationships.add_prefix("relationship_")
+    all_orders_ever = pd.merge(all_orders_ever, longform_relationships, 
+            left_on=["phase", "country"], right_on=["relationship_phase", "relationship_agent"]).drop(columns=["relationship_phase", "relationship_agent"])
+    
+    alternate_relationship_view = pd.concat(agent_relationship_matrix_over_time)
+    alternate_relationship_view.index.names = ["phase", "agent"]
+    alternate_relationship_view = alternate_relationship_view.stack().reset_index().rename(columns={"level_2":"recipient",
+            0:"status"}).set_index(["phase", "recipient", 
+            "agent"])["status"].unstack("agent").fillna("Self").add_suffix("s_relationship_rating").reset_index()
+    all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view, 
+            left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
 
-    # if action was supporting
+    # if action was supporting, add flags
     all_orders_ever["supporting_self"] = all_orders_ever["country"]==all_orders_ever["recipient_unit_owner"]
     all_orders_ever["supporting_an_ally"] = (all_orders_ever["country"] !=all_orders_ever["recipient_unit_owner"]) & (all_orders_ever["recipient_unit_owner"].notnull())
 
-    def countries_aside_from(a_country):
+    def countries_aside_from(a_country : str) -> List[str]:
         return [country for country in all_orders_ever["country"].unique() if country != a_country]
 
-    def check_country(supporters, country):
+    def check_country(supporters : List[str], country : str) -> bool:
+        """
+        Helper - checks if a given country is in a list of supporters
+        
+        Args:
+            supporters: The list of supporters to check
+            country: The country to check
+        
+        Returns:
+            True if the country is in the list of supporters, False otherwise
+        """
         if pd.isnull(supporters):
             return False
         for other_countries in countries_aside_from(country):
@@ -267,7 +357,7 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
 
     all_orders_ever["destination_unit_was_supported"] = all_orders_ever["destination_unit_supported_by"].notnull()
 
-    # add number of unit orders ever made
+    # add number of unit orders ever made during this game
     unit_order_weight = 1 / all_orders_ever.groupby("country").size()
     all_orders_ever["unit_order_weight"] = all_orders_ever["country"].map(unit_order_weight)
 
@@ -317,30 +407,30 @@ if __name__ == "__main__":
     current_game_data_folder = Path(args.game_data_folder)
     analysis_folder = Path(args.analysis_folder) / "orders_data" 
 
-    if not os.path.exists(analysis_folder):
+    if not analysis_folder.exists():
         print(f"Output folder {analysis_folder} not found, creating it.")
-        os.makedirs(analysis_folder)
+        analysis_folder.mkdir(parents=True, exist_ok=True)
 
     games_to_process = args.selected_game
     if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
 
     for game_name in tqdm(games_to_process, desc="Processing games"):
-        if game_name == ".DS_Store":
-            continue
-        
         game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
             continue
 
         try:
-            game_source_data = process_standard_game_inputs(game_path, game_name)
-            data = make_longform_order_data(overview=game_source_data["overview"], 
-                                            lmvs_data=game_source_data["lmvs_data"], 
+            game_source_data = process_standard_game_inputs(game_path)
+            overview_df = game_source_data["overview"]
+            country_to_model = get_country_to_model_mapping(overview_df, game_source_data["all_responses"])
+            data = make_longform_order_data(country_to_model=country_to_model,
+                                            lmvs_data=game_source_data["lmvs_data"],
                                             all_responses=game_source_data["all_responses"])
             output_path = analysis_folder / f"{game_name}_orders_data.csv"
             data.to_csv(output_path, index=False)
         except FileNotFoundError as e:
             print(f"Could not process {game_name}. Missing file: {e.filename}")
         except Exception as e:
-            print(f"An unexpected error occurred while processing {game_name}: {e}")
\ No newline at end of file
+            print(f"An unexpected error occurred while processing {game_name}: {e}")
+            traceback.print_exc()
\ No newline at end of file
diff --git a/analysis/p2_make_convo_data.py b/analysis/p2_make_convo_data.py
index 72d86e9..72a8867 100644
--- a/analysis/p2_make_convo_data.py
+++ b/analysis/p2_make_convo_data.py
@@ -1,5 +1,5 @@
 """
-Make conversation data from diplomacy game logs.
+Make conversation data from diplomacy game logs, for convenience in analyzing conversationd data alone.
 
 Resulting columns: 
 ['phase',
@@ -21,14 +21,23 @@ Resulting columns:
 import pandas as pd
 import itertools 
 import argparse
-import os
+
 from tqdm import tqdm
 from pathlib import Path
-from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs
-
-def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) -> pd.DataFrame:
-    country_to_model = overview.loc[1, COUNTRIES]
+from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs, get_country_to_model_mapping
+import traceback
 
+def make_conversation_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame) -> pd.DataFrame:
+    """
+    Make conversation data from diplomacy game logs.
+    
+    Args:
+        country_to_model: A Series mapping country names to model names
+        lmvs_data: A DataFrame containing the game data
+    
+    Returns:
+        A DataFrame containing the conversation data (a row for every conversation between 2 powers, at every phase)
+    """
     COUNTRY_COMBINATIONS = list(itertools.combinations(COUNTRIES, r=2))
     
     # relationship data
@@ -67,20 +76,12 @@ def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) ->
                 
             messages_from_sender = (messages_exchanged['sender']==sender).sum()
             sender_streak = max_consecutive[sender] if sender in max_consecutive.index else 0
-            messages_from_recipient = (messages_exchanged['recipient']==sender).sum()
-            recipient_streak = max_consecutive[recipient] if recipient in max_consecutive.index else 0
-            party_1_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) & 
-                                (longform_relationships["agent"]==sender)][recipient]
-            if party_1_opinion.empty:
-                party_1_opinion = ""
-            else: 
-                party_1_opinion = party_1_opinion.squeeze()
-            party_2_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) & 
-                                (longform_relationships["agent"]==recipient)][sender]
-            if party_2_opinion.empty:
-                party_2_opinion = ""
-            else: 
-                party_2_opinion = party_2_opinion.squeeze()
+            messages_from_recipient = (messages_exchanged['sender'] == recipient).sum()
+            recipient_streak = max_consecutive.get(recipient, 0)
+            party_1_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == sender)].reindex(columns=[recipient])
+            party_1_opinion = party_1_opinion_series.iloc[0,0] if not party_1_opinion_series.empty else ""
+            party_2_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == recipient)].reindex(columns=[sender])
+            party_2_opinion = party_2_opinion_series.iloc[0,0] if not party_2_opinion_series.empty else ""
 
             conversation_data = {
                 "party_1": sender,
@@ -132,29 +133,30 @@ if __name__ == "__main__":
     current_game_data_folder = Path(args.game_data_folder)
     analysis_folder = Path(args.analysis_folder) / "conversations_data"
 
-    if not os.path.exists(analysis_folder):
+    if not analysis_folder.exists():
         print(f"Output folder {analysis_folder} not found, creating it.")
-        os.makedirs(analysis_folder)
+        analysis_folder.mkdir(parents=True, exist_ok=True)
 
     games_to_process = args.selected_game
     if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
 
-    for game_name in tqdm(games_to_process):
-        if game_name == ".DS_Store":
-            continue
-        
+    for game_name in tqdm(games_to_process, desc="Processing games"):
         game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
             continue
 
         try:
-            game_data = process_standard_game_inputs(game_data_folder=game_path, 
-                                                     selected_game=game_name)
-            data = make_conversation_data(overview=game_data["overview"], 
+            game_data = process_standard_game_inputs(game_path)
+            overview_df = game_data["overview"]
+            country_to_model = get_country_to_model_mapping(overview_df, game_data["all_responses"])
+            data = make_conversation_data(country_to_model=country_to_model,
                                           lmvs_data=game_data["lmvs_data"])
             output_path = analysis_folder / f"{game_name}_conversations_data.csv"
             data.to_csv(output_path, index=False)
+        except FileNotFoundError as e:
+            print(f"Could not process {game_name}. Missing file: {e.filename}")
         except Exception as e:
             print(f"An unexpected error occurred while processing {game_name}: {e}")
-            print(f"Skipping {game_name}.")
\ No newline at end of file
+            print(f"Skipping {game_name}.")
+            traceback.print_exc()
\ No newline at end of file
diff --git a/analysis/p3_make_phase_data.py b/analysis/p3_make_phase_data.py
index 6eeb301..a978c96 100644
--- a/analysis/p3_make_phase_data.py
+++ b/analysis/p3_make_phase_data.py
@@ -74,24 +74,39 @@ etc (a lot of possible combinations here)
 'invalid_order_count', (number of invalid orders given)
 'no_moves_extracted_flag', (flag for if no moves were extracted)
 'valid_order_count', (number of valid orders, calculated as unit_count - invalid_order_count, unless no valid orders were extracted )
+'goals', (list of goals for the phase separated by \n\n)
+'diary', (diary entry for the phase)
 """
 
 import pandas as pd
 import numpy as np
-import os 
+
 import json 
 import copy
 import re 
 import argparse
 from pathlib import Path
-from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES
+from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
+from analysis.schemas import COUNTRIES
 from tqdm import tqdm
+import traceback 
 
-def make_phase_data(overview : pd.DataFrame, 
+def make_phase_data(country_to_model : pd.Series, 
                     lmvs_data : pd.DataFrame, 
                     conversations_data : pd.DataFrame, 
                     orders_data : pd.DataFrame) -> pd.DataFrame:
-    country_to_model = overview.loc[1, COUNTRIES]
+    """
+    takes country-to-model mapping, game state (lmvs_data), conversations, and orders, and returns a dataframe with one row per (power, phase). 
+    
+    Args:
+        country_to_model: mapping of country to model
+        lmvs_data: raw lmvs_data dataframe
+        conversations_data: dataframe of conversations
+        orders_data: dataframe of orders
+        
+    Returns:
+        dataframe with one row per (power, phase) containing phase-level features, convos, relationships, and orders info. 
+    """
 
     longform_conversations_complete = []
     for c in COUNTRIES: 
@@ -109,12 +124,27 @@ def make_phase_data(overview : pd.DataFrame,
 
     ############ Relationships #############
     agent_relationship_matrix_over_time = {}
-    state_list = {}
     for phase in lmvs_data["phases"]:
         agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
-
     longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
-
+    
+    if longform_relationships.empty:
+        # Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
+        agent_relationship_matrix_over_time = {}
+        for phase in lmvs_data["phases"]:
+            agent_state = phase.get("state_agents", {}) 
+            country_relationships = {}
+            for c in COUNTRIES:
+                country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
+            agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
+        longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
+   
+   
+    longform_relationships.columns = longform_relationships.columns.str.lower()
+    longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
+        'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
+        'russia', 'turkey']].fillna("Self") 
+    longform_relationships = longform_relationships.add_prefix("relationship_")
 
     ########### ORDERS DATA ###########
     # adding results to lmvs
@@ -200,7 +230,7 @@ def make_phase_data(overview : pd.DataFrame,
     # lost a supply center
 
 
-    # territories held, territories moved to? 
+    # territories held, territories moved to
 
     orders_summary = pd.concat([commands_given.unstack().add_prefix("count_").add_suffix("_commands"), 
                                 immediate_outcomes.unstack().add_prefix("count_got_"),
@@ -240,10 +270,33 @@ def make_phase_data(overview : pd.DataFrame,
         state_list[phase["name"]].append(orders_over_time.loc[phase["name"]].rename("orders"))
         state_list[phase["name"]] = pd.concat(state_list[phase["name"]], axis=1)
             
+    # goals and diaries        
+    goals_over_time = {}
+    diary_over_time = {}
+    for phase in lmvs_data["phases"]:
+        agent_state = phase.get("state_agents", {}) 
+        if agent_state: # Not all versions have this
+            country_goals = {}
+            country_diary = {}
+            for c in COUNTRIES:
+                country_goals[c] = "\n\n".join(agent_state.get(c, {}).get("goals", {}))
+                country_diary[c] = "\n\n".join(agent_state.get(c, {}).get("full_private_diary", []))
+            goals_over_time[phase["name"]] = pd.Series(country_goals)
+            diary_over_time[phase["name"]] = pd.Series(country_diary)
+
     state_list = pd.concat(state_list, axis=0)
     state_list.index.names = ["phase", "agent"]
+    if goals_over_time:
+        goals_over_time = pd.DataFrame(goals_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"goal"}).set_index(["phase", "agent"])
+        state_list = pd.concat([state_list, goals_over_time], axis=1)
+    if diary_over_time:
+        diary_over_time = pd.DataFrame(diary_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"diary"}).set_index(["phase", "agent"])
+        state_list = pd.concat([state_list, diary_over_time], axis=1)
+        
+    longform_relationships = longform_relationships.set_index(["relationship_phase", "relationship_agent"])
+    longform_relationships.index.names = ["phase", "agent"]
     full_phase_data = pd.merge(state_list, 
-                            longform_relationships.set_index(["phase", "agent"]).add_prefix("relationship_to_").fillna("Self"),
+                            longform_relationships,
                             left_index=True, right_index=True).reset_index()
     full_phase_data["centers_count"] = full_phase_data["centers"].apply(lambda x: len(x))
     full_phase_data["units_count"] = full_phase_data["units"].apply(lambda x: len(x))
@@ -262,7 +315,7 @@ def make_phase_data(overview : pd.DataFrame,
                                                                             "influence_count"]].diff()
 
     full_phase_data = pd.merge(full_phase_data, longform_conversations_complete, 
-                               left_on=["phase", "agent"], right_on=["phase", "power"]).drop(columns=["agent"])
+                               left_on=["phase", "agent"], right_on=["phase", "power"])
     full_phase_data = pd.merge(full_phase_data, orders_summary, how="left", left_on=["power", "phase"],
                                right_index=True)
     full_phase_data["model"] = full_phase_data["power"].map(country_to_model)
@@ -301,32 +354,39 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     current_game_data_folder = Path(args.game_data_folder)
-    analysis_folder = args.analysis_folder
-    output_folder = Path(analysis_folder) / "phase_data"
+    analysis_folder = Path(args.analysis_folder)
+    output_folder = analysis_folder / "phase_data"
 
-    if not os.path.exists(output_folder):
+    if not output_folder.exists():
         print(f"Output folder {output_folder} not found, creating it.")
-        os.makedirs(output_folder)
+        output_folder.mkdir(parents=True, exist_ok=True)
 
     games_to_process = args.selected_game
     if not games_to_process:
-        games_to_process = os.listdir(current_game_data_folder)
+        games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
 
     for game_name in tqdm(games_to_process):
         if game_name == ".DS_Store":
             continue
         
         game_path = current_game_data_folder / game_name
-        if not os.path.isdir(game_path):
+        if not game_path.is_dir():
             continue
         
-        #try:
-        game_data = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
-        orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
-        conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
-        data = make_phase_data(overview=game_data["overview"], 
-                               lmvs_data=game_data["lmvs_data"], 
-                               conversations_data=conversations_data, 
-                               orders_data=orders_data)
-        output_path = output_folder / f"{game_name}_phase_data.csv"
-        data.to_csv(output_path, index=False)
\ No newline at end of file
+        try:
+            game_data = process_standard_game_inputs(game_path)
+            orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
+            conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
+            country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
+            data = make_phase_data(country_to_model=country_to_model, 
+                                   lmvs_data=game_data["lmvs_data"], 
+                                   conversations_data=conversations_data, 
+                                   orders_data=orders_data)
+            output_path = output_folder / f"{game_name}_phase_data.csv"
+            data.to_csv(output_path, index=False)
+        except FileNotFoundError as e:
+            print(f"Could not process {game_name}. Missing file: {e.filename}")
+        except Exception as e:
+            print(f"An unexpected error occurred while processing {game_name}: {e}")
+            print(f"Skipping {game_name}.")
+            traceback.print_exc()
\ No newline at end of file
diff --git a/analysis/readme.md b/analysis/readme.md
new file mode 100644
index 0000000..1b3bbb3
--- /dev/null
+++ b/analysis/readme.md
@@ -0,0 +1,143 @@
+# Analysis Pipeline
+
+This folder contains the data processing pipeline for converting raw diplomacy game logs into structured analysis datasets.
+
+## Overview
+
+The module contains pipelines transforms raw game logs data (stored as json/csv files) into four analytical datasets:
+
+1. **Orders Data** - one row per order given by each power in each phase
+2. **Conversations Data** - one row per conversation between two powers in each phase  
+3. **Phase Data** - one row per power per phase with aggregated state and action summaries
+4. **Game Data** - Summary of overall game features
+
+## Main entry point
+
+### `make_all_analysis_data.py` - Primary orchestrator
+**main use case**: process all games in a data folder, create corresponding orders, conversations, and phase datasets. Supports batch and individual processing.
+
+```bash
+# process all games in a folder
+python analysis/make_all_analysis_data.py \
+  --game_data_folder "/path/to/Game Data" \
+  --output_folder "/path/to/Game Data - Analysis"
+
+# process specific games
+python analysis/make_all_analysis_data.py \
+  --selected_game game1 game2 \
+  --game_data_folder "/path/to/Game Data" \
+  --output_folder "/path/to/Game Data - Analysis"
+```
+
+This script runs the three p1, p2 and p3 analysis scripts in sequence and saves outputs to organized subfolders. 
+
+### Individual analysis scripts
+
+#### `p1_make_longform_orders_data.py`
+**what it does**: creates detailed order-level data with one row per order given
+**key outputs**: 
+- order classification (move, support, hold, etc.)
+- unit locations and destinations
+- support relationships and outcomes
+- relationship matrices between powers
+- llm reasoning for order generation
+
+```bash
+python analysis/p1_make_longform_orders_data.py \
+  --game_data_folder "/path/to/Game Data" \
+  --analysis_folder "/path/to/output"
+```
+
+#### `p2_make_convo_data.py` 
+**what it does**: extracts conversation data between all pairs of powers
+**key outputs**:
+- message counts and streaks per party
+- conversation transcripts
+- relationship context for each conversation
+
+```bash
+python analysis/p2_make_convo_data.py \
+  --game_data_folder "/path/to/Game Data" \
+  --analysis_folder "/path/to/output"
+```
+
+#### `p3_make_phase_data.py`
+**what it does**: creates power-phase level summaries combining state, actions, and conversations
+**key outputs**:
+- current state (units, centers, influence counts)
+- action summaries (command counts, outcomes)
+- conversation transcripts with each power
+- change metrics between phases
+- llm reasoning and diary entries
+
+```bash
+python analysis/p3_make_phase_data.py \
+  --game_data_folder "/path/to/Game Data" \
+  --analysis_folder "/path/to/output"
+```
+
+#### `statistical_game_analysis.py`
+**what it does**: comprehensive statistical analysis of game results and llm performance
+**key outputs**:
+- game-level aggregated metrics and features
+- response success/failure rates by type
+- relationship dynamics and negotiation patterns
+- phase-level analysis with response-type granularity
+- comprehensive failure tracking and validation
+
+```bash
+# analyze single game folder
+python analysis/statistical_game_analysis.py /path/to/game_folder
+
+# batch analyze multiple games
+python analysis/statistical_game_analysis.py /path/to/parent_folder --multiple
+
+# specify output directory
+python analysis/statistical_game_analysis.py /path/to/game_folder --output /path/to/output
+```
+
+**note**: this is a separate analysis tool that operates independently of the main pipeline
+
+
+## supporting modules
+
+### `analysis_helpers.py`
+utility functions for:
+- loading game data from folders or zip files
+- mapping countries to their llm models
+- standardizing data loading across scripts
+
+### `schemas.py` 
+constants and regex patterns:
+- supply center lists and coastal variants
+- country names
+- order parsing regexes
+- phase naming patterns
+
+## expected input data structure
+
+each game folder should contain:
+- `overview.jsonl` - maps countries to llm models
+- `lmvsgame.json` - full turn-by-turn game state and actions
+- `llm_responses.csv` - all llm prompts and responses
+
+## output structure
+
+the pipeline creates organized subfolders:
+
+output_folder/
+├── orders_data/
+│ └── {game_name}orders_data.csv
+├── conversations_data/
+│ └── {game_name}conversations_data.csv
+└── phase_data/
+│ └── {game_name}phase_data.csv
+
+## Use cases
+
+- **game analysis**: examine specific games in detail
+- **model comparison**: compare llm performance across games
+- **relationship analysis**: study diplomatic dynamics
+- **order validation**: check llm order generation success rates
+- **conversation analysis**: study negotiation patterns
+- **phase progression**: track game state evolution
\ No newline at end of file
diff --git a/analysis/requirements-analysis.txt b/analysis/requirements-analysis.txt
new file mode 100644
index 0000000..bfbd58e
--- /dev/null
+++ b/analysis/requirements-analysis.txt
@@ -0,0 +1,19 @@
+# Analysis Pipeline Requirements
+# External packages needed for the analysis scripts
+
+pandas>=1.5.0
+numpy>=1.21.0
+tqdm>=4.64.0
+pydantic>=2.0.0
+
+# Standard library modules (included with Python):
+# - pathlib
+# - typing
+# - json
+# - zipfile
+# - copy
+# - re
+# - argparse
+# - warnings
+# - traceback
+# - itertools
diff --git a/analysis/schemas.py b/analysis/schemas.py
new file mode 100644
index 0000000..440f851
--- /dev/null
+++ b/analysis/schemas.py
@@ -0,0 +1,57 @@
+""" separate module for constants"""
+import re
+__all__ = ["ALL_PROVINCES", "ALL_SUPPLY_CENTERS", "COASTAL_SCs", "COUNTRIES", "PHASE_REGEX", "PLACE_IDENTIFIER", "UNIT_IDENTIFIER", "UNIT_MOVE", "POSSIBLE_COMMANDS", "POSSIBLE_COMMAND_RESULTS", "COUNTRY_RE", "PHASE_RE", "UNIT_RE", "PLACE_RE", "COMMAND_PATTERNS", "ALLOWED_RESULTS", "ALLOWED_COUNTRIES"]
+
+ALL_PROVINCES = ['BRE', 'PAR', 'MAR', 'PIC', 'BUR', 'GAS', 'SPA', 'POR', 'NAF',
+       'TUN', 'LON', 'WAL', 'LVP', 'YOR', 'EDI', 'CLY', 'NWY', 'SWE',
+       'DEN', 'FIN', 'STP', 'STP/NC', 'STP/SC', 'MOS', 'SEV', 'UKR',
+       'WAR', 'LVN', 'BER', 'PRU', 'SIL', 'MUN', 'RUH', 'KIE', 'HOL',
+       'BEL', 'VIE', 'BOH', 'GAL', 'TYR', 'TRI', 'BUD', 'SER', 'RUM',
+       'BUL', 'BUL/EC', 'BUL/SC', 'GRE', 'ALB', 'CON', 'ANK', 'SMY',
+       'ARM', 'SYR', 'VEN', 'PIE', 'TUS', 'ROM', 'NAP', 'APU', 'NTH',
+       'ENG', 'IRI', 'MAO', 'WES', 'LYO', 'TYS', 'ION', 'ADR', 'AEG',
+       'EAS', 'BLA', 'BAL', 'BOT', 'SKA', 'BAR', 'NWG', 'NAO']
+
+ALL_SUPPLY_CENTERS = [
+    "ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
+    "HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
+    "ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
+    "VEN", "VIE", "WAR", 
+    "SPA", "STP",  # coastal provinces
+]
+
+COASTAL_SCs = ["SPA/SC", "SPA/NC",
+    "STP/SC", "STP/NC", 'BUL/EC',
+       'BUL/SC',]
+
+COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
+
+PHASE_REGEX = r"^[A-Z]\d{4}[MRA]$"
+
+PLACE_IDENTIFIER = r"[A-Z]{3}(?:/[A-Z]{2})?"
+PLACE_CAPTURING_REGEX = r"([A-Z]{3})"
+UNIT_IDENTIFIER = rf"[AF] {PLACE_IDENTIFIER}"
+UNIT_MOVE = rf"{UNIT_IDENTIFIER} . {PLACE_IDENTIFIER}"
+
+POSSIBLE_COMMANDS = {
+    "Move": f"^"+UNIT_MOVE, # distinguishing this from support
+    "Support Move": f"{UNIT_IDENTIFIER} S {UNIT_MOVE}",
+    "Support Hold": fr"{UNIT_IDENTIFIER} S {UNIT_IDENTIFIER}(?!\s+[.\-]\s+{PLACE_IDENTIFIER})",
+    "Convoy": f"F {PLACE_IDENTIFIER} C {UNIT_MOVE}", # No convoys in here? 
+    "Hold": f"{UNIT_IDENTIFIER} H",
+    "Build": f"{UNIT_IDENTIFIER} B",
+    "Disband": f"{UNIT_IDENTIFIER} D",
+    "Retreat": f"{UNIT_IDENTIFIER} R",
+}
+
+POSSIBLE_COMMAND_RESULTS = [
+"void", "bounce", "cut", "dislodged", "disband", "no convoy"]
+
+COUNTRY_RE = re.compile(r"^[A-Z][A-Z]+$")
+PHASE_RE   = re.compile(PHASE_REGEX)
+UNIT_RE    = re.compile(rf"^(?P<ut>A|F) (?P<ter>[A-Z]{{3}}(?:/(?:NC|SC|EC|WC))?)$")  # allow coasts
+PLACE_RE   = re.compile(rf"^{PLACE_IDENTIFIER}$")
+
+COMMAND_PATTERNS = [(name, re.compile(p)) for name, p in POSSIBLE_COMMANDS.items()]
+ALLOWED_RESULTS  = set(POSSIBLE_COMMAND_RESULTS)
+ALLOWED_COUNTRIES = set(COUNTRIES)
\ No newline at end of file
diff --git a/analysis/validation.py b/analysis/validation.py
new file mode 100644
index 0000000..8c3edfb
--- /dev/null
+++ b/analysis/validation.py
@@ -0,0 +1,318 @@
+"""
+LMVS JSON Schema Validator
+==========================
+
+This module defines a Pydantic v2 schema for validating Diplomacy game logs
+exported in the LMVS (Language Model vs. State) format.
+
+Friendly overview
+-----------------
+An LMVS file should contain a top-level object with metadata (`id`, `map`,
+`rules`) and a list of **PhaseData** objects under the key `"phases"`.
+
+Each PhaseData object has the following important parts:
+
+* `"name"`: a string like "S1901M" or "F1903A" that encodes season, year, and
+  sub-phase. The format is '^[A-Z]\d{4}[MRA]$' or the word 'COMPLETED'.
+
+* `"state"`: a dictionary describing the game board at this phase, with keys:
+  
+  - `"units"`: {country → [list of unit identifiers]}.
+    Each unit identifier looks like `"A BUD"` or `"F STP/SC"`.  
+    This says which units each power currently controls and where they are.
+
+  - `"centers"`: {country → [list of supply center locations]}.
+    Shows which supply centers each power owns.
+
+  - `"influence"`: {country → [list of provinces]}.
+    Records which provinces each power currently controls.
+
+  - `"homes"`: {country → [list of home supply centers]}.
+    A country's build home centers.
+
+  - `"retreats"`: {country → {unit → [list of possible retreat provinces]}}.
+    Units that must retreat and their allowed destinations.
+
+  - `"civil_disorder"`: {country → integer flag}.
+    Records whether a country is in civil disorder.
+
+  - `"builds"`: {country → {count: int, homes: [list of places]}}.
+    Tracks build counts and home sites during adjustment phases.
+
+* `"orders"`: {country → [list of orders]}.
+  Each order string must follow one of the canonical Diplomacy order formats,
+  such as Move, Hold, Support, Convoy, Build, Disband, or Retreat.
+  For example: `"A BUD - SER"`, `"F LON S F EDI - NTH"`.
+
+* `"results"`: {unit identifier → [list of result codes]}.
+  Each result code is one of: `"void"`, `"bounce"`, `"cut"`, `"dislodged"`,
+  `"disband"`, or `"no convoy"`.  
+  These describe how the order for that unit resolved.
+
+* `"messages"` (optional): a list of dictionaries, each with:
+  - `"sender"`: a valid country
+  - `"recipient"`: a valid country
+  - `"phase"`: phase code like "S1901M"
+  - `"message"`: the text of the press message
+
+Validation rules
+----------------
+The schema enforces:
+- Country names must be one of: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY.
+- Phase codes must match the required regex format.
+- Units must be of the form "A XXX" or "F XXX[/COAST]".
+- Orders must match one of the defined command regexes.
+- Result codes must be from the allowed list.
+- Orders must reference units that exist in the corresponding `state.units`.
+
+Usage
+-----
+To use, call:
+
+    from lmvs_light_validation import LMVSGame
+    import json
+
+    data = json.load(open("lmvs.json"))
+    game = LMVSGame.model_validate(data)
+
+Any structural or semantic mismatches will raise a pydantic `ValidationError`.
+
+This module can be extended with stricter checks by toggling options such as
+strict territory membership, strict supply center membership, or coast handling.
+"""
+
+from __future__ import annotations
+import re
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator, ConfigDict
+from pydantic_core.core_schema import ValidationInfo
+from analysis.schemas import ALL_PROVINCES, COASTAL_SCs, COUNTRY_RE, PHASE_RE, UNIT_RE, PLACE_RE, COMMAND_PATTERNS, ALLOWED_RESULTS, ALLOWED_COUNTRIES
+
+
+# build a strict territory set that includes both underscore and slash spellings
+STRICT_TERRITORY = set(ALL_PROVINCES)
+
+@dataclass(frozen=True)
+class ValidationConfig:
+    strict_countries: bool = True
+    strict_territories: bool = False  # set True to enforce membership in STRICT_TERRITORY for places
+    strict_sc: bool = False           # optionally enforce that centers ⊆ ALL_SUPPLY_CENTERS(+coasts)
+
+# -------------------- validators --------------------
+
+def _validate_country(c: str) -> str:
+    if not COUNTRY_RE.match(c):
+        raise ValueError(f"bad country name: {c!r}")
+    if c not in ALLOWED_COUNTRIES:
+        raise ValueError(f"unknown country: {c!r}")
+    return c
+
+def _validate_place(code: str, info: ValidationInfo, allow_coast: bool = True) -> str:
+    if not PLACE_RE.match(code) and code != "WAIVE":
+        raise ValueError(f"bad place code: {code!r}")
+    cfg: ValidationConfig = (info.context or {}).get("cfg", ValidationConfig())  # type: ignore
+    if cfg.strict_territories:
+        if allow_coast:
+            if code not in STRICT_TERRITORY:
+                raise ValueError(f"unknown territory: {code!r}")
+        else:
+            base = code.split("/")[0]
+            if base not in STRICT_TERRITORY:
+                raise ValueError(f"unknown base territory: {code!r}")
+    return code
+
+def _validate_unit(u: str, info: ValidationInfo) -> str:
+    # handle dislodged units (prefixed with *)
+    if u.startswith("*"):
+        u = u[1:]  # remove the * prefix for validation
+    
+    m = UNIT_RE.match(u)
+    if not m:
+        raise ValueError(f"bad unit: {u!r} (expected 'A XXX' or 'F XXX[/COAST]')")
+    _validate_place(m.group("ter"), info, allow_coast=True)
+    return u
+
+def _order_kind(order: str) -> Optional[str]:
+    # handle build orders (B suffix)
+    if order.endswith(" B"):
+        return "Build"
+    
+    for name, pat in COMMAND_PATTERNS:
+        if pat.match(order):
+            return name
+    return None
+
+def _unit_head(order: str) -> Optional[str]:
+    # returns the leading "A XXX" or "F XXX/COAST" if present
+    m = UNIT_RE.match(order.split(" ", 2)[0] + " " + order.split(" ", 2)[1]) if len(order.split()) >= 2 else None
+    if m:
+        return m.group(0)
+    # fallback: looser search at start
+    m = UNIT_RE.match(order)
+    return m.group(0) if m else None
+
+def _base_ter(u: str) -> str:
+    # "A STP/NC" -> "STP"; "F BUD" -> "BUD"
+    ter = u.split(" ", 1)[1] if " " in u else u
+    return ter.split("/")[0]
+
+def _unit_type(u: str) -> str:
+    return u.split(" ", 1)[0]
+
+# -------------------- models --------------------
+
+class PhaseState(BaseModel):
+    name: str
+    phase: str
+    game_id: str
+
+    units: Dict[str, List[str]]
+    centers: Dict[str, List[str]]
+    influence: Dict[str, List[str]]
+    
+    model_config = ConfigDict(extra="ignore")
+
+    @field_validator("name","phase")
+    @classmethod
+    def _phase_format(cls, v: str, info: ValidationInfo) -> str:
+        if not PHASE_RE.match(v) and v != "COMPLETED":
+            raise ValueError(f"bad phase: {v!r}")
+        return v
+
+    # these should be dictionaries with country names as keys
+    @field_validator("units","centers","influence", mode="after")
+    @classmethod
+    def _country_keys_ok(cls, mapping: Dict[str, Any], info: ValidationInfo) -> Dict[str, Any]:
+        for c in mapping.keys():
+            _validate_country(c)
+        return mapping
+
+    # these should be lists of unit strings
+    @field_validator("units", mode="after")
+    @classmethod
+    def _units_ok(cls, u: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
+        for c, lst in u.items():
+            if not isinstance(lst, list):
+                raise ValueError(f"units[{c}] must be a list")
+            for unit in lst:
+                _validate_unit(unit, info)
+        return u
+
+    # these should be lists of place strings
+    @field_validator("centers","influence", mode="after")
+    @classmethod
+    def _place_lists_ok(cls, d: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
+        for c, lst in d.items():
+            if not isinstance(lst, list):
+                raise ValueError(f"{c} values must be a list")
+            for t in lst:
+                _validate_place(t, info, allow_coast=False)
+        return d
+
+class Phase(BaseModel):
+    name: str
+    state: PhaseState
+    orders: Dict[str, Optional[List[str]]]  # allow None values
+    results: Dict[str, List[str]]
+
+    model_config = ConfigDict(extra="ignore")
+
+    @field_validator("name")
+    @classmethod
+    def _phase_format(cls, v: str, info: ValidationInfo) -> str:
+        if not PHASE_RE.match(v) and v != "COMPLETED":
+            raise ValueError(f"bad phase: {v!r}")
+        return v
+
+    @field_validator("orders", mode="after")
+    @classmethod
+    def _orders_ok(cls, orders: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
+        # handle null orders by converting to empty lists
+        cleaned_orders = {}
+        for c, lst in orders.items():
+            _validate_country(c)
+            if lst is None or lst == "null":
+                cleaned_orders[c] = []
+            elif not isinstance(lst, list):
+                raise ValueError(f"orders[{c}] must be a list")
+            else:
+                cleaned_orders[c] = lst
+                for o in lst:
+                    kind = _order_kind(o)
+                    if not kind:
+                        raise ValueError(f"order doesn't match any known command: {o!r}")
+        return cleaned_orders
+
+    @field_validator("results", mode="after")
+    @classmethod
+    def _results_ok(cls, res: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
+        for unit, lst in res.items():
+            # skip unit validation for special unit names
+            if unit != "WAIVE":
+                _validate_unit(unit, info)
+            if not isinstance(lst, list):
+                raise ValueError(f"results[{unit}] must be a list")
+            for r in lst:
+                if r == "":
+                    pass  # allow empty result codes
+                elif r == "WAIVE":
+                    pass  # allow WAIVE as a special result code
+                elif r not in ALLOWED_RESULTS:
+                    raise ValueError(f"illegal result code {r!r} for {unit}")
+        return res
+
+    @model_validator(mode="after")
+    def _orders_correspond_to_units(self) -> "Phase":
+        # derive {country -> set(units)} from state
+        country_units = {c: set(v) for c, v in self.state.units.items()}
+
+        # for each order, subject unit must exist for that country (coast-tolerant)
+        for c, lst in self.orders.items():
+            known = country_units.get(c, set())
+            for o in lst:
+                # skip validation for build orders (they create new units)
+                if o.endswith(' B'):
+                    continue
+                # skip validation for retreat orders (they operate on dislodged units)
+                if ' R ' in o or o.endswith(' R') or o.endswith(' D'):
+                    continue
+                    
+                head = _unit_head(o)
+                if not head:
+                    # already caught by regex; skip
+                    continue
+                if head in known:
+                    continue
+                # coast/base tolerant match
+                base = _base_ter(head)
+                ut   = _unit_type(head)
+                if not any((_unit_type(u) == ut and _base_ter(u) == base) for u in known):
+                    raise ValueError(f"order {o!r} for {c} does not match any unit in state.units[{c}]")
+        return self
+
+class LMVSGame(BaseModel):
+    id: str
+    map: str
+    rules: List[str]
+    phases: List[Phase]
+
+    model_config = ConfigDict(extra="ignore")
+
+
+
+# -------------------- example usage --------------------
+if __name__ == "__main__":
+    import sys, json, pathlib
+    if len(sys.argv) != 2:
+        print("usage: python validation.py <path_to_lmvsgame.json>")
+        sys.exit(1)
+    cfg = ValidationConfig(strict_territories=False, strict_sc=False)
+    p = pathlib.Path(sys.argv[1])
+    data = json.loads(p.read_text())
+    game = LMVSGame.model_validate(
+        data,
+        context={"cfg": cfg},
+    )
+
+    print(f"{p} is valid: game_id={game.id} phases={len(game.phases)}")
\ No newline at end of file