mirror of
https://github.com/GoodStartLabs/AI_Diplomacy.git
synced 2026-04-19 12:58:09 +00:00
Merge pull request #61 from peregrinates/ordersdata
patched for compatability with new data logging structure
This commit is contained in:
commit
287d845d4c
10 changed files with 942 additions and 201 deletions
0
analysis/__init__.py
Normal file
0
analysis/__init__.py
Normal file
|
|
@ -1,63 +1,94 @@
|
||||||
# analysis_constants.py
|
"""Utility functions and constants for loading Diplomacy analysis data.
|
||||||
import os
|
|
||||||
import json
|
This module provides helpers to read game data stored either as a folder on disk
|
||||||
|
or inside a zip archive, plus a few constant lists and regex patterns that are
|
||||||
|
used across the analysis scripts.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
from typing import Dict, Union
|
||||||
|
import json
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
def process_standard_game_inputs(game_data_folder : Path, selected_game : str) -> dict[str, pd.DataFrame]:
|
import pandas as pd
|
||||||
path_to_folder = game_data_folder / selected_game
|
from analysis.schemas import COUNTRIES
|
||||||
|
from analysis.validation import LMVSGame
|
||||||
|
|
||||||
assert os.path.exists(path_to_folder / "overview.jsonl"), f"Overview file not found in {path_to_folder}"
|
__all__: list[str] = [
|
||||||
overview = pd.read_json(path_to_folder / "overview.jsonl", lines=True)
|
"process_standard_game_inputs",
|
||||||
|
"process_game_inputs_in_zip",
|
||||||
|
"get_country_to_model_mapping",
|
||||||
|
]
|
||||||
|
|
||||||
# get all turn actions from lmvs
|
def process_standard_game_inputs(path_to_folder: Path) -> Dict[str, Union[pd.DataFrame, dict]]:
|
||||||
assert os.path.exists(path_to_folder / "lmvsgame.json"), f"LMVS file not found in {path_to_folder}"
|
"""
|
||||||
path_to_file = path_to_folder / "lmvsgame.json"
|
Read in a game folder and return the overview, lmvs_data, and all_responses
|
||||||
|
|
||||||
# Use the standard `json` library to load the file into a Python object
|
Args:
|
||||||
with open(path_to_file, 'r') as f:
|
path_to_folder: Path to the game folder. Must contain overview.jsonl, lmvsgame.json, and llm_responses.csv files.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing overview, lmvs_data, and all_responses
|
||||||
|
"""
|
||||||
|
# ----- check files exist -----
|
||||||
|
overview_path = path_to_folder / "overview.jsonl"
|
||||||
|
lmvsgame_path = path_to_folder / "lmvsgame.json"
|
||||||
|
llm_resp_path = path_to_folder / "llm_responses.csv"
|
||||||
|
|
||||||
|
if not overview_path.exists():
|
||||||
|
raise FileNotFoundError(str(overview_path))
|
||||||
|
if overview_path.stat().st_size == 0:
|
||||||
|
raise FileNotFoundError(f"{overview_path} is empty")
|
||||||
|
|
||||||
|
if not lmvsgame_path.exists():
|
||||||
|
raise FileNotFoundError(str(lmvsgame_path))
|
||||||
|
if lmvsgame_path.stat().st_size == 0:
|
||||||
|
raise FileNotFoundError(f"{lmvsgame_path} is empty")
|
||||||
|
if not llm_resp_path.exists():
|
||||||
|
raise FileNotFoundError(str(llm_resp_path))
|
||||||
|
if llm_resp_path.stat().st_size == 0:
|
||||||
|
raise FileNotFoundError(f"{llm_resp_path} is empty")
|
||||||
|
|
||||||
|
# ----- load data -----
|
||||||
|
overview = pd.read_json(overview_path, lines=True)
|
||||||
|
|
||||||
|
with open(lmvsgame_path, "r") as f:
|
||||||
lmvs_data = json.load(f)
|
lmvs_data = json.load(f)
|
||||||
|
# validate the LMVS data format
|
||||||
|
LMVSGame.model_validate(
|
||||||
|
lmvs_data,
|
||||||
|
)
|
||||||
|
|
||||||
assert os.path.exists(path_to_folder / "llm_responses.csv"), f"LLM responses file not found in {path_to_folder}"
|
all_responses = pd.read_csv(llm_resp_path)
|
||||||
all_responses = pd.read_csv(path_to_folder / "llm_responses.csv")
|
expected_columns = ['model', 'power', 'phase', 'response_type', 'raw_input', 'raw_response',
|
||||||
|
'success']
|
||||||
|
missing_columns = [col for col in expected_columns if col not in all_responses.columns]
|
||||||
|
assert len(missing_columns) == 0, f"Missing required columns in CSV: {missing_columns}"
|
||||||
return {"overview":overview, "lmvs_data":lmvs_data, "all_responses":all_responses}
|
return {"overview":overview, "lmvs_data":lmvs_data, "all_responses":all_responses}
|
||||||
|
|
||||||
def process_game_in_zip(zip_path: Path, selected_game: str) -> dict[str, pd.DataFrame]:
|
def get_country_to_model_mapping(overview_df : pd.DataFrame, llm_responses_df : pd.DataFrame) -> pd.Series:
|
||||||
|
""" Get a country:model map of which country was played by which model, different in different versions of data"""
|
||||||
|
country_to_model = overview_df.loc[1].reindex(COUNTRIES)
|
||||||
|
if pd.isnull(country_to_model).any():
|
||||||
|
if llm_responses_df is not None:
|
||||||
|
country_to_model = llm_responses_df.set_index("power")["model"].reindex(COUNTRIES)
|
||||||
|
return country_to_model
|
||||||
|
|
||||||
|
def process_game_inputs_in_zip(zip_path: Path, selected_game: str) -> Dict[str, Union[pd.DataFrame, dict]]:
|
||||||
|
"""
|
||||||
|
Read in a game folder and return the overview, lmvs_data, and all_responses
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_path: Path to the zip file
|
||||||
|
selected_game: Name of the game to extract
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing overview, lmvs_data, and all_responses
|
||||||
|
"""
|
||||||
zip_name = zip_path.stem # Gets filename without extension
|
zip_name = zip_path.stem # Gets filename without extension
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
overview = pd.read_json(zip_ref.open(f"{zip_name}/{selected_game}/overview.jsonl"), lines=True)
|
overview = pd.read_json(zip_ref.open(f"{zip_name}/{selected_game}/overview.jsonl"), lines=True)
|
||||||
lmvs_data = json.load(zip_ref.open(f"{zip_name}/{selected_game}/lmvsgame.json"))
|
lmvs_data = json.load(zip_ref.open(f"{zip_name}/{selected_game}/lmvsgame.json"))
|
||||||
all_responses = pd.read_csv(zip_ref.open(f"{zip_name}/{selected_game}/llm_responses.csv"))
|
all_responses = pd.read_csv(zip_ref.open(f"{zip_name}/{selected_game}/llm_responses.csv"))
|
||||||
return {"overview": overview, "lmvs_data": lmvs_data, "all_responses": all_responses}
|
return {"overview": overview, "lmvs_data": lmvs_data, "all_responses": all_responses}
|
||||||
|
|
||||||
supply_centers = [
|
|
||||||
"ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
|
|
||||||
"HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
|
|
||||||
"ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
|
|
||||||
"VEN", "VIE", "WAR",
|
|
||||||
"SPA", "STP", "BUL" # coastal provinces
|
|
||||||
]
|
|
||||||
|
|
||||||
coastal_scs = ["SPA/SC", "SPA/NC",
|
|
||||||
"STP/SC", "STP/NC", 'BUL/EC',
|
|
||||||
'BUL/SC',]
|
|
||||||
|
|
||||||
COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
|
|
||||||
|
|
||||||
place_identifier = "[A-Z]{3}(?:/[A-Z]{2})?"
|
|
||||||
place_capturing_regex = r"([A-Z]{3})"
|
|
||||||
unit_identifier = rf"[AF] {place_identifier}"
|
|
||||||
unit_move = rf"{unit_identifier} . {place_identifier}"
|
|
||||||
|
|
||||||
possible_commands = {
|
|
||||||
"Move": f"^"+unit_move, # distinguishing this from support
|
|
||||||
"Support Move": f"{unit_identifier} S {unit_move}",
|
|
||||||
"Support Hold": fr"{unit_identifier} S {unit_identifier}(?!\s+[.\-]\s+{place_identifier})",
|
|
||||||
"Convoy": f"F {place_identifier} C {unit_move}", # No convoys in here?
|
|
||||||
"Hold": f"{unit_identifier} H",
|
|
||||||
"Build": f"{unit_identifier} B",
|
|
||||||
"Disband": f"{unit_identifier} D",
|
|
||||||
"Retreat": f"{unit_identifier} R",
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ python analysis/make_all_analysis_data.py --selected_game game1 --game_data_fold
|
||||||
python analysis/make_all_analysis_data.py --game_data_folder "/path/to/Game Data" --output_folder "/path/to/Game Data - Analysis"
|
python analysis/make_all_analysis_data.py --game_data_folder "/path/to/Game Data" --output_folder "/path/to/Game Data - Analysis"
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
@ -27,49 +27,68 @@ from tqdm import tqdm
|
||||||
from analysis.p1_make_longform_orders_data import make_longform_order_data
|
from analysis.p1_make_longform_orders_data import make_longform_order_data
|
||||||
from analysis.p2_make_convo_data import make_conversation_data
|
from analysis.p2_make_convo_data import make_conversation_data
|
||||||
from analysis.p3_make_phase_data import make_phase_data
|
from analysis.p3_make_phase_data import make_phase_data
|
||||||
from analysis.analysis_helpers import process_standard_game_inputs, process_game_in_zip
|
from analysis.analysis_helpers import get_country_to_model_mapping, process_standard_game_inputs, process_game_inputs_in_zip
|
||||||
|
from analysis.schemas import COUNTRIES
|
||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
def process_game_data_from_folders(game_name : str, game_path : Path) -> Dict[str, pd.DataFrame]:
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_game_data_from_folders(game_path : Path) -> Dict[str, pd.DataFrame]:
|
||||||
"""Reads log data from folder and makes analytic data sets"""
|
"""Reads log data from folder and makes analytic data sets"""
|
||||||
|
|
||||||
game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
|
game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(path_to_folder=game_path)
|
||||||
|
|
||||||
orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"],
|
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
|
||||||
|
|
||||||
|
orders_data : pd.DataFrame = make_longform_order_data(country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"],
|
lmvs_data=game_data["lmvs_data"],
|
||||||
all_responses=game_data["all_responses"])
|
all_responses=game_data["all_responses"])
|
||||||
|
|
||||||
conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
|
conversations_data : pd.DataFrame = make_conversation_data(country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"])
|
||||||
|
|
||||||
phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"],
|
phase_data : pd.DataFrame = make_phase_data(country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"],
|
lmvs_data=game_data["lmvs_data"],
|
||||||
conversations_data=conversations_data,
|
conversations_data=conversations_data,
|
||||||
orders_data=orders_data)
|
orders_data=orders_data)
|
||||||
|
|
||||||
return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
|
return {"orders_data": orders_data,
|
||||||
|
"conversations_data": conversations_data,
|
||||||
|
"phase_data": phase_data}
|
||||||
|
|
||||||
def process_game_data_from_zip(zip_path: Path, game_name: str) -> Dict[str, pd.DataFrame]:
|
def process_game_data_from_zip(zip_path: Path, game_name: str) -> Dict[str, pd.DataFrame]:
|
||||||
"""Reads log data from zip and makes analytic data sets"""
|
"""Reads log data from zip and makes analytic data sets"""
|
||||||
|
|
||||||
game_data : dict[str, pd.DataFrame] = process_game_in_zip(zip_path=zip_path, selected_game=game_name)
|
game_data: dict[str, pd.DataFrame] = process_game_inputs_in_zip(zip_path=zip_path, selected_game=game_name)
|
||||||
|
|
||||||
orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"],
|
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
|
||||||
|
|
||||||
|
orders_data: pd.DataFrame = make_longform_order_data(
|
||||||
|
country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"],
|
lmvs_data=game_data["lmvs_data"],
|
||||||
all_responses=game_data["all_responses"])
|
all_responses=game_data["all_responses"],
|
||||||
|
)
|
||||||
|
|
||||||
conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
|
conversations_data: pd.DataFrame = make_conversation_data(
|
||||||
|
country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"]
|
||||||
|
)
|
||||||
|
|
||||||
phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"],
|
phase_data: pd.DataFrame = make_phase_data(
|
||||||
|
country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"],
|
lmvs_data=game_data["lmvs_data"],
|
||||||
conversations_data=conversations_data,
|
conversations_data=conversations_data,
|
||||||
orders_data=orders_data)
|
orders_data=orders_data,
|
||||||
|
)
|
||||||
|
|
||||||
return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
|
return {
|
||||||
|
"orders_data": orders_data,
|
||||||
|
"conversations_data": conversations_data,
|
||||||
|
"phase_data": phase_data,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Run all three analysis scripts in sequence with the same arguments.")
|
parser = argparse.ArgumentParser(description="Run all three analysis scripts in sequence with the same arguments.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--selected_game",
|
"--selected_game",
|
||||||
type=str,
|
type=str,
|
||||||
|
|
@ -98,16 +117,18 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
games_to_process = args.selected_game
|
games_to_process = args.selected_game
|
||||||
if not games_to_process:
|
if not games_to_process:
|
||||||
games_to_process = os.listdir(args_dict["game_data_folder"])
|
games_to_process = [p.name for p in args_dict["game_data_folder"].iterdir() if p.is_dir()]
|
||||||
for game in tqdm(games_to_process, desc="Processing games"):
|
for game in tqdm(games_to_process, desc="Processing games"):
|
||||||
game_path = args_dict["game_data_folder"] / game
|
game_path = args_dict["game_data_folder"] / game
|
||||||
if not game_path.is_dir():
|
if not game_path.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = process_game_data_from_folders(game_name=game, game_path=args_dict["game_data_folder"])
|
results = process_game_data_from_folders(game_path=game_path)
|
||||||
for data_set, df in results.items():
|
for data_set, df in results.items():
|
||||||
output_path = args_dict["analysis_folder"] / data_set / f"{game}_{data_set}.csv"
|
output_dir = args_dict["analysis_folder"] / data_set
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path = output_dir / f"{game}_{data_set}.csv"
|
||||||
df.to_csv(output_path, index=False)
|
df.to_csv(output_path, index=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing game {game}: {e}")
|
print(f"Error processing game {game}: {e}")
|
||||||
|
|
@ -53,24 +53,33 @@ Return / save
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES, supply_centers, coastal_scs, place_identifier, unit_identifier, unit_move, possible_commands
|
from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
|
||||||
|
from analysis.schemas import COUNTRIES, ALL_SUPPLY_CENTERS, COASTAL_SCs, PLACE_IDENTIFIER, UNIT_IDENTIFIER, UNIT_MOVE, POSSIBLE_COMMANDS
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import traceback
|
||||||
|
from typing import List, Optional, Dict
|
||||||
# Suppress pandas warnings
|
# Suppress pandas warnings
|
||||||
warnings.filterwarnings('ignore', category=UserWarning, module='pandas.core.strings')
|
warnings.filterwarnings('ignore', category=UserWarning, module='pandas.core.strings')
|
||||||
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
|
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
|
||||||
|
|
||||||
def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
|
def make_longform_order_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
|
||||||
try:
|
"""
|
||||||
country_to_model = overview.loc[1, COUNTRIES] # map countries to models
|
Makes a dataframe with a row for each order given by every power, in every phase (see module docstring for more details).
|
||||||
except:
|
|
||||||
country_to_model = {country: "not specified in overview.jsonl" for country in COUNTRIES}
|
Args:
|
||||||
|
country_to_model: A Series mapping country names to model names
|
||||||
|
lmvs_data: A DataFrame containing the game data
|
||||||
|
all_responses: A DataFrame containing the responses from the LLM responses csv
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A DataFrame with a row for each order given by every power, in every phase
|
||||||
|
"""
|
||||||
################## PART 1 ##################
|
################## PART 1 ##################
|
||||||
# build `turn_actions` dataframe
|
# build `turn_actions` dataframe
|
||||||
|
|
||||||
|
|
@ -120,22 +129,22 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
|
|
||||||
# categorize each order based on regex
|
# categorize each order based on regex
|
||||||
# note that this will overwrite if multiple regexes match, which is why we've split support into 2 commands
|
# note that this will overwrite if multiple regexes match, which is why we've split support into 2 commands
|
||||||
for possible_command, regex in possible_commands.items():
|
for possible_command, regex in POSSIBLE_COMMANDS.items():
|
||||||
all_orders_ever.loc[all_orders_ever.order.str.contains(regex, regex=True), "command"] = possible_command
|
all_orders_ever.loc[all_orders_ever.order.str.contains(regex, regex=True), "command"] = possible_command
|
||||||
|
|
||||||
all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({place_identifier})")
|
all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({PLACE_IDENTIFIER})")
|
||||||
all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(supply_centers) | all_orders_ever["unit_location"].isin(coastal_scs)
|
all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["unit_location"].isin(COASTAL_SCs)
|
||||||
|
|
||||||
# only MOVE has a destination
|
# only MOVE has a destination
|
||||||
all_orders_ever["destination"] = np.where(
|
all_orders_ever["destination"] = np.where(
|
||||||
all_orders_ever["command"]=="Move",
|
all_orders_ever["command"]=="Move",
|
||||||
all_orders_ever["order"].str.extract(rf"{unit_identifier} . ({place_identifier})", expand=False),
|
all_orders_ever["order"].str.extract(rf"{UNIT_IDENTIFIER} . ({PLACE_IDENTIFIER})", expand=False),
|
||||||
np.nan
|
np.nan
|
||||||
)
|
)
|
||||||
all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(supply_centers) | all_orders_ever["destination"].isin(coastal_scs)
|
all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["destination"].isin(COASTAL_SCs)
|
||||||
|
|
||||||
# Retreat also has a destination
|
# Retreat also has a destination
|
||||||
all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{unit_identifier} R ({place_identifier})", expand=False)
|
all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{UNIT_IDENTIFIER} R ({PLACE_IDENTIFIER})", expand=False)
|
||||||
|
|
||||||
all_orders_ever["immediate_result"] = all_orders_ever["order"].str.extract(r"\(([^)]+)\)")
|
all_orders_ever["immediate_result"] = all_orders_ever["order"].str.extract(r"\(([^)]+)\)")
|
||||||
all_orders_ever["immediate_result"] = all_orders_ever["immediate_result"].fillna("PASS")
|
all_orders_ever["immediate_result"] = all_orders_ever["immediate_result"].fillna("PASS")
|
||||||
|
|
@ -146,7 +155,17 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
all_orders_ever["model_short_name"] = all_orders_ever["model"].str.split("/").str[-1]
|
all_orders_ever["model_short_name"] = all_orders_ever["model"].str.split("/").str[-1]
|
||||||
all_orders_ever["country_model"] = all_orders_ever["country"] + " (" + all_orders_ever["model_short_name"] + ")"
|
all_orders_ever["country_model"] = all_orders_ever["country"] + " (" + all_orders_ever["model_short_name"] + ")"
|
||||||
|
|
||||||
def check_location_influence(phase_id, location):
|
def check_location_influence(phase_id : str, location : str) -> str:
|
||||||
|
"""
|
||||||
|
Helper - checks who owns a location at a given phase. Uses the `turn_actions` dataframe from overall context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
phase_id: The phase to check
|
||||||
|
location: The location to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The country that owns the location, or "Unowned" if no country owns it
|
||||||
|
"""
|
||||||
# checking who owns a location at `phase_id`
|
# checking who owns a location at `phase_id`
|
||||||
if pd.isnull(location):
|
if pd.isnull(location):
|
||||||
return np.nan
|
return np.nan
|
||||||
|
|
@ -162,22 +181,45 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
all_orders_ever["destination_affiliation"] = all_orders_ever.apply(lambda row: check_location_influence(row["phase"],
|
all_orders_ever["destination_affiliation"] = all_orders_ever.apply(lambda row: check_location_influence(row["phase"],
|
||||||
row["destination"]), axis=1)
|
row["destination"]), axis=1)
|
||||||
|
|
||||||
def find_supporting_country(unit_command, command_type, phase):
|
def find_supporting_country(unit_command, command_type, phase) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Helper - finds which orders support a given unit and records the supporting powers. Operating on the `all_orders_ever` dataframe.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
unit_command: The unit command to find supporting orders for
|
||||||
|
command_type: The type of command ("Move" or "Hold")
|
||||||
|
phase: The phase to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string containing a comma-separated list of countries that issued an order to support that unit, or None if no such orders exist
|
||||||
|
"""
|
||||||
if command_type == "Move" or command_type == "Hold": # commands that can be supported
|
if command_type == "Move" or command_type == "Hold": # commands that can be supported
|
||||||
potential_supports = all_orders_ever[(all_orders_ever["phase"] == phase) &
|
potential_supports = all_orders_ever[(all_orders_ever["phase"] == phase) &
|
||||||
(all_orders_ever["command"].isin(["Support Move", "Support Hold"]))]
|
(all_orders_ever["command"].isin(["Support Move", "Support Hold"]))]
|
||||||
potential_supports = potential_supports[potential_supports["order"].str.contains(unit_command, regex=False)]
|
potential_supports = potential_supports[potential_supports["order"].str.contains(unit_command, regex=False)]
|
||||||
if potential_supports.empty:
|
if potential_supports.empty:
|
||||||
return np.nan
|
return None
|
||||||
else:
|
else:
|
||||||
return ",".join(potential_supports["country"].tolist())
|
return ",".join(potential_supports["country"].tolist())
|
||||||
return np.nan
|
return None
|
||||||
|
|
||||||
all_orders_ever["supported_by"] = all_orders_ever.apply(lambda row: find_supporting_country(row["order"], row["command"], row["phase"]), axis=1)
|
all_orders_ever["supported_by"] = all_orders_ever.apply(lambda row: find_supporting_country(row["order"], row["command"], row["phase"]), axis=1)
|
||||||
all_orders_ever["in_anothers_territory"] =( all_orders_ever["country"] != all_orders_ever["unit_location_affiliation"]) & (all_orders_ever["unit_location_affiliation"] != "Unowned")
|
all_orders_ever["in_anothers_territory"] =( all_orders_ever["country"] != all_orders_ever["unit_location_affiliation"]) & (all_orders_ever["unit_location_affiliation"] != "Unowned")
|
||||||
all_orders_ever["moving_into_anothers_territory"] = (all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & (all_orders_ever["destination_affiliation"].notnull()) & (all_orders_ever["destination_affiliation"] != "Unowned")
|
all_orders_ever["moving_into_anothers_territory"] = ((all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) &
|
||||||
|
(all_orders_ever["destination_affiliation"].notnull()) &
|
||||||
|
(all_orders_ever["destination_affiliation"] != "Unowned"))
|
||||||
|
|
||||||
def find_owner_of_unit(unit_location, phase):
|
def find_owner_of_unit(unit_location : str, phase : str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Helper - finds the owner of a unit at a given phase. Operating on the `turn_actions` dataframe from overall context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
unit_location: The location of the unit to find the owner of
|
||||||
|
phase: The phase to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The country that owns the unit, or None if no country owns it
|
||||||
|
"""
|
||||||
if pd.notnull(unit_location):
|
if pd.notnull(unit_location):
|
||||||
unit_status = turn_actions.loc[turn_actions.index.str.contains("_units"), phase]
|
unit_status = turn_actions.loc[turn_actions.index.str.contains("_units"), phase]
|
||||||
unit_status.index = unit_status.index.str.replace("_units", "")
|
unit_status.index = unit_status.index.str.replace("_units", "")
|
||||||
|
|
@ -186,18 +228,30 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
if re.match(f"[AF] {unit_location}", unit):
|
if re.match(f"[AF] {unit_location}", unit):
|
||||||
return country
|
return country
|
||||||
|
|
||||||
# where were they going? what was their destination like?
|
def find_destination_info(destination, phase) -> Optional[Dict[str, Optional[str]]]:
|
||||||
def find_destination_info(destination, phase):
|
"""
|
||||||
|
Helper - finds information about the destination of a unit at a given phase.
|
||||||
|
Operating on the `all_orders_ever` dataframe from overall context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
destination: The location of the unit to find the owner of
|
||||||
|
phase: The phase to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing information about the destination unit, or None if no such unit exists
|
||||||
|
"""
|
||||||
if pd.notnull(destination):
|
if pd.notnull(destination):
|
||||||
country = find_owner_of_unit(destination, phase)
|
country = find_owner_of_unit(destination, phase)
|
||||||
|
# there should only ever be one unit at a given location during a phase
|
||||||
destination_unit_orders = all_orders_ever[(all_orders_ever["country"] == country) &
|
destination_unit_orders = all_orders_ever[(all_orders_ever["country"] == country) &
|
||||||
(all_orders_ever["phase"] == phase) &
|
(all_orders_ever["phase"] == phase) &
|
||||||
(all_orders_ever["unit_location"] == destination)]
|
(all_orders_ever["unit_location"] == destination)]
|
||||||
if not destination_unit_orders.empty:
|
if not destination_unit_orders.empty:
|
||||||
|
destination_unit_orders = destination_unit_orders.iloc[0] # safe conversion to a series
|
||||||
return {"destination_unit_owner": country,
|
return {"destination_unit_owner": country,
|
||||||
"destination_unit_order": destination_unit_orders["command"].squeeze(),
|
"destination_unit_order": destination_unit_orders["command"],
|
||||||
"destination_unit_outcome":destination_unit_orders["immediate_result"].squeeze(),
|
"destination_unit_outcome":destination_unit_orders["immediate_result"],
|
||||||
"destination_unit_supported_by": destination_unit_orders["supported_by"].squeeze()}
|
"destination_unit_supported_by": destination_unit_orders["supported_by"]}
|
||||||
|
|
||||||
destination_unit_info = all_orders_ever.apply(lambda row: find_destination_info(row["destination"], row["phase"]), axis=1).apply(pd.Series)
|
destination_unit_info = all_orders_ever.apply(lambda row: find_destination_info(row["destination"], row["phase"]), axis=1).apply(pd.Series)
|
||||||
destination_unit_info["destination_was_occupied"] = destination_unit_info["destination_unit_owner"].notnull()
|
destination_unit_info["destination_was_occupied"] = destination_unit_info["destination_unit_owner"].notnull()
|
||||||
|
|
@ -205,30 +259,56 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
all_orders_ever = pd.concat([all_orders_ever, destination_unit_info], axis=1)
|
all_orders_ever = pd.concat([all_orders_ever, destination_unit_info], axis=1)
|
||||||
|
|
||||||
# if a Support action: who were they supporting? what was their support doing?
|
# if a Support action: who were they supporting? what was their support doing?
|
||||||
def find_support_recipient_info(unit_order, command, phase):
|
def find_support_recipient_info(unit_order, command, phase) -> Optional[Dict[str, Optional[str]]]:
|
||||||
|
"""
|
||||||
|
Helper - finds information about the recipient of a support action at a given phase.
|
||||||
|
Operating on the `all_orders_ever` dataframe from overall context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
unit_order: The order of the unit to find the recipient of support for
|
||||||
|
command: The type of command ("Support Move" or "Support Hold")
|
||||||
|
phase: The phase to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing information about the recipient of support, or None if no such recipient exists
|
||||||
|
"""
|
||||||
if "Support" in command:
|
if "Support" in command:
|
||||||
recipient_location = re.match(rf"{unit_identifier} S [AF] ({place_identifier})", unit_order).group(1)
|
recipient_location = re.match(rf"{UNIT_IDENTIFIER} S [AF] ({PLACE_IDENTIFIER})", unit_order).group(1)
|
||||||
recipient_country = find_owner_of_unit(recipient_location, phase)
|
recipient_country = find_owner_of_unit(recipient_location, phase)
|
||||||
|
# there should only ever be one unit at a given location during a phase
|
||||||
recipient_order_info = all_orders_ever[(all_orders_ever["country"] == recipient_country) &
|
recipient_order_info = all_orders_ever[(all_orders_ever["country"] == recipient_country) &
|
||||||
(all_orders_ever["phase"] == phase) &
|
(all_orders_ever["phase"] == phase) &
|
||||||
(all_orders_ever["unit_location"] == recipient_location)]
|
(all_orders_ever["unit_location"] == recipient_location)].iloc[0]
|
||||||
return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"].squeeze(),
|
return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"],
|
||||||
"recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"].squeeze(),
|
"recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"],
|
||||||
"recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"].squeeze(),
|
"recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"],
|
||||||
"recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"].squeeze()}
|
"recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"]}
|
||||||
|
|
||||||
support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], row["command"], row["phase"]), axis=1).apply(pd.Series)
|
support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"],
|
||||||
|
row["command"],
|
||||||
|
row["phase"]), axis=1).apply(pd.Series)
|
||||||
|
# add support recipient info to all_orders_ever as additional columns
|
||||||
all_orders_ever = pd.concat([all_orders_ever, support_recipient_info], axis=1)
|
all_orders_ever = pd.concat([all_orders_ever, support_recipient_info], axis=1)
|
||||||
|
|
||||||
# add relationships with other countries
|
# add relationships with other countries
|
||||||
|
# if original v1
|
||||||
agent_relationship_matrix_over_time = {}
|
agent_relationship_matrix_over_time = {}
|
||||||
for phase in lmvs_data["phases"]:
|
for phase in lmvs_data["phases"]:
|
||||||
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
|
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
|
||||||
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
||||||
|
|
||||||
if longform_relationships.empty:
|
if longform_relationships.empty:
|
||||||
print("Warning: no relationship data found in phase data")
|
# Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
|
||||||
else:
|
agent_relationship_matrix_over_time = {}
|
||||||
|
for phase in lmvs_data["phases"]:
|
||||||
|
agent_state = phase.get("state_agents", {})
|
||||||
|
country_relationships = {}
|
||||||
|
for c in COUNTRIES:
|
||||||
|
country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
|
||||||
|
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
|
||||||
|
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
||||||
|
|
||||||
|
|
||||||
longform_relationships.columns = longform_relationships.columns.str.lower()
|
longform_relationships.columns = longform_relationships.columns.str.lower()
|
||||||
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
||||||
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
||||||
|
|
@ -245,14 +325,24 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view,
|
all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view,
|
||||||
left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
|
left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
|
||||||
|
|
||||||
# if action was supporting
|
# if action was supporting, add flags
|
||||||
all_orders_ever["supporting_self"] = all_orders_ever["country"]==all_orders_ever["recipient_unit_owner"]
|
all_orders_ever["supporting_self"] = all_orders_ever["country"]==all_orders_ever["recipient_unit_owner"]
|
||||||
all_orders_ever["supporting_an_ally"] = (all_orders_ever["country"] !=all_orders_ever["recipient_unit_owner"]) & (all_orders_ever["recipient_unit_owner"].notnull())
|
all_orders_ever["supporting_an_ally"] = (all_orders_ever["country"] !=all_orders_ever["recipient_unit_owner"]) & (all_orders_ever["recipient_unit_owner"].notnull())
|
||||||
|
|
||||||
def countries_aside_from(a_country):
|
def countries_aside_from(a_country : str) -> List[str]:
|
||||||
return [country for country in all_orders_ever["country"].unique() if country != a_country]
|
return [country for country in all_orders_ever["country"].unique() if country != a_country]
|
||||||
|
|
||||||
def check_country(supporters, country):
|
def check_country(supporters : List[str], country : str) -> bool:
|
||||||
|
"""
|
||||||
|
Helper - checks if a given country is in a list of supporters
|
||||||
|
|
||||||
|
Args:
|
||||||
|
supporters: The list of supporters to check
|
||||||
|
country: The country to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the country is in the list of supporters, False otherwise
|
||||||
|
"""
|
||||||
if pd.isnull(supporters):
|
if pd.isnull(supporters):
|
||||||
return False
|
return False
|
||||||
for other_countries in countries_aside_from(country):
|
for other_countries in countries_aside_from(country):
|
||||||
|
|
@ -267,7 +357,7 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
|
||||||
|
|
||||||
all_orders_ever["destination_unit_was_supported"] = all_orders_ever["destination_unit_supported_by"].notnull()
|
all_orders_ever["destination_unit_was_supported"] = all_orders_ever["destination_unit_supported_by"].notnull()
|
||||||
|
|
||||||
# add number of unit orders ever made
|
# add number of unit orders ever made during this game
|
||||||
unit_order_weight = 1 / all_orders_ever.groupby("country").size()
|
unit_order_weight = 1 / all_orders_ever.groupby("country").size()
|
||||||
all_orders_ever["unit_order_weight"] = all_orders_ever["country"].map(unit_order_weight)
|
all_orders_ever["unit_order_weight"] = all_orders_ever["country"].map(unit_order_weight)
|
||||||
|
|
||||||
|
|
@ -317,25 +407,24 @@ if __name__ == "__main__":
|
||||||
current_game_data_folder = Path(args.game_data_folder)
|
current_game_data_folder = Path(args.game_data_folder)
|
||||||
analysis_folder = Path(args.analysis_folder) / "orders_data"
|
analysis_folder = Path(args.analysis_folder) / "orders_data"
|
||||||
|
|
||||||
if not os.path.exists(analysis_folder):
|
if not analysis_folder.exists():
|
||||||
print(f"Output folder {analysis_folder} not found, creating it.")
|
print(f"Output folder {analysis_folder} not found, creating it.")
|
||||||
os.makedirs(analysis_folder)
|
analysis_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
games_to_process = args.selected_game
|
games_to_process = args.selected_game
|
||||||
if not games_to_process:
|
if not games_to_process:
|
||||||
games_to_process = os.listdir(current_game_data_folder)
|
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
|
||||||
|
|
||||||
for game_name in tqdm(games_to_process, desc="Processing games"):
|
for game_name in tqdm(games_to_process, desc="Processing games"):
|
||||||
if game_name == ".DS_Store":
|
|
||||||
continue
|
|
||||||
|
|
||||||
game_path = current_game_data_folder / game_name
|
game_path = current_game_data_folder / game_name
|
||||||
if not os.path.isdir(game_path):
|
if not game_path.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
game_source_data = process_standard_game_inputs(game_path, game_name)
|
game_source_data = process_standard_game_inputs(game_path)
|
||||||
data = make_longform_order_data(overview=game_source_data["overview"],
|
overview_df = game_source_data["overview"]
|
||||||
|
country_to_model = get_country_to_model_mapping(overview_df, game_source_data["all_responses"])
|
||||||
|
data = make_longform_order_data(country_to_model=country_to_model,
|
||||||
lmvs_data=game_source_data["lmvs_data"],
|
lmvs_data=game_source_data["lmvs_data"],
|
||||||
all_responses=game_source_data["all_responses"])
|
all_responses=game_source_data["all_responses"])
|
||||||
output_path = analysis_folder / f"{game_name}_orders_data.csv"
|
output_path = analysis_folder / f"{game_name}_orders_data.csv"
|
||||||
|
|
@ -344,3 +433,4 @@ if __name__ == "__main__":
|
||||||
print(f"Could not process {game_name}. Missing file: {e.filename}")
|
print(f"Could not process {game_name}. Missing file: {e.filename}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An unexpected error occurred while processing {game_name}: {e}")
|
print(f"An unexpected error occurred while processing {game_name}: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Make conversation data from diplomacy game logs.
|
Make conversation data from diplomacy game logs, for convenience in analyzing conversationd data alone.
|
||||||
|
|
||||||
Resulting columns:
|
Resulting columns:
|
||||||
['phase',
|
['phase',
|
||||||
|
|
@ -21,14 +21,23 @@ Resulting columns:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import itertools
|
import itertools
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs
|
from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs, get_country_to_model_mapping
|
||||||
|
import traceback
|
||||||
|
|
||||||
def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) -> pd.DataFrame:
|
def make_conversation_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame) -> pd.DataFrame:
|
||||||
country_to_model = overview.loc[1, COUNTRIES]
|
"""
|
||||||
|
Make conversation data from diplomacy game logs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
country_to_model: A Series mapping country names to model names
|
||||||
|
lmvs_data: A DataFrame containing the game data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A DataFrame containing the conversation data (a row for every conversation between 2 powers, at every phase)
|
||||||
|
"""
|
||||||
COUNTRY_COMBINATIONS = list(itertools.combinations(COUNTRIES, r=2))
|
COUNTRY_COMBINATIONS = list(itertools.combinations(COUNTRIES, r=2))
|
||||||
|
|
||||||
# relationship data
|
# relationship data
|
||||||
|
|
@ -67,20 +76,12 @@ def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) ->
|
||||||
|
|
||||||
messages_from_sender = (messages_exchanged['sender']==sender).sum()
|
messages_from_sender = (messages_exchanged['sender']==sender).sum()
|
||||||
sender_streak = max_consecutive[sender] if sender in max_consecutive.index else 0
|
sender_streak = max_consecutive[sender] if sender in max_consecutive.index else 0
|
||||||
messages_from_recipient = (messages_exchanged['recipient']==sender).sum()
|
messages_from_recipient = (messages_exchanged['sender'] == recipient).sum()
|
||||||
recipient_streak = max_consecutive[recipient] if recipient in max_consecutive.index else 0
|
recipient_streak = max_consecutive.get(recipient, 0)
|
||||||
party_1_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) &
|
party_1_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == sender)].reindex(columns=[recipient])
|
||||||
(longform_relationships["agent"]==sender)][recipient]
|
party_1_opinion = party_1_opinion_series.iloc[0,0] if not party_1_opinion_series.empty else ""
|
||||||
if party_1_opinion.empty:
|
party_2_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == recipient)].reindex(columns=[sender])
|
||||||
party_1_opinion = ""
|
party_2_opinion = party_2_opinion_series.iloc[0,0] if not party_2_opinion_series.empty else ""
|
||||||
else:
|
|
||||||
party_1_opinion = party_1_opinion.squeeze()
|
|
||||||
party_2_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) &
|
|
||||||
(longform_relationships["agent"]==recipient)][sender]
|
|
||||||
if party_2_opinion.empty:
|
|
||||||
party_2_opinion = ""
|
|
||||||
else:
|
|
||||||
party_2_opinion = party_2_opinion.squeeze()
|
|
||||||
|
|
||||||
conversation_data = {
|
conversation_data = {
|
||||||
"party_1": sender,
|
"party_1": sender,
|
||||||
|
|
@ -132,29 +133,30 @@ if __name__ == "__main__":
|
||||||
current_game_data_folder = Path(args.game_data_folder)
|
current_game_data_folder = Path(args.game_data_folder)
|
||||||
analysis_folder = Path(args.analysis_folder) / "conversations_data"
|
analysis_folder = Path(args.analysis_folder) / "conversations_data"
|
||||||
|
|
||||||
if not os.path.exists(analysis_folder):
|
if not analysis_folder.exists():
|
||||||
print(f"Output folder {analysis_folder} not found, creating it.")
|
print(f"Output folder {analysis_folder} not found, creating it.")
|
||||||
os.makedirs(analysis_folder)
|
analysis_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
games_to_process = args.selected_game
|
games_to_process = args.selected_game
|
||||||
if not games_to_process:
|
if not games_to_process:
|
||||||
games_to_process = os.listdir(current_game_data_folder)
|
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
|
||||||
|
|
||||||
for game_name in tqdm(games_to_process):
|
|
||||||
if game_name == ".DS_Store":
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
for game_name in tqdm(games_to_process, desc="Processing games"):
|
||||||
game_path = current_game_data_folder / game_name
|
game_path = current_game_data_folder / game_name
|
||||||
if not os.path.isdir(game_path):
|
if not game_path.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
game_data = process_standard_game_inputs(game_data_folder=game_path,
|
game_data = process_standard_game_inputs(game_path)
|
||||||
selected_game=game_name)
|
overview_df = game_data["overview"]
|
||||||
data = make_conversation_data(overview=game_data["overview"],
|
country_to_model = get_country_to_model_mapping(overview_df, game_data["all_responses"])
|
||||||
|
data = make_conversation_data(country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"])
|
lmvs_data=game_data["lmvs_data"])
|
||||||
output_path = analysis_folder / f"{game_name}_conversations_data.csv"
|
output_path = analysis_folder / f"{game_name}_conversations_data.csv"
|
||||||
data.to_csv(output_path, index=False)
|
data.to_csv(output_path, index=False)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Could not process {game_name}. Missing file: {e.filename}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An unexpected error occurred while processing {game_name}: {e}")
|
print(f"An unexpected error occurred while processing {game_name}: {e}")
|
||||||
print(f"Skipping {game_name}.")
|
print(f"Skipping {game_name}.")
|
||||||
|
traceback.print_exc()
|
||||||
|
|
@ -74,24 +74,39 @@ etc (a lot of possible combinations here)
|
||||||
'invalid_order_count', (number of invalid orders given)
|
'invalid_order_count', (number of invalid orders given)
|
||||||
'no_moves_extracted_flag', (flag for if no moves were extracted)
|
'no_moves_extracted_flag', (flag for if no moves were extracted)
|
||||||
'valid_order_count', (number of valid orders, calculated as unit_count - invalid_order_count, unless no valid orders were extracted )
|
'valid_order_count', (number of valid orders, calculated as unit_count - invalid_order_count, unless no valid orders were extracted )
|
||||||
|
'goals', (list of goals for the phase separated by \n\n)
|
||||||
|
'diary', (diary entry for the phase)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES
|
from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping
|
||||||
|
from analysis.schemas import COUNTRIES
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import traceback
|
||||||
|
|
||||||
def make_phase_data(overview : pd.DataFrame,
|
def make_phase_data(country_to_model : pd.Series,
|
||||||
lmvs_data : pd.DataFrame,
|
lmvs_data : pd.DataFrame,
|
||||||
conversations_data : pd.DataFrame,
|
conversations_data : pd.DataFrame,
|
||||||
orders_data : pd.DataFrame) -> pd.DataFrame:
|
orders_data : pd.DataFrame) -> pd.DataFrame:
|
||||||
country_to_model = overview.loc[1, COUNTRIES]
|
"""
|
||||||
|
takes country-to-model mapping, game state (lmvs_data), conversations, and orders, and returns a dataframe with one row per (power, phase).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
country_to_model: mapping of country to model
|
||||||
|
lmvs_data: raw lmvs_data dataframe
|
||||||
|
conversations_data: dataframe of conversations
|
||||||
|
orders_data: dataframe of orders
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dataframe with one row per (power, phase) containing phase-level features, convos, relationships, and orders info.
|
||||||
|
"""
|
||||||
|
|
||||||
longform_conversations_complete = []
|
longform_conversations_complete = []
|
||||||
for c in COUNTRIES:
|
for c in COUNTRIES:
|
||||||
|
|
@ -109,12 +124,27 @@ def make_phase_data(overview : pd.DataFrame,
|
||||||
|
|
||||||
############ Relationships #############
|
############ Relationships #############
|
||||||
agent_relationship_matrix_over_time = {}
|
agent_relationship_matrix_over_time = {}
|
||||||
state_list = {}
|
|
||||||
for phase in lmvs_data["phases"]:
|
for phase in lmvs_data["phases"]:
|
||||||
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
|
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
|
||||||
|
|
||||||
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
||||||
|
|
||||||
|
if longform_relationships.empty:
|
||||||
|
# Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
|
||||||
|
agent_relationship_matrix_over_time = {}
|
||||||
|
for phase in lmvs_data["phases"]:
|
||||||
|
agent_state = phase.get("state_agents", {})
|
||||||
|
country_relationships = {}
|
||||||
|
for c in COUNTRIES:
|
||||||
|
country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
|
||||||
|
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
|
||||||
|
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
|
||||||
|
|
||||||
|
|
||||||
|
longform_relationships.columns = longform_relationships.columns.str.lower()
|
||||||
|
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
||||||
|
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
|
||||||
|
'russia', 'turkey']].fillna("Self")
|
||||||
|
longform_relationships = longform_relationships.add_prefix("relationship_")
|
||||||
|
|
||||||
########### ORDERS DATA ###########
|
########### ORDERS DATA ###########
|
||||||
# adding results to lmvs
|
# adding results to lmvs
|
||||||
|
|
@ -200,7 +230,7 @@ def make_phase_data(overview : pd.DataFrame,
|
||||||
# lost a supply center
|
# lost a supply center
|
||||||
|
|
||||||
|
|
||||||
# territories held, territories moved to?
|
# territories held, territories moved to
|
||||||
|
|
||||||
orders_summary = pd.concat([commands_given.unstack().add_prefix("count_").add_suffix("_commands"),
|
orders_summary = pd.concat([commands_given.unstack().add_prefix("count_").add_suffix("_commands"),
|
||||||
immediate_outcomes.unstack().add_prefix("count_got_"),
|
immediate_outcomes.unstack().add_prefix("count_got_"),
|
||||||
|
|
@ -240,10 +270,33 @@ def make_phase_data(overview : pd.DataFrame,
|
||||||
state_list[phase["name"]].append(orders_over_time.loc[phase["name"]].rename("orders"))
|
state_list[phase["name"]].append(orders_over_time.loc[phase["name"]].rename("orders"))
|
||||||
state_list[phase["name"]] = pd.concat(state_list[phase["name"]], axis=1)
|
state_list[phase["name"]] = pd.concat(state_list[phase["name"]], axis=1)
|
||||||
|
|
||||||
|
# goals and diaries
|
||||||
|
goals_over_time = {}
|
||||||
|
diary_over_time = {}
|
||||||
|
for phase in lmvs_data["phases"]:
|
||||||
|
agent_state = phase.get("state_agents", {})
|
||||||
|
if agent_state: # Not all versions have this
|
||||||
|
country_goals = {}
|
||||||
|
country_diary = {}
|
||||||
|
for c in COUNTRIES:
|
||||||
|
country_goals[c] = "\n\n".join(agent_state.get(c, {}).get("goals", {}))
|
||||||
|
country_diary[c] = "\n\n".join(agent_state.get(c, {}).get("full_private_diary", []))
|
||||||
|
goals_over_time[phase["name"]] = pd.Series(country_goals)
|
||||||
|
diary_over_time[phase["name"]] = pd.Series(country_diary)
|
||||||
|
|
||||||
state_list = pd.concat(state_list, axis=0)
|
state_list = pd.concat(state_list, axis=0)
|
||||||
state_list.index.names = ["phase", "agent"]
|
state_list.index.names = ["phase", "agent"]
|
||||||
|
if goals_over_time:
|
||||||
|
goals_over_time = pd.DataFrame(goals_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"goal"}).set_index(["phase", "agent"])
|
||||||
|
state_list = pd.concat([state_list, goals_over_time], axis=1)
|
||||||
|
if diary_over_time:
|
||||||
|
diary_over_time = pd.DataFrame(diary_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"diary"}).set_index(["phase", "agent"])
|
||||||
|
state_list = pd.concat([state_list, diary_over_time], axis=1)
|
||||||
|
|
||||||
|
longform_relationships = longform_relationships.set_index(["relationship_phase", "relationship_agent"])
|
||||||
|
longform_relationships.index.names = ["phase", "agent"]
|
||||||
full_phase_data = pd.merge(state_list,
|
full_phase_data = pd.merge(state_list,
|
||||||
longform_relationships.set_index(["phase", "agent"]).add_prefix("relationship_to_").fillna("Self"),
|
longform_relationships,
|
||||||
left_index=True, right_index=True).reset_index()
|
left_index=True, right_index=True).reset_index()
|
||||||
full_phase_data["centers_count"] = full_phase_data["centers"].apply(lambda x: len(x))
|
full_phase_data["centers_count"] = full_phase_data["centers"].apply(lambda x: len(x))
|
||||||
full_phase_data["units_count"] = full_phase_data["units"].apply(lambda x: len(x))
|
full_phase_data["units_count"] = full_phase_data["units"].apply(lambda x: len(x))
|
||||||
|
|
@ -262,7 +315,7 @@ def make_phase_data(overview : pd.DataFrame,
|
||||||
"influence_count"]].diff()
|
"influence_count"]].diff()
|
||||||
|
|
||||||
full_phase_data = pd.merge(full_phase_data, longform_conversations_complete,
|
full_phase_data = pd.merge(full_phase_data, longform_conversations_complete,
|
||||||
left_on=["phase", "agent"], right_on=["phase", "power"]).drop(columns=["agent"])
|
left_on=["phase", "agent"], right_on=["phase", "power"])
|
||||||
full_phase_data = pd.merge(full_phase_data, orders_summary, how="left", left_on=["power", "phase"],
|
full_phase_data = pd.merge(full_phase_data, orders_summary, how="left", left_on=["power", "phase"],
|
||||||
right_index=True)
|
right_index=True)
|
||||||
full_phase_data["model"] = full_phase_data["power"].map(country_to_model)
|
full_phase_data["model"] = full_phase_data["power"].map(country_to_model)
|
||||||
|
|
@ -301,32 +354,39 @@ if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
current_game_data_folder = Path(args.game_data_folder)
|
current_game_data_folder = Path(args.game_data_folder)
|
||||||
analysis_folder = args.analysis_folder
|
analysis_folder = Path(args.analysis_folder)
|
||||||
output_folder = Path(analysis_folder) / "phase_data"
|
output_folder = analysis_folder / "phase_data"
|
||||||
|
|
||||||
if not os.path.exists(output_folder):
|
if not output_folder.exists():
|
||||||
print(f"Output folder {output_folder} not found, creating it.")
|
print(f"Output folder {output_folder} not found, creating it.")
|
||||||
os.makedirs(output_folder)
|
output_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
games_to_process = args.selected_game
|
games_to_process = args.selected_game
|
||||||
if not games_to_process:
|
if not games_to_process:
|
||||||
games_to_process = os.listdir(current_game_data_folder)
|
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
|
||||||
|
|
||||||
for game_name in tqdm(games_to_process):
|
for game_name in tqdm(games_to_process):
|
||||||
if game_name == ".DS_Store":
|
if game_name == ".DS_Store":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
game_path = current_game_data_folder / game_name
|
game_path = current_game_data_folder / game_name
|
||||||
if not os.path.isdir(game_path):
|
if not game_path.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#try:
|
try:
|
||||||
game_data = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
|
game_data = process_standard_game_inputs(game_path)
|
||||||
orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
|
orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
|
||||||
conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
|
conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
|
||||||
data = make_phase_data(overview=game_data["overview"],
|
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
|
||||||
|
data = make_phase_data(country_to_model=country_to_model,
|
||||||
lmvs_data=game_data["lmvs_data"],
|
lmvs_data=game_data["lmvs_data"],
|
||||||
conversations_data=conversations_data,
|
conversations_data=conversations_data,
|
||||||
orders_data=orders_data)
|
orders_data=orders_data)
|
||||||
output_path = output_folder / f"{game_name}_phase_data.csv"
|
output_path = output_folder / f"{game_name}_phase_data.csv"
|
||||||
data.to_csv(output_path, index=False)
|
data.to_csv(output_path, index=False)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Could not process {game_name}. Missing file: {e.filename}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred while processing {game_name}: {e}")
|
||||||
|
print(f"Skipping {game_name}.")
|
||||||
|
traceback.print_exc()
|
||||||
143
analysis/readme.md
Normal file
143
analysis/readme.md
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
# Analysis Pipeline
|
||||||
|
|
||||||
|
This folder contains the data processing pipeline for converting raw diplomacy game logs into structured analysis datasets.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The module contains pipelines transforms raw game logs data (stored as json/csv files) into four analytical datasets:
|
||||||
|
|
||||||
|
1. **Orders Data** - one row per order given by each power in each phase
|
||||||
|
2. **Conversations Data** - one row per conversation between two powers in each phase
|
||||||
|
3. **Phase Data** - one row per power per phase with aggregated state and action summaries
|
||||||
|
4. **Game Data** - Summary of overall game features
|
||||||
|
|
||||||
|
## Main entry point
|
||||||
|
|
||||||
|
### `make_all_analysis_data.py` - Primary orchestrator
|
||||||
|
**main use case**: process all games in a data folder, create corresponding orders, conversations, and phase datasets. Supports batch and individual processing.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# process all games in a folder
|
||||||
|
python analysis/make_all_analysis_data.py \
|
||||||
|
--game_data_folder "/path/to/Game Data" \
|
||||||
|
--output_folder "/path/to/Game Data - Analysis"
|
||||||
|
|
||||||
|
# process specific games
|
||||||
|
python analysis/make_all_analysis_data.py \
|
||||||
|
--selected_game game1 game2 \
|
||||||
|
--game_data_folder "/path/to/Game Data" \
|
||||||
|
--output_folder "/path/to/Game Data - Analysis"
|
||||||
|
```
|
||||||
|
|
||||||
|
This script runs the three p1, p2 and p3 analysis scripts in sequence and saves outputs to organized subfolders.
|
||||||
|
|
||||||
|
### Individual analysis scripts
|
||||||
|
|
||||||
|
#### `p1_make_longform_orders_data.py`
|
||||||
|
**what it does**: creates detailed order-level data with one row per order given
|
||||||
|
**key outputs**:
|
||||||
|
- order classification (move, support, hold, etc.)
|
||||||
|
- unit locations and destinations
|
||||||
|
- support relationships and outcomes
|
||||||
|
- relationship matrices between powers
|
||||||
|
- llm reasoning for order generation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python analysis/p1_make_longform_orders_data.py \
|
||||||
|
--game_data_folder "/path/to/Game Data" \
|
||||||
|
--analysis_folder "/path/to/output"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `p2_make_convo_data.py`
|
||||||
|
**what it does**: extracts conversation data between all pairs of powers
|
||||||
|
**key outputs**:
|
||||||
|
- message counts and streaks per party
|
||||||
|
- conversation transcripts
|
||||||
|
- relationship context for each conversation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python analysis/p2_make_convo_data.py \
|
||||||
|
--game_data_folder "/path/to/Game Data" \
|
||||||
|
--analysis_folder "/path/to/output"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `p3_make_phase_data.py`
|
||||||
|
**what it does**: creates power-phase level summaries combining state, actions, and conversations
|
||||||
|
**key outputs**:
|
||||||
|
- current state (units, centers, influence counts)
|
||||||
|
- action summaries (command counts, outcomes)
|
||||||
|
- conversation transcripts with each power
|
||||||
|
- change metrics between phases
|
||||||
|
- llm reasoning and diary entries
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python analysis/p3_make_phase_data.py \
|
||||||
|
--game_data_folder "/path/to/Game Data" \
|
||||||
|
--analysis_folder "/path/to/output"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `statistical_game_analysis.py`
|
||||||
|
**what it does**: comprehensive statistical analysis of game results and llm performance
|
||||||
|
**key outputs**:
|
||||||
|
- game-level aggregated metrics and features
|
||||||
|
- response success/failure rates by type
|
||||||
|
- relationship dynamics and negotiation patterns
|
||||||
|
- phase-level analysis with response-type granularity
|
||||||
|
- comprehensive failure tracking and validation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# analyze single game folder
|
||||||
|
python analysis/statistical_game_analysis.py /path/to/game_folder
|
||||||
|
|
||||||
|
# batch analyze multiple games
|
||||||
|
python analysis/statistical_game_analysis.py /path/to/parent_folder --multiple
|
||||||
|
|
||||||
|
# specify output directory
|
||||||
|
python analysis/statistical_game_analysis.py /path/to/game_folder --output /path/to/output
|
||||||
|
```
|
||||||
|
|
||||||
|
**note**: this is a separate analysis tool that operates independently of the main pipeline
|
||||||
|
|
||||||
|
|
||||||
|
## supporting modules
|
||||||
|
|
||||||
|
### `analysis_helpers.py`
|
||||||
|
utility functions for:
|
||||||
|
- loading game data from folders or zip files
|
||||||
|
- mapping countries to their llm models
|
||||||
|
- standardizing data loading across scripts
|
||||||
|
|
||||||
|
### `schemas.py`
|
||||||
|
constants and regex patterns:
|
||||||
|
- supply center lists and coastal variants
|
||||||
|
- country names
|
||||||
|
- order parsing regexes
|
||||||
|
- phase naming patterns
|
||||||
|
|
||||||
|
## expected input data structure
|
||||||
|
|
||||||
|
each game folder should contain:
|
||||||
|
- `overview.jsonl` - maps countries to llm models
|
||||||
|
- `lmvsgame.json` - full turn-by-turn game state and actions
|
||||||
|
- `llm_responses.csv` - all llm prompts and responses
|
||||||
|
|
||||||
|
## output structure
|
||||||
|
|
||||||
|
the pipeline creates organized subfolders:
|
||||||
|
|
||||||
|
output_folder/
|
||||||
|
├── orders_data/
|
||||||
|
│ └── {game_name}orders_data.csv
|
||||||
|
├── conversations_data/
|
||||||
|
│ └── {game_name}conversations_data.csv
|
||||||
|
└── phase_data/
|
||||||
|
│ └── {game_name}phase_data.csv
|
||||||
|
|
||||||
|
## Use cases
|
||||||
|
|
||||||
|
- **game analysis**: examine specific games in detail
|
||||||
|
- **model comparison**: compare llm performance across games
|
||||||
|
- **relationship analysis**: study diplomatic dynamics
|
||||||
|
- **order validation**: check llm order generation success rates
|
||||||
|
- **conversation analysis**: study negotiation patterns
|
||||||
|
- **phase progression**: track game state evolution
|
||||||
19
analysis/requirements-analysis.txt
Normal file
19
analysis/requirements-analysis.txt
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Analysis Pipeline Requirements
|
||||||
|
# External packages needed for the analysis scripts
|
||||||
|
|
||||||
|
pandas>=1.5.0
|
||||||
|
numpy>=1.21.0
|
||||||
|
tqdm>=4.64.0
|
||||||
|
pydantic>=2.0.0
|
||||||
|
|
||||||
|
# Standard library modules (included with Python):
|
||||||
|
# - pathlib
|
||||||
|
# - typing
|
||||||
|
# - json
|
||||||
|
# - zipfile
|
||||||
|
# - copy
|
||||||
|
# - re
|
||||||
|
# - argparse
|
||||||
|
# - warnings
|
||||||
|
# - traceback
|
||||||
|
# - itertools
|
||||||
57
analysis/schemas.py
Normal file
57
analysis/schemas.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
""" separate module for constants"""
|
||||||
|
import re
|
||||||
|
__all__ = ["ALL_PROVINCES", "ALL_SUPPLY_CENTERS", "COASTAL_SCs", "COUNTRIES", "PHASE_REGEX", "PLACE_IDENTIFIER", "UNIT_IDENTIFIER", "UNIT_MOVE", "POSSIBLE_COMMANDS", "POSSIBLE_COMMAND_RESULTS", "COUNTRY_RE", "PHASE_RE", "UNIT_RE", "PLACE_RE", "COMMAND_PATTERNS", "ALLOWED_RESULTS", "ALLOWED_COUNTRIES"]
|
||||||
|
|
||||||
|
ALL_PROVINCES = ['BRE', 'PAR', 'MAR', 'PIC', 'BUR', 'GAS', 'SPA', 'POR', 'NAF',
|
||||||
|
'TUN', 'LON', 'WAL', 'LVP', 'YOR', 'EDI', 'CLY', 'NWY', 'SWE',
|
||||||
|
'DEN', 'FIN', 'STP', 'STP/NC', 'STP/SC', 'MOS', 'SEV', 'UKR',
|
||||||
|
'WAR', 'LVN', 'BER', 'PRU', 'SIL', 'MUN', 'RUH', 'KIE', 'HOL',
|
||||||
|
'BEL', 'VIE', 'BOH', 'GAL', 'TYR', 'TRI', 'BUD', 'SER', 'RUM',
|
||||||
|
'BUL', 'BUL/EC', 'BUL/SC', 'GRE', 'ALB', 'CON', 'ANK', 'SMY',
|
||||||
|
'ARM', 'SYR', 'VEN', 'PIE', 'TUS', 'ROM', 'NAP', 'APU', 'NTH',
|
||||||
|
'ENG', 'IRI', 'MAO', 'WES', 'LYO', 'TYS', 'ION', 'ADR', 'AEG',
|
||||||
|
'EAS', 'BLA', 'BAL', 'BOT', 'SKA', 'BAR', 'NWG', 'NAO']
|
||||||
|
|
||||||
|
ALL_SUPPLY_CENTERS = [
|
||||||
|
"ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
|
||||||
|
"HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
|
||||||
|
"ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
|
||||||
|
"VEN", "VIE", "WAR",
|
||||||
|
"SPA", "STP", # coastal provinces
|
||||||
|
]
|
||||||
|
|
||||||
|
COASTAL_SCs = ["SPA/SC", "SPA/NC",
|
||||||
|
"STP/SC", "STP/NC", 'BUL/EC',
|
||||||
|
'BUL/SC',]
|
||||||
|
|
||||||
|
COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
|
||||||
|
|
||||||
|
PHASE_REGEX = r"^[A-Z]\d{4}[MRA]$"
|
||||||
|
|
||||||
|
PLACE_IDENTIFIER = r"[A-Z]{3}(?:/[A-Z]{2})?"
|
||||||
|
PLACE_CAPTURING_REGEX = r"([A-Z]{3})"
|
||||||
|
UNIT_IDENTIFIER = rf"[AF] {PLACE_IDENTIFIER}"
|
||||||
|
UNIT_MOVE = rf"{UNIT_IDENTIFIER} . {PLACE_IDENTIFIER}"
|
||||||
|
|
||||||
|
POSSIBLE_COMMANDS = {
|
||||||
|
"Move": f"^"+UNIT_MOVE, # distinguishing this from support
|
||||||
|
"Support Move": f"{UNIT_IDENTIFIER} S {UNIT_MOVE}",
|
||||||
|
"Support Hold": fr"{UNIT_IDENTIFIER} S {UNIT_IDENTIFIER}(?!\s+[.\-]\s+{PLACE_IDENTIFIER})",
|
||||||
|
"Convoy": f"F {PLACE_IDENTIFIER} C {UNIT_MOVE}", # No convoys in here?
|
||||||
|
"Hold": f"{UNIT_IDENTIFIER} H",
|
||||||
|
"Build": f"{UNIT_IDENTIFIER} B",
|
||||||
|
"Disband": f"{UNIT_IDENTIFIER} D",
|
||||||
|
"Retreat": f"{UNIT_IDENTIFIER} R",
|
||||||
|
}
|
||||||
|
|
||||||
|
POSSIBLE_COMMAND_RESULTS = [
|
||||||
|
"void", "bounce", "cut", "dislodged", "disband", "no convoy"]
|
||||||
|
|
||||||
|
COUNTRY_RE = re.compile(r"^[A-Z][A-Z]+$")
|
||||||
|
PHASE_RE = re.compile(PHASE_REGEX)
|
||||||
|
UNIT_RE = re.compile(rf"^(?P<ut>A|F) (?P<ter>[A-Z]{{3}}(?:/(?:NC|SC|EC|WC))?)$") # allow coasts
|
||||||
|
PLACE_RE = re.compile(rf"^{PLACE_IDENTIFIER}$")
|
||||||
|
|
||||||
|
COMMAND_PATTERNS = [(name, re.compile(p)) for name, p in POSSIBLE_COMMANDS.items()]
|
||||||
|
ALLOWED_RESULTS = set(POSSIBLE_COMMAND_RESULTS)
|
||||||
|
ALLOWED_COUNTRIES = set(COUNTRIES)
|
||||||
318
analysis/validation.py
Normal file
318
analysis/validation.py
Normal file
|
|
@ -0,0 +1,318 @@
|
||||||
|
"""
|
||||||
|
LMVS JSON Schema Validator
|
||||||
|
==========================
|
||||||
|
|
||||||
|
This module defines a Pydantic v2 schema for validating Diplomacy game logs
|
||||||
|
exported in the LMVS (Language Model vs. State) format.
|
||||||
|
|
||||||
|
Friendly overview
|
||||||
|
-----------------
|
||||||
|
An LMVS file should contain a top-level object with metadata (`id`, `map`,
|
||||||
|
`rules`) and a list of **PhaseData** objects under the key `"phases"`.
|
||||||
|
|
||||||
|
Each PhaseData object has the following important parts:
|
||||||
|
|
||||||
|
* `"name"`: a string like "S1901M" or "F1903A" that encodes season, year, and
|
||||||
|
sub-phase. The format is '^[A-Z]\d{4}[MRA]$' or the word 'COMPLETED'.
|
||||||
|
|
||||||
|
* `"state"`: a dictionary describing the game board at this phase, with keys:
|
||||||
|
|
||||||
|
- `"units"`: {country → [list of unit identifiers]}.
|
||||||
|
Each unit identifier looks like `"A BUD"` or `"F STP/SC"`.
|
||||||
|
This says which units each power currently controls and where they are.
|
||||||
|
|
||||||
|
- `"centers"`: {country → [list of supply center locations]}.
|
||||||
|
Shows which supply centers each power owns.
|
||||||
|
|
||||||
|
- `"influence"`: {country → [list of provinces]}.
|
||||||
|
Records which provinces each power currently controls.
|
||||||
|
|
||||||
|
- `"homes"`: {country → [list of home supply centers]}.
|
||||||
|
A country's build home centers.
|
||||||
|
|
||||||
|
- `"retreats"`: {country → {unit → [list of possible retreat provinces]}}.
|
||||||
|
Units that must retreat and their allowed destinations.
|
||||||
|
|
||||||
|
- `"civil_disorder"`: {country → integer flag}.
|
||||||
|
Records whether a country is in civil disorder.
|
||||||
|
|
||||||
|
- `"builds"`: {country → {count: int, homes: [list of places]}}.
|
||||||
|
Tracks build counts and home sites during adjustment phases.
|
||||||
|
|
||||||
|
* `"orders"`: {country → [list of orders]}.
|
||||||
|
Each order string must follow one of the canonical Diplomacy order formats,
|
||||||
|
such as Move, Hold, Support, Convoy, Build, Disband, or Retreat.
|
||||||
|
For example: `"A BUD - SER"`, `"F LON S F EDI - NTH"`.
|
||||||
|
|
||||||
|
* `"results"`: {unit identifier → [list of result codes]}.
|
||||||
|
Each result code is one of: `"void"`, `"bounce"`, `"cut"`, `"dislodged"`,
|
||||||
|
`"disband"`, or `"no convoy"`.
|
||||||
|
These describe how the order for that unit resolved.
|
||||||
|
|
||||||
|
* `"messages"` (optional): a list of dictionaries, each with:
|
||||||
|
- `"sender"`: a valid country
|
||||||
|
- `"recipient"`: a valid country
|
||||||
|
- `"phase"`: phase code like "S1901M"
|
||||||
|
- `"message"`: the text of the press message
|
||||||
|
|
||||||
|
Validation rules
|
||||||
|
----------------
|
||||||
|
The schema enforces:
|
||||||
|
- Country names must be one of: AUSTRIA, ENGLAND, FRANCE, GERMANY, ITALY, RUSSIA, TURKEY.
|
||||||
|
- Phase codes must match the required regex format.
|
||||||
|
- Units must be of the form "A XXX" or "F XXX[/COAST]".
|
||||||
|
- Orders must match one of the defined command regexes.
|
||||||
|
- Result codes must be from the allowed list.
|
||||||
|
- Orders must reference units that exist in the corresponding `state.units`.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
To use, call:
|
||||||
|
|
||||||
|
from lmvs_light_validation import LMVSGame
|
||||||
|
import json
|
||||||
|
|
||||||
|
data = json.load(open("lmvs.json"))
|
||||||
|
game = LMVSGame.model_validate(data)
|
||||||
|
|
||||||
|
Any structural or semantic mismatches will raise a pydantic `ValidationError`.
|
||||||
|
|
||||||
|
This module can be extended with stricter checks by toggling options such as
|
||||||
|
strict territory membership, strict supply center membership, or coast handling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator, ConfigDict
|
||||||
|
from pydantic_core.core_schema import ValidationInfo
|
||||||
|
from analysis.schemas import ALL_PROVINCES, COASTAL_SCs, COUNTRY_RE, PHASE_RE, UNIT_RE, PLACE_RE, COMMAND_PATTERNS, ALLOWED_RESULTS, ALLOWED_COUNTRIES
|
||||||
|
|
||||||
|
|
||||||
|
# build a strict territory set that includes both underscore and slash spellings
|
||||||
|
STRICT_TERRITORY = set(ALL_PROVINCES)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ValidationConfig:
|
||||||
|
strict_countries: bool = True
|
||||||
|
strict_territories: bool = False # set True to enforce membership in STRICT_TERRITORY for places
|
||||||
|
strict_sc: bool = False # optionally enforce that centers ⊆ ALL_SUPPLY_CENTERS(+coasts)
|
||||||
|
|
||||||
|
# -------------------- validators --------------------
|
||||||
|
|
||||||
|
def _validate_country(c: str) -> str:
|
||||||
|
if not COUNTRY_RE.match(c):
|
||||||
|
raise ValueError(f"bad country name: {c!r}")
|
||||||
|
if c not in ALLOWED_COUNTRIES:
|
||||||
|
raise ValueError(f"unknown country: {c!r}")
|
||||||
|
return c
|
||||||
|
|
||||||
|
def _validate_place(code: str, info: ValidationInfo, allow_coast: bool = True) -> str:
|
||||||
|
if not PLACE_RE.match(code) and code != "WAIVE":
|
||||||
|
raise ValueError(f"bad place code: {code!r}")
|
||||||
|
cfg: ValidationConfig = (info.context or {}).get("cfg", ValidationConfig()) # type: ignore
|
||||||
|
if cfg.strict_territories:
|
||||||
|
if allow_coast:
|
||||||
|
if code not in STRICT_TERRITORY:
|
||||||
|
raise ValueError(f"unknown territory: {code!r}")
|
||||||
|
else:
|
||||||
|
base = code.split("/")[0]
|
||||||
|
if base not in STRICT_TERRITORY:
|
||||||
|
raise ValueError(f"unknown base territory: {code!r}")
|
||||||
|
return code
|
||||||
|
|
||||||
|
def _validate_unit(u: str, info: ValidationInfo) -> str:
|
||||||
|
# handle dislodged units (prefixed with *)
|
||||||
|
if u.startswith("*"):
|
||||||
|
u = u[1:] # remove the * prefix for validation
|
||||||
|
|
||||||
|
m = UNIT_RE.match(u)
|
||||||
|
if not m:
|
||||||
|
raise ValueError(f"bad unit: {u!r} (expected 'A XXX' or 'F XXX[/COAST]')")
|
||||||
|
_validate_place(m.group("ter"), info, allow_coast=True)
|
||||||
|
return u
|
||||||
|
|
||||||
|
def _order_kind(order: str) -> Optional[str]:
|
||||||
|
# handle build orders (B suffix)
|
||||||
|
if order.endswith(" B"):
|
||||||
|
return "Build"
|
||||||
|
|
||||||
|
for name, pat in COMMAND_PATTERNS:
|
||||||
|
if pat.match(order):
|
||||||
|
return name
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _unit_head(order: str) -> Optional[str]:
|
||||||
|
# returns the leading "A XXX" or "F XXX/COAST" if present
|
||||||
|
m = UNIT_RE.match(order.split(" ", 2)[0] + " " + order.split(" ", 2)[1]) if len(order.split()) >= 2 else None
|
||||||
|
if m:
|
||||||
|
return m.group(0)
|
||||||
|
# fallback: looser search at start
|
||||||
|
m = UNIT_RE.match(order)
|
||||||
|
return m.group(0) if m else None
|
||||||
|
|
||||||
|
def _base_ter(u: str) -> str:
|
||||||
|
# "A STP/NC" -> "STP"; "F BUD" -> "BUD"
|
||||||
|
ter = u.split(" ", 1)[1] if " " in u else u
|
||||||
|
return ter.split("/")[0]
|
||||||
|
|
||||||
|
def _unit_type(u: str) -> str:
|
||||||
|
return u.split(" ", 1)[0]
|
||||||
|
|
||||||
|
# -------------------- models --------------------
|
||||||
|
|
||||||
|
class PhaseState(BaseModel):
|
||||||
|
name: str
|
||||||
|
phase: str
|
||||||
|
game_id: str
|
||||||
|
|
||||||
|
units: Dict[str, List[str]]
|
||||||
|
centers: Dict[str, List[str]]
|
||||||
|
influence: Dict[str, List[str]]
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="ignore")
|
||||||
|
|
||||||
|
@field_validator("name","phase")
|
||||||
|
@classmethod
|
||||||
|
def _phase_format(cls, v: str, info: ValidationInfo) -> str:
|
||||||
|
if not PHASE_RE.match(v) and v != "COMPLETED":
|
||||||
|
raise ValueError(f"bad phase: {v!r}")
|
||||||
|
return v
|
||||||
|
|
||||||
|
# these should be dictionaries with country names as keys
|
||||||
|
@field_validator("units","centers","influence", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def _country_keys_ok(cls, mapping: Dict[str, Any], info: ValidationInfo) -> Dict[str, Any]:
|
||||||
|
for c in mapping.keys():
|
||||||
|
_validate_country(c)
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
# these should be lists of unit strings
|
||||||
|
@field_validator("units", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def _units_ok(cls, u: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
|
||||||
|
for c, lst in u.items():
|
||||||
|
if not isinstance(lst, list):
|
||||||
|
raise ValueError(f"units[{c}] must be a list")
|
||||||
|
for unit in lst:
|
||||||
|
_validate_unit(unit, info)
|
||||||
|
return u
|
||||||
|
|
||||||
|
# these should be lists of place strings
|
||||||
|
@field_validator("centers","influence", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def _place_lists_ok(cls, d: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
|
||||||
|
for c, lst in d.items():
|
||||||
|
if not isinstance(lst, list):
|
||||||
|
raise ValueError(f"{c} values must be a list")
|
||||||
|
for t in lst:
|
||||||
|
_validate_place(t, info, allow_coast=False)
|
||||||
|
return d
|
||||||
|
|
||||||
|
class Phase(BaseModel):
|
||||||
|
name: str
|
||||||
|
state: PhaseState
|
||||||
|
orders: Dict[str, Optional[List[str]]] # allow None values
|
||||||
|
results: Dict[str, List[str]]
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="ignore")
|
||||||
|
|
||||||
|
@field_validator("name")
|
||||||
|
@classmethod
|
||||||
|
def _phase_format(cls, v: str, info: ValidationInfo) -> str:
|
||||||
|
if not PHASE_RE.match(v) and v != "COMPLETED":
|
||||||
|
raise ValueError(f"bad phase: {v!r}")
|
||||||
|
return v
|
||||||
|
|
||||||
|
@field_validator("orders", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def _orders_ok(cls, orders: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
|
||||||
|
# handle null orders by converting to empty lists
|
||||||
|
cleaned_orders = {}
|
||||||
|
for c, lst in orders.items():
|
||||||
|
_validate_country(c)
|
||||||
|
if lst is None or lst == "null":
|
||||||
|
cleaned_orders[c] = []
|
||||||
|
elif not isinstance(lst, list):
|
||||||
|
raise ValueError(f"orders[{c}] must be a list")
|
||||||
|
else:
|
||||||
|
cleaned_orders[c] = lst
|
||||||
|
for o in lst:
|
||||||
|
kind = _order_kind(o)
|
||||||
|
if not kind:
|
||||||
|
raise ValueError(f"order doesn't match any known command: {o!r}")
|
||||||
|
return cleaned_orders
|
||||||
|
|
||||||
|
@field_validator("results", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def _results_ok(cls, res: Dict[str, List[str]], info: ValidationInfo) -> Dict[str, List[str]]:
|
||||||
|
for unit, lst in res.items():
|
||||||
|
# skip unit validation for special unit names
|
||||||
|
if unit != "WAIVE":
|
||||||
|
_validate_unit(unit, info)
|
||||||
|
if not isinstance(lst, list):
|
||||||
|
raise ValueError(f"results[{unit}] must be a list")
|
||||||
|
for r in lst:
|
||||||
|
if r == "":
|
||||||
|
pass # allow empty result codes
|
||||||
|
elif r == "WAIVE":
|
||||||
|
pass # allow WAIVE as a special result code
|
||||||
|
elif r not in ALLOWED_RESULTS:
|
||||||
|
raise ValueError(f"illegal result code {r!r} for {unit}")
|
||||||
|
return res
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _orders_correspond_to_units(self) -> "Phase":
|
||||||
|
# derive {country -> set(units)} from state
|
||||||
|
country_units = {c: set(v) for c, v in self.state.units.items()}
|
||||||
|
|
||||||
|
# for each order, subject unit must exist for that country (coast-tolerant)
|
||||||
|
for c, lst in self.orders.items():
|
||||||
|
known = country_units.get(c, set())
|
||||||
|
for o in lst:
|
||||||
|
# skip validation for build orders (they create new units)
|
||||||
|
if o.endswith(' B'):
|
||||||
|
continue
|
||||||
|
# skip validation for retreat orders (they operate on dislodged units)
|
||||||
|
if ' R ' in o or o.endswith(' R') or o.endswith(' D'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
head = _unit_head(o)
|
||||||
|
if not head:
|
||||||
|
# already caught by regex; skip
|
||||||
|
continue
|
||||||
|
if head in known:
|
||||||
|
continue
|
||||||
|
# coast/base tolerant match
|
||||||
|
base = _base_ter(head)
|
||||||
|
ut = _unit_type(head)
|
||||||
|
if not any((_unit_type(u) == ut and _base_ter(u) == base) for u in known):
|
||||||
|
raise ValueError(f"order {o!r} for {c} does not match any unit in state.units[{c}]")
|
||||||
|
return self
|
||||||
|
|
||||||
|
class LMVSGame(BaseModel):
|
||||||
|
id: str
|
||||||
|
map: str
|
||||||
|
rules: List[str]
|
||||||
|
phases: List[Phase]
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="ignore")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------- example usage --------------------
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys, json, pathlib
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("usage: python validation.py <path_to_lmvsgame.json>")
|
||||||
|
sys.exit(1)
|
||||||
|
cfg = ValidationConfig(strict_territories=False, strict_sc=False)
|
||||||
|
p = pathlib.Path(sys.argv[1])
|
||||||
|
data = json.loads(p.read_text())
|
||||||
|
game = LMVSGame.model_validate(
|
||||||
|
data,
|
||||||
|
context={"cfg": cfg},
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{p} is valid: game_id={game.id} phases={len(game.phases)}")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue