patched for compatability with new data logging structure

This commit is contained in:
peregrinates 2025-08-15 12:02:13 -04:00
parent f7c3949e57
commit 975c4f59b9
No known key found for this signature in database
GPG key ID: 1CDDD19B6F8CC99D
5 changed files with 407 additions and 186 deletions

View file

@ -1,30 +1,93 @@
# analysis_constants.py
import os
import json
"""Utility functions and constants for loading Diplomacy analysis data.
This module provides helpers to read game data stored either as a folder on disk
or inside a zip archive, plus a few constant lists and regex patterns that are
used across the analysis scripts.
"""
from pathlib import Path
import pandas as pd
from typing import Dict, Union
import json
import zipfile
def process_standard_game_inputs(game_data_folder : Path, selected_game : str) -> dict[str, pd.DataFrame]:
path_to_folder = game_data_folder / selected_game
import pandas as pd
__all__: list[str] = [
"process_standard_game_inputs",
"process_game_inputs_in_zip",
"get_country_to_model_mapping",
"ALL_SUPPLY_CENTERS",
"COASTAL_SCs",
"COUNTRIES",
"PLACE_IDENTIFIER",
"PLACE_CAPTURING_REGEX",
"UNIT_IDENTIFIER",
"UNIT_MOVE",
"POSSIBLE_COMMANDS",
]
assert os.path.exists(path_to_folder / "overview.jsonl"), f"Overview file not found in {path_to_folder}"
overview = pd.read_json(path_to_folder / "overview.jsonl", lines=True)
# get all turn actions from lmvs
assert os.path.exists(path_to_folder / "lmvsgame.json"), f"LMVS file not found in {path_to_folder}"
path_to_file = path_to_folder / "lmvsgame.json"
def process_standard_game_inputs(path_to_folder: Path) -> Dict[str, Union[pd.DataFrame, dict]]:
"""
Read in a game folder and return the overview, lmvs_data, and all_responses
Args:
path_to_folder: Path to the game folder. Must contain overview.jsonl, lmvsgame.json, and llm_responses.csv files.
Returns:
Dictionary containing overview, lmvs_data, and all_responses
"""
# ----- check files exist -----
overview_path = path_to_folder / "overview.jsonl"
lmvsgame_path = path_to_folder / "lmvsgame.json"
llm_resp_path = path_to_folder / "llm_responses.csv"
# Use the standard `json` library to load the file into a Python object
with open(path_to_file, 'r') as f:
if not overview_path.exists():
raise FileNotFoundError(str(overview_path))
if overview_path.stat().st_size == 0:
raise FileNotFoundError(f"{overview_path} is empty")
if not lmvsgame_path.exists():
raise FileNotFoundError(str(lmvsgame_path))
if lmvsgame_path.stat().st_size == 0:
raise FileNotFoundError(f"{lmvsgame_path} is empty")
if not llm_resp_path.exists():
raise FileNotFoundError(str(llm_resp_path))
if llm_resp_path.stat().st_size == 0:
raise FileNotFoundError(f"{llm_resp_path} is empty")
# ----- load data -----
overview = pd.read_json(overview_path, lines=True)
with open(lmvsgame_path, "r") as f:
lmvs_data = json.load(f)
assert os.path.exists(path_to_folder / "llm_responses.csv"), f"LLM responses file not found in {path_to_folder}"
all_responses = pd.read_csv(path_to_folder / "llm_responses.csv")
all_responses = pd.read_csv(llm_resp_path)
return {"overview":overview, "lmvs_data":lmvs_data, "all_responses":all_responses}
def process_game_in_zip(zip_path: Path, selected_game: str) -> dict[str, pd.DataFrame]:
def get_country_to_model_mapping(overview_df : pd.DataFrame, llm_responses_df : pd.DataFrame) -> pd.Series:
""" Get a country:model map of which country was played by which model, different in different versions of data"""
country_to_model = overview_df.loc[1].reindex(COUNTRIES)
if pd.isnull(country_to_model).any():
if llm_responses_df is not None:
country_to_model = llm_responses_df.set_index("power")["model"].reindex(COUNTRIES)
return country_to_model
def process_game_inputs_in_zip(zip_path: Path, selected_game: str) -> Dict[str, Union[pd.DataFrame, dict]]:
"""
Read in a game folder and return the overview, lmvs_data, and all_responses
Args:
zip_path: Path to the zip file
selected_game: Name of the game to extract
Returns:
Dictionary containing overview, lmvs_data, and all_responses
"""
zip_name = zip_path.stem # Gets filename without extension
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
overview = pd.read_json(zip_ref.open(f"{zip_name}/{selected_game}/overview.jsonl"), lines=True)
@ -32,32 +95,32 @@ def process_game_in_zip(zip_path: Path, selected_game: str) -> dict[str, pd.Data
all_responses = pd.read_csv(zip_ref.open(f"{zip_name}/{selected_game}/llm_responses.csv"))
return {"overview": overview, "lmvs_data": lmvs_data, "all_responses": all_responses}
supply_centers = [
ALL_SUPPLY_CENTERS = [
"ANK", "ARM", "BEL", "BER", "BUD", "BUL", "CON", "DEN", "EDI", "GRE",
"HOL", "KIE", "LON", "LVP", "MAR", "MOS", "MUN", "NAP", "PAR", "POR",
"ROM", "RUM", "SER", "SEV", "SMY", "SWE", "TRI", "TUN",
"VEN", "VIE", "WAR",
"SPA", "STP", "BUL" # coastal provinces
"SPA", "STP", # coastal provinces
]
coastal_scs = ["SPA/SC", "SPA/NC",
COASTAL_SCs = ["SPA/SC", "SPA/NC",
"STP/SC", "STP/NC", 'BUL/EC',
'BUL/SC',]
COUNTRIES = ['AUSTRIA', 'ENGLAND', 'FRANCE', 'GERMANY', 'ITALY', 'RUSSIA', 'TURKEY']
place_identifier = "[A-Z]{3}(?:/[A-Z]{2})?"
place_capturing_regex = r"([A-Z]{3})"
unit_identifier = rf"[AF] {place_identifier}"
unit_move = rf"{unit_identifier} . {place_identifier}"
PLACE_IDENTIFIER = r"[A-Z]{3}(?:/[A-Z]{2})?"
PLACE_CAPTURING_REGEX = r"([A-Z]{3})"
UNIT_IDENTIFIER = rf"[AF] {PLACE_IDENTIFIER}"
UNIT_MOVE = rf"{UNIT_IDENTIFIER} . {PLACE_IDENTIFIER}"
possible_commands = {
"Move": f"^"+unit_move, # distinguishing this from support
"Support Move": f"{unit_identifier} S {unit_move}",
"Support Hold": fr"{unit_identifier} S {unit_identifier}(?!\s+[.\-]\s+{place_identifier})",
"Convoy": f"F {place_identifier} C {unit_move}", # No convoys in here?
"Hold": f"{unit_identifier} H",
"Build": f"{unit_identifier} B",
"Disband": f"{unit_identifier} D",
"Retreat": f"{unit_identifier} R",
POSSIBLE_COMMANDS = {
"Move": f"^"+UNIT_MOVE, # distinguishing this from support
"Support Move": f"{UNIT_IDENTIFIER} S {UNIT_MOVE}",
"Support Hold": fr"{UNIT_IDENTIFIER} S {UNIT_IDENTIFIER}(?!\s+[.\-]\s+{PLACE_IDENTIFIER})",
"Convoy": f"F {PLACE_IDENTIFIER} C {UNIT_MOVE}", # No convoys in here?
"Hold": f"{UNIT_IDENTIFIER} H",
"Build": f"{UNIT_IDENTIFIER} B",
"Disband": f"{UNIT_IDENTIFIER} D",
"Retreat": f"{UNIT_IDENTIFIER} R",
}

View file

@ -19,7 +19,7 @@ python analysis/make_all_analysis_data.py --selected_game game1 --game_data_fold
python analysis/make_all_analysis_data.py --game_data_folder "/path/to/Game Data" --output_folder "/path/to/Game Data - Analysis"
"""
import argparse
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
@ -27,49 +27,67 @@ from tqdm import tqdm
from analysis.p1_make_longform_orders_data import make_longform_order_data
from analysis.p2_make_convo_data import make_conversation_data
from analysis.p3_make_phase_data import make_phase_data
from analysis.analysis_helpers import process_standard_game_inputs, process_game_in_zip
from analysis.analysis_helpers import get_country_to_model_mapping, process_standard_game_inputs, process_game_inputs_in_zip, COUNTRIES
from typing import Dict
def process_game_data_from_folders(game_name : str, game_path : Path) -> Dict[str, pd.DataFrame]:
def process_game_data_from_folders(game_path : Path) -> Dict[str, pd.DataFrame]:
"""Reads log data from folder and makes analytic data sets"""
game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
game_data : dict[str, pd.DataFrame] = process_standard_game_inputs(path_to_folder=game_path)
orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"],
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
orders_data : pd.DataFrame = make_longform_order_data(country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"],
all_responses=game_data["all_responses"])
conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"],
lmvs_data=game_data["lmvs_data"],
conversations_data=conversations_data,
orders_data=orders_data)
return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
conversations_data : pd.DataFrame = make_conversation_data(country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"])
def process_game_data_from_zip(zip_path : Path, game_name : str) -> Dict[str, pd.DataFrame]:
"""Reads log data from zip and makes analytic data sets"""
game_data : dict[str, pd.DataFrame] = process_game_in_zip(zip_path=zip_path, selected_game=game_name)
orders_data : pd.DataFrame = make_longform_order_data(overview=game_data["overview"],
lmvs_data=game_data["lmvs_data"],
all_responses=game_data["all_responses"])
conversations_data : pd.DataFrame = make_conversation_data(overview=game_data["overview"], lmvs_data=game_data["lmvs_data"])
phase_data : pd.DataFrame = make_phase_data(overview=game_data["overview"],
phase_data : pd.DataFrame = make_phase_data(country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"],
conversations_data=conversations_data,
orders_data=orders_data)
return {"orders_data": orders_data, "conversations_data": conversations_data, "phase_data": phase_data}
return {"orders_data": orders_data,
"conversations_data": conversations_data,
"phase_data": phase_data}
def process_game_data_from_zip(zip_path: Path, game_name: str) -> Dict[str, pd.DataFrame]:
"""Reads log data from zip and makes analytic data sets"""
game_data: dict[str, pd.DataFrame] = process_game_inputs_in_zip(zip_path=zip_path, selected_game=game_name)
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
orders_data: pd.DataFrame = make_longform_order_data(
country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"],
all_responses=game_data["all_responses"],
)
conversations_data: pd.DataFrame = make_conversation_data(
country_to_model=country_to_model, lmvs_data=game_data["lmvs_data"]
)
phase_data: pd.DataFrame = make_phase_data(
country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"],
conversations_data=conversations_data,
orders_data=orders_data,
)
return {
"orders_data": orders_data,
"conversations_data": conversations_data,
"phase_data": phase_data,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run all three analysis scripts in sequence with the same arguments.")
parser.add_argument(
"--selected_game",
type=str,
@ -98,16 +116,18 @@ if __name__ == "__main__":
games_to_process = args.selected_game
if not games_to_process:
games_to_process = os.listdir(args_dict["game_data_folder"])
games_to_process = [p.name for p in args_dict["game_data_folder"].iterdir() if p.is_dir()]
for game in tqdm(games_to_process, desc="Processing games"):
game_path = args_dict["game_data_folder"] / game
if not game_path.is_dir():
continue
try:
results = process_game_data_from_folders(game_name=game, game_path=args_dict["game_data_folder"])
results = process_game_data_from_folders(game_path=game_path)
for data_set, df in results.items():
output_path = args_dict["analysis_folder"] / data_set / f"{game}_{data_set}.csv"
output_dir = args_dict["analysis_folder"] / data_set
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{game}_{data_set}.csv"
df.to_csv(output_path, index=False)
except Exception as e:
print(f"Error processing game {game}: {e}")

View file

@ -53,24 +53,32 @@ Return / save
import pandas as pd
import numpy as np
import os
import copy
import re
import argparse
import warnings
from pathlib import Path
from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES, supply_centers, coastal_scs, place_identifier, unit_identifier, unit_move, possible_commands
from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping, COUNTRIES, ALL_SUPPLY_CENTERS, COASTAL_SCs, PLACE_IDENTIFIER, UNIT_IDENTIFIER, UNIT_MOVE, POSSIBLE_COMMANDS
from tqdm import tqdm
import traceback
from typing import List, Optional, Dict, Any
# Suppress pandas warnings
warnings.filterwarnings('ignore', category=UserWarning, module='pandas.core.strings')
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
try:
country_to_model = overview.loc[1, COUNTRIES] # map countries to models
except:
country_to_model = {country: "not specified in overview.jsonl" for country in COUNTRIES}
def make_longform_order_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame, all_responses : pd.DataFrame) -> pd.DataFrame:
"""
Makes a dataframe with a row for each order given by every power, in every phase (see module docstring for more details).
Args:
country_to_model: A Series mapping country names to model names
lmvs_data: A DataFrame containing the game data
all_responses: A DataFrame containing the responses from the LLM responses csv
Returns:
A DataFrame with a row for each order given by every power, in every phase
"""
################## PART 1 ##################
# build `turn_actions` dataframe
@ -120,22 +128,22 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
# categorize each order based on regex
# note that this will overwrite if multiple regexes match, which is why we've split support into 2 commands
for possible_command, regex in possible_commands.items():
for possible_command, regex in POSSIBLE_COMMANDS.items():
all_orders_ever.loc[all_orders_ever.order.str.contains(regex, regex=True), "command"] = possible_command
all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({place_identifier})")
all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(supply_centers) | all_orders_ever["unit_location"].isin(coastal_scs)
all_orders_ever["unit_location"] = all_orders_ever["order"].str.extract(rf"({PLACE_IDENTIFIER})")
all_orders_ever["location_was_sc"] = all_orders_ever["unit_location"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["unit_location"].isin(COASTAL_SCs)
# only MOVE has a destination
all_orders_ever["destination"] = np.where(
all_orders_ever["command"]=="Move",
all_orders_ever["order"].str.extract(rf"{unit_identifier} . ({place_identifier})", expand=False),
all_orders_ever["order"].str.extract(rf"{UNIT_IDENTIFIER} . ({PLACE_IDENTIFIER})", expand=False),
np.nan
)
all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(supply_centers) | all_orders_ever["destination"].isin(coastal_scs)
all_orders_ever["destination_was_sc"] = all_orders_ever["destination"].isin(ALL_SUPPLY_CENTERS) | all_orders_ever["destination"].isin(COASTAL_SCs)
# Retreat also has a destination
all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{unit_identifier} R ({place_identifier})", expand=False)
all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "destination"] = all_orders_ever.loc[all_orders_ever["command"]=="Retreat", "order"].str.extract(rf"{UNIT_IDENTIFIER} R ({PLACE_IDENTIFIER})", expand=False)
all_orders_ever["immediate_result"] = all_orders_ever["order"].str.extract(r"\(([^)]+)\)")
all_orders_ever["immediate_result"] = all_orders_ever["immediate_result"].fillna("PASS")
@ -147,6 +155,16 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
all_orders_ever["country_model"] = all_orders_ever["country"] + " (" + all_orders_ever["model_short_name"] + ")"
def check_location_influence(phase_id, location):
"""
Helper - checks who owns a location at a given phase. Uses the `turn_actions` dataframe from overall context.
Args:
phase_id: The phase to check
location: The location to check
Returns:
The country that owns the location, or "Unowned" if no country owns it
"""
# checking who owns a location at `phase_id`
if pd.isnull(location):
return np.nan
@ -162,22 +180,45 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
all_orders_ever["destination_affiliation"] = all_orders_ever.apply(lambda row: check_location_influence(row["phase"],
row["destination"]), axis=1)
def find_supporting_country(unit_command, command_type, phase):
def find_supporting_country(unit_command, command_type, phase) -> Optional[str]:
"""
Helper - finds which orders support a given unit and records the supporting powers. Operating on the `all_orders_ever` dataframe.
Args:
unit_command: The unit command to find supporting orders for
command_type: The type of command ("Move" or "Hold")
phase: The phase to check
Returns:
A string containing a comma-separated list of countries that issued an order to support that unit, or None if no such orders exist
"""
if command_type == "Move" or command_type == "Hold": # commands that can be supported
potential_supports = all_orders_ever[(all_orders_ever["phase"] == phase) &
(all_orders_ever["command"].isin(["Support Move", "Support Hold"]))]
potential_supports = potential_supports[potential_supports["order"].str.contains(unit_command, regex=False)]
if potential_supports.empty:
return np.nan
return None
else:
return ",".join(potential_supports["country"].tolist())
return np.nan
return None
all_orders_ever["supported_by"] = all_orders_ever.apply(lambda row: find_supporting_country(row["order"], row["command"], row["phase"]), axis=1)
all_orders_ever["in_anothers_territory"] =( all_orders_ever["country"] != all_orders_ever["unit_location_affiliation"]) & (all_orders_ever["unit_location_affiliation"] != "Unowned")
all_orders_ever["moving_into_anothers_territory"] = (all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) & (all_orders_ever["destination_affiliation"].notnull()) & (all_orders_ever["destination_affiliation"] != "Unowned")
all_orders_ever["moving_into_anothers_territory"] = ((all_orders_ever["country"] != all_orders_ever["destination_affiliation"]) &
(all_orders_ever["destination_affiliation"].notnull()) &
(all_orders_ever["destination_affiliation"] != "Unowned"))
def find_owner_of_unit(unit_location, phase):
def find_owner_of_unit(unit_location, phase) -> Optional[str]:
"""
Helper - finds the owner of a unit at a given phase. Operating on the `turn_actions` dataframe from overall context.
Args:
unit_location: The location of the unit to find the owner of
phase: The phase to check
Returns:
The country that owns the unit, or None if no country owns it
"""
if pd.notnull(unit_location):
unit_status = turn_actions.loc[turn_actions.index.str.contains("_units"), phase]
unit_status.index = unit_status.index.str.replace("_units", "")
@ -186,18 +227,30 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
if re.match(f"[AF] {unit_location}", unit):
return country
# where were they going? what was their destination like?
def find_destination_info(destination, phase):
def find_destination_info(destination, phase) -> Optional[Dict[str, Any]]:
"""
Helper - finds information about the destination of a unit at a given phase.
Operating on the `all_orders_ever` dataframe from overall context.
Args:
destination: The location of the unit to find the owner of
phase: The phase to check
Returns:
A dictionary containing information about the destination unit, or None if no such unit exists
"""
if pd.notnull(destination):
country = find_owner_of_unit(destination, phase)
# there should only ever be one unit at a given location during a phase
destination_unit_orders = all_orders_ever[(all_orders_ever["country"] == country) &
(all_orders_ever["phase"] == phase) &
(all_orders_ever["unit_location"] == destination)]
if not destination_unit_orders.empty:
destination_unit_orders = destination_unit_orders.iloc[0] # safe conversion to a series
return {"destination_unit_owner": country,
"destination_unit_order": destination_unit_orders["command"].squeeze(),
"destination_unit_outcome":destination_unit_orders["immediate_result"].squeeze(),
"destination_unit_supported_by": destination_unit_orders["supported_by"].squeeze()}
"destination_unit_order": destination_unit_orders["command"],
"destination_unit_outcome":destination_unit_orders["immediate_result"],
"destination_unit_supported_by": destination_unit_orders["supported_by"]}
destination_unit_info = all_orders_ever.apply(lambda row: find_destination_info(row["destination"], row["phase"]), axis=1).apply(pd.Series)
destination_unit_info["destination_was_occupied"] = destination_unit_info["destination_unit_owner"].notnull()
@ -205,54 +258,90 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
all_orders_ever = pd.concat([all_orders_ever, destination_unit_info], axis=1)
# if a Support action: who were they supporting? what was their support doing?
def find_support_recipient_info(unit_order, command, phase):
def find_support_recipient_info(unit_order, command, phase) -> Optional[Dict[str, Any]]:
"""
Helper - finds information about the recipient of a support action at a given phase.
Operating on the `all_orders_ever` dataframe from overall context.
Args:
unit_order: The order of the unit to find the recipient of support for
command: The type of command ("Support Move" or "Support Hold")
phase: The phase to check
Returns:
A dictionary containing information about the recipient of support, or None if no such recipient exists
"""
if "Support" in command:
recipient_location = re.match(rf"{unit_identifier} S [AF] ({place_identifier})", unit_order).group(1)
recipient_location = re.match(rf"{UNIT_IDENTIFIER} S [AF] ({PLACE_IDENTIFIER})", unit_order).group(1)
recipient_country = find_owner_of_unit(recipient_location, phase)
# there should only ever be one unit at a given location during a phase
recipient_order_info = all_orders_ever[(all_orders_ever["country"] == recipient_country) &
(all_orders_ever["phase"] == phase) &
(all_orders_ever["unit_location"] == recipient_location)]
return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"].squeeze(),
"recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"].squeeze(),
"recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"].squeeze(),
"recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"].squeeze()}
(all_orders_ever["unit_location"] == recipient_location)].iloc[0]
return {"recipient_unit_owner": recipient_country, "recipient_unit_outcome": recipient_order_info["immediate_result"],
"recipient_unit_in_anothers_territory": recipient_order_info["in_anothers_territory"],
"recipient_unit_moving_into_anothers_territory": recipient_order_info["moving_into_anothers_territory"],
"recipient_unit_destination_occupied": recipient_order_info["destination_was_occupied"]}
support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"], row["command"], row["phase"]), axis=1).apply(pd.Series)
support_recipient_info = all_orders_ever.apply(lambda row: find_support_recipient_info(row["order"],
row["command"],
row["phase"]), axis=1).apply(pd.Series)
# add support recipient info to all_orders_ever as additional columns
all_orders_ever = pd.concat([all_orders_ever, support_recipient_info], axis=1)
# add relationships with other countries
# if original v1
agent_relationship_matrix_over_time = {}
for phase in lmvs_data["phases"]:
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
if longform_relationships.empty:
print("Warning: no relationship data found in phase data")
else:
longform_relationships.columns = longform_relationships.columns.str.lower()
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']].fillna("Self")
longform_relationships = longform_relationships.add_prefix("relationship_")
all_orders_ever = pd.merge(all_orders_ever, longform_relationships,
left_on=["phase", "country"], right_on=["relationship_phase", "relationship_agent"]).drop(columns=["relationship_phase", "relationship_agent"])
alternate_relationship_view = pd.concat(agent_relationship_matrix_over_time)
alternate_relationship_view.index.names = ["phase", "agent"]
alternate_relationship_view = alternate_relationship_view.stack().reset_index().rename(columns={"level_2":"recipient",
0:"status"}).set_index(["phase", "recipient",
"agent"])["status"].unstack("agent").fillna("Self").add_suffix("s_relationship_rating").reset_index()
all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view,
left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
# Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
agent_relationship_matrix_over_time = {}
for phase in lmvs_data["phases"]:
agent_state = phase.get("state_agents", {})
country_relationships = {}
for c in COUNTRIES:
country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
longform_relationships.columns = longform_relationships.columns.str.lower()
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']].fillna("Self")
longform_relationships = longform_relationships.add_prefix("relationship_")
all_orders_ever = pd.merge(all_orders_ever, longform_relationships,
left_on=["phase", "country"], right_on=["relationship_phase", "relationship_agent"]).drop(columns=["relationship_phase", "relationship_agent"])
alternate_relationship_view = pd.concat(agent_relationship_matrix_over_time)
alternate_relationship_view.index.names = ["phase", "agent"]
alternate_relationship_view = alternate_relationship_view.stack().reset_index().rename(columns={"level_2":"recipient",
0:"status"}).set_index(["phase", "recipient",
"agent"])["status"].unstack("agent").fillna("Self").add_suffix("s_relationship_rating").reset_index()
all_orders_ever = pd.merge(all_orders_ever, alternate_relationship_view,
left_on=["phase", "country"], right_on=["phase", "recipient"]).drop(columns=["recipient"])
# if action was supporting
# if action was supporting, add flags
all_orders_ever["supporting_self"] = all_orders_ever["country"]==all_orders_ever["recipient_unit_owner"]
all_orders_ever["supporting_an_ally"] = (all_orders_ever["country"] !=all_orders_ever["recipient_unit_owner"]) & (all_orders_ever["recipient_unit_owner"].notnull())
def countries_aside_from(a_country):
def countries_aside_from(a_country : str) -> List[str]:
return [country for country in all_orders_ever["country"].unique() if country != a_country]
def check_country(supporters, country):
def check_country(supporters : List[str], country : str) -> bool:
"""
Helper - checks if a given country is in a list of supporters
Args:
supporters: The list of supporters to check
country: The country to check
Returns:
True if the country is in the list of supporters, False otherwise
"""
if pd.isnull(supporters):
return False
for other_countries in countries_aside_from(country):
@ -267,7 +356,7 @@ def make_longform_order_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame,
all_orders_ever["destination_unit_was_supported"] = all_orders_ever["destination_unit_supported_by"].notnull()
# add number of unit orders ever made
# add number of unit orders ever made during this game
unit_order_weight = 1 / all_orders_ever.groupby("country").size()
all_orders_ever["unit_order_weight"] = all_orders_ever["country"].map(unit_order_weight)
@ -317,30 +406,30 @@ if __name__ == "__main__":
current_game_data_folder = Path(args.game_data_folder)
analysis_folder = Path(args.analysis_folder) / "orders_data"
if not os.path.exists(analysis_folder):
if not analysis_folder.exists():
print(f"Output folder {analysis_folder} not found, creating it.")
os.makedirs(analysis_folder)
analysis_folder.mkdir(parents=True, exist_ok=True)
games_to_process = args.selected_game
if not games_to_process:
games_to_process = os.listdir(current_game_data_folder)
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
for game_name in tqdm(games_to_process, desc="Processing games"):
if game_name == ".DS_Store":
continue
game_path = current_game_data_folder / game_name
if not os.path.isdir(game_path):
if not game_path.is_dir():
continue
try:
game_source_data = process_standard_game_inputs(game_path, game_name)
data = make_longform_order_data(overview=game_source_data["overview"],
lmvs_data=game_source_data["lmvs_data"],
game_source_data = process_standard_game_inputs(game_path)
overview_df = game_source_data["overview"]
country_to_model = get_country_to_model_mapping(overview_df, game_source_data["all_responses"])
data = make_longform_order_data(country_to_model=country_to_model,
lmvs_data=game_source_data["lmvs_data"],
all_responses=game_source_data["all_responses"])
output_path = analysis_folder / f"{game_name}_orders_data.csv"
data.to_csv(output_path, index=False)
except FileNotFoundError as e:
print(f"Could not process {game_name}. Missing file: {e.filename}")
except Exception as e:
print(f"An unexpected error occurred while processing {game_name}: {e}")
print(f"An unexpected error occurred while processing {game_name}: {e}")
traceback.print_exc()

View file

@ -1,5 +1,5 @@
"""
Make conversation data from diplomacy game logs.
Make conversation data from diplomacy game logs, for convenience in analyzing conversationd data alone.
Resulting columns:
['phase',
@ -21,14 +21,23 @@ Resulting columns:
import pandas as pd
import itertools
import argparse
import os
from tqdm import tqdm
from pathlib import Path
from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs
def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) -> pd.DataFrame:
country_to_model = overview.loc[1, COUNTRIES]
from analysis.analysis_helpers import COUNTRIES, process_standard_game_inputs, get_country_to_model_mapping
import traceback
def make_conversation_data(country_to_model : pd.Series, lmvs_data : pd.DataFrame) -> pd.DataFrame:
"""
Make conversation data from diplomacy game logs.
Args:
country_to_model: A Series mapping country names to model names
lmvs_data: A DataFrame containing the game data
Returns:
A DataFrame containing the conversation data (a row for every conversation between 2 powers, at every phase)
"""
COUNTRY_COMBINATIONS = list(itertools.combinations(COUNTRIES, r=2))
# relationship data
@ -67,20 +76,12 @@ def make_conversation_data(overview : pd.DataFrame, lmvs_data : pd.DataFrame) ->
messages_from_sender = (messages_exchanged['sender']==sender).sum()
sender_streak = max_consecutive[sender] if sender in max_consecutive.index else 0
messages_from_recipient = (messages_exchanged['recipient']==sender).sum()
recipient_streak = max_consecutive[recipient] if recipient in max_consecutive.index else 0
party_1_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) &
(longform_relationships["agent"]==sender)][recipient]
if party_1_opinion.empty:
party_1_opinion = ""
else:
party_1_opinion = party_1_opinion.squeeze()
party_2_opinion = longform_relationships[(longform_relationships["phase"]==current_phase) &
(longform_relationships["agent"]==recipient)][sender]
if party_2_opinion.empty:
party_2_opinion = ""
else:
party_2_opinion = party_2_opinion.squeeze()
messages_from_recipient = (messages_exchanged['sender'] == recipient).sum()
recipient_streak = max_consecutive.get(recipient, 0)
party_1_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == sender)].reindex(columns=[recipient])
party_1_opinion = party_1_opinion_series.iloc[0,0] if not party_1_opinion_series.empty else ""
party_2_opinion_series = longform_relationships[(longform_relationships["phase"] == current_phase) & (longform_relationships["agent"] == recipient)].reindex(columns=[sender])
party_2_opinion = party_2_opinion_series.iloc[0,0] if not party_2_opinion_series.empty else ""
conversation_data = {
"party_1": sender,
@ -132,29 +133,30 @@ if __name__ == "__main__":
current_game_data_folder = Path(args.game_data_folder)
analysis_folder = Path(args.analysis_folder) / "conversations_data"
if not os.path.exists(analysis_folder):
if not analysis_folder.exists():
print(f"Output folder {analysis_folder} not found, creating it.")
os.makedirs(analysis_folder)
analysis_folder.mkdir(parents=True, exist_ok=True)
games_to_process = args.selected_game
if not games_to_process:
games_to_process = os.listdir(current_game_data_folder)
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
for game_name in tqdm(games_to_process):
if game_name == ".DS_Store":
continue
for game_name in tqdm(games_to_process, desc="Processing games"):
game_path = current_game_data_folder / game_name
if not os.path.isdir(game_path):
if not game_path.is_dir():
continue
try:
game_data = process_standard_game_inputs(game_data_folder=game_path,
selected_game=game_name)
data = make_conversation_data(overview=game_data["overview"],
game_data = process_standard_game_inputs(game_path)
overview_df = game_data["overview"]
country_to_model = get_country_to_model_mapping(overview_df, game_data["all_responses"])
data = make_conversation_data(country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"])
output_path = analysis_folder / f"{game_name}_conversations_data.csv"
data.to_csv(output_path, index=False)
except FileNotFoundError as e:
print(f"Could not process {game_name}. Missing file: {e.filename}")
except Exception as e:
print(f"An unexpected error occurred while processing {game_name}: {e}")
print(f"Skipping {game_name}.")
print(f"Skipping {game_name}.")
traceback.print_exc()

View file

@ -74,24 +74,26 @@ etc (a lot of possible combinations here)
'invalid_order_count', (number of invalid orders given)
'no_moves_extracted_flag', (flag for if no moves were extracted)
'valid_order_count', (number of valid orders, calculated as unit_count - invalid_order_count, unless no valid orders were extracted )
'goals', (list of goals for the phase separated by \n\n)
'diary', (diary entry for the phase)
"""
import pandas as pd
import numpy as np
import os
import json
import copy
import re
import argparse
from pathlib import Path
from analysis.analysis_helpers import process_standard_game_inputs, COUNTRIES
from analysis.analysis_helpers import process_standard_game_inputs, get_country_to_model_mapping, COUNTRIES
from tqdm import tqdm
import traceback
def make_phase_data(overview : pd.DataFrame,
def make_phase_data(country_to_model : pd.Series,
lmvs_data : pd.DataFrame,
conversations_data : pd.DataFrame,
orders_data : pd.DataFrame) -> pd.DataFrame:
country_to_model = overview.loc[1, COUNTRIES]
longform_conversations_complete = []
for c in COUNTRIES:
@ -109,12 +111,27 @@ def make_phase_data(overview : pd.DataFrame,
############ Relationships #############
agent_relationship_matrix_over_time = {}
state_list = {}
for phase in lmvs_data["phases"]:
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(phase.get("agent_relationships", {}))
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
if longform_relationships.empty:
# Then we have v2 of the data log where relationships are stored under state_agents and need a different approach
agent_relationship_matrix_over_time = {}
for phase in lmvs_data["phases"]:
agent_state = phase.get("state_agents", {})
country_relationships = {}
for c in COUNTRIES:
country_relationships[c] = agent_state.get(c, {}).get("relationships", {})
agent_relationship_matrix_over_time[phase["name"]] = pd.DataFrame(country_relationships)
longform_relationships = pd.concat(agent_relationship_matrix_over_time).reset_index(names=["phase", "agent"])
longform_relationships.columns = longform_relationships.columns.str.lower()
longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']] = longform_relationships[['austria', 'england', 'france', 'germany', 'italy',
'russia', 'turkey']].fillna("Self")
longform_relationships = longform_relationships.add_prefix("relationship_")
########### ORDERS DATA ###########
# adding results to lmvs
@ -200,7 +217,7 @@ def make_phase_data(overview : pd.DataFrame,
# lost a supply center
# territories held, territories moved to?
# territories held, territories moved to
orders_summary = pd.concat([commands_given.unstack().add_prefix("count_").add_suffix("_commands"),
immediate_outcomes.unstack().add_prefix("count_got_"),
@ -240,10 +257,33 @@ def make_phase_data(overview : pd.DataFrame,
state_list[phase["name"]].append(orders_over_time.loc[phase["name"]].rename("orders"))
state_list[phase["name"]] = pd.concat(state_list[phase["name"]], axis=1)
# goals and diaries
goals_over_time = {}
diary_over_time = {}
for phase in lmvs_data["phases"]:
agent_state = phase.get("state_agents", {})
if agent_state: # Not all versions have this
country_goals = {}
country_diary = {}
for c in COUNTRIES:
country_goals[c] = "\n\n".join(agent_state.get(c, {}).get("goals", {}))
country_diary[c] = "\n\n".join(agent_state.get(c, {}).get("full_private_diary", []))
goals_over_time[phase["name"]] = pd.Series(country_goals)
diary_over_time[phase["name"]] = pd.Series(country_diary)
state_list = pd.concat(state_list, axis=0)
state_list.index.names = ["phase", "agent"]
if goals_over_time:
goals_over_time = pd.DataFrame(goals_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"goal"}).set_index(["phase", "agent"])
state_list = pd.concat([state_list, goals_over_time], axis=1)
if diary_over_time:
diary_over_time = pd.DataFrame(diary_over_time).T.stack().reset_index().rename(columns={"level_0":"phase", "level_1":"agent", 0:"diary"}).set_index(["phase", "agent"])
state_list = pd.concat([state_list, diary_over_time], axis=1)
longform_relationships = longform_relationships.set_index(["relationship_phase", "relationship_agent"])
longform_relationships.index.names = ["phase", "agent"]
full_phase_data = pd.merge(state_list,
longform_relationships.set_index(["phase", "agent"]).add_prefix("relationship_to_").fillna("Self"),
longform_relationships,
left_index=True, right_index=True).reset_index()
full_phase_data["centers_count"] = full_phase_data["centers"].apply(lambda x: len(x))
full_phase_data["units_count"] = full_phase_data["units"].apply(lambda x: len(x))
@ -262,7 +302,7 @@ def make_phase_data(overview : pd.DataFrame,
"influence_count"]].diff()
full_phase_data = pd.merge(full_phase_data, longform_conversations_complete,
left_on=["phase", "agent"], right_on=["phase", "power"]).drop(columns=["agent"])
left_on=["phase", "agent"], right_on=["phase", "power"])
full_phase_data = pd.merge(full_phase_data, orders_summary, how="left", left_on=["power", "phase"],
right_index=True)
full_phase_data["model"] = full_phase_data["power"].map(country_to_model)
@ -301,32 +341,39 @@ if __name__ == "__main__":
args = parser.parse_args()
current_game_data_folder = Path(args.game_data_folder)
analysis_folder = args.analysis_folder
output_folder = Path(analysis_folder) / "phase_data"
analysis_folder = Path(args.analysis_folder)
output_folder = analysis_folder / "phase_data"
if not os.path.exists(output_folder):
if not output_folder.exists():
print(f"Output folder {output_folder} not found, creating it.")
os.makedirs(output_folder)
output_folder.mkdir(parents=True, exist_ok=True)
games_to_process = args.selected_game
if not games_to_process:
games_to_process = os.listdir(current_game_data_folder)
games_to_process = [p.name for p in current_game_data_folder.iterdir() if p.is_dir()]
for game_name in tqdm(games_to_process):
if game_name == ".DS_Store":
continue
game_path = current_game_data_folder / game_name
if not os.path.isdir(game_path):
if not game_path.is_dir():
continue
#try:
game_data = process_standard_game_inputs(game_data_folder=game_path, selected_game=game_name)
orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
data = make_phase_data(overview=game_data["overview"],
lmvs_data=game_data["lmvs_data"],
conversations_data=conversations_data,
orders_data=orders_data)
output_path = output_folder / f"{game_name}_phase_data.csv"
data.to_csv(output_path, index=False)
try:
game_data = process_standard_game_inputs(game_path)
orders_data = pd.read_csv(analysis_folder / "orders_data" / f"{game_name}_orders_data.csv")
conversations_data = pd.read_csv(analysis_folder / "conversations_data" / f"{game_name}_conversations_data.csv")
country_to_model = get_country_to_model_mapping(game_data["overview"], game_data["all_responses"])
data = make_phase_data(country_to_model=country_to_model,
lmvs_data=game_data["lmvs_data"],
conversations_data=conversations_data,
orders_data=orders_data)
output_path = output_folder / f"{game_name}_phase_data.csv"
data.to_csv(output_path, index=False)
except FileNotFoundError as e:
print(f"Could not process {game_name}. Missing file: {e.filename}")
except Exception as e:
print(f"An unexpected error occurred while processing {game_name}: {e}")
print(f"Skipping {game_name}.")
traceback.print_exc()