mirror of
https://github.com/GoodStartLabs/AI_Diplomacy.git
synced 2026-04-19 12:58:09 +00:00
361 lines
No EOL
15 KiB
Python
361 lines
No EOL
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyze hold reduction experiment results comparing baseline vs intervention.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import json
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
def analyze_orders_for_experiment(exp_dir: Path):
|
|
"""
|
|
Analyze order types across all runs in an experiment directory.
|
|
Returns aggregated statistics for holds, supports, moves, and convoys.
|
|
"""
|
|
order_stats = {
|
|
'holds': [],
|
|
'supports': [],
|
|
'moves': [],
|
|
'convoys': [],
|
|
'total_units': []
|
|
}
|
|
|
|
for run_dir in sorted(exp_dir.glob("runs/run_*")):
|
|
game_file = run_dir / "lmvsgame.json"
|
|
if not game_file.exists():
|
|
continue
|
|
|
|
with open(game_file, 'r') as f:
|
|
game_data = json.load(f)
|
|
|
|
# Analyze each movement phase
|
|
for phase in game_data.get('phases', []):
|
|
phase_name = phase.get('name', phase.get('state', {}).get('name', ''))
|
|
|
|
# Only analyze movement phases
|
|
if not phase_name.endswith('M') or phase_name.endswith('R'):
|
|
continue
|
|
|
|
# Count orders by type for all powers
|
|
phase_holds = 0
|
|
phase_supports = 0
|
|
phase_moves = 0
|
|
phase_convoys = 0
|
|
phase_units = 0
|
|
|
|
for power, power_orders in phase.get('order_results', {}).items():
|
|
# Count units
|
|
units = phase['state']['units'].get(power, [])
|
|
phase_units += len(units)
|
|
|
|
# Count order types
|
|
phase_holds += len(power_orders.get('hold', []))
|
|
phase_supports += len(power_orders.get('support', []))
|
|
phase_moves += len(power_orders.get('move', []))
|
|
phase_convoys += len(power_orders.get('convoy', []))
|
|
|
|
if phase_units > 0:
|
|
order_stats['holds'].append(phase_holds)
|
|
order_stats['supports'].append(phase_supports)
|
|
order_stats['moves'].append(phase_moves)
|
|
order_stats['convoys'].append(phase_convoys)
|
|
order_stats['total_units'].append(phase_units)
|
|
|
|
return order_stats
|
|
|
|
def calculate_rates(order_stats):
|
|
"""Calculate rates per unit for each order type."""
|
|
holds = np.array(order_stats['holds'])
|
|
supports = np.array(order_stats['supports'])
|
|
moves = np.array(order_stats['moves'])
|
|
convoys = np.array(order_stats['convoys'])
|
|
total_units = np.array(order_stats['total_units'])
|
|
|
|
# Avoid division by zero
|
|
mask = total_units > 0
|
|
|
|
rates = {
|
|
'hold_rate': np.mean(holds[mask] / total_units[mask]),
|
|
'support_rate': np.mean(supports[mask] / total_units[mask]),
|
|
'move_rate': np.mean(moves[mask] / total_units[mask]),
|
|
'convoy_rate': np.mean(convoys[mask] / total_units[mask]),
|
|
'n_phases': len(holds[mask])
|
|
}
|
|
|
|
# Calculate standard errors
|
|
rates['hold_se'] = np.std(holds[mask] / total_units[mask]) / np.sqrt(rates['n_phases'])
|
|
rates['support_se'] = np.std(supports[mask] / total_units[mask]) / np.sqrt(rates['n_phases'])
|
|
rates['move_se'] = np.std(moves[mask] / total_units[mask]) / np.sqrt(rates['n_phases'])
|
|
rates['convoy_se'] = np.std(convoys[mask] / total_units[mask]) / np.sqrt(rates['n_phases'])
|
|
|
|
return rates
|
|
|
|
def main():
|
|
import sys
|
|
|
|
# Check if specific experiment directories are provided
|
|
if len(sys.argv) > 1:
|
|
# Analyze specific experiments provided as arguments
|
|
experiments = []
|
|
for exp_path in sys.argv[1:]:
|
|
exp_dir = Path(exp_path)
|
|
if exp_dir.exists():
|
|
experiments.append((exp_dir.name, exp_dir))
|
|
|
|
print(f"Analyzing {len(experiments)} experiments")
|
|
print("=" * 50)
|
|
|
|
results = {}
|
|
for exp_name, exp_dir in experiments:
|
|
print(f"\nAnalyzing {exp_name}...")
|
|
stats = analyze_orders_for_experiment(exp_dir)
|
|
rates = calculate_rates(stats)
|
|
results[exp_name] = rates
|
|
|
|
print(f"\n{exp_name} Results (n={rates['n_phases']} phases):")
|
|
print(f" Hold rate: {rates['hold_rate']:.3f} ± {rates['hold_se']:.3f}")
|
|
print(f" Support rate: {rates['support_rate']:.3f} ± {rates['support_se']:.3f}")
|
|
print(f" Move rate: {rates['move_rate']:.3f} ± {rates['move_se']:.3f}")
|
|
print(f" Convoy rate: {rates['convoy_rate']:.3f} ± {rates['convoy_se']:.3f}")
|
|
|
|
# Create visualization for multiple experiments
|
|
if len(results) > 2:
|
|
# Group by model
|
|
models = {}
|
|
for exp_name, rates in results.items():
|
|
if 'mistral' in exp_name.lower():
|
|
model = 'Mistral'
|
|
elif 'gemini' in exp_name.lower():
|
|
model = 'Gemini'
|
|
elif 'kimi' in exp_name.lower():
|
|
model = 'Kimi'
|
|
else:
|
|
continue
|
|
|
|
if model not in models:
|
|
models[model] = {}
|
|
|
|
# Determine version
|
|
if 'baseline' in exp_name:
|
|
version = 'Baseline'
|
|
elif '_v3_' in exp_name:
|
|
version = 'V3'
|
|
elif '_v2_' in exp_name:
|
|
version = 'V2'
|
|
elif '_v1_' in exp_name or (model == 'Mistral' and 'hold_reduction_mistral_' in exp_name):
|
|
version = 'V1'
|
|
else:
|
|
version = 'V1' # Default for gemini/kimi first intervention
|
|
|
|
models[model][version] = rates
|
|
|
|
# Create subplots for each model
|
|
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
|
|
|
|
for idx, (model, versions) in enumerate(sorted(models.items())):
|
|
ax = axes[idx]
|
|
|
|
# Sort versions
|
|
version_order = ['Baseline', 'V1', 'V2', 'V3']
|
|
sorted_versions = [(v, versions[v]) for v in version_order if v in versions]
|
|
|
|
# Prepare data
|
|
version_names = [v[0] for v in sorted_versions]
|
|
hold_rates = [v[1]['hold_rate'] for v in sorted_versions]
|
|
support_rates = [v[1]['support_rate'] for v in sorted_versions]
|
|
move_rates = [v[1]['move_rate'] for v in sorted_versions]
|
|
|
|
hold_errors = [v[1]['hold_se'] for v in sorted_versions]
|
|
support_errors = [v[1]['support_se'] for v in sorted_versions]
|
|
move_errors = [v[1]['move_se'] for v in sorted_versions]
|
|
|
|
x = np.arange(len(version_names))
|
|
width = 0.25
|
|
|
|
# Create bars
|
|
bars1 = ax.bar(x - width, hold_rates, width, yerr=hold_errors,
|
|
label='Hold', capsize=3, color='#ff7f0e')
|
|
bars2 = ax.bar(x, support_rates, width, yerr=support_errors,
|
|
label='Support', capsize=3, color='#2ca02c')
|
|
bars3 = ax.bar(x + width, move_rates, width, yerr=move_errors,
|
|
label='Move', capsize=3, color='#1f77b4')
|
|
|
|
# Formatting
|
|
ax.set_xlabel('Version')
|
|
ax.set_ylabel('Orders per Unit')
|
|
ax.set_title(f'{model} - Hold Reduction Progression')
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(version_names)
|
|
ax.legend()
|
|
ax.grid(axis='y', alpha=0.3)
|
|
ax.set_ylim(0, 1.0)
|
|
|
|
# Add value labels on bars
|
|
for bars in [bars1, bars2, bars3]:
|
|
for bar in bars:
|
|
height = bar.get_height()
|
|
if height > 0.02: # Only label visible bars
|
|
ax.annotate(f'{height:.2f}',
|
|
xy=(bar.get_x() + bar.get_width() / 2, height),
|
|
xytext=(0, 2),
|
|
textcoords="offset points",
|
|
ha='center', va='bottom',
|
|
fontsize=8)
|
|
|
|
plt.suptitle('Hold Reduction Experiment Results Across Models', fontsize=16, y=1.02)
|
|
plt.tight_layout()
|
|
plt.savefig('experiments/hold_reduction_all_models_comparison.png', dpi=150, bbox_inches='tight')
|
|
print(f"\nComparison plot saved to experiments/hold_reduction_all_models_comparison.png")
|
|
|
|
# Save results to CSV
|
|
csv_data = []
|
|
for model, versions in models.items():
|
|
for version, rates in versions.items():
|
|
csv_data.append({
|
|
'Model': model,
|
|
'Version': version,
|
|
'Hold_Rate': rates['hold_rate'],
|
|
'Hold_SE': rates['hold_se'],
|
|
'Support_Rate': rates['support_rate'],
|
|
'Support_SE': rates['support_se'],
|
|
'Move_Rate': rates['move_rate'],
|
|
'Move_SE': rates['move_se'],
|
|
'N_Phases': rates['n_phases']
|
|
})
|
|
|
|
df = pd.DataFrame(csv_data)
|
|
df = df.sort_values(['Model', 'Version'])
|
|
df.to_csv('experiments/hold_reduction_all_results.csv', index=False)
|
|
print(f"Results saved to experiments/hold_reduction_all_results.csv")
|
|
|
|
# Print summary statistics
|
|
print("\n" + "="*60)
|
|
print("SUMMARY: Hold Rate Changes from Baseline")
|
|
print("="*60)
|
|
for model in sorted(models.keys()):
|
|
print(f"\n{model}:")
|
|
if 'Baseline' in models[model]:
|
|
baseline = models[model]['Baseline']['hold_rate']
|
|
for version in ['V1', 'V2', 'V3']:
|
|
if version in models[model]:
|
|
rate = models[model][version]['hold_rate']
|
|
change = (rate - baseline) / baseline * 100
|
|
print(f" {version}: {rate:.3f} ({change:+.1f}% from baseline)")
|
|
|
|
return
|
|
|
|
# Default behavior - analyze baseline vs intervention
|
|
baseline_dir = Path("experiments/hold_reduction_baseline_S1911M")
|
|
intervention_dir = Path("experiments/hold_reduction_intervention_S1911M")
|
|
|
|
print("Analyzing Hold Reduction Experiment")
|
|
print("=" * 50)
|
|
|
|
# Analyze baseline
|
|
print("\nAnalyzing baseline experiment...")
|
|
baseline_stats = analyze_orders_for_experiment(baseline_dir)
|
|
baseline_rates = calculate_rates(baseline_stats)
|
|
|
|
print(f"\nBaseline Results (n={baseline_rates['n_phases']} phases):")
|
|
print(f" Hold rate: {baseline_rates['hold_rate']:.3f} ± {baseline_rates['hold_se']:.3f}")
|
|
print(f" Support rate: {baseline_rates['support_rate']:.3f} ± {baseline_rates['support_se']:.3f}")
|
|
print(f" Move rate: {baseline_rates['move_rate']:.3f} ± {baseline_rates['move_se']:.3f}")
|
|
print(f" Convoy rate: {baseline_rates['convoy_rate']:.3f} ± {baseline_rates['convoy_se']:.3f}")
|
|
|
|
# Analyze intervention
|
|
print("\nAnalyzing intervention experiment...")
|
|
intervention_stats = analyze_orders_for_experiment(intervention_dir)
|
|
intervention_rates = calculate_rates(intervention_stats)
|
|
|
|
print(f"\nIntervention Results (n={intervention_rates['n_phases']} phases):")
|
|
print(f" Hold rate: {intervention_rates['hold_rate']:.3f} ± {intervention_rates['hold_se']:.3f}")
|
|
print(f" Support rate: {intervention_rates['support_rate']:.3f} ± {intervention_rates['support_se']:.3f}")
|
|
print(f" Move rate: {intervention_rates['move_rate']:.3f} ± {intervention_rates['move_se']:.3f}")
|
|
print(f" Convoy rate: {intervention_rates['convoy_rate']:.3f} ± {intervention_rates['convoy_se']:.3f}")
|
|
|
|
# Calculate changes
|
|
print("\nChanges from Baseline to Intervention:")
|
|
hold_change = (intervention_rates['hold_rate'] - baseline_rates['hold_rate']) / baseline_rates['hold_rate'] * 100
|
|
support_change = (intervention_rates['support_rate'] - baseline_rates['support_rate']) / baseline_rates['support_rate'] * 100
|
|
move_change = (intervention_rates['move_rate'] - baseline_rates['move_rate']) / baseline_rates['move_rate'] * 100
|
|
|
|
print(f" Hold rate: {hold_change:+.1f}%")
|
|
print(f" Support rate: {support_change:+.1f}%")
|
|
print(f" Move rate: {move_change:+.1f}%")
|
|
|
|
# Create visualization
|
|
fig, ax = plt.subplots(figsize=(10, 6))
|
|
|
|
x = np.arange(4)
|
|
width = 0.35
|
|
|
|
baseline_means = [
|
|
baseline_rates['hold_rate'],
|
|
baseline_rates['support_rate'],
|
|
baseline_rates['move_rate'],
|
|
baseline_rates['convoy_rate']
|
|
]
|
|
baseline_errors = [
|
|
baseline_rates['hold_se'],
|
|
baseline_rates['support_se'],
|
|
baseline_rates['move_se'],
|
|
baseline_rates['convoy_se']
|
|
]
|
|
|
|
intervention_means = [
|
|
intervention_rates['hold_rate'],
|
|
intervention_rates['support_rate'],
|
|
intervention_rates['move_rate'],
|
|
intervention_rates['convoy_rate']
|
|
]
|
|
intervention_errors = [
|
|
intervention_rates['hold_se'],
|
|
intervention_rates['support_se'],
|
|
intervention_rates['move_se'],
|
|
intervention_rates['convoy_se']
|
|
]
|
|
|
|
bars1 = ax.bar(x - width/2, baseline_means, width, yerr=baseline_errors,
|
|
label='Baseline', capsize=5)
|
|
bars2 = ax.bar(x + width/2, intervention_means, width, yerr=intervention_errors,
|
|
label='Hold Reduction', capsize=5)
|
|
|
|
ax.set_xlabel('Order Type')
|
|
ax.set_ylabel('Orders per Unit')
|
|
ax.set_title('Hold Reduction Experiment: Order Type Distribution')
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(['Hold', 'Support', 'Move', 'Convoy'])
|
|
ax.legend()
|
|
ax.grid(axis='y', alpha=0.3)
|
|
|
|
# Add value labels on bars
|
|
for bars in [bars1, bars2]:
|
|
for bar in bars:
|
|
height = bar.get_height()
|
|
ax.annotate(f'{height:.3f}',
|
|
xy=(bar.get_x() + bar.get_width() / 2, height),
|
|
xytext=(0, 3), # 3 points vertical offset
|
|
textcoords="offset points",
|
|
ha='center', va='bottom',
|
|
fontsize=8)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig('experiments/hold_reduction_analysis.png', dpi=150)
|
|
print(f"\nPlot saved to experiments/hold_reduction_analysis.png")
|
|
|
|
# Save results to CSV
|
|
results_df = pd.DataFrame({
|
|
'Experiment': ['Baseline', 'Intervention'],
|
|
'Hold_Rate': [baseline_rates['hold_rate'], intervention_rates['hold_rate']],
|
|
'Support_Rate': [baseline_rates['support_rate'], intervention_rates['support_rate']],
|
|
'Move_Rate': [baseline_rates['move_rate'], intervention_rates['move_rate']],
|
|
'Convoy_Rate': [baseline_rates['convoy_rate'], intervention_rates['convoy_rate']],
|
|
'N_Phases': [baseline_rates['n_phases'], intervention_rates['n_phases']]
|
|
})
|
|
results_df.to_csv('experiments/hold_reduction_results.csv', index=False)
|
|
print(f"Results saved to experiments/hold_reduction_results.csv")
|
|
|
|
if __name__ == "__main__":
|
|
main() |