From 4e7d9296eede1db362ab48d4bd6ab7969aca31ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=B6pf?= Date: Sun, 16 Mar 2025 12:26:27 +0100 Subject: [PATCH] feat: Add comprehensive visualization script for evaluation results (#376) --- eval/requirements-eval.txt | 1 + eval/visualize_results.py | 704 ++++++++++++++++++ ...struct.yaml => llama-3.1-8b-instruct.yaml} | 0 3 files changed, 705 insertions(+) create mode 100644 eval/visualize_results.py rename eval/yaml/{llama-3.3-8b-instruct.yaml => llama-3.1-8b-instruct.yaml} (100%) diff --git a/eval/requirements-eval.txt b/eval/requirements-eval.txt index 8650b14a..2f01c1cf 100644 --- a/eval/requirements-eval.txt +++ b/eval/requirements-eval.txt @@ -1,3 +1,4 @@ openai>=1.64.0 PyYAML>=6.0 tqdm>=4.66.0 +matplotlib>=3.10.0 diff --git a/eval/visualize_results.py b/eval/visualize_results.py new file mode 100644 index 00000000..18f0d084 --- /dev/null +++ b/eval/visualize_results.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python +""" +Visualization script for reasoning gym evaluation results. + +This script generates visualizations from evaluation results stored in summary.json files. + +Usage: + python visualize_results.py --results-dir results/ [options] + +Options: + --output-dir DIR Directory to save visualizations (default: visualizations) + --plots PLOTS Comma-separated list of plots to generate (default: all) + Available: radar,bar,violin,heatmap,dashboard,distribution,top_datasets + --top-n N Number of datasets to show in top datasets plot (default: 15) + --top-mode MODE Mode for top datasets plot: hardest, easiest, variable (default: hardest) + --format FORMAT Output format for plots: png, pdf, svg (default: png) + --dpi DPI DPI for output images (default: 300) + --no-show Don't display plots, just save them + --debug Enable debug logging +""" + +import argparse +import json +import logging +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.figure import Figure +from matplotlib.patches import Patch + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) +logger = logging.getLogger("visualize_results") + + +def load_summaries(results_dir: str) -> Dict[str, Dict[str, Any]]: + """Load all summary.json files from subdirectories. + + Args: + results_dir: Directory containing model evaluation results + + Returns: + Dictionary mapping model names to their summary data + """ + summaries = {} + results_path = Path(results_dir) + + if not results_path.exists(): + logger.error(f"Results directory {results_dir} does not exist") + return {} + + # Find all summary.json files + for model_dir in results_path.iterdir(): + if not model_dir.is_dir(): + continue + + summary_path = model_dir / "summary.json" + if not summary_path.exists(): + logger.warning(f"No summary.json found in {model_dir}") + continue + + try: + # Extract model name from directory name (remove timestamp) + model_name = re.sub(r"_\d{8}_\d{6}$", "", model_dir.name) + # Replace underscores with slashes in model name for better display + model_name = model_name.replace("_", "/") + + with open(summary_path, "r") as f: + summary_data = json.load(f) + + # Check if summary has required fields + if "dataset_best_scores" not in summary_data: + logger.warning(f"Summary in {model_dir} is missing required fields") + continue + + summaries[model_name] = summary_data + logger.info(f"Loaded summary for {model_name}") + + except Exception as e: + logger.error(f"Error loading summary from {model_dir}: {str(e)}") + + if not summaries: + logger.error("No valid summary files found") + + return summaries + + +def get_dataset_categories(results_dir: str, summaries: Dict[str, Dict[str, Any]]) -> Dict[str, List[str]]: + """Group datasets by their categories based on directory structure. + + Args: + results_dir: Directory containing model evaluation results + summaries: Dictionary of model summaries + + Returns: + Dictionary mapping category names to lists of dataset names + """ + categories = {} + results_path = Path(results_dir) + + # Get all dataset names from the first summary + if not summaries: + return {} + + first_summary = next(iter(summaries.values())) + all_datasets = set(first_summary["dataset_best_scores"].keys()) + + # Find categories by looking at directory structure + for model_dir in results_path.iterdir(): + if not model_dir.is_dir(): + continue + + # Look for category directories + for category_dir in model_dir.iterdir(): + if not category_dir.is_dir(): + continue + + category_name = category_dir.name + if category_name not in categories: + categories[category_name] = [] + + # Find all dataset JSON files in this category + for dataset_file in category_dir.glob("*.json"): + dataset_name = dataset_file.stem + if dataset_name in all_datasets and dataset_name not in categories[category_name]: + categories[category_name].append(dataset_name) + + # Check if we found categories for all datasets + categorized_datasets = set() + for datasets in categories.values(): + categorized_datasets.update(datasets) + + uncategorized = all_datasets - categorized_datasets + if uncategorized: + logger.warning(f"Found {len(uncategorized)} datasets without categories") + categories["uncategorized"] = list(uncategorized) + + return categories + + +def create_category_radar(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure: + """Create a radar chart showing performance by category. + + Args: + summaries: Dictionary of model summaries + categories: Dictionary mapping categories to dataset lists + + Returns: + Matplotlib figure + """ + # Calculate average score per category for each model + category_scores = {} + for model_name, summary in summaries.items(): + category_scores[model_name] = {} + for category, datasets in categories.items(): + scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset in datasets] + if scores: # Avoid division by zero + category_scores[model_name][category] = np.mean(scores) + else: + category_scores[model_name][category] = 0 + + # Create radar chart + categories_list = sorted(categories.keys()) + angles = np.linspace(0, 2 * np.pi, len(categories_list), endpoint=False).tolist() + angles += angles[:1] # Close the loop + + fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(polar=True)) + + # Use a color cycle for different models + colors = plt.cm.tab10.colors + + for i, (model_name, scores) in enumerate(category_scores.items()): + color = colors[i % len(colors)] + values = [scores[cat] for cat in categories_list] + values += values[:1] # Close the loop + + ax.plot(angles, values, linewidth=2, label=model_name, color=color) + ax.fill(angles, values, alpha=0.1, color=color) + + # Set category labels + ax.set_xticks(angles[:-1]) + ax.set_xticklabels(categories_list) + + # Add radial grid lines at 0.2, 0.4, 0.6, 0.8 + ax.set_rticks([0.2, 0.4, 0.6, 0.8]) + ax.set_yticklabels(["0.2", "0.4", "0.6", "0.8"]) + ax.set_rlabel_position(0) # Move radial labels away from plotted line + + # Add legend and title + plt.legend(loc="upper right", bbox_to_anchor=(0.1, 0.1)) + plt.title("Model Performance by Category", size=15) + + return fig + + +def create_overall_performance_bar(summaries: Dict[str, Dict[str, Any]]) -> Figure: + """Create a bar chart of overall model performance. + + Args: + summaries: Dictionary of model summaries + + Returns: + Matplotlib figure + """ + # Calculate overall average score for each model + overall_scores = {} + for model_name, summary in summaries.items(): + scores = list(summary["dataset_best_scores"].values()) + overall_scores[model_name] = np.mean(scores) + + # Sort models by performance + sorted_models = sorted(overall_scores.items(), key=lambda x: x[1], reverse=True) + + # Create bar chart + fig, ax = plt.subplots(figsize=(12, 6)) + + models = [m[0] for m in sorted_models] + scores = [m[1] for m in sorted_models] + + # Use a color gradient based on performance + colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(models))) + + bars = ax.bar(models, scores, color=colors) + + # Add value labels on top of bars + for bar in bars: + height = bar.get_height() + ax.text(bar.get_x() + bar.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", va="bottom") + + ax.set_ylabel("Average Score") + ax.set_ylim(0, max(scores) * 1.1) # Add some space for labels + plt.xticks(rotation=45, ha="right") + + plt.title("Overall Model Performance", size=15) + plt.tight_layout() + + return fig + + +def create_top_datasets_comparison(summaries: Dict[str, Dict[str, Any]], n: int = 15, mode: str = "hardest") -> Figure: + """Create a bar chart comparing performance on top N datasets. + + Args: + summaries: Dictionary of model summaries + n: Number of datasets to show + mode: Selection mode - 'hardest', 'easiest', or 'variable' + + Returns: + Matplotlib figure + """ + if not summaries: + logger.error("No summaries provided") + return plt.figure() + + # Calculate average score across all models for each dataset + dataset_avg_scores = {} + for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys(): + scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()] + dataset_avg_scores[dataset] = np.mean(scores) + + # Select top N datasets based on mode + if mode == "hardest": + # Select datasets with lowest average scores + selected_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1])[:n] + elif mode == "easiest": + # Select datasets with highest average scores + selected_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1], reverse=True)[:n] + else: # 'variable' + # Select datasets with highest variance in scores + dataset_variances = {} + for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys(): + scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()] + dataset_variances[dataset] = np.var(scores) + selected_datasets = sorted(dataset_variances.items(), key=lambda x: x[1], reverse=True)[:n] + selected_datasets = [(dataset, dataset_avg_scores[dataset]) for dataset, _ in selected_datasets] + + # Create horizontal bar chart + fig, ax = plt.subplots(figsize=(12, n * 0.5)) + + datasets = [d[0] for d in selected_datasets] + x = np.arange(len(datasets)) + width = 0.8 / len(summaries) + + # Use a color cycle for different models + colors = plt.cm.tab10.colors + + for i, (model_name, summary) in enumerate(summaries.items()): + scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset, _ in selected_datasets] + ax.barh( + x + i * width - 0.4 + width / 2, scores, width, label=model_name, color=colors[i % len(colors)], alpha=0.8 + ) + + ax.set_yticks(x) + ax.set_yticklabels(datasets) + ax.set_xlabel("Score") + ax.set_xlim(0, 1) + + # Add legend and title + plt.legend(loc="upper right") + title = f'Model Performance on {n} {"Hardest" if mode=="hardest" else "Easiest" if mode=="easiest" else "Most Variable"} Datasets' + plt.title(title, size=15) + + plt.tight_layout() + return fig + + +def create_performance_distribution_violin(summaries: Dict[str, Dict[str, Any]]) -> Figure: + """Create a violin plot showing score distribution for each model. + + Args: + summaries: Dictionary of model summaries + + Returns: + Matplotlib figure + """ + # Prepare data for violin plot + data = [] + labels = [] + + for model_name, summary in summaries.items(): + scores = list(summary["dataset_best_scores"].values()) + data.append(scores) + labels.append(model_name) + + # Create violin plot + fig, ax = plt.subplots(figsize=(12, 6)) + + # Use a color cycle + colors = plt.cm.tab10.colors + + parts = ax.violinplot(data, showmeans=True, showmedians=True) + + # Customize violin plot + for i, pc in enumerate(parts["bodies"]): + pc.set_facecolor(colors[i % len(colors)]) + pc.set_alpha(0.7) + + # Add labels + ax.set_xticks(np.arange(1, len(labels) + 1)) + ax.set_xticklabels(labels, rotation=45, ha="right") + ax.set_ylabel("Score Distribution") + ax.set_ylim(0, 1) + + # Add grid for better readability + ax.yaxis.grid(True) + + # Add mean and median to legend + legend_elements = [ + Patch(facecolor="black", edgecolor="black", label="Mean", alpha=0.3), + Patch(facecolor="white", edgecolor="black", label="Median"), + ] + ax.legend(handles=legend_elements, loc="upper right") + + plt.title("Distribution of Scores Across All Datasets", size=15) + plt.tight_layout() + + return fig + + +def create_performance_heatmap(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure: + """Create a heatmap of model performance across datasets. + + Args: + summaries: Dictionary of model summaries + categories: Dictionary mapping categories to dataset lists + + Returns: + Matplotlib figure + """ + if not summaries: + logger.error("No summaries provided") + return plt.figure() + + # Get all dataset names + all_datasets = [] + for category, datasets in sorted(categories.items()): + all_datasets.extend(sorted(datasets)) + + models = list(summaries.keys()) + + # Create score matrix + score_matrix = np.zeros((len(models), len(all_datasets))) + + for i, model in enumerate(models): + for j, dataset in enumerate(all_datasets): + score_matrix[i, j] = summaries[model]["dataset_best_scores"].get(dataset, 0) + + # Create heatmap + fig, ax = plt.subplots(figsize=(max(20, len(all_datasets) * 0.25), max(8, len(models) * 0.5))) + + im = ax.imshow(score_matrix, cmap="viridis", aspect="auto", vmin=0, vmax=1) + + # Add colorbar + cbar = ax.figure.colorbar(im, ax=ax) + cbar.ax.set_ylabel("Score", rotation=-90, va="bottom") + + # Set ticks and labels + ax.set_xticks(np.arange(len(all_datasets))) + ax.set_yticks(np.arange(len(models))) + ax.set_xticklabels(all_datasets, rotation=90, fontsize=8) + ax.set_yticklabels(models) + + # Add category separators and labels + current_idx = 0 + for category, datasets in sorted(categories.items()): + if datasets: + # Add vertical line after each category + next_idx = current_idx + len(datasets) + if next_idx < len(all_datasets): + ax.axvline(x=next_idx - 0.5, color="white", linestyle="-", linewidth=2) + + # Add category label + middle_idx = current_idx + len(datasets) / 2 - 0.5 + ax.text( + middle_idx, + -0.5, + category, + ha="center", + va="top", + fontsize=10, + bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"), + ) + + current_idx = next_idx + + # Add grid lines + ax.set_xticks(np.arange(-0.5, len(all_datasets), 1), minor=True) + ax.set_yticks(np.arange(-0.5, len(models), 1), minor=True) + ax.grid(which="minor", color="w", linestyle="-", linewidth=0.5) + + plt.title("Model Performance Heatmap", size=15) + plt.tight_layout() + + return fig + + +def create_dashboard(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure: + """Create a comprehensive dashboard with multiple visualizations. + + Args: + summaries: Dictionary of model summaries + categories: Dictionary mapping categories to dataset lists + + Returns: + Matplotlib figure + """ + if not summaries: + logger.error("No summaries provided") + return plt.figure() + + fig = plt.figure(figsize=(20, 15)) + + # 1. Overall performance comparison + ax1 = plt.subplot2grid((2, 2), (0, 0)) + models = [] + scores = [] + for model_name, summary in summaries.items(): + models.append(model_name) + scores.append(np.mean(list(summary["dataset_best_scores"].values()))) + + # Sort by performance + sorted_indices = np.argsort(scores)[::-1] + models = [models[i] for i in sorted_indices] + scores = [scores[i] for i in sorted_indices] + + # Use a color gradient based on performance + colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(models))) + + bars = ax1.bar(models, scores, color=colors) + for bar in bars: + height = bar.get_height() + ax1.text( + bar.get_x() + bar.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", va="bottom", fontsize=8 + ) + + ax1.set_ylabel("Average Score") + ax1.set_ylim(0, max(scores) * 1.1) + plt.setp(ax1.get_xticklabels(), rotation=45, ha="right", fontsize=8) + ax1.set_title("Overall Model Performance", size=12) + + # 2. Top 10 hardest datasets comparison + ax2 = plt.subplot2grid((2, 2), (0, 1)) + # Calculate average score across all models for each dataset + dataset_avg_scores = {} + for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys(): + scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()] + dataset_avg_scores[dataset] = np.mean(scores) + + # Select 10 hardest datasets + hardest_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1])[:10] + + datasets = [d[0] for d in hardest_datasets] + x = np.arange(len(datasets)) + width = 0.8 / len(summaries) + + # Use a color cycle for different models + colors = plt.cm.tab10.colors + + for i, (model_name, summary) in enumerate(summaries.items()): + scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset, _ in hardest_datasets] + ax2.barh( + x + i * width - 0.4 + width / 2, scores, width, label=model_name, color=colors[i % len(colors)], alpha=0.8 + ) + + ax2.set_yticks(x) + ax2.set_yticklabels(datasets, fontsize=8) + ax2.set_xlabel("Score") + ax2.set_xlim(0, 1) + ax2.set_title("Performance on 10 Hardest Datasets", size=12) + ax2.legend(fontsize=8) + + # 3. Category radar chart + ax3 = plt.subplot2grid((2, 2), (1, 0), polar=True) + + # Calculate average score per category for each model + category_scores = {} + for model_name, summary in summaries.items(): + category_scores[model_name] = {} + for category, datasets in categories.items(): + scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset in datasets] + if scores: # Avoid division by zero + category_scores[model_name][category] = np.mean(scores) + else: + category_scores[model_name][category] = 0 + + # Create radar chart + categories_list = sorted(categories.keys()) + angles = np.linspace(0, 2 * np.pi, len(categories_list), endpoint=False).tolist() + angles += angles[:1] # Close the loop + + for i, (model_name, scores) in enumerate(category_scores.items()): + color = colors[i % len(colors)] + values = [scores.get(cat, 0) for cat in categories_list] + values += values[:1] # Close the loop + + ax3.plot(angles, values, linewidth=2, label=model_name, color=color) + ax3.fill(angles, values, alpha=0.1, color=color) + + ax3.set_xticks(angles[:-1]) + ax3.set_xticklabels(categories_list, fontsize=8) + ax3.set_title("Performance by Category", size=12) + + # 4. Performance distribution violin plot + ax4 = plt.subplot2grid((2, 2), (1, 1)) + data = [] + labels = [] + + for model_name, summary in summaries.items(): + scores = list(summary["dataset_best_scores"].values()) + data.append(scores) + labels.append(model_name) + + parts = ax4.violinplot(data, showmeans=True, showmedians=True) + + for i, pc in enumerate(parts["bodies"]): + pc.set_facecolor(colors[i % len(colors)]) + pc.set_alpha(0.7) + + ax4.set_xticks(np.arange(1, len(labels) + 1)) + ax4.set_xticklabels(labels, rotation=45, ha="right", fontsize=8) + ax4.set_ylabel("Score Distribution") + ax4.set_ylim(0, 1) + ax4.yaxis.grid(True) + ax4.set_title("Distribution of Scores", size=12) + + plt.tight_layout() + plt.suptitle("Model Evaluation Dashboard", size=16, y=0.98) + plt.subplots_adjust(top=0.9) + + return fig + + +def save_figure(fig: Figure, output_dir: str, name: str, fmt: str = "png", dpi: int = 300) -> str: + """Save a figure to a file. + + Args: + fig: Matplotlib figure to save + output_dir: Directory to save the figure + name: Base name for the figure file + fmt: File format (png, pdf, svg) + dpi: DPI for raster formats + + Returns: + Path to the saved file + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Create filename + filename = f"{name}.{fmt}" + filepath = os.path.join(output_dir, filename) + + # Save figure + fig.savefig(filepath, dpi=dpi, bbox_inches="tight") + logger.info(f"Saved {filepath}") + + return filepath + + +def main(): + """Main function.""" + parser = argparse.ArgumentParser(description="Generate visualizations from evaluation results") + parser.add_argument("--results-dir", required=True, help="Directory containing evaluation results") + parser.add_argument("--output-dir", default="visualizations", help="Directory to save visualizations") + parser.add_argument("--plots", default="all", help="Comma-separated list of plots to generate") + parser.add_argument("--top-n", type=int, default=15, help="Number of datasets to show in top datasets plot") + parser.add_argument( + "--top-mode", default="hardest", choices=["hardest", "easiest", "variable"], help="Mode for top datasets plot" + ) + parser.add_argument("--format", default="png", choices=["png", "pdf", "svg"], help="Output format for plots") + parser.add_argument("--dpi", type=int, default=300, help="DPI for output images") + parser.add_argument("--no-show", action="store_true", help="Don't display plots, just save them") + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + args = parser.parse_args() + + # Configure logging + if args.debug: + logger.setLevel(logging.DEBUG) + + # Load summaries + logger.info(f"Loading summaries from {args.results_dir}") + summaries = load_summaries(args.results_dir) + + if not summaries: + logger.error("No valid summaries found. Exiting.") + return 1 + + logger.info(f"Found {len(summaries)} model summaries") + + # Get dataset categories + categories = get_dataset_categories(args.results_dir, summaries) + logger.info(f"Found {len(categories)} dataset categories") + + # Determine which plots to generate + if args.plots.lower() == "all": + plots_to_generate = ["radar", "bar", "violin", "heatmap", "dashboard", "top_datasets"] + else: + plots_to_generate = [p.strip().lower() for p in args.plots.split(",")] + + logger.info(f"Generating plots: {', '.join(plots_to_generate)}") + + # Generate and save plots + for plot_type in plots_to_generate: + try: + if plot_type == "radar": + fig = create_category_radar(summaries, categories) + save_figure(fig, args.output_dir, "category_radar", args.format, args.dpi) + + elif plot_type == "bar": + fig = create_overall_performance_bar(summaries) + save_figure(fig, args.output_dir, "overall_performance", args.format, args.dpi) + + elif plot_type == "violin": + fig = create_performance_distribution_violin(summaries) + save_figure(fig, args.output_dir, "score_distribution", args.format, args.dpi) + + elif plot_type == "heatmap": + fig = create_performance_heatmap(summaries, categories) + save_figure(fig, args.output_dir, "performance_heatmap", args.format, args.dpi) + + elif plot_type == "dashboard": + fig = create_dashboard(summaries, categories) + save_figure(fig, args.output_dir, "evaluation_dashboard", args.format, args.dpi) + + elif plot_type == "top_datasets": + fig = create_top_datasets_comparison(summaries, args.top_n, args.top_mode) + save_figure(fig, args.output_dir, f"top_{args.top_n}_{args.top_mode}_datasets", args.format, args.dpi) + + else: + logger.warning(f"Unknown plot type: {plot_type}") + continue + + # Show plot if requested + if not args.no_show: + plt.show() + else: + plt.close(fig) + + except Exception as e: + logger.error(f"Error generating {plot_type} plot: {str(e)}") + if args.debug: + import traceback + + traceback.print_exc() + + logger.info(f"All visualizations saved to {args.output_dir}") + return 0 + + +if __name__ == "__main__": + exit_code = main() + import sys + + sys.exit(exit_code) diff --git a/eval/yaml/llama-3.3-8b-instruct.yaml b/eval/yaml/llama-3.1-8b-instruct.yaml similarity index 100% rename from eval/yaml/llama-3.3-8b-instruct.yaml rename to eval/yaml/llama-3.1-8b-instruct.yaml