mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-05-01 17:45:20 +00:00
Fix horizon bug, multi-provider support, add Sonnet vs Gemini benchmark results
Bug fixes: - CLI --horizon-years defaulted to 3, silently overriding config presets. Now defaults to None so config value (1yr for medium/hard/nightmare) is used. - Runtime passed a single api_key kwarg regardless of provider, breaking Gemini. Now lets LiteLLM resolve keys from provider-specific env vars. - Removed temperature+top_p from LLM calls (Anthropic rejects both together). - DB and result filenames now include config name to prevent cross-config collisions. Benchmark results (1yr horizon, 3 seeds each): Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3 Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3 Gemini has higher win rates (93-98% vs 40-83% on medium). Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K). New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py Updated README with detailed comparison tables and failure analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d1d7bc97b5
commit
5d2962073d
38 changed files with 16654 additions and 34 deletions
|
|
@ -4,6 +4,7 @@ import json
|
|||
import sys
|
||||
from contextlib import contextmanager
|
||||
from decimal import Decimal
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
import typer
|
||||
|
|
@ -80,7 +81,7 @@ app.add_typer(scratchpad_app, name="scratchpad")
|
|||
def run_command_cli(
|
||||
model: str = typer.Option(..., help="LiteLLM model string (e.g. openrouter/z-ai/glm-5)"),
|
||||
seed: int = typer.Option(..., help="Random seed for deterministic world generation"),
|
||||
horizon_years: int = typer.Option(3, help="Simulation horizon in years"),
|
||||
horizon_years: Optional[int] = typer.Option(None, help="Simulation horizon in years (default from config)"),
|
||||
company_name: str = typer.Option("BenchCo", help="Name of the simulated company"),
|
||||
start_date: str = typer.Option("2025-01-01", help="Simulation start date (YYYY-MM-DD)"),
|
||||
config_name: str = typer.Option(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue