mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-04-29 17:35:12 +00:00
Fix horizon bug, multi-provider support, add Sonnet vs Gemini benchmark results
Bug fixes: - CLI --horizon-years defaulted to 3, silently overriding config presets. Now defaults to None so config value (1yr for medium/hard/nightmare) is used. - Runtime passed a single api_key kwarg regardless of provider, breaking Gemini. Now lets LiteLLM resolve keys from provider-specific env vars. - Removed temperature+top_p from LLM calls (Anthropic rejects both together). - DB and result filenames now include config name to prevent cross-config collisions. Benchmark results (1yr horizon, 3 seeds each): Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3 Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3 Gemini has higher win rates (93-98% vs 40-83% on medium). Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K). New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py Updated README with detailed comparison tables and failure analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d1d7bc97b5
commit
5d2962073d
38 changed files with 16654 additions and 34 deletions
|
|
@ -64,11 +64,12 @@ class LiteLLMRuntime(AgentRuntime):
|
|||
if self._retry_backoff_seconds <= 0:
|
||||
raise ValueError("retry_backoff_seconds must be > 0")
|
||||
|
||||
# API key: prefer OPENAI_API_KEY, fall back to OPENROUTER_API_KEY.
|
||||
# For openrouter/ prefixed models LiteLLM also reads OPENROUTER_API_KEY
|
||||
# automatically, so either approach works.
|
||||
# API key: check provider-specific env vars, then generic fallbacks.
|
||||
# LiteLLM reads these natively for their respective providers, but we
|
||||
# also pass the key explicitly via kwargs to be safe.
|
||||
self._api_key = (
|
||||
os.environ.get("OPENAI_API_KEY")
|
||||
os.environ.get("ANTHROPIC_API_KEY")
|
||||
or os.environ.get("OPENAI_API_KEY")
|
||||
or os.environ.get("OPENROUTER_API_KEY")
|
||||
or None
|
||||
)
|
||||
|
|
@ -159,14 +160,13 @@ class LiteLLMRuntime(AgentRuntime):
|
|||
messages=messages,
|
||||
tools=[_RUN_COMMAND_TOOL],
|
||||
tool_choice="auto",
|
||||
temperature=self._settings.temperature,
|
||||
top_p=self._settings.top_p,
|
||||
timeout=self._request_timeout_seconds,
|
||||
)
|
||||
if self._api_base:
|
||||
kwargs["api_base"] = self._api_base
|
||||
if self._api_key:
|
||||
kwargs["api_key"] = self._api_key
|
||||
# Let LiteLLM resolve API keys from provider-specific env vars
|
||||
# (ANTHROPIC_API_KEY, GEMINI_API_KEY, OPENROUTER_API_KEY, etc.)
|
||||
# rather than passing a single key that may not match the provider.
|
||||
|
||||
response = litellm.completion(**kwargs)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue