diff --git a/eval/README.md b/eval/README.md index 8b18fec0..f7f89c7c 100644 --- a/eval/README.md +++ b/eval/README.md @@ -28,10 +28,19 @@ pip install -e .. pip install -r requirements-eval.txt ``` -3. Set your OpenRouter API key as an environment variable: -```bash -export OPENROUTER_API_KEY=your-api-key -``` +3. Set your API key (if required by the API): + + For OpenRouter, you can set it as an environment variable: + ```bash + export OPENROUTER_API_KEY=your-api-key + ``` + + Or provide it directly when running the script: + ```bash + python eval.py --config your_config.yaml --api-key your-api-key + ``` + + Note: API key is optional for some APIs (e.g., local deployments). 4. Prepare your evaluation configuration in YAML or JSON format (see example in `example_config.yaml`): @@ -132,6 +141,12 @@ For example: python eval.py --config example_config.yaml --full-results ``` +You can specify a different API base URL if needed: + +```bash +python eval.py --config example_config.yaml --base-url "https://api.together.xyz/v1" --api-key "your-together-api-key" +``` + The results will be stored in a directory named after the model and timestamp, containing: - `summary.json` - Summary of all results diff --git a/eval/eval.py b/eval/eval.py index 5626aa79..999325a3 100755 --- a/eval/eval.py +++ b/eval/eval.py @@ -11,6 +11,7 @@ Options: --model MODEL Override model specified in config --output-dir DIR Override output directory specified in config --max-concurrent NUM Maximum number of concurrent API calls + --base-url URL API base URL (default: https://openrouter.ai/api/v1) --save-metadata Save entry metadata in results --full-results Save the full results file --verbose Print detailed model responses @@ -29,7 +30,7 @@ import subprocess import sys from datetime import datetime from pathlib import Path -from typing import Any, Union +from typing import Any, Optional, Union from eval_config import CategoryConfig, DatasetConfig, EvalConfig from openai import AsyncOpenAI @@ -61,15 +62,25 @@ def get_git_hash() -> str: class AsyncModelEvaluator: """Evaluates models on reasoning datasets with async API calls via OpenRouter.""" - def __init__(self, config: EvalConfig, verbose: bool = False, debug: bool = False): + def __init__( + self, + config: EvalConfig, + api_key: Optional[str] = None, + base_url: str = "https://openrouter.ai/api/v1", + verbose: bool = False, + debug: bool = False, + ): """Initialize the evaluator with configuration. Args: config: Evaluation configuration + api_key: API key for the service (optional for some APIs) + base_url: API base URL verbose: Whether to print detailed model responses debug: Whether to enable debug logging """ self.config = config + self.base_url = base_url self.verbose = verbose self.debug = debug @@ -83,12 +94,8 @@ class AsyncModelEvaluator: # Suppress httpx logs in normal mode logging.getLogger("httpx").setLevel(logging.WARNING) - # Set up OpenRouter API client - api_key = os.getenv("OPENROUTER_API_KEY") - if not api_key: - raise ValueError("OPENROUTER_API_KEY environment variable is not set") - - self.client = AsyncOpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key) + # Set up API client + self.client = AsyncOpenAI(base_url=self.base_url, api_key=api_key) # Concurrency control self.semaphore = asyncio.Semaphore(config.max_concurrent) @@ -474,6 +481,11 @@ async def main_async(): parser.add_argument("--model", help="Override model specified in config") parser.add_argument("--output-dir", help="Override output directory specified in config") parser.add_argument("--max-concurrent", type=int, help="Maximum number of concurrent API calls") + parser.add_argument("--base-url", default="https://openrouter.ai/api/v1", help="API base URL") + parser.add_argument( + "--api-key", + help="API key for the service (optional for some APIs, defaults to OPENROUTER_API_KEY env var for OpenRouter URLs)", + ) parser.add_argument("--save-metadata", action="store_true", help="Save entry metadata in results") parser.add_argument("--full-results", action="store_true", help="Save the full results file") parser.add_argument("--verbose", action="store_true", help="Print detailed model responses") @@ -481,11 +493,17 @@ async def main_async(): args = parser.parse_args() - # Check for required API key - if not os.getenv("OPENROUTER_API_KEY"): - print("Error: OPENROUTER_API_KEY environment variable is not set") - print("Please set it using: export OPENROUTER_API_KEY=your-api-key") - return 1 + # Get API key from command line or environment variable + api_key = args.api_key + if api_key is None: + # If base_url is OpenRouter, try to get API key from environment + if args.base_url.startswith("https://openrouter.ai/api"): + api_key = os.getenv("OPENROUTER_API_KEY") + if not api_key: + print("Warning: OPENROUTER_API_KEY environment variable is not set") + print("Please set it using: export OPENROUTER_API_KEY=your-api-key") + print("Or provide it directly with --api-key") + print("Continuing without API key...") # Load configuration config_path = args.config @@ -510,7 +528,9 @@ async def main_async(): config.save_full_results = True # Create evaluator - evaluator = AsyncModelEvaluator(config=config, verbose=args.verbose, debug=args.debug) + evaluator = AsyncModelEvaluator( + config=config, api_key=api_key, base_url=args.base_url, verbose=args.verbose, debug=args.debug + ) # Run evaluation try: