Eval script consolidation (#238)

The script now supports:
   - YAML and JSON configurations
   - Dataset-specific parameters
   - Overriding configuration via command line
   - Detailed logging and error handling
This commit is contained in:
Andreas Köpf 2025-02-27 17:39:14 +01:00 committed by GitHub
parent 8a66d2a216
commit 850c1cf6f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
40 changed files with 1111 additions and 670 deletions

33
eval/example_config.yaml Normal file
View file

@ -0,0 +1,33 @@
# Example configuration for the evaluation script
model: "meta-llama/llama-3.3-70b-instruct"
provider: "Hyperbolic"
output_dir: "results"
max_concurrent: 10
default_size: 20 # Default size for all datasets
default_seed: 42 # Default seed for all datasets
categories:
- category: "algebra"
datasets:
- dataset: "complex_arithmetic"
params:
min_real: -10
max_real: 10
min_imag: -10
max_imag: 10
- category: "arithmetic"
datasets:
- dataset: "products"
size: 10
seed: 43
params:
min_digits: 2
allow_negation: true
- dataset: "chain_sum"
size: 12
seed: 43
params:
min_digits: 2
allow_negation: true