inter-domain generalisation evaluation configs (#424)

* add inter-domain generalisation eval config for algebra

* add algorithmic eval cfg

* vllm infer

* add arithmetic eval cfg

* add geometry eval cfg

* add arc cfg

* add games eval cfg

* add cognition eval cfg

* add graphs eval cfg
This commit is contained in:
Oliver Stanley 2025-04-22 17:32:35 +01:00 committed by GitHub
parent 98e976642d
commit 10863ea12b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 385 additions and 18 deletions

View file

@ -0,0 +1,68 @@
# Config used for evaluating inter-domain generalisation experiment models on arithmetic test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data)
model_path: ../models/inter_algorithmic_qwen_3b_500 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: arithmetic
datasets:
- dataset: basic_arithmetic
size: 100
seed: 42
- dataset: bitwise_arithmetic
size: 100
seed: 42
- dataset: calendar_arithmetic
size: 100
seed: 42
- dataset: chain_sum
size: 100
seed: 42
- dataset: count_bits
size: 100
seed: 42
- dataset: decimal_arithmetic
size: 100
seed: 42
- dataset: decimal_chain_sum
size: 100
seed: 42
- dataset: dice
size: 100
seed: 42
- dataset: fraction_simplification
size: 100
seed: 42
- dataset: gcd
size: 100
seed: 42
- dataset: lcm
size: 100
seed: 42
- dataset: power_function
size: 100
seed: 42
- dataset: prime_factorization
size: 100
seed: 42
- dataset: products
size: 100
seed: 42
- dataset: time_intervals
size: 100
seed: 42