reasoning-gym/training/evaluations/inter_generalisation/cognition.yaml
Oliver Stanley 10863ea12b
inter-domain generalisation evaluation configs (#424)
* add inter-domain generalisation eval config for algebra

* add algorithmic eval cfg

* vllm infer

* add arithmetic eval cfg

* add geometry eval cfg

* add arc cfg

* add games eval cfg

* add cognition eval cfg

* add graphs eval cfg
2025-04-22 17:32:35 +01:00

44 lines
1.1 KiB
YAML

# Config used for evaluating inter-domain generalisation experiment models on cognition test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data)
model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: cognition
datasets:
- dataset: color_cube_rotation
size: 100
seed: 42
- dataset: figlet_font
size: 100
seed: 42
- dataset: modulo_grid
size: 100
seed: 42
- dataset: needle_haystack
size: 100
seed: 42
- dataset: number_sequence
size: 100
seed: 42
- dataset: rectangle_count
size: 100
seed: 42
- dataset: rubiks_cube
size: 100
seed: 42