inter-domain generalisation evaluation configs (#424)

* add inter-domain generalisation eval config for algebra

* add algorithmic eval cfg

* vllm infer

* add arithmetic eval cfg

* add geometry eval cfg

* add arc cfg

* add games eval cfg

* add cognition eval cfg

* add graphs eval cfg
This commit is contained in:
Oliver Stanley 2025-04-22 17:32:35 +01:00 committed by GitHub
parent 98e976642d
commit 10863ea12b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 385 additions and 18 deletions

View file

@ -84,10 +84,15 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_
```
# Run evaluations
From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step).
## Run the script
```
export VLLM_ATTENTION_BACKEND=XFORMERS
```
Navigate to evaluations directory:
```
python evaluate_model.py --config path-to-yaml

View file

@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
import torch
import yaml
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams
import reasoning_gym
from reasoning_gym.utils import SYSTEM_PROMPTS, extract_answer
@ -82,12 +82,13 @@ class LocalModelEvaluator:
self.verbose = verbose
# Load model and tokenizer
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16 if "cuda" in device else torch.float32,
self.llm = LLM(model=model_path)
self.tokenizer = self.llm.get_tokenizer()
self.sampling_params = SamplingParams(
temperature=config.temperature,
top_p=config.top_p,
max_tokens=config.max_tokens,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model.to(device)
self.start_time = datetime.now()
# If you have a system prompt, retrieve it from SYSTEM_PROMPTS
@ -110,18 +111,9 @@ class LocalModelEvaluator:
# Some Hugging Face chat-friendly models use a convenience method like below:
prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=self.config.max_tokens,
temperature=self.config.temperature,
top_p=self.config.top_p,
do_sample=True if self.config.temperature > 0 else False,
pad_token_id=self.tokenizer.eos_token_id,
)
# Decode the *new* tokens only:
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True).strip()
response = self.llm.generate(prompt, self.sampling_params, use_tqdm=False)
# Extract the text from the response
response = response[0].outputs[0].text
if self.verbose:
print(f"[Prompt]\n{question}\n[Response]\n{response}\n{'-'*60}")

View file

@ -0,0 +1,41 @@
# Config used for evaluating inter-domain generalisation experiment models on algebra test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data)
# ../models/inter_algorithmic_qwen_3b_500
model_path: Qwen/Qwen2.5-3B-Instruct # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: algebra
datasets:
- dataset: complex_arithmetic
size: 100
seed: 42
- dataset: intermediate_integration
size: 100
seed: 42
- dataset: polynomial_equations
size: 100
seed: 42
- dataset: polynomial_multiplication
size: 100
seed: 42
- dataset: simple_equations
size: 100
seed: 42
- dataset: simple_integration
size: 100
seed: 42

View file

@ -0,0 +1,70 @@
# Config used for evaluating inter-domain generalisation experiment models on algorithmic test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data)
model_path: ../models/inter_algebra_qwen_3b_500 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: algorithmic
datasets:
- dataset: ab
size: 100
seed: 42
- dataset: base_conversion
size: 100
seed: 42
- dataset: binary_alternation
size: 100
seed: 42
params:
p_solvable: 0.9
- dataset: binary_matrix
size: 100
seed: 42
params:
min_n: 2
max_n: 6
- dataset: caesar_cipher
size: 100
seed: 42
params:
max_words: 10
- dataset: cryptarithm
size: 100
seed: 42
- dataset: isomorphic_strings
size: 100
seed: 42
params:
max_string_length: 8
- dataset: jugs
size: 100
seed: 42
params:
difficulty: 6
- dataset: rotate_matrix
size: 100
seed: 42
params:
min_n: 2
max_n: 6
- dataset: string_manipulation
size: 100
seed: 42
params:
max_string_length: 15
max_num_rules: 6

View file

@ -0,0 +1,32 @@
# Config used for evaluating inter-domain generalisation experiment models on ARC test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data)
model_path: ../models/inter_algebra_qwen_3b_500 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: arc
datasets:
- dataset: arc_1d
size: 100
seed: 42
- dataset: arc_agi
size: 100
seed: 42
- dataset: rearc
size: 100
seed: 42

View file

@ -0,0 +1,68 @@
# Config used for evaluating inter-domain generalisation experiment models on arithmetic test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data)
model_path: ../models/inter_algorithmic_qwen_3b_500 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: arithmetic
datasets:
- dataset: basic_arithmetic
size: 100
seed: 42
- dataset: bitwise_arithmetic
size: 100
seed: 42
- dataset: calendar_arithmetic
size: 100
seed: 42
- dataset: chain_sum
size: 100
seed: 42
- dataset: count_bits
size: 100
seed: 42
- dataset: decimal_arithmetic
size: 100
seed: 42
- dataset: decimal_chain_sum
size: 100
seed: 42
- dataset: dice
size: 100
seed: 42
- dataset: fraction_simplification
size: 100
seed: 42
- dataset: gcd
size: 100
seed: 42
- dataset: lcm
size: 100
seed: 42
- dataset: power_function
size: 100
seed: 42
- dataset: prime_factorization
size: 100
seed: 42
- dataset: products
size: 100
seed: 42
- dataset: time_intervals
size: 100
seed: 42

View file

@ -0,0 +1,44 @@
# Config used for evaluating inter-domain generalisation experiment models on cognition test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data)
model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: cognition
datasets:
- dataset: color_cube_rotation
size: 100
seed: 42
- dataset: figlet_font
size: 100
seed: 42
- dataset: modulo_grid
size: 100
seed: 42
- dataset: needle_haystack
size: 100
seed: 42
- dataset: number_sequence
size: 100
seed: 42
- dataset: rectangle_count
size: 100
seed: 42
- dataset: rubiks_cube
size: 100
seed: 42

View file

@ -0,0 +1,48 @@
# Config used for evaluating inter-domain generalisation experiment models on games test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data)
# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data)
model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: games
datasets:
- dataset: knight_swap
size: 100
seed: 42
- dataset: mahjong_puzzle
size: 100
seed: 42
- dataset: maze
size: 100
seed: 42
- dataset: mini_sudoku
size: 100
seed: 42
- dataset: n_queens
size: 100
seed: 42
- dataset: rush_hour
size: 100
seed: 42
- dataset: sokoban
size: 100
seed: 42
- dataset: tsumego
size: 100
seed: 42

View file

@ -0,0 +1,29 @@
# Config used for evaluating inter-domain generalisation experiment models on geometry test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data)
model_path: ../models/inter_algorithmic_qwen_3b_500 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: geometry
datasets:
- dataset: advanced_geometry
size: 100
seed: 42
- dataset: simple_geometry
size: 100
seed: 42

View file

@ -0,0 +1,38 @@
# Config used for evaluating inter-domain generalisation experiment models on graphs test data
# Models evaluated on this config:
# Qwen/Qwen2.5-3B-Instruct (original model)
# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data)
model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated
max_tokens: 2048 # From max_response_length in training config
top_p: 0.9 # From rollout top_p
temperature: 0.6 # Lower temperature for more focused responses
developer_prompt: DeepSeekZero
developer_role: system
output_dir: results
save_metadata: true
save_full_results: true
eval_repeats: 3
categories:
- category: graphs
datasets:
- dataset: course_schedule
size: 100
seed: 42
- dataset: family_relationships
size: 100
seed: 42
- dataset: largest_island
size: 100
seed: 42
- dataset: quantum_lock
size: 100
seed: 42
- dataset: shortest_path
size: 100
seed: 42