From 10863ea12b1adf566c1f42c665efa7b10f85f4a1 Mon Sep 17 00:00:00 2001 From: Oliver Stanley Date: Tue, 22 Apr 2025 17:32:35 +0100 Subject: [PATCH] inter-domain generalisation evaluation configs (#424) * add inter-domain generalisation eval config for algebra * add algorithmic eval cfg * vllm infer * add arithmetic eval cfg * add geometry eval cfg * add arc cfg * add games eval cfg * add cognition eval cfg * add graphs eval cfg --- training/README.md | 5 ++ training/evaluations/evaluate_model.py | 28 +++----- .../inter_generalisation/algebra.yaml | 41 +++++++++++ .../inter_generalisation/algorithmic.yaml | 70 +++++++++++++++++++ .../evaluations/inter_generalisation/arc.yaml | 32 +++++++++ .../inter_generalisation/arithmetic.yaml | 68 ++++++++++++++++++ .../inter_generalisation/cognition.yaml | 44 ++++++++++++ .../inter_generalisation/games.yaml | 48 +++++++++++++ .../inter_generalisation/geometry.yaml | 29 ++++++++ .../inter_generalisation/graphs.yaml | 38 ++++++++++ 10 files changed, 385 insertions(+), 18 deletions(-) create mode 100644 training/evaluations/inter_generalisation/algebra.yaml create mode 100644 training/evaluations/inter_generalisation/algorithmic.yaml create mode 100644 training/evaluations/inter_generalisation/arc.yaml create mode 100644 training/evaluations/inter_generalisation/arithmetic.yaml create mode 100644 training/evaluations/inter_generalisation/cognition.yaml create mode 100644 training/evaluations/inter_generalisation/games.yaml create mode 100644 training/evaluations/inter_generalisation/geometry.yaml create mode 100644 training/evaluations/inter_generalisation/graphs.yaml diff --git a/training/README.md b/training/README.md index 552b8fba..22413a50 100644 --- a/training/README.md +++ b/training/README.md @@ -84,10 +84,15 @@ python utils/load_fsdp_to_hf.py checkpoints/rg-test/intra_reasoning_algorithmic_ ``` # Run evaluations + From here you may to run evaluations of your trained model. In the `training/evaluation` directory there is a script `evaluate_model.py` which you csn run to evaluate your trained model on a specific dataset. You specify evaluation parameters in a yaml file. This evaluation can point to either a local or remote model. For example the configuration file `training/evaluation/eval_algorithmic_composite.yaml` specifies the path to a local model which is stored as a hugginface checkpoint at `training/utils/qwen3b_500` (note that you have to convert to fsdp checkpoint to hf checkpoint for evaluation script to work as shown in the previous step). ## Run the script + +``` export VLLM_ATTENTION_BACKEND=XFORMERS +``` + Navigate to evaluations directory: ``` python evaluate_model.py --config path-to-yaml diff --git a/training/evaluations/evaluate_model.py b/training/evaluations/evaluate_model.py index b7ed1da6..5b51ee48 100644 --- a/training/evaluations/evaluate_model.py +++ b/training/evaluations/evaluate_model.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional import torch import yaml from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer +from vllm import LLM, SamplingParams import reasoning_gym from reasoning_gym.utils import SYSTEM_PROMPTS, extract_answer @@ -82,12 +82,13 @@ class LocalModelEvaluator: self.verbose = verbose # Load model and tokenizer - self.model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.bfloat16 if "cuda" in device else torch.float32, + self.llm = LLM(model=model_path) + self.tokenizer = self.llm.get_tokenizer() + self.sampling_params = SamplingParams( + temperature=config.temperature, + top_p=config.top_p, + max_tokens=config.max_tokens, ) - self.tokenizer = AutoTokenizer.from_pretrained(model_path) - self.model.to(device) self.start_time = datetime.now() # If you have a system prompt, retrieve it from SYSTEM_PROMPTS @@ -110,18 +111,9 @@ class LocalModelEvaluator: # Some Hugging Face chat-friendly models use a convenience method like below: prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) - inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) - with torch.no_grad(): - outputs = self.model.generate( - **inputs, - max_new_tokens=self.config.max_tokens, - temperature=self.config.temperature, - top_p=self.config.top_p, - do_sample=True if self.config.temperature > 0 else False, - pad_token_id=self.tokenizer.eos_token_id, - ) - # Decode the *new* tokens only: - response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True).strip() + response = self.llm.generate(prompt, self.sampling_params, use_tqdm=False) + # Extract the text from the response + response = response[0].outputs[0].text if self.verbose: print(f"[Prompt]\n{question}\n[Response]\n{response}\n{'-'*60}") diff --git a/training/evaluations/inter_generalisation/algebra.yaml b/training/evaluations/inter_generalisation/algebra.yaml new file mode 100644 index 00000000..22bd6f4e --- /dev/null +++ b/training/evaluations/inter_generalisation/algebra.yaml @@ -0,0 +1,41 @@ +# Config used for evaluating inter-domain generalisation experiment models on algebra test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data) +# ../models/inter_algorithmic_qwen_3b_500 +model_path: Qwen/Qwen2.5-3B-Instruct # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: algebra + datasets: + - dataset: complex_arithmetic + size: 100 + seed: 42 + - dataset: intermediate_integration + size: 100 + seed: 42 + - dataset: polynomial_equations + size: 100 + seed: 42 + - dataset: polynomial_multiplication + size: 100 + seed: 42 + - dataset: simple_equations + size: 100 + seed: 42 + - dataset: simple_integration + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/algorithmic.yaml b/training/evaluations/inter_generalisation/algorithmic.yaml new file mode 100644 index 00000000..519eb8cc --- /dev/null +++ b/training/evaluations/inter_generalisation/algorithmic.yaml @@ -0,0 +1,70 @@ +# Config used for evaluating inter-domain generalisation experiment models on algorithmic test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data) + +model_path: ../models/inter_algebra_qwen_3b_500 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: algorithmic + datasets: + - dataset: ab + size: 100 + seed: 42 + - dataset: base_conversion + size: 100 + seed: 42 + - dataset: binary_alternation + size: 100 + seed: 42 + params: + p_solvable: 0.9 + - dataset: binary_matrix + size: 100 + seed: 42 + params: + min_n: 2 + max_n: 6 + - dataset: caesar_cipher + size: 100 + seed: 42 + params: + max_words: 10 + - dataset: cryptarithm + size: 100 + seed: 42 + - dataset: isomorphic_strings + size: 100 + seed: 42 + params: + max_string_length: 8 + - dataset: jugs + size: 100 + seed: 42 + params: + difficulty: 6 + - dataset: rotate_matrix + size: 100 + seed: 42 + params: + min_n: 2 + max_n: 6 + - dataset: string_manipulation + size: 100 + seed: 42 + params: + max_string_length: 15 + max_num_rules: 6 diff --git a/training/evaluations/inter_generalisation/arc.yaml b/training/evaluations/inter_generalisation/arc.yaml new file mode 100644 index 00000000..3888eb62 --- /dev/null +++ b/training/evaluations/inter_generalisation/arc.yaml @@ -0,0 +1,32 @@ +# Config used for evaluating inter-domain generalisation experiment models on ARC test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data) + +model_path: ../models/inter_algebra_qwen_3b_500 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: arc + datasets: + - dataset: arc_1d + size: 100 + seed: 42 + - dataset: arc_agi + size: 100 + seed: 42 + - dataset: rearc + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/arithmetic.yaml b/training/evaluations/inter_generalisation/arithmetic.yaml new file mode 100644 index 00000000..4125a956 --- /dev/null +++ b/training/evaluations/inter_generalisation/arithmetic.yaml @@ -0,0 +1,68 @@ +# Config used for evaluating inter-domain generalisation experiment models on arithmetic test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data) + +model_path: ../models/inter_algorithmic_qwen_3b_500 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: arithmetic + datasets: + - dataset: basic_arithmetic + size: 100 + seed: 42 + - dataset: bitwise_arithmetic + size: 100 + seed: 42 + - dataset: calendar_arithmetic + size: 100 + seed: 42 + - dataset: chain_sum + size: 100 + seed: 42 + - dataset: count_bits + size: 100 + seed: 42 + - dataset: decimal_arithmetic + size: 100 + seed: 42 + - dataset: decimal_chain_sum + size: 100 + seed: 42 + - dataset: dice + size: 100 + seed: 42 + - dataset: fraction_simplification + size: 100 + seed: 42 + - dataset: gcd + size: 100 + seed: 42 + - dataset: lcm + size: 100 + seed: 42 + - dataset: power_function + size: 100 + seed: 42 + - dataset: prime_factorization + size: 100 + seed: 42 + - dataset: products + size: 100 + seed: 42 + - dataset: time_intervals + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/cognition.yaml b/training/evaluations/inter_generalisation/cognition.yaml new file mode 100644 index 00000000..9d25c260 --- /dev/null +++ b/training/evaluations/inter_generalisation/cognition.yaml @@ -0,0 +1,44 @@ +# Config used for evaluating inter-domain generalisation experiment models on cognition test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data) + +model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: cognition + datasets: + - dataset: color_cube_rotation + size: 100 + seed: 42 + - dataset: figlet_font + size: 100 + seed: 42 + - dataset: modulo_grid + size: 100 + seed: 42 + - dataset: needle_haystack + size: 100 + seed: 42 + - dataset: number_sequence + size: 100 + seed: 42 + - dataset: rectangle_count + size: 100 + seed: 42 + - dataset: rubiks_cube + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/games.yaml b/training/evaluations/inter_generalisation/games.yaml new file mode 100644 index 00000000..f07a8463 --- /dev/null +++ b/training/evaluations/inter_generalisation/games.yaml @@ -0,0 +1,48 @@ +# Config used for evaluating inter-domain generalisation experiment models on games test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algebra_qwen_3b_500 (original + 500 GRPO steps on algebra RG data) +# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data) + +model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: games + datasets: + - dataset: knight_swap + size: 100 + seed: 42 + - dataset: mahjong_puzzle + size: 100 + seed: 42 + - dataset: maze + size: 100 + seed: 42 + - dataset: mini_sudoku + size: 100 + seed: 42 + - dataset: n_queens + size: 100 + seed: 42 + - dataset: rush_hour + size: 100 + seed: 42 + - dataset: sokoban + size: 100 + seed: 42 + - dataset: tsumego + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/geometry.yaml b/training/evaluations/inter_generalisation/geometry.yaml new file mode 100644 index 00000000..7bd97a80 --- /dev/null +++ b/training/evaluations/inter_generalisation/geometry.yaml @@ -0,0 +1,29 @@ +# Config used for evaluating inter-domain generalisation experiment models on geometry test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_algorithmic_qwen_3b_500 (original + 500 GRPO steps on algorithmic RG data) + +model_path: ../models/inter_algorithmic_qwen_3b_500 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: geometry + datasets: + - dataset: advanced_geometry + size: 100 + seed: 42 + - dataset: simple_geometry + size: 100 + seed: 42 diff --git a/training/evaluations/inter_generalisation/graphs.yaml b/training/evaluations/inter_generalisation/graphs.yaml new file mode 100644 index 00000000..c28151d2 --- /dev/null +++ b/training/evaluations/inter_generalisation/graphs.yaml @@ -0,0 +1,38 @@ +# Config used for evaluating inter-domain generalisation experiment models on graphs test data + +# Models evaluated on this config: +# Qwen/Qwen2.5-3B-Instruct (original model) +# inter_logic_qwen_3b_400 (original + 400 GRPO steps on logic RG data) + +model_path: ../models/inter_logic_qwen_3b_400 # Change to the model to be evaluated + +max_tokens: 2048 # From max_response_length in training config +top_p: 0.9 # From rollout top_p +temperature: 0.6 # Lower temperature for more focused responses + +developer_prompt: DeepSeekZero +developer_role: system + +output_dir: results +save_metadata: true +save_full_results: true +eval_repeats: 3 + +categories: + - category: graphs + datasets: + - dataset: course_schedule + size: 100 + seed: 42 + - dataset: family_relationships + size: 100 + seed: 42 + - dataset: largest_island + size: 100 + seed: 42 + - dataset: quantum_lock + size: 100 + seed: 42 + - dataset: shortest_path + size: 100 + seed: 42