add graphs eval configs

2026-04-19 12:58:07 +00:00 · 2025-07-28 12:21:31 +01:00 · 2025-07-28 12:21:31 +01:00 · 523d56f019
commit 523d56f019
parent 1ab6cf3a12
2 changed files with 43 additions and 1 deletions
--- a/training/evaluations/curriculum/graphs.yml
+++ b/training/evaluations/curriculum/graphs.yml
@ -0,0 +1,40 @@
+# Config used for evaluating curriculum experiment models on graphs composite data
+
+# Models evaluated on this config:
+# Qwen/Qwen2.5-3B-Instruct (original model)
+# qwen3b_graphs_noncurriculum_300 (original + 300 GRPO steps on non-curriculum graphs data)
+# qwen3b_graphs_curriculum_300 (original + 300 GRPO steps on curriculum graphs data)
+
+model_path: Qwen/Qwen2.5-3B-Instruct  # Default model path
+
+max_tokens: 2048  # From max_response_length in training config
+top_p: 1.0
+temperature: 1.0  # Lower temperature for more focused responses
+dtype: bfloat16
+
+developer_prompt: DeepSeekZero
+developer_role: system
+
+output_dir: results
+save_metadata: true
+save_full_results: true
+eval_repeats: 1
+
+categories:
+  - category: graphs
+    datasets:
+      - dataset: course_schedule
+        size: 50
+        seed: 42
+      - dataset: family_relationships
+        size: 50
+        seed: 42
+      - dataset: largest_island
+        size: 50
+        seed: 42
+      - dataset: quantum_lock
+        size: 50
+        seed: 42
+      - dataset: shortest_path
+        size: 50
+        seed: 42
--- a/training/evaluations/evaluate_model.py
+++ b/training/evaluations/evaluate_model.py
@ -45,6 +45,7 @@ class EvalConfig:
    model_path: str
    max_tokens: int
    temperature: float
+    dtype: str
    top_p: float
    output_dir: str
    save_metadata: bool
@ -82,7 +83,7 @@ class LocalModelEvaluator:
        self.verbose = verbose

        # Load model and tokenizer
-        self.llm = LLM(model=model_path)
+        self.llm = LLM(model=model_path, dtype=config.dtype)
        self.tokenizer = self.llm.get_tokenizer()
        self.sampling_params = SamplingParams(
            temperature=config.temperature,
@ -214,6 +215,7 @@ class LocalModelEvaluator:
                "duration_seconds": (datetime.now() - self.start_time).total_seconds(),
                "max_tokens": self.config.max_tokens,
                "temperature": self.config.temperature,
+                "dtype": self.config.dtype,
                "top_p": self.config.top_p,
                "eval_repeats": self.config.eval_repeats,
            },