diff --git a/training/evaluations/curriculum/knights_knaves.yaml b/training/evaluations/curriculum/knights_knaves.yaml
index 976442f6..4d084356 100644
--- a/training/evaluations/curriculum/knights_knaves.yaml
+++ b/training/evaluations/curriculum/knights_knaves.yaml
@@ -2,14 +2,17 @@
 
 # Models evaluated on this config:
 # Qwen/Qwen2.5-3B-Instruct (original model)
-# noncurriculum_kk_qwen_3b_400 (original + 400 GRPO steps on non-curriculum Knights and Knaves data)
-# curriculum_kk_qwen_3b_400 (original + 400 GRPO steps on curriculum Knights and Knaves data)
+# qwen3b_knights-knaves_noncurriculum (original + 300 GRPO steps on non-curriculum Knights and Knaves data)
+# qwen3b_knights-knaves_curriculum (original + 300 GRPO steps on curriculum Knights and Knaves data)
 
-model_path: ../models/curriculum_kk_qwen_3b_400  # Change to the model to be evaluated
+model_path: Qwen/Qwen2.5-3B-Instruct  # Default model path
+# model_path: /workspace/reasoning-gym/training/qwen3b_knights-knaves_noncurriculum
+# model_path: /workspace/reasoning-gym/training/qwen3b_knights-knaves_curriculum
 
 max_tokens: 2048  # From max_response_length in training config
-top_p: 0.9  # From rollout top_p
-temperature: 0.6  # Lower temperature for more focused responses
+top_p: 1.0
+temperature: 1.0  # Lower temperature for more focused responses
+dtype: bfloat16
 
 developer_prompt: DeepSeekZero
 developer_role: system
@@ -17,7 +20,7 @@ developer_role: system
 output_dir: results
 save_metadata: true
 save_full_results: true
-eval_repeats: 3
+eval_repeats: 1
 
 categories:
   - category: logic
@@ -25,6 +28,7 @@ categories:
       - dataset: knights_knaves
         size: 100
         seed: 42
-        n_people: 5
-        depth_constraint: 5
-        width_constraint: 5
+        params:
+          n_people: 5
+          depth_constraint: 3
+          width_constraint: 3
diff --git a/training/evaluations/evaluate_model.py b/training/evaluations/evaluate_model.py
index 5b51ee48..a54b229c 100644
--- a/training/evaluations/evaluate_model.py
+++ b/training/evaluations/evaluate_model.py
@@ -45,6 +45,7 @@ class EvalConfig:
     model_path: str
     max_tokens: int
     temperature: float
+    dtype: str
     top_p: float
     output_dir: str
     save_metadata: bool
@@ -82,7 +83,7 @@ class LocalModelEvaluator:
         self.verbose = verbose
 
         # Load model and tokenizer
-        self.llm = LLM(model=model_path)
+        self.llm = LLM(model=model_path, dtype=config.dtype)
         self.tokenizer = self.llm.get_tokenizer()
         self.sampling_params = SamplingParams(
             temperature=config.temperature,
@@ -132,7 +133,6 @@ class LocalModelEvaluator:
                 raw_response = self.get_model_response(entry["question"])
                 model_answer = extract_answer(raw_response)
                 score = dataset.score_answer(answer=model_answer, entry=entry)
-                score = 0.0 if score < 1 else score
                 all_completions.append(
                     {
                         "model_answer": model_answer,
@@ -214,6 +214,7 @@ class LocalModelEvaluator:
                 "duration_seconds": (datetime.now() - self.start_time).total_seconds(),
                 "max_tokens": self.config.max_tokens,
                 "temperature": self.config.temperature,
+                "dtype": self.config.dtype,
                 "top_p": self.config.top_p,
                 "eval_repeats": self.config.eval_repeats,
             },