diff --git a/training/evaluations/curriculum/knights_knaves.yaml b/training/evaluations/curriculum/knights_knaves.yaml index 976442f6..4d084356 100644 --- a/training/evaluations/curriculum/knights_knaves.yaml +++ b/training/evaluations/curriculum/knights_knaves.yaml @@ -2,14 +2,17 @@ # Models evaluated on this config: # Qwen/Qwen2.5-3B-Instruct (original model) -# noncurriculum_kk_qwen_3b_400 (original + 400 GRPO steps on non-curriculum Knights and Knaves data) -# curriculum_kk_qwen_3b_400 (original + 400 GRPO steps on curriculum Knights and Knaves data) +# qwen3b_knights-knaves_noncurriculum (original + 300 GRPO steps on non-curriculum Knights and Knaves data) +# qwen3b_knights-knaves_curriculum (original + 300 GRPO steps on curriculum Knights and Knaves data) -model_path: ../models/curriculum_kk_qwen_3b_400 # Change to the model to be evaluated +model_path: Qwen/Qwen2.5-3B-Instruct # Default model path +# model_path: /workspace/reasoning-gym/training/qwen3b_knights-knaves_noncurriculum +# model_path: /workspace/reasoning-gym/training/qwen3b_knights-knaves_curriculum max_tokens: 2048 # From max_response_length in training config -top_p: 0.9 # From rollout top_p -temperature: 0.6 # Lower temperature for more focused responses +top_p: 1.0 +temperature: 1.0 # Lower temperature for more focused responses +dtype: bfloat16 developer_prompt: DeepSeekZero developer_role: system @@ -17,7 +20,7 @@ developer_role: system output_dir: results save_metadata: true save_full_results: true -eval_repeats: 3 +eval_repeats: 1 categories: - category: logic @@ -25,6 +28,7 @@ categories: - dataset: knights_knaves size: 100 seed: 42 - n_people: 5 - depth_constraint: 5 - width_constraint: 5 + params: + n_people: 5 + depth_constraint: 3 + width_constraint: 3 diff --git a/training/evaluations/evaluate_model.py b/training/evaluations/evaluate_model.py index 5b51ee48..a54b229c 100644 --- a/training/evaluations/evaluate_model.py +++ b/training/evaluations/evaluate_model.py @@ -45,6 +45,7 @@ class EvalConfig: model_path: str max_tokens: int temperature: float + dtype: str top_p: float output_dir: str save_metadata: bool @@ -82,7 +83,7 @@ class LocalModelEvaluator: self.verbose = verbose # Load model and tokenizer - self.llm = LLM(model=model_path) + self.llm = LLM(model=model_path, dtype=config.dtype) self.tokenizer = self.llm.get_tokenizer() self.sampling_params = SamplingParams( temperature=config.temperature, @@ -132,7 +133,6 @@ class LocalModelEvaluator: raw_response = self.get_model_response(entry["question"]) model_answer = extract_answer(raw_response) score = dataset.score_answer(answer=model_answer, entry=entry) - score = 0.0 if score < 1 else score all_completions.append( { "model_answer": model_answer, @@ -214,6 +214,7 @@ class LocalModelEvaluator: "duration_seconds": (datetime.now() - self.start_time).total_seconds(), "max_tokens": self.config.max_tokens, "temperature": self.config.temperature, + "dtype": self.config.dtype, "top_p": self.config.top_p, "eval_repeats": self.config.eval_repeats, },