fix score function

This commit is contained in:
Zafir Stojanovski 2025-02-15 17:40:37 +01:00
parent 95d86464f2
commit 85b4601b57
3 changed files with 31 additions and 4 deletions

View file

@ -5,7 +5,7 @@ https://github.com/yongchao98/CodeSteer-v1.0/blob/main/create_dataset/create_dat
from dataclasses import dataclass
from random import Random
from typing import Optional
from typing import Dict, Optional
from ..factory import ProceduralDataset, register_dataset
@ -26,6 +26,7 @@ Example
- First, we insert A after ABCD.
- Even though with the newly inserted 'A' we can obtain the substring BCD[A], we can't use it to insert another character.
- Lastly, we insert D after DEAB.
- Therefore, the final answer is DDABCDAEEDEABD (represented as a string, instead of a list of characters).
Given the following string, provide the answer after inserting the characters according to the pattern: {string}
"""
@ -79,12 +80,28 @@ class StringInsertionDataset(ProceduralDataset):
i += 1
return "".join(output)
def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
"""Overwrite this method in derived classes if a single oracle answer is not available."""
oracle_answer = entry["answer"]
if answer is not None:
if answer == oracle_answer:
return 1.0
else:
try:
# check if answer is python list of characters
answer = "".join(eval(answer))
if answer == oracle_answer:
return 0.5
except Exception as e:
return 0.01
return 0.0
def __getitem__(self, idx: int) -> dict:
"""Generate a single String Insertion question"""
rng = Random(self.seed + idx)
string_length = rng.randint(self.config.min_string_length, self.config.max_string_length)
string = [rng.choice(self.vocabulary) for _ in range(string_length)]
string = "".join(rng.choice(self.vocabulary) for _ in range(string_length))
answer = self._get_answer(string)

View file

@ -8,12 +8,12 @@ SYSTEM_PROMPTS = {
"DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it.
The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>
<answer>answer here</answer>
Do not explain your reasoning inside the answer tags, provide only the final answer.
Do not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example.
""",
"default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner.
Once you have thought about the reasoning process, provide the answer in the following format:
<answer>answer here</answer>
Do not explain your reasoning inside the answer tags, provide only the final answer.
Do not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example.
""",
}

View file

@ -92,3 +92,13 @@ def test_string_insertion_answer():
# No reuse of newly inserted characters
assert dataset._get_answer("ABCDBCD") == "ABCDABCD"
# Test score_answer with correct answer
answer = "AABCDAEEEEEEEBCDEBAAAAA"
entry = {"answer": "AABCDAEEEEEEEBCDEBAAAAA"}
assert dataset.score_answer(answer, entry) == 1.0
# Test score_answer with correct answer as python list of characters (partial correct)
answer = "['A', 'A', 'B', 'C', 'D', 'A', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'C', 'D', 'E', 'B', 'A', 'A', 'A', 'A', 'A']"
entry = {"answer": "AABCDAEEEEEEEBCDEBAAAAA"}
assert dataset.score_answer(answer, entry) == 0.5