mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-26 17:13:17 +00:00
update training dir with external eval details (#437)
* added games * added llama 3b training conf * update readme with details of external evals * readme update --------- Co-authored-by: joesharratt1229 <joesharratt1229@gmail.com>
This commit is contained in:
parent
5961a10145
commit
add527ada1
5 changed files with 374 additions and 0 deletions
88
training/evaluations/lmeh/gsm8k_cot_rg.yaml
Normal file
88
training/evaluations/lmeh/gsm8k_cot_rg.yaml
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
dataset_name: main
|
||||
dataset_path: gsm8k
|
||||
doc_to_target: '{{answer.split(''####'')[-1].strip() if answer is defined else target}}'
|
||||
doc_to_text: "<|im_start|>system\nYou are a helpful AI Assistant that provides well-reasoned and detailed responses.\nYou first think about the reasoning process as an internal monologue and then provide the user with the answer.\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer><|im_end|>\n<|im_start|>user\nGiven the following problem, reason and give a final answer to the problem.\nProblem: {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.<|im_end|>\n<|im_start|>assistant\n"
|
||||
fewshot_config:
|
||||
sampler: first_n
|
||||
samples:
|
||||
- question: There are 15 trees in the grove. Grove workers will plant trees in the
|
||||
grove today. After they are done, there will be 21 trees. How many trees did
|
||||
the grove workers plant today?
|
||||
target: There are 15 trees originally. Then there were 21 trees after some more
|
||||
were planted. So there must have been 21 - 15 = 6. The final answer is 6
|
||||
- question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
|
||||
cars are in the parking lot?
|
||||
target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
|
||||
is 5
|
||||
- question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
|
||||
pieces do they have left in total?
|
||||
target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
|
||||
had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
|
||||
- question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
|
||||
lollipops. How many lollipops did Jason give to Denny?
|
||||
target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
|
||||
So he gave Denny 20 - 12 = 8. The final answer is 8
|
||||
- question: Shawn has five toys. For Christmas, he got two toys each from his mom and
|
||||
dad. How many toys does he have now?
|
||||
target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
|
||||
then that is 4 more toys. 5 + 4 = 9. The final answer is 9
|
||||
- question: There were nine computers in the server room. Five more computers were
|
||||
installed each day, from monday to thursday. How many computers are now in the
|
||||
server room?
|
||||
target: There were originally 9 computers. For each of 4 days, 5 more computers
|
||||
were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
|
||||
29
|
||||
- question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
|
||||
he lost 2 more. How many golf balls did he have at the end of wednesday?
|
||||
target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
|
||||
58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
|
||||
is 33
|
||||
- question: Olivia has $23. She bought five bagels for $3 each. How much money does
|
||||
she have left?
|
||||
target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
|
||||
dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
|
||||
filter_list:
|
||||
- filter:
|
||||
- function: regex
|
||||
group_select: -1
|
||||
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
|
||||
- function: take_first
|
||||
name: strict-match
|
||||
- filter:
|
||||
- function: regex
|
||||
group_select: -1
|
||||
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
|
||||
- function: take_first
|
||||
name: flexible-extract
|
||||
generation_kwargs:
|
||||
do_sample: true
|
||||
temperature: 0.6
|
||||
top_p: 0.95
|
||||
max_gen_toks: 4096
|
||||
until:
|
||||
- '<|eot_id|>'
|
||||
- '<|start_header_id|>user<|end_header_id|>'
|
||||
- 'Q:'
|
||||
- </s>
|
||||
- <|im_end|>
|
||||
- </answer>
|
||||
tag:
|
||||
- chain_of_thought
|
||||
metadata:
|
||||
version: 3.0
|
||||
metric_list:
|
||||
- aggregation: mean
|
||||
higher_is_better: true
|
||||
ignore_case: true
|
||||
ignore_punctuation: false
|
||||
metric: exact_match
|
||||
regexes_to_ignore:
|
||||
- ','
|
||||
- \$
|
||||
- '(?s).*#### '
|
||||
- \.$
|
||||
num_fewshot: 8
|
||||
output_type: generate_until
|
||||
repeats: 1
|
||||
task: gsm8k_cot_rg
|
||||
test_split: test
|
||||
Loading…
Add table
Add a link
Reference in a new issue