mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
add eval demo for generated script
This commit is contained in:
parent
d6c9a534af
commit
7c0509db7a
1 changed files with 52 additions and 22 deletions
|
|
@ -2,13 +2,13 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompt_template = \"\"\"You need to generate python code for a synthetic procedural dataset. The dataset is similar to OpenAI's GSM8K which contains grade-school level math questions in natural language.\n",
|
||||
"\n",
|
||||
"Here is a the SOURCE item from the dataset which you should translate into a python generator:\n",
|
||||
"Here is a the SOURCE item which you should translate into a python generator:\n",
|
||||
"\n",
|
||||
"```json\n",
|
||||
"{0}\n",
|
||||
|
|
@ -42,7 +42,7 @@
|
|||
"Your task:\n",
|
||||
"\n",
|
||||
"- Generate reasonable random values for all the variables\n",
|
||||
"- Ensure mathematical consistency (total distance is divisible by distance per interval)\n",
|
||||
"- Ensure mathematical consistency (results of divisions need to be integers)\n",
|
||||
"- Create natural language question and answer texts\n",
|
||||
"- Include metadata about the variables and solution\n",
|
||||
"\n",
|
||||
|
|
@ -118,7 +118,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -132,12 +132,16 @@
|
|||
],
|
||||
"source": [
|
||||
"# create open-router client, place your OPENROUTER_API_KEY in .env file\n",
|
||||
"# .env contents:\n",
|
||||
"# OPENROUTER_API_KEY=sk-or-v1- ...\n",
|
||||
"\n",
|
||||
"%load_ext dotenv\n",
|
||||
"%dotenv\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Any, Iterable, Optional\n",
|
||||
"import json\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from openai.types.chat import ChatCompletion, ChatCompletionMessageParam\n",
|
||||
"import time\n",
|
||||
|
|
@ -174,25 +178,16 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/koepf/code/open-thought/reasoning-gym/notebooks/../../../ml-gsm-symbolic/templates/symbolic\n"
|
||||
"Reading templates from path: /home/koepf/code/open-thought/reasoning-gym/notebooks/../../../ml-gsm-symbolic/templates/symbolic\n",
|
||||
"len of python source: 2389\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'from random import Random\\nfrom typing import Dict, Any\\n\\ndef generate_from_variables(item: str, n1: int, p: int, c1: str, c2: str, c3: str) -> Dict[str, Any]:\\n more_cards = int(p/100 * n1)\\n n2 = n1 + more_cards\\n n3 = n1 + n2\\n total = n3 + n3\\n\\n question = f\"In a set of {item}\\'s cards, there are {n1} {c1} cards, and {p}% more {c2} cards. {c3} cards are as many as the sum of {c1} and {c2} cards. How many cards of all mentioned colors are there?\"\\n\\n answer_cot = f\"There are {p}/100 * {n1} = {more_cards} more {c2} cards than {c1} cards.\\\\n\" \\\\\\n f\"Which means there are {n1} + {more_cards} = {n2} {c2} cards.\\\\n\" \\\\\\n f\"{c3} cards make up to {n1} + {n2} = {n3} cards.\\\\n\" \\\\\\n f\"So in total, there are {n3} + {n3} = {total} cards of different colors.\\\\n\" \\\\\\n f\"#### {total}\"\\n\\n return {\\n \\'question\\': question,\\n \\'answer\\': str(total),\\n \\'answer_cot\\': answer_cot,\\n \\'answer_value\\': total,\\n \\'variables\\': {\\n \\'item\\': item,\\n \\'n1\\': n1,\\n \\'p\\': p,\\n \\'c1\\': c1,\\n \\'c2\\': c2, \\n \\'c3\\': c3,\\n \\'more_cards\\': more_cards,\\n \\'total\\': total\\n }\\n }\\n\\ndef generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:\\n items = [\"magician\", \"artist\", \"chef\", \"scientist\", \"athlete\"]\\n colors = [\"red\", \"blue\", \"green\", \"yellow\", \"purple\", \"orange\"]\\n \\n item = rng.choice(items)\\n c1, c2, c3 = rng.sample(colors, 3)\\n \\n n1 = int(rng.randint(20, int(81 * difficulty)))\\n \\n # Generate p ensuring p/100 * n1 is an integer\\n while True:\\n p = int(rng.randint(20, min(90, int(90 * difficulty))))\\n if (p/100 * n1).is_integer():\\n break\\n \\n result = generate_from_variables(item, n1, p, c1, c2, c3)\\n \\n return {\\n \\'question\\': result[\\'question\\'],\\n \\'answer\\': result[\\'answer\\'],\\n \\'metadata\\': {\\n \\'difficulty\\': difficulty,\\n \\'answer_value\\': result[\\'answer_value\\'],\\n \\'answer_cot\\': result[\\'answer_cot\\'],\\n \\'variables\\': result[\\'variables\\']\\n }\\n }\\n\\ndef original_example() -> Dict[str, Any]:\\n return generate_from_variables(\"magician\", 15, 60, \"red\", \"green\", \"yellow\")\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
|
@ -205,9 +200,9 @@
|
|||
" return prompt\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def produce_generator(json_path: Path):\n",
|
||||
" json = json_path.read_text()\n",
|
||||
" user_request = prompt_template.format(json)\n",
|
||||
"def eval_prompt_template(input: str):\n",
|
||||
" \n",
|
||||
" user_request = prompt_template.format(input)\n",
|
||||
" \n",
|
||||
" input_messages = generate_simple_request(user_prompt=user_request)\n",
|
||||
" output = llm_generate(open_router_client, input_messages, sampling_params)\n",
|
||||
|
|
@ -224,13 +219,48 @@
|
|||
"template_files = list(path_to_gsmsym.glob(\"*.json\"))\n",
|
||||
"\n",
|
||||
"# for testing just do it for the first entry\n",
|
||||
"response_text = produce_generator(template_files[0])\n",
|
||||
"response_text = eval_prompt_template(template_files[0].read_text())\n",
|
||||
"\n",
|
||||
"# extract python source section\n",
|
||||
"result_match = re.search(r\"^```.*\\n((.*\\n)+)```\", response_text, flags=re.MULTILINE)\n",
|
||||
"\n",
|
||||
"pytho_source = result_match.group(1)\n",
|
||||
"pytho_source"
|
||||
"\n",
|
||||
"python_source = result_match.group(1)\n",
|
||||
"print(\"len of python source: \", len(python_source))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"In a set of magician's cards, there are 15 red cards, and 60% more green cards. yellow cards are as many as the sum of red and green cards. How many cards of all mentioned colors are there?\n",
|
||||
"In a set of magicians cards, there are 15 red cards, and 60% more green cards. Yellow cards are as many, as the sum of red and green cards. How many cards of all mentioned colors are there?\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# WARNING: We are now executing the llm response without sandbox environment!\n",
|
||||
"\n",
|
||||
"scope = {} # eval generated python code here\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" exec(python_source, scope, scope)\n",
|
||||
"except Exception as err:\n",
|
||||
" raise\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"exec(\"output = original_example()\", scope, scope)\n",
|
||||
"generated_data = scope[\"output\"]\n",
|
||||
"print(generated_data['question'])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"original_data = json.loads(template_files[0].read_text())\n",
|
||||
"print(original_data['question'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue