add eval demo for generated script

2026-04-19 12:58:07 +00:00 · 2025-01-29 18:28:17 +01:00 · 2025-01-29 18:28:17 +01:00 · 7c0509db7a
commit 7c0509db7a
parent d6c9a534af
1 changed files with 52 additions and 22 deletions
--- a/notebooks/gsm_symbolic.ipynb
+++ b/notebooks/gsm_symbolic.ipynb
@ -2,13 +2,13 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_template = \"\"\"You need to generate python code for a synthetic procedural dataset. The dataset is similar to OpenAI's GSM8K which contains grade-school level math questions in natural language.\n",
    "\n",
-    "Here is a the SOURCE item from the dataset which you should translate into a python generator:\n",
+    "Here is a the SOURCE item which you should translate into a python generator:\n",
    "\n",
    "```json\n",
    "{0}\n",
@ -42,7 +42,7 @@
    "Your task:\n",
    "\n",
    "- Generate reasonable random values for all the variables\n",
-    "- Ensure mathematical consistency (total distance is divisible by distance per interval)\n",
+    "- Ensure mathematical consistency (results of divisions need to be integers)\n",
    "- Create natural language question and answer texts\n",
    "- Include metadata about the variables and solution\n",
    "\n",
@ -118,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
@ -132,12 +132,16 @@
   ],
   "source": [
    "# create open-router client, place your OPENROUTER_API_KEY in .env file\n",
+    "# .env contents:\n",
+    "# OPENROUTER_API_KEY=sk-or-v1- ...\n",
+    "\n",
    "%load_ext dotenv\n",
    "%dotenv\n",
    "import os\n",
    "import re\n",
    "from pathlib import Path\n",
    "from typing import Any, Iterable, Optional\n",
+    "import json\n",
    "from openai import OpenAI\n",
    "from openai.types.chat import ChatCompletion, ChatCompletionMessageParam\n",
    "import time\n",
@ -174,25 +178,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "/home/koepf/code/open-thought/reasoning-gym/notebooks/../../../ml-gsm-symbolic/templates/symbolic\n"
+      "Reading templates from path:  /home/koepf/code/open-thought/reasoning-gym/notebooks/../../../ml-gsm-symbolic/templates/symbolic\n",
+      "len of python source:  2389\n"
     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'from random import Random\\nfrom typing import Dict, Any\\n\\ndef generate_from_variables(item: str, n1: int, p: int, c1: str, c2: str, c3: str) -> Dict[str, Any]:\\n    more_cards = int(p/100 * n1)\\n    n2 = n1 + more_cards\\n    n3 = n1 + n2\\n    total = n3 + n3\\n\\n    question = f\"In a set of {item}\\'s cards, there are {n1} {c1} cards, and {p}% more {c2} cards. {c3} cards are as many as the sum of {c1} and {c2} cards. How many cards of all mentioned colors are there?\"\\n\\n    answer_cot = f\"There are {p}/100 * {n1} = {more_cards} more {c2} cards than {c1} cards.\\\\n\" \\\\\\n                 f\"Which means there are {n1} + {more_cards} = {n2} {c2} cards.\\\\n\" \\\\\\n                 f\"{c3} cards make up to {n1} + {n2} = {n3} cards.\\\\n\" \\\\\\n                 f\"So in total, there are {n3} + {n3} = {total} cards of different colors.\\\\n\" \\\\\\n                 f\"#### {total}\"\\n\\n    return {\\n        \\'question\\': question,\\n        \\'answer\\': str(total),\\n        \\'answer_cot\\': answer_cot,\\n        \\'answer_value\\': total,\\n        \\'variables\\': {\\n            \\'item\\': item,\\n            \\'n1\\': n1,\\n            \\'p\\': p,\\n            \\'c1\\': c1,\\n            \\'c2\\': c2, \\n            \\'c3\\': c3,\\n            \\'more_cards\\': more_cards,\\n            \\'total\\': total\\n        }\\n    }\\n\\ndef generate_example(rng: Random, difficulty: float = 1.0) -> Dict[str, Any]:\\n    items = [\"magician\", \"artist\", \"chef\", \"scientist\", \"athlete\"]\\n    colors = [\"red\", \"blue\", \"green\", \"yellow\", \"purple\", \"orange\"]\\n    \\n    item = rng.choice(items)\\n    c1, c2, c3 = rng.sample(colors, 3)\\n    \\n    n1 = int(rng.randint(20, int(81 * difficulty)))\\n    \\n    # Generate p ensuring p/100 * n1 is an integer\\n    while True:\\n        p = int(rng.randint(20, min(90, int(90 * difficulty))))\\n        if (p/100 * n1).is_integer():\\n            break\\n            \\n    result = generate_from_variables(item, n1, p, c1, c2, c3)\\n    \\n    return {\\n        \\'question\\': result[\\'question\\'],\\n        \\'answer\\': result[\\'answer\\'],\\n        \\'metadata\\': {\\n            \\'difficulty\\': difficulty,\\n            \\'answer_value\\': result[\\'answer_value\\'],\\n            \\'answer_cot\\': result[\\'answer_cot\\'],\\n            \\'variables\\': result[\\'variables\\']\\n        }\\n    }\\n\\ndef original_example() -> Dict[str, Any]:\\n    return generate_from_variables(\"magician\", 15, 60, \"red\", \"green\", \"yellow\")\\n'"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
@ -205,9 +200,9 @@
    "    return prompt\n",
    "    \n",
    "\n",
-    "def produce_generator(json_path: Path):\n",
-    "    json = json_path.read_text()\n",
-    "    user_request = prompt_template.format(json)\n",
+    "def eval_prompt_template(input: str):\n",
+    "    \n",
+    "    user_request = prompt_template.format(input)\n",
    "    \n",
    "    input_messages = generate_simple_request(user_prompt=user_request)\n",
    "    output =  llm_generate(open_router_client, input_messages, sampling_params)\n",
@ -224,13 +219,48 @@
    "template_files = list(path_to_gsmsym.glob(\"*.json\"))\n",
    "\n",
    "# for testing just do it for the first entry\n",
-    "response_text = produce_generator(template_files[0])\n",
+    "response_text = eval_prompt_template(template_files[0].read_text())\n",
    "\n",
    "# extract python source section\n",
    "result_match = re.search(r\"^```.*\\n((.*\\n)+)```\", response_text, flags=re.MULTILINE)\n",
    "\n",
-    "pytho_source = result_match.group(1)\n",
-    "pytho_source"
+    "\n",
+    "python_source = result_match.group(1)\n",
+    "print(\"len of python source: \", len(python_source))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "In a set of magician's cards, there are 15 red cards, and 60% more green cards. yellow cards are as many as the sum of red and green cards. How many cards of all mentioned colors are there?\n",
+      "In a set of magicians cards, there are 15 red cards, and 60% more green cards. Yellow cards are as many, as the sum of red and green cards. How many cards of all mentioned colors are there?\n"
+     ]
+    }
+   ],
+   "source": [
+    "# WARNING: We are now executing the llm response without sandbox environment!\n",
+    "\n",
+    "scope = {}  # eval generated python code here\n",
+    "\n",
+    "try:\n",
+    "    exec(python_source, scope, scope)\n",
+    "except Exception as err:\n",
+    "    raise\n",
+    "\n",
+    "\n",
+    "exec(\"output = original_example()\", scope, scope)\n",
+    "generated_data = scope[\"output\"]\n",
+    "print(generated_data['question'])\n",
+    "\n",
+    "\n",
+    "original_data = json.loads(template_files[0].read_text())\n",
+    "print(original_data['question'])\n"
   ]
  }
 ],