mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
e2b testing
This commit is contained in:
parent
b47bf882ce
commit
2ce450486d
1 changed files with 167 additions and 31 deletions
|
|
@ -2,23 +2,32 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import abc\n",
|
||||
"import asyncio\n",
|
||||
"from collections import defaultdict\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import random\n",
|
||||
"from random import Random\n",
|
||||
"import re\n",
|
||||
"import signal\n",
|
||||
"from typing import Union\n",
|
||||
"\n",
|
||||
"import aiohttp\n",
|
||||
"import datasets\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import numpy as np\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"from tenacity import (\n",
|
||||
|
|
@ -28,7 +37,11 @@
|
|||
" wait_exponential,\n",
|
||||
")\n",
|
||||
"import torch\n",
|
||||
"from tqdm.notebook import tqdm"
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"from e2b_code_interpreter import Sandbox\n",
|
||||
"from e2b import TimeoutException\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -351,8 +364,6 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"SYSTEM_PROMPT = \"\"\"You are a helpful assistant that generates valid Python functions that act as input generators for a given code snippet.\n",
|
||||
"\n",
|
||||
"You have access to `random.Random`, therefore you SHOULD NOT import it again. You should use this random number generator to make the input generation process stochastic on each call.\n",
|
||||
|
|
@ -481,9 +492,26 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Filter out invalid input generators\n",
|
||||
"## Filter out invalid input generators"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to install a template with custom package\n",
|
||||
"\n",
|
||||
"**NOTE**: The code below is buggy because we have a memory leak (I think) - every time you run the `exec` with some code snippet, it stores the variables in the global scope. Over time, this will consume all the memory. And besides, running `exec` on untrusted code is not smart."
|
||||
"https://e2b.dev/docs/quickstart/install-custom-packages\n",
|
||||
"\n",
|
||||
"An example e2b.Dockerfile looks like this:\n",
|
||||
"\n",
|
||||
"```Dockerfile\n",
|
||||
"FROM e2bdev/code-interpreter:latest\n",
|
||||
"\n",
|
||||
"RUN pip install numpy matplotlib scipy pandas scikit-learn sympy networkx requests pillow bs4 cryptography spacy numba pyyaml regex\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"However, I am going with the default installed libraries: https://e2b.dev/docs/code-interpreting/analyze-data-with-ai/pre-installed-libraries "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -492,33 +520,141 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# def timeout_handler(signum, frame):\n",
|
||||
"# raise TimeoutError(\"Function call timed out\")\n",
|
||||
"# Example usage of the Sandbox class\n",
|
||||
"with Sandbox() as sandbox:\n",
|
||||
"\n",
|
||||
"# def get_input_generator_func(code_sample: str, input_generator_str: str) -> dict:\n",
|
||||
"# env = globals().copy()\n",
|
||||
"# exec(code_sample, env, env)\n",
|
||||
"# exec(input_generator_str, env, env)\n",
|
||||
"# return env['generate_input']\n",
|
||||
" # First initialize the sandbox\n",
|
||||
" execution = sandbox.run_code(\"\"\"\n",
|
||||
"from random import Random # <----- ALWAYS PREPEND THIS LINE TO YOUR CODE SNIPPET\n",
|
||||
"\n",
|
||||
"# def execute_code_sample(code_sample: str, input_dict: dict) -> dict:\n",
|
||||
"# env = globals().copy()\n",
|
||||
"# exec(code_sample, env, env)\n",
|
||||
"# main_solution = env['main_solution']\n",
|
||||
"# return main_solution(**input_dict)\n",
|
||||
"def hello_world():\n",
|
||||
" return {\"a\": 5, \"b\": 10}\n",
|
||||
"\n",
|
||||
"# NUM_INPUT_GENERATE = 1_000 # how many inputs to try and generate\n",
|
||||
"# ALARM_TOLERANCE = 1 # in seconds\n",
|
||||
"# PERCENT_UNIQUE_INPUTS = 0.30 # what fraction of generated inputs should be unique\n",
|
||||
"# PERCENT_UNIQUE_OUTPUTS = 0.30 # what fraction of generated outputs should be unique\n",
|
||||
"def multiple_hello_worlds(rng: Random):\n",
|
||||
" return [\n",
|
||||
" {\"a\": rng.randint(1, 10), \"b\": rng.randint(10, 20)},\n",
|
||||
" {\"a\": 10, \"b\": 20},\n",
|
||||
" ]\n",
|
||||
"\"\"\"\n",
|
||||
" )\n",
|
||||
" try:\n",
|
||||
" # Run the code snippet\n",
|
||||
" execution = sandbox.run_code(\"rng = Random(53);multiple_hello_worlds(rng)\", timeout=5)\n",
|
||||
" print(execution)\n",
|
||||
" if execution.error:\n",
|
||||
" print(\"[!! FOUND ERROR !!]\")\n",
|
||||
" else:\n",
|
||||
" print(type(execution.text))\n",
|
||||
" print(execution.text)\n",
|
||||
" except TimeoutException as e:\n",
|
||||
" print(e)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CODE_TEMPLATE = \"\"\"from random import Random\n",
|
||||
"{code_sample}\n",
|
||||
"\n",
|
||||
"# signal.signal(signal.SIGALRM, timeout_handler)\n",
|
||||
"{input_generator}\n",
|
||||
"\n",
|
||||
"# rng = random.Random()\n",
|
||||
"# rng.seed(42)\n",
|
||||
"def multiple_eval(num_generations: int, seed: int = 42) -> tuple:\n",
|
||||
" rng = Random(seed)\n",
|
||||
" inputs = [generate_input(rng) for _ in range(num_generations)]\n",
|
||||
" outputs = [main_solution(**inp) for inp in inputs]\n",
|
||||
" return inputs, outputs\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# errors = defaultdict(int)\n",
|
||||
"# total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n",
|
||||
"SAMPLING_TEMPLATE = \"multiple_eval({num_generations})\"\n",
|
||||
"\n",
|
||||
"WARMUP_GENERATIONS = 5\n",
|
||||
"TOTAL_GENERATIONS = 1_000\n",
|
||||
"TIMEOUT_CODE_INIT = 10\n",
|
||||
"TIMEOUT_PER_SAMPLE = 2\n",
|
||||
"\n",
|
||||
"errors = defaultdict(int)\n",
|
||||
"total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n",
|
||||
"\n",
|
||||
"with open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\") as f_in, \\\n",
|
||||
" open(\"data/codeio-pyedu-with-input-generator-filtered.jsonl\", \"w+\") as f_out:\n",
|
||||
"\n",
|
||||
" iterator = tqdm(enumerate(f_in), total=total_entries)\n",
|
||||
"\n",
|
||||
" for i, line in iterator:\n",
|
||||
" iterator.set_description(f\"Processing {i}/{total_entries} | \" + \" | \".join(f\"{k}: {v}\" for k, v in errors.items()) + f\" | total: {sum(errors.values())}\")\n",
|
||||
" entry = json.loads(line)\n",
|
||||
"\n",
|
||||
" if not \"input_generator\" in entry:\n",
|
||||
" errors[\"missing_input_generator\"] += 1\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" with Sandbox() as sandbox:\n",
|
||||
" # 1. Initialize the sandbox\n",
|
||||
" try: \n",
|
||||
" execution = sandbox.run_code(\n",
|
||||
" code=CODE_TEMPLATE.format(**entry), \n",
|
||||
" timeout=TIMEOUT_CODE_INIT\n",
|
||||
" )\n",
|
||||
" assert not execution.error, \"Error in code snippet\"\n",
|
||||
" except Exception as e:\n",
|
||||
" errors[\"cannot_initialize_code\"] += 1\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # 2. Warmup the sampling\n",
|
||||
" try:\n",
|
||||
" execution = sandbox.run_code(\n",
|
||||
" code=SAMPLING_TEMPLATE.format(num_generations=WARMUP_GENERATIONS),\n",
|
||||
" timeout=TIMEOUT_PER_SAMPLE * WARMUP_GENERATIONS\n",
|
||||
" )\n",
|
||||
" assert not execution.error, \"Error in input generator (warmup)\"\n",
|
||||
" assert execution.text, \"Empty input generator output (warmup)\"\n",
|
||||
" inputs, outputs = eval(execution.text)\n",
|
||||
" except Exception as e:\n",
|
||||
" errors[\"warmup_fails\"] += 1\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # 3. Run the full sampling\n",
|
||||
" try:\n",
|
||||
" execution = sandbox.run_code(\n",
|
||||
" code=SAMPLING_TEMPLATE.format(num_generations=TOTAL_GENERATIONS),\n",
|
||||
" timeout=TIMEOUT_PER_SAMPLE * TOTAL_GENERATIONS\n",
|
||||
" )\n",
|
||||
" assert not execution.error, \"Error in input generator (full)\"\n",
|
||||
" assert execution.text, \"Empty input generator output (full)\"\n",
|
||||
" inputs, outputs = eval(execution.text)\n",
|
||||
" assert len(inputs) == TOTAL_GENERATIONS, \"Mismatch in input generations\"\n",
|
||||
" assert len(outputs) == TOTAL_GENERATIONS, \"Mismatch in output generations\"\n",
|
||||
" unique_inputs = len(set(hash(json.dumps(inp, sort_keys=True)) for inp in inputs))\n",
|
||||
" unique_outputs = len(set(hash(json.dumps(out, sort_keys=True)) for out in outputs))\n",
|
||||
" except:\n",
|
||||
" errors[\"full_sampling_fails\"] += 1\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # 4. Save the entry\n",
|
||||
" entry = entry | {\n",
|
||||
" \"unique_inputs\": unique_inputs,\n",
|
||||
" \"unique_outputs\": unique_outputs,\n",
|
||||
" \"total_generations\": TOTAL_GENERATIONS,\n",
|
||||
" }\n",
|
||||
" f_out.write(json.dumps(entry))\n",
|
||||
" f_out.write(\"\\n\")\n",
|
||||
"\n",
|
||||
"for k, v in errors.items():\n",
|
||||
" print(f\"{k}: {v}\")\n",
|
||||
"print(f\"Total errors: {sum(errors.values())}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"errors = defaultdict(int)\n",
|
||||
"total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n",
|
||||
"\n",
|
||||
"# with open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\") as f_in, \\\n",
|
||||
"# open(\"data/codeio-pyedu-with-input-generator-filtered.jsonl\", \"w+\") as f_out:\n",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue