diff --git a/notebooks/codeio.ipynb b/notebooks/codeio.ipynb index 63abb71b..623fff2a 100644 --- a/notebooks/codeio.ipynb +++ b/notebooks/codeio.ipynb @@ -2,23 +2,32 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import abc\n", "import asyncio\n", "from collections import defaultdict\n", "import json\n", "import os\n", - "import random\n", - "from random import Random\n", "import re\n", - "import signal\n", "from typing import Union\n", "\n", "import aiohttp\n", "import datasets\n", + "from dotenv import load_dotenv\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "from tenacity import (\n", @@ -28,7 +37,11 @@ " wait_exponential,\n", ")\n", "import torch\n", - "from tqdm.notebook import tqdm" + "from tqdm.notebook import tqdm\n", + "from e2b_code_interpreter import Sandbox\n", + "from e2b import TimeoutException\n", + "\n", + "load_dotenv()" ] }, { @@ -351,8 +364,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "\n", "SYSTEM_PROMPT = \"\"\"You are a helpful assistant that generates valid Python functions that act as input generators for a given code snippet.\n", "\n", "You have access to `random.Random`, therefore you SHOULD NOT import it again. You should use this random number generator to make the input generation process stochastic on each call.\n", @@ -481,9 +492,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Filter out invalid input generators\n", + "## Filter out invalid input generators" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to install a template with custom package\n", "\n", - "**NOTE**: The code below is buggy because we have a memory leak (I think) - every time you run the `exec` with some code snippet, it stores the variables in the global scope. Over time, this will consume all the memory. And besides, running `exec` on untrusted code is not smart." + "https://e2b.dev/docs/quickstart/install-custom-packages\n", + "\n", + "An example e2b.Dockerfile looks like this:\n", + "\n", + "```Dockerfile\n", + "FROM e2bdev/code-interpreter:latest\n", + "\n", + "RUN pip install numpy matplotlib scipy pandas scikit-learn sympy networkx requests pillow bs4 cryptography spacy numba pyyaml regex\n", + "```\n", + "\n", + "However, I am going with the default installed libraries: https://e2b.dev/docs/code-interpreting/analyze-data-with-ai/pre-installed-libraries " ] }, { @@ -492,33 +520,141 @@ "metadata": {}, "outputs": [], "source": [ - "# def timeout_handler(signum, frame):\n", - "# raise TimeoutError(\"Function call timed out\")\n", + "# Example usage of the Sandbox class\n", + "with Sandbox() as sandbox:\n", "\n", - "# def get_input_generator_func(code_sample: str, input_generator_str: str) -> dict:\n", - "# env = globals().copy()\n", - "# exec(code_sample, env, env)\n", - "# exec(input_generator_str, env, env)\n", - "# return env['generate_input']\n", + " # First initialize the sandbox\n", + " execution = sandbox.run_code(\"\"\"\n", + "from random import Random # <----- ALWAYS PREPEND THIS LINE TO YOUR CODE SNIPPET\n", "\n", - "# def execute_code_sample(code_sample: str, input_dict: dict) -> dict:\n", - "# env = globals().copy()\n", - "# exec(code_sample, env, env)\n", - "# main_solution = env['main_solution']\n", - "# return main_solution(**input_dict)\n", + "def hello_world():\n", + " return {\"a\": 5, \"b\": 10}\n", "\n", - "# NUM_INPUT_GENERATE = 1_000 # how many inputs to try and generate\n", - "# ALARM_TOLERANCE = 1 # in seconds\n", - "# PERCENT_UNIQUE_INPUTS = 0.30 # what fraction of generated inputs should be unique\n", - "# PERCENT_UNIQUE_OUTPUTS = 0.30 # what fraction of generated outputs should be unique\n", + "def multiple_hello_worlds(rng: Random):\n", + " return [\n", + " {\"a\": rng.randint(1, 10), \"b\": rng.randint(10, 20)},\n", + " {\"a\": 10, \"b\": 20},\n", + " ]\n", + "\"\"\"\n", + " )\n", + " try:\n", + " # Run the code snippet\n", + " execution = sandbox.run_code(\"rng = Random(53);multiple_hello_worlds(rng)\", timeout=5)\n", + " print(execution)\n", + " if execution.error:\n", + " print(\"[!! FOUND ERROR !!]\")\n", + " else:\n", + " print(type(execution.text))\n", + " print(execution.text)\n", + " except TimeoutException as e:\n", + " print(e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CODE_TEMPLATE = \"\"\"from random import Random\n", + "{code_sample}\n", "\n", - "# signal.signal(signal.SIGALRM, timeout_handler)\n", + "{input_generator}\n", "\n", - "# rng = random.Random()\n", - "# rng.seed(42)\n", + "def multiple_eval(num_generations: int, seed: int = 42) -> tuple:\n", + " rng = Random(seed)\n", + " inputs = [generate_input(rng) for _ in range(num_generations)]\n", + " outputs = [main_solution(**inp) for inp in inputs]\n", + " return inputs, outputs\n", + "\"\"\"\n", "\n", - "# errors = defaultdict(int)\n", - "# total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n", + "SAMPLING_TEMPLATE = \"multiple_eval({num_generations})\"\n", + "\n", + "WARMUP_GENERATIONS = 5\n", + "TOTAL_GENERATIONS = 1_000\n", + "TIMEOUT_CODE_INIT = 10\n", + "TIMEOUT_PER_SAMPLE = 2\n", + "\n", + "errors = defaultdict(int)\n", + "total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n", + "\n", + "with open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\") as f_in, \\\n", + " open(\"data/codeio-pyedu-with-input-generator-filtered.jsonl\", \"w+\") as f_out:\n", + "\n", + " iterator = tqdm(enumerate(f_in), total=total_entries)\n", + "\n", + " for i, line in iterator:\n", + " iterator.set_description(f\"Processing {i}/{total_entries} | \" + \" | \".join(f\"{k}: {v}\" for k, v in errors.items()) + f\" | total: {sum(errors.values())}\")\n", + " entry = json.loads(line)\n", + "\n", + " if not \"input_generator\" in entry:\n", + " errors[\"missing_input_generator\"] += 1\n", + " continue\n", + " \n", + " with Sandbox() as sandbox:\n", + " # 1. Initialize the sandbox\n", + " try: \n", + " execution = sandbox.run_code(\n", + " code=CODE_TEMPLATE.format(**entry), \n", + " timeout=TIMEOUT_CODE_INIT\n", + " )\n", + " assert not execution.error, \"Error in code snippet\"\n", + " except Exception as e:\n", + " errors[\"cannot_initialize_code\"] += 1\n", + " continue\n", + " \n", + " # 2. Warmup the sampling\n", + " try:\n", + " execution = sandbox.run_code(\n", + " code=SAMPLING_TEMPLATE.format(num_generations=WARMUP_GENERATIONS),\n", + " timeout=TIMEOUT_PER_SAMPLE * WARMUP_GENERATIONS\n", + " )\n", + " assert not execution.error, \"Error in input generator (warmup)\"\n", + " assert execution.text, \"Empty input generator output (warmup)\"\n", + " inputs, outputs = eval(execution.text)\n", + " except Exception as e:\n", + " errors[\"warmup_fails\"] += 1\n", + " continue\n", + "\n", + " # 3. Run the full sampling\n", + " try:\n", + " execution = sandbox.run_code(\n", + " code=SAMPLING_TEMPLATE.format(num_generations=TOTAL_GENERATIONS),\n", + " timeout=TIMEOUT_PER_SAMPLE * TOTAL_GENERATIONS\n", + " )\n", + " assert not execution.error, \"Error in input generator (full)\"\n", + " assert execution.text, \"Empty input generator output (full)\"\n", + " inputs, outputs = eval(execution.text)\n", + " assert len(inputs) == TOTAL_GENERATIONS, \"Mismatch in input generations\"\n", + " assert len(outputs) == TOTAL_GENERATIONS, \"Mismatch in output generations\"\n", + " unique_inputs = len(set(hash(json.dumps(inp, sort_keys=True)) for inp in inputs))\n", + " unique_outputs = len(set(hash(json.dumps(out, sort_keys=True)) for out in outputs))\n", + " except:\n", + " errors[\"full_sampling_fails\"] += 1\n", + " continue\n", + " \n", + " # 4. Save the entry\n", + " entry = entry | {\n", + " \"unique_inputs\": unique_inputs,\n", + " \"unique_outputs\": unique_outputs,\n", + " \"total_generations\": TOTAL_GENERATIONS,\n", + " }\n", + " f_out.write(json.dumps(entry))\n", + " f_out.write(\"\\n\")\n", + "\n", + "for k, v in errors.items():\n", + " print(f\"{k}: {v}\")\n", + "print(f\"Total errors: {sum(errors.values())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "errors = defaultdict(int)\n", + "total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n", "\n", "# with open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\") as f_in, \\\n", "# open(\"data/codeio-pyedu-with-input-generator-filtered.jsonl\", \"w+\") as f_out:\n",