filtering

This commit is contained in:
Zafir Stojanovski 2025-02-25 22:21:26 +01:00
parent f19498edb8
commit 8a0423f185

View file

@ -8,11 +8,13 @@
"source": [ "source": [
"import abc\n", "import abc\n",
"import asyncio\n", "import asyncio\n",
"from collections import defaultdict\n",
"import json\n", "import json\n",
"import os\n", "import os\n",
"import random\n", "import random\n",
"from random import Random\n", "from random import Random\n",
"import re\n", "import re\n",
"import signal\n",
"from typing import Union\n", "from typing import Union\n",
"\n", "\n",
"import aiohttp\n", "import aiohttp\n",
@ -345,66 +347,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3a63a9127ee24e039b91c83a714ee994",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/7053 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n",
"Could not find <function>...</function> block in response\n"
]
}
],
"source": [ "source": [
"\n", "\n",
"\n", "\n",
@ -532,21 +477,106 @@
")" ")"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Filter out invalid input generators\n",
"\n",
"**NOTE**: The code below is buggy because we have a memory leak (I think) - every time you run the `exec` with some code snippet, it stores the variables in the global scope. Over time, this will consume all the memory. And besides, running `exec` on untrusted code is not smart."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Example of how to execute the generated code\n", "# def timeout_handler(signum, frame):\n",
"# local_dict = {}\n", "# raise TimeoutError(\"Function call timed out\")\n",
"# exec(data['input_generator'], globals(), local_dict)\n",
"# generate_input_func = local_dict['generate_input']\n",
"# rng = random.Random()\n",
"\n", "\n",
"# for i in range(5):\n", "# def get_input_generator_func(code_sample: str, input_generator_str: str) -> dict:\n",
"# random_input = generate_input_func(rng)\n", "# env = globals().copy()\n",
"# print(f\"[{i}]: {random_input}\")" "# exec(code_sample, env, env)\n",
"# exec(input_generator_str, env, env)\n",
"# return env['generate_input']\n",
"\n",
"# def execute_code_sample(code_sample: str, input_dict: dict) -> dict:\n",
"# env = globals().copy()\n",
"# exec(code_sample, env, env)\n",
"# main_solution = env['main_solution']\n",
"# return main_solution(**input_dict)\n",
"\n",
"# NUM_INPUT_GENERATE = 1_000 # how many inputs to try and generate\n",
"# ALARM_TOLERANCE = 1 # in seconds\n",
"# PERCENT_UNIQUE_INPUTS = 0.30 # what fraction of generated inputs should be unique\n",
"# PERCENT_UNIQUE_OUTPUTS = 0.30 # what fraction of generated outputs should be unique\n",
"\n",
"# signal.signal(signal.SIGALRM, timeout_handler)\n",
"\n",
"# rng = random.Random()\n",
"# rng.seed(42)\n",
"\n",
"# errors = defaultdict(int)\n",
"# total_entries = sum(1 for _ in open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\"))\n",
"\n",
"# with open(\"data/codeio-pyedu-with-input-generator.jsonl\", \"r\") as f_in, \\\n",
"# open(\"data/codeio-pyedu-with-input-generator-filtered.jsonl\", \"w+\") as f_out:\n",
"\n",
"# iterator = tqdm(enumerate(f_in), total=total_entries)\n",
"\n",
"# for i, line in iterator:\n",
"# iterator.set_description(f\"Processing {i}/{total_entries} | \" + \" | \".join(f\"{k}: {v}\" for k, v in errors.items()) + f\" | total: {sum(errors.values())}\")\n",
"# entry = json.loads(line)\n",
"# # Check if input generator is present\n",
"# if not \"input_generator\" in entry:\n",
"# errors[\"missing_input_generator\"] += 1\n",
"# continue\n",
" \n",
"# # Check if input generator is valid function\n",
"# try:\n",
"# input_generator_func = get_input_generator_func(entry['code_sample'], entry['input_generator'])\n",
"# except Exception as e:\n",
"# errors[\"cannot_instantiate_input_generator\"] += 1\n",
"# continue\n",
"\n",
"# skip = False\n",
"# seen_inputs, seen_outputs = set(), set()\n",
"\n",
"# for _ in range(NUM_INPUT_GENERATE):\n",
"# try:\n",
"# # Check if you can generate input\n",
"# signal.alarm(ALARM_TOLERANCE)\n",
"# random_input = input_generator_func(rng)\n",
"# signal.alarm(0)\n",
"# seen_inputs.add(hash(json.dumps(random_input)))\n",
"\n",
"# # Check if code snippet can execute with generated input\n",
"# signal.alarm(ALARM_TOLERANCE)\n",
"# random_output = execute_code_sample(entry[\"code_sample\"], random_input)\n",
"# signal.alarm(0)\n",
"# seen_outputs.add(hash(json.dumps(random_output)))\n",
"# except Exception as e:\n",
"# signal.alarm(0)\n",
"# errors[\"unreliable_input_generator\"] += 1\n",
"# skip = True\n",
"# break\n",
"# if skip: \n",
"# continue\n",
" \n",
"# if len(seen_inputs) / NUM_INPUT_GENERATE < PERCENT_UNIQUE_INPUTS:\n",
"# errors[\"insufficient_unique_inputs\"] += 1\n",
"# continue\n",
" \n",
"# if len(seen_outputs) / NUM_INPUT_GENERATE < PERCENT_UNIQUE_OUTPUTS:\n",
"# errors[\"insufficient_unique_outputs\"] += 1\n",
"# continue\n",
"\n",
"# f_out.write(json.dumps(entry))\n",
"# f_out.write(\"\\n\")\n",
"\n",
"# for k, v in errors.items():\n",
"# print(f\"{k}: {v}\")"
] ]
}, },
{ {