mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-22 16:49:06 +00:00
461 lines
18 KiB
Text
461 lines
18 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import abc\n",
|
|
"import os\n",
|
|
"from typing import Union\n",
|
|
"import pickle\n",
|
|
"import re\n",
|
|
"import random\n",
|
|
"from random import Random\n",
|
|
"import requests\n",
|
|
"import json\n",
|
|
"from tqdm import tqdm\n",
|
|
"\n",
|
|
"import datasets\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
|
|
"from sentence_transformers import SentenceTransformer\n",
|
|
"import tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"dataset = datasets.load_dataset(\"hkust-nlp/CodeIO-PyEdu-Reasoning\")['train']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Extract the relevant parts of the prompt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 1630607/1630607 [01:20<00:00, 20302.13it/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"There were 1489543 out of 1630607 duplicate entries\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"pattern = re.compile(\n",
|
|
" r'(?s)' # DOTALL so . matches newlines\n",
|
|
" r'You are given a question that requires some input and output variables as follows:\\s*(.*?)'\n",
|
|
" r'\\s*The input and output requirements are as follows:\\s*(.*?)'\n",
|
|
" r'\\s*Given the following.*?Tip: Here is a reference code snippet for this question\\. '\n",
|
|
" r'You can refer to this code to guide your reasoning but not copy spans of code directly\\.\\s*(.*)'\n",
|
|
")\n",
|
|
"\n",
|
|
"seen = set()\n",
|
|
"duplicate = 0\n",
|
|
"\n",
|
|
"with open(\"data/codeio-pyedu-extracted.jsonl\", \"w+\") as f:\n",
|
|
" for i, item in tqdm(enumerate(dataset), total=len(dataset)):\n",
|
|
" match = pattern.search(item[\"prompt\"])\n",
|
|
" if match:\n",
|
|
" # Extract relevant info\n",
|
|
" task_description = match.group(1).strip()\n",
|
|
" input_output_spec = match.group(2).strip()\n",
|
|
" code_sample = match.group(3).strip()\n",
|
|
"\n",
|
|
" # Check if code sample is unique\n",
|
|
" hash_entry = f\"{hash(task_description)}-{hash(input_output_spec)}-{hash(code_sample)}\"\n",
|
|
" if hash_entry in seen:\n",
|
|
" duplicate += 1\n",
|
|
" continue\n",
|
|
" seen.add(hash_entry)\n",
|
|
"\n",
|
|
" # Save to disk\n",
|
|
" json.dump({\n",
|
|
" \"task_description\": task_description,\n",
|
|
" \"input_output_spec\": input_output_spec,\n",
|
|
" \"code_sample\": code_sample\n",
|
|
" }, f)\n",
|
|
" f.write(\"\\n\")\n",
|
|
" else:\n",
|
|
" print(f\"No match found for item {i}\")\n",
|
|
"\n",
|
|
"print(f\"There were {duplicate} out of {len(dataset)} duplicate entries\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Subsample the data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class IdentitySampler:\n",
|
|
" def run(\n",
|
|
" self, features: Union[torch.Tensor, np.ndarray]\n",
|
|
" ) -> Union[torch.Tensor, np.ndarray]:\n",
|
|
" return features\n",
|
|
"\n",
|
|
"\n",
|
|
"class BaseSampler(abc.ABC):\n",
|
|
" def __init__(self, percentage: float):\n",
|
|
" if not 0 < percentage < 1:\n",
|
|
" raise ValueError(\"Percentage value not in (0, 1).\")\n",
|
|
" self.percentage = percentage\n",
|
|
"\n",
|
|
" @abc.abstractmethod\n",
|
|
" def run(\n",
|
|
" self, features: Union[torch.Tensor, np.ndarray]\n",
|
|
" ) -> Union[torch.Tensor, np.ndarray]:\n",
|
|
" pass\n",
|
|
"\n",
|
|
" def _store_type(self, features: Union[torch.Tensor, np.ndarray]) -> None:\n",
|
|
" self.features_is_numpy = isinstance(features, np.ndarray)\n",
|
|
" if not self.features_is_numpy:\n",
|
|
" self.features_device = features.device\n",
|
|
"\n",
|
|
" def _restore_type(self, features: torch.Tensor) -> Union[torch.Tensor, np.ndarray]:\n",
|
|
" if self.features_is_numpy:\n",
|
|
" return features.cpu().numpy()\n",
|
|
" return features.to(self.features_device)\n",
|
|
"\n",
|
|
"\n",
|
|
"class GreedyCoresetSampler(BaseSampler):\n",
|
|
" def __init__(\n",
|
|
" self,\n",
|
|
" percentage: float,\n",
|
|
" device: torch.device,\n",
|
|
" dtype: torch.dtype = torch.float32,\n",
|
|
" dimension_to_project_features_to=128,\n",
|
|
" ):\n",
|
|
" \"\"\"Greedy Coreset sampling base class.\"\"\"\n",
|
|
" super().__init__(percentage)\n",
|
|
"\n",
|
|
" self.device = device\n",
|
|
" self.dtype = dtype\n",
|
|
" self.dimension_to_project_features_to = dimension_to_project_features_to\n",
|
|
"\n",
|
|
" def _reduce_features(self, features):\n",
|
|
" if features.shape[1] == self.dimension_to_project_features_to:\n",
|
|
" return features\n",
|
|
" mapper = torch.nn.Linear(\n",
|
|
" features.shape[1], self.dimension_to_project_features_to, bias=False, dtype=self.dtype,\n",
|
|
" )\n",
|
|
" _ = mapper.to(self.device)\n",
|
|
" features = features.to(self.device)\n",
|
|
" return mapper(features)\n",
|
|
"\n",
|
|
" def run(\n",
|
|
" self, features: Union[torch.Tensor, np.ndarray]\n",
|
|
" ) -> Union[torch.Tensor, np.ndarray]:\n",
|
|
" \"\"\"Subsamples features using Greedy Coreset.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" features: [N x D]\n",
|
|
" \"\"\"\n",
|
|
" if self.percentage == 1:\n",
|
|
" return features\n",
|
|
" self._store_type(features)\n",
|
|
" if isinstance(features, np.ndarray):\n",
|
|
" features = torch.from_numpy(features)\n",
|
|
" reduced_features = self._reduce_features(features)\n",
|
|
" sample_indices = self._compute_greedy_coreset_indices(reduced_features)\n",
|
|
" return sample_indices\n",
|
|
"\n",
|
|
" @staticmethod\n",
|
|
" def _compute_batchwise_differences(\n",
|
|
" matrix_a: torch.Tensor, matrix_b: torch.Tensor\n",
|
|
" ) -> torch.Tensor:\n",
|
|
" \"\"\"Computes batchwise Euclidean distances using PyTorch.\"\"\"\n",
|
|
" a_times_a = matrix_a.unsqueeze(1).bmm(matrix_a.unsqueeze(2)).reshape(-1, 1)\n",
|
|
" b_times_b = matrix_b.unsqueeze(1).bmm(matrix_b.unsqueeze(2)).reshape(1, -1)\n",
|
|
" a_times_b = matrix_a.mm(matrix_b.T)\n",
|
|
"\n",
|
|
" return (-2 * a_times_b + a_times_a + b_times_b).clamp(0, None).sqrt()\n",
|
|
"\n",
|
|
" def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n",
|
|
" \"\"\"Runs iterative greedy coreset selection.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" features: [NxD] input feature bank to sample.\n",
|
|
" \"\"\"\n",
|
|
" distance_matrix = self._compute_batchwise_differences(features, features)\n",
|
|
" coreset_anchor_distances = torch.norm(distance_matrix, dim=1)\n",
|
|
"\n",
|
|
" coreset_indices = []\n",
|
|
" num_coreset_samples = int(len(features) * self.percentage)\n",
|
|
"\n",
|
|
" for _ in range(num_coreset_samples):\n",
|
|
" select_idx = torch.argmax(coreset_anchor_distances).item()\n",
|
|
" coreset_indices.append(select_idx)\n",
|
|
"\n",
|
|
" coreset_select_distance = distance_matrix[\n",
|
|
" :, select_idx : select_idx + 1 # noqa E203\n",
|
|
" ]\n",
|
|
" coreset_anchor_distances = torch.cat(\n",
|
|
" [coreset_anchor_distances.unsqueeze(-1), coreset_select_distance], dim=1\n",
|
|
" )\n",
|
|
" coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values\n",
|
|
"\n",
|
|
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
|
|
"\n",
|
|
"\n",
|
|
"class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):\n",
|
|
" def __init__(\n",
|
|
" self,\n",
|
|
" percentage: float,\n",
|
|
" device: torch.device,\n",
|
|
" dtype: torch.dtype = torch.float32,\n",
|
|
" number_of_starting_points: int = 10,\n",
|
|
" dimension_to_project_features_to: int = 128,\n",
|
|
" ):\n",
|
|
" \"\"\"Approximate Greedy Coreset sampling base class.\"\"\"\n",
|
|
" self.number_of_starting_points = number_of_starting_points\n",
|
|
" super().__init__(percentage, device, dtype, dimension_to_project_features_to)\n",
|
|
"\n",
|
|
" def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n",
|
|
" \"\"\"Runs approximate iterative greedy coreset selection.\n",
|
|
"\n",
|
|
" This greedy coreset implementation does not require computation of the\n",
|
|
" full N x N distance matrix and thus requires a lot less memory, however\n",
|
|
" at the cost of increased sampling times.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" features: [NxD] input feature bank to sample.\n",
|
|
" \"\"\"\n",
|
|
" number_of_starting_points = np.clip(\n",
|
|
" self.number_of_starting_points, None, len(features)\n",
|
|
" )\n",
|
|
" start_points = np.random.choice(\n",
|
|
" len(features), number_of_starting_points, replace=False\n",
|
|
" ).tolist()\n",
|
|
"\n",
|
|
" approximate_distance_matrix = self._compute_batchwise_differences(\n",
|
|
" features, features[start_points]\n",
|
|
" )\n",
|
|
" approximate_coreset_anchor_distances = torch.mean(\n",
|
|
" approximate_distance_matrix, axis=-1\n",
|
|
" ).reshape(-1, 1)\n",
|
|
" coreset_indices = []\n",
|
|
" num_coreset_samples = int(len(features) * self.percentage)\n",
|
|
"\n",
|
|
" with torch.no_grad():\n",
|
|
" for _ in tqdm.tqdm(range(num_coreset_samples), desc=\"Subsampling...\"):\n",
|
|
" select_idx = torch.argmax(approximate_coreset_anchor_distances).item()\n",
|
|
" coreset_indices.append(select_idx)\n",
|
|
" coreset_select_distance = self._compute_batchwise_differences(\n",
|
|
" features, features[select_idx : select_idx + 1] # noqa: E203\n",
|
|
" )\n",
|
|
" approximate_coreset_anchor_distances = torch.cat(\n",
|
|
" [approximate_coreset_anchor_distances, coreset_select_distance],\n",
|
|
" dim=-1,\n",
|
|
" )\n",
|
|
" approximate_coreset_anchor_distances = torch.min(\n",
|
|
" approximate_coreset_anchor_distances, dim=1\n",
|
|
" ).values.reshape(-1, 1)\n",
|
|
"\n",
|
|
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
|
|
"\n",
|
|
"\n",
|
|
"class RandomSampler(BaseSampler):\n",
|
|
" def __init__(self, percentage: float):\n",
|
|
" super().__init__(percentage)\n",
|
|
"\n",
|
|
" def run(\n",
|
|
" self, features: Union[torch.Tensor, np.ndarray]\n",
|
|
" ) -> Union[torch.Tensor, np.ndarray]:\n",
|
|
" \"\"\"Randomly samples input feature collection.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" features: [N x D]\n",
|
|
" \"\"\"\n",
|
|
" num_random_samples = int(len(features) * self.percentage)\n",
|
|
" subset_indices = np.random.choice(\n",
|
|
" len(features), num_random_samples, replace=False\n",
|
|
" )\n",
|
|
" return torch.tensor(subset_indices, device=features.device, dtype=torch.int64)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# I ran this cell on Google Colab because I don't have a GPU on my local machine,\n",
|
|
"# hence why you see the Google Drive paths\n",
|
|
"\n",
|
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
|
"model = SentenceTransformer(\"nomic-ai/modernbert-embed-base\")\n",
|
|
"print(model)\n",
|
|
"\n",
|
|
"def get_entry_info(entry) -> str:\n",
|
|
" return entry['task_description']\n",
|
|
"\n",
|
|
"def get_embeddings(text) -> torch.Tensor:\n",
|
|
" return torch.from_numpy(model.encode(text)).to(torch.bfloat16)\n",
|
|
"\n",
|
|
"embeddings = []\n",
|
|
"\n",
|
|
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\") as f:\n",
|
|
" for line in tqdm.tqdm(f):\n",
|
|
" entry = json.loads(line)\n",
|
|
" entry_info = get_entry_info(entry)\n",
|
|
" embeddings.append(get_embeddings(entry_info))\n",
|
|
"\n",
|
|
"embeddings = torch.stack(embeddings).to(torch.bfloat16).to(device)\n",
|
|
"print(embeddings.shape)\n",
|
|
"\n",
|
|
"sampler = ApproximateGreedyCoresetSampler(\n",
|
|
" percentage=0.05, \n",
|
|
" device=device, \n",
|
|
" dtype=torch.bfloat16,\n",
|
|
" dimension_to_project_features_to=768,\n",
|
|
")\n",
|
|
"subsampled = sampler.run(embeddings)\n",
|
|
"\n",
|
|
"indices = set(subsampled.cpu().tolist())\n",
|
|
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\", \"r\") as f_in, \\\n",
|
|
" open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-best-coverage.jsonl\", \"w+\") as f_out:\n",
|
|
" for i, line in enumerate(f_in):\n",
|
|
" if i in indices:\n",
|
|
" f_out.write(line)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create input generators for each problem separately"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"SYSTEM_PROMPT = \"\"\"You are a helpful assistant that generates valid Python functions that act as input generators for a given code snippet.\n",
|
|
"\n",
|
|
"You have access to `random.Random`, therefore you SHOULD NOT import it again. You should use this random number generator to make the input generation process stochastic on each call.\n",
|
|
"\n",
|
|
"When the user asks you to generate an input for a code snippet, you should strictly respond in the following format:\n",
|
|
"<function>\n",
|
|
"def generate_input(rng: Random) -> dict:\n",
|
|
" # Your code here\n",
|
|
" pass\n",
|
|
"</function>\n",
|
|
"\n",
|
|
"The output of the function should be a dictionary where the keys are the variable names and the values are the generated values.\n",
|
|
"\n",
|
|
"It must contain all the variables that listed in the user's input specification, or more precisely in the `main_solution` function signature. \n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"USER_PROMPT = \"\"\"Following are a task description, input/output specification, and relevant code snippet for a Python programming task.\n",
|
|
"\n",
|
|
"<task_description>\n",
|
|
"{task_description}\n",
|
|
"</task_description>\n",
|
|
"\n",
|
|
"<input_output_spec>\n",
|
|
"{input_output_spec}\n",
|
|
"</input_output_spec>\n",
|
|
"\n",
|
|
"<code_sample>\n",
|
|
"{code_sample}\n",
|
|
"</code_sample>\n",
|
|
"\n",
|
|
"Your task is to write a Python function `generate_input(rng: Random) -> dict` that generates valid inputs for the given code snippet, based on the provided information.\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"with open(\"data/codeio-pyedu-extracted.jsonl\", \"r\") as f:\n",
|
|
" for i in range(1):\n",
|
|
" entry = json.loads(f.readline())\n",
|
|
" response = requests.post(\n",
|
|
" url=\"https://openrouter.ai/api/v1/chat/completions\",\n",
|
|
" headers={\n",
|
|
" \"Authorization\": f\"Bearer {os.getenv('OPENROUTER_API_KEY')}\",\n",
|
|
" \"Content-Type\": \"application/json\",\n",
|
|
" },\n",
|
|
" data = json.dumps({\n",
|
|
" \"model\": \"deepseek/deepseek-chat\",\n",
|
|
" \"messages\": [\n",
|
|
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
|
|
" {\"role\": \"user\", \"content\": USER_PROMPT.format(**entry)}\n",
|
|
" ]\n",
|
|
" })\n",
|
|
" )\n",
|
|
" full_response = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
|
|
" input_generator = re.search(r\"<function>(.*?)</function>\", full_response, re.DOTALL).group(1).strip()\n",
|
|
"\n",
|
|
" # Example of how to execute the generated code\n",
|
|
" # local_dict = {}\n",
|
|
" # exec(input_generator, globals(), local_dict)\n",
|
|
" # generate_input_func = local_dict['generate_input']\n",
|
|
" # rng = random.Random()\n",
|
|
"\n",
|
|
" # for i in range(5):\n",
|
|
" # random_input = generate_input_func(rng)\n",
|
|
" # print(f\"[{i}]: {random_input}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "reasoning_gym",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|