mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-24 17:05:03 +00:00
sampling code
This commit is contained in:
parent
e84cec26ed
commit
0d07746a4e
1 changed files with 65 additions and 42 deletions
|
|
@ -2,18 +2,27 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import datasets\n",
|
||||
"import abc\n",
|
||||
"import os\n",
|
||||
"from typing import Union\n",
|
||||
"import pickle\n",
|
||||
"import re\n",
|
||||
"import random\n",
|
||||
"from random import Random\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"import random\n",
|
||||
"from random import Random"
|
||||
"\n",
|
||||
"import datasets\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"import tqdm"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -113,14 +122,6 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import abc\n",
|
||||
"from typing import Union\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import tqdm\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class IdentitySampler:\n",
|
||||
" def run(\n",
|
||||
" self, features: Union[torch.Tensor, np.ndarray]\n",
|
||||
|
|
@ -156,19 +157,21 @@
|
|||
" self,\n",
|
||||
" percentage: float,\n",
|
||||
" device: torch.device,\n",
|
||||
" dtype: torch.dtype = torch.float32,\n",
|
||||
" dimension_to_project_features_to=128,\n",
|
||||
" ):\n",
|
||||
" \"\"\"Greedy Coreset sampling base class.\"\"\"\n",
|
||||
" super().__init__(percentage)\n",
|
||||
"\n",
|
||||
" self.device = device\n",
|
||||
" self.dtype = dtype\n",
|
||||
" self.dimension_to_project_features_to = dimension_to_project_features_to\n",
|
||||
"\n",
|
||||
" def _reduce_features(self, features):\n",
|
||||
" if features.shape[1] == self.dimension_to_project_features_to:\n",
|
||||
" return features\n",
|
||||
" mapper = torch.nn.Linear(\n",
|
||||
" features.shape[1], self.dimension_to_project_features_to, bias=False\n",
|
||||
" features.shape[1], self.dimension_to_project_features_to, bias=False, dtype=self.dtype,\n",
|
||||
" )\n",
|
||||
" _ = mapper.to(self.device)\n",
|
||||
" features = features.to(self.device)\n",
|
||||
|
|
@ -189,8 +192,7 @@
|
|||
" features = torch.from_numpy(features)\n",
|
||||
" reduced_features = self._reduce_features(features)\n",
|
||||
" sample_indices = self._compute_greedy_coreset_indices(reduced_features)\n",
|
||||
" features = features[sample_indices]\n",
|
||||
" return self._restore_type(features)\n",
|
||||
" return sample_indices\n",
|
||||
"\n",
|
||||
" @staticmethod\n",
|
||||
" def _compute_batchwise_differences(\n",
|
||||
|
|
@ -227,7 +229,7 @@
|
|||
" )\n",
|
||||
" coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values\n",
|
||||
"\n",
|
||||
" return np.array(coreset_indices)\n",
|
||||
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):\n",
|
||||
|
|
@ -235,12 +237,13 @@
|
|||
" self,\n",
|
||||
" percentage: float,\n",
|
||||
" device: torch.device,\n",
|
||||
" dtype: torch.dtype = torch.float32,\n",
|
||||
" number_of_starting_points: int = 10,\n",
|
||||
" dimension_to_project_features_to: int = 128,\n",
|
||||
" ):\n",
|
||||
" \"\"\"Approximate Greedy Coreset sampling base class.\"\"\"\n",
|
||||
" self.number_of_starting_points = number_of_starting_points\n",
|
||||
" super().__init__(percentage, device, dimension_to_project_features_to)\n",
|
||||
" super().__init__(percentage, device, dtype, dimension_to_project_features_to)\n",
|
||||
"\n",
|
||||
" def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n",
|
||||
" \"\"\"Runs approximate iterative greedy coreset selection.\n",
|
||||
|
|
@ -283,7 +286,7 @@
|
|||
" approximate_coreset_anchor_distances, dim=1\n",
|
||||
" ).values.reshape(-1, 1)\n",
|
||||
"\n",
|
||||
" return np.array(coreset_indices)\n",
|
||||
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class RandomSampler(BaseSampler):\n",
|
||||
|
|
@ -302,8 +305,7 @@
|
|||
" subset_indices = np.random.choice(\n",
|
||||
" len(features), num_random_samples, replace=False\n",
|
||||
" )\n",
|
||||
" subset_indices = np.array(subset_indices)\n",
|
||||
" return features[subset_indices]"
|
||||
" return torch.tensor(subset_indices, device=features.device, dtype=torch.int64)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -311,7 +313,46 @@
|
|||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"# I ran this cell on Google Colab because I don't have a GPU on my local machine,\n",
|
||||
"# hence why you see the Google Drive paths\n",
|
||||
"\n",
|
||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||||
"model = SentenceTransformer(\"nomic-ai/modernbert-embed-base\")\n",
|
||||
"print(model)\n",
|
||||
"\n",
|
||||
"def get_entry_info(entry) -> str:\n",
|
||||
" return entry['task_description']\n",
|
||||
"\n",
|
||||
"def get_embeddings(text) -> torch.Tensor:\n",
|
||||
" return torch.from_numpy(model.encode(text)).to(torch.bfloat16)\n",
|
||||
"\n",
|
||||
"embeddings = []\n",
|
||||
"\n",
|
||||
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\") as f:\n",
|
||||
" for line in tqdm.tqdm(f):\n",
|
||||
" entry = json.loads(line)\n",
|
||||
" entry_info = get_entry_info(entry)\n",
|
||||
" embeddings.append(get_embeddings(entry_info))\n",
|
||||
"\n",
|
||||
"embeddings = torch.stack(embeddings).to(torch.bfloat16).to(device)\n",
|
||||
"print(embeddings.shape)\n",
|
||||
"\n",
|
||||
"sampler = ApproximateGreedyCoresetSampler(\n",
|
||||
" percentage=0.05, \n",
|
||||
" device=device, \n",
|
||||
" dtype=torch.bfloat16,\n",
|
||||
" dimension_to_project_features_to=768,\n",
|
||||
")\n",
|
||||
"subsampled = sampler.run(embeddings)\n",
|
||||
"\n",
|
||||
"indices = set(subsampled.cpu().tolist())\n",
|
||||
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\", \"r\") as f_in, \\\n",
|
||||
" open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-best-coverage.jsonl\", \"w+\") as f_out:\n",
|
||||
" for i, line in enumerate(f_in):\n",
|
||||
" if i in indices:\n",
|
||||
" f_out.write(line)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
|
|
@ -379,6 +420,7 @@
|
|||
" full_response = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
|
||||
" input_generator = re.search(r\"<function>(.*?)</function>\", full_response, re.DOTALL).group(1).strip()\n",
|
||||
"\n",
|
||||
" # Example of how to execute the generated code\n",
|
||||
" # local_dict = {}\n",
|
||||
" # exec(input_generator, globals(), local_dict)\n",
|
||||
" # generate_input_func = local_dict['generate_input']\n",
|
||||
|
|
@ -389,25 +431,6 @@
|
|||
" # print(f\"[{i}]: {random_input}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}\n",
|
||||
"[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}\n",
|
||||
"[2]: {'board': [[0]]}\n",
|
||||
"[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}\n",
|
||||
"[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue