sampling code

This commit is contained in:
Zafir Stojanovski 2025-02-23 00:40:11 +01:00
parent e84cec26ed
commit 0d07746a4e

View file

@ -2,18 +2,27 @@
"cells": [
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datasets\n",
"import abc\n",
"import os\n",
"from typing import Union\n",
"import pickle\n",
"import re\n",
"import random\n",
"from random import Random\n",
"import requests\n",
"import json\n",
"from tqdm import tqdm\n",
"import os\n",
"import requests\n",
"import random\n",
"from random import Random"
"\n",
"import datasets\n",
"import numpy as np\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"from sentence_transformers import SentenceTransformer\n",
"import tqdm"
]
},
{
@ -113,14 +122,6 @@
"metadata": {},
"outputs": [],
"source": [
"import abc\n",
"from typing import Union\n",
"\n",
"import numpy as np\n",
"import torch\n",
"import tqdm\n",
"\n",
"\n",
"class IdentitySampler:\n",
" def run(\n",
" self, features: Union[torch.Tensor, np.ndarray]\n",
@ -156,19 +157,21 @@
" self,\n",
" percentage: float,\n",
" device: torch.device,\n",
" dtype: torch.dtype = torch.float32,\n",
" dimension_to_project_features_to=128,\n",
" ):\n",
" \"\"\"Greedy Coreset sampling base class.\"\"\"\n",
" super().__init__(percentage)\n",
"\n",
" self.device = device\n",
" self.dtype = dtype\n",
" self.dimension_to_project_features_to = dimension_to_project_features_to\n",
"\n",
" def _reduce_features(self, features):\n",
" if features.shape[1] == self.dimension_to_project_features_to:\n",
" return features\n",
" mapper = torch.nn.Linear(\n",
" features.shape[1], self.dimension_to_project_features_to, bias=False\n",
" features.shape[1], self.dimension_to_project_features_to, bias=False, dtype=self.dtype,\n",
" )\n",
" _ = mapper.to(self.device)\n",
" features = features.to(self.device)\n",
@ -189,8 +192,7 @@
" features = torch.from_numpy(features)\n",
" reduced_features = self._reduce_features(features)\n",
" sample_indices = self._compute_greedy_coreset_indices(reduced_features)\n",
" features = features[sample_indices]\n",
" return self._restore_type(features)\n",
" return sample_indices\n",
"\n",
" @staticmethod\n",
" def _compute_batchwise_differences(\n",
@ -227,7 +229,7 @@
" )\n",
" coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values\n",
"\n",
" return np.array(coreset_indices)\n",
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
"\n",
"\n",
"class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):\n",
@ -235,12 +237,13 @@
" self,\n",
" percentage: float,\n",
" device: torch.device,\n",
" dtype: torch.dtype = torch.float32,\n",
" number_of_starting_points: int = 10,\n",
" dimension_to_project_features_to: int = 128,\n",
" ):\n",
" \"\"\"Approximate Greedy Coreset sampling base class.\"\"\"\n",
" self.number_of_starting_points = number_of_starting_points\n",
" super().__init__(percentage, device, dimension_to_project_features_to)\n",
" super().__init__(percentage, device, dtype, dimension_to_project_features_to)\n",
"\n",
" def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n",
" \"\"\"Runs approximate iterative greedy coreset selection.\n",
@ -283,7 +286,7 @@
" approximate_coreset_anchor_distances, dim=1\n",
" ).values.reshape(-1, 1)\n",
"\n",
" return np.array(coreset_indices)\n",
" return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
"\n",
"\n",
"class RandomSampler(BaseSampler):\n",
@ -302,8 +305,7 @@
" subset_indices = np.random.choice(\n",
" len(features), num_random_samples, replace=False\n",
" )\n",
" subset_indices = np.array(subset_indices)\n",
" return features[subset_indices]"
" return torch.tensor(subset_indices, device=features.device, dtype=torch.int64)"
]
},
{
@ -311,7 +313,46 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# I ran this cell on Google Colab because I don't have a GPU on my local machine,\n",
"# hence why you see the Google Drive paths\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"model = SentenceTransformer(\"nomic-ai/modernbert-embed-base\")\n",
"print(model)\n",
"\n",
"def get_entry_info(entry) -> str:\n",
" return entry['task_description']\n",
"\n",
"def get_embeddings(text) -> torch.Tensor:\n",
" return torch.from_numpy(model.encode(text)).to(torch.bfloat16)\n",
"\n",
"embeddings = []\n",
"\n",
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\") as f:\n",
" for line in tqdm.tqdm(f):\n",
" entry = json.loads(line)\n",
" entry_info = get_entry_info(entry)\n",
" embeddings.append(get_embeddings(entry_info))\n",
"\n",
"embeddings = torch.stack(embeddings).to(torch.bfloat16).to(device)\n",
"print(embeddings.shape)\n",
"\n",
"sampler = ApproximateGreedyCoresetSampler(\n",
" percentage=0.05, \n",
" device=device, \n",
" dtype=torch.bfloat16,\n",
" dimension_to_project_features_to=768,\n",
")\n",
"subsampled = sampler.run(embeddings)\n",
"\n",
"indices = set(subsampled.cpu().tolist())\n",
"with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\", \"r\") as f_in, \\\n",
" open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-best-coverage.jsonl\", \"w+\") as f_out:\n",
" for i, line in enumerate(f_in):\n",
" if i in indices:\n",
" f_out.write(line)"
]
},
{
"cell_type": "markdown",
@ -379,6 +420,7 @@
" full_response = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
" input_generator = re.search(r\"<function>(.*?)</function>\", full_response, re.DOTALL).group(1).strip()\n",
"\n",
" # Example of how to execute the generated code\n",
" # local_dict = {}\n",
" # exec(input_generator, globals(), local_dict)\n",
" # generate_input_func = local_dict['generate_input']\n",
@ -389,25 +431,6 @@
" # print(f\"[{i}]: {random_input}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}\n",
"[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}\n",
"[2]: {'board': [[0]]}\n",
"[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}\n",
"[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}\n"
]
}
],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},