sampling code

2026-04-24 17:05:03 +00:00 · 2025-02-23 00:40:11 +01:00 · 2025-02-23 00:40:11 +01:00 · 0d07746a4e
commit 0d07746a4e
parent e84cec26ed
1 changed files with 65 additions and 42 deletions
--- a/notebooks/codeio.ipynb
+++ b/notebooks/codeio.ipynb
@ -2,18 +2,27 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import datasets\n",
+    "import abc\n",
+    "import os\n",
+    "from typing import Union\n",
+    "import pickle\n",
    "import re\n",
+    "import random\n",
+    "from random import Random\n",
+    "import requests\n",
    "import json\n",
    "from tqdm import tqdm\n",
-    "import os\n",
-    "import requests\n",
-    "import random\n",
-    "from random import Random"
+    "\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import tqdm"
   ]
  },
  {
@ -113,14 +122,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import abc\n",
-    "from typing import Union\n",
-    "\n",
-    "import numpy as np\n",
-    "import torch\n",
-    "import tqdm\n",
-    "\n",
-    "\n",
    "class IdentitySampler:\n",
    "    def run(\n",
    "        self, features: Union[torch.Tensor, np.ndarray]\n",
@ -156,19 +157,21 @@
    "        self,\n",
    "        percentage: float,\n",
    "        device: torch.device,\n",
+    "        dtype: torch.dtype = torch.float32,\n",
    "        dimension_to_project_features_to=128,\n",
    "    ):\n",
    "        \"\"\"Greedy Coreset sampling base class.\"\"\"\n",
    "        super().__init__(percentage)\n",
    "\n",
    "        self.device = device\n",
+    "        self.dtype = dtype\n",
    "        self.dimension_to_project_features_to = dimension_to_project_features_to\n",
    "\n",
    "    def _reduce_features(self, features):\n",
    "        if features.shape[1] == self.dimension_to_project_features_to:\n",
    "            return features\n",
    "        mapper = torch.nn.Linear(\n",
-    "            features.shape[1], self.dimension_to_project_features_to, bias=False\n",
+    "            features.shape[1], self.dimension_to_project_features_to, bias=False, dtype=self.dtype,\n",
    "        )\n",
    "        _ = mapper.to(self.device)\n",
    "        features = features.to(self.device)\n",
@ -189,8 +192,7 @@
    "            features = torch.from_numpy(features)\n",
    "        reduced_features = self._reduce_features(features)\n",
    "        sample_indices = self._compute_greedy_coreset_indices(reduced_features)\n",
-    "        features = features[sample_indices]\n",
-    "        return self._restore_type(features)\n",
+    "        return sample_indices\n",
    "\n",
    "    @staticmethod\n",
    "    def _compute_batchwise_differences(\n",
@ -227,7 +229,7 @@
    "            )\n",
    "            coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values\n",
    "\n",
-    "        return np.array(coreset_indices)\n",
+    "        return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
    "\n",
    "\n",
    "class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):\n",
@ -235,12 +237,13 @@
    "        self,\n",
    "        percentage: float,\n",
    "        device: torch.device,\n",
+    "        dtype: torch.dtype = torch.float32,\n",
    "        number_of_starting_points: int = 10,\n",
    "        dimension_to_project_features_to: int = 128,\n",
    "    ):\n",
    "        \"\"\"Approximate Greedy Coreset sampling base class.\"\"\"\n",
    "        self.number_of_starting_points = number_of_starting_points\n",
-    "        super().__init__(percentage, device, dimension_to_project_features_to)\n",
+    "        super().__init__(percentage, device, dtype, dimension_to_project_features_to)\n",
    "\n",
    "    def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n",
    "        \"\"\"Runs approximate iterative greedy coreset selection.\n",
@ -283,7 +286,7 @@
    "                    approximate_coreset_anchor_distances, dim=1\n",
    "                ).values.reshape(-1, 1)\n",
    "\n",
-    "        return np.array(coreset_indices)\n",
+    "        return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n",
    "\n",
    "\n",
    "class RandomSampler(BaseSampler):\n",
@ -302,8 +305,7 @@
    "        subset_indices = np.random.choice(\n",
    "            len(features), num_random_samples, replace=False\n",
    "        )\n",
-    "        subset_indices = np.array(subset_indices)\n",
-    "        return features[subset_indices]"
+    "        return torch.tensor(subset_indices, device=features.device, dtype=torch.int64)"
   ]
  },
  {
@ -311,7 +313,46 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# I ran this cell on Google Colab because I don't have a GPU on my local machine,\n",
+    "# hence why you see the Google Drive paths\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model = SentenceTransformer(\"nomic-ai/modernbert-embed-base\")\n",
+    "print(model)\n",
+    "\n",
+    "def get_entry_info(entry) -> str:\n",
+    "  return entry['task_description']\n",
+    "\n",
+    "def get_embeddings(text) -> torch.Tensor:\n",
+    "  return torch.from_numpy(model.encode(text)).to(torch.bfloat16)\n",
+    "\n",
+    "embeddings = []\n",
+    "\n",
+    "with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\") as f:\n",
+    "  for line in tqdm.tqdm(f):\n",
+    "    entry = json.loads(line)\n",
+    "    entry_info = get_entry_info(entry)\n",
+    "    embeddings.append(get_embeddings(entry_info))\n",
+    "\n",
+    "embeddings = torch.stack(embeddings).to(torch.bfloat16).to(device)\n",
+    "print(embeddings.shape)\n",
+    "\n",
+    "sampler = ApproximateGreedyCoresetSampler(\n",
+    "    percentage=0.05, \n",
+    "    device=device, \n",
+    "    dtype=torch.bfloat16,\n",
+    "    dimension_to_project_features_to=768,\n",
+    ")\n",
+    "subsampled = sampler.run(embeddings)\n",
+    "\n",
+    "indices = set(subsampled.cpu().tolist())\n",
+    "with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\", \"r\") as f_in, \\\n",
+    "  open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-best-coverage.jsonl\", \"w+\") as f_out:\n",
+    "  for i, line in enumerate(f_in):\n",
+    "    if i in indices:\n",
+    "      f_out.write(line)"
+   ]
  },
  {
   "cell_type": "markdown",
@ -379,6 +420,7 @@
    "        full_response = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
    "        input_generator = re.search(r\"<function>(.*?)</function>\", full_response, re.DOTALL).group(1).strip()\n",
    "\n",
+    "        # Example of how to execute the generated code\n",
    "        # local_dict = {}\n",
    "        # exec(input_generator, globals(), local_dict)\n",
    "        # generate_input_func = local_dict['generate_input']\n",
@ -389,25 +431,6 @@
    "        #     print(f\"[{i}]: {random_input}\")"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}\n",
-      "[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}\n",
-      "[2]: {'board': [[0]]}\n",
-      "[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}\n",
-      "[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}\n"
-     ]
-    }
-   ],
-   "source": []
-  },
  {
   "cell_type": "markdown",
   "metadata": {},