diff --git a/notebooks/codeio.ipynb b/notebooks/codeio.ipynb index 95e409f2..ac80e76c 100644 --- a/notebooks/codeio.ipynb +++ b/notebooks/codeio.ipynb @@ -2,18 +2,27 @@ "cells": [ { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import datasets\n", + "import abc\n", + "import os\n", + "from typing import Union\n", + "import pickle\n", "import re\n", + "import random\n", + "from random import Random\n", + "import requests\n", "import json\n", "from tqdm import tqdm\n", - "import os\n", - "import requests\n", - "import random\n", - "from random import Random" + "\n", + "import datasets\n", + "import numpy as np\n", + "import torch\n", + "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", + "from sentence_transformers import SentenceTransformer\n", + "import tqdm" ] }, { @@ -113,14 +122,6 @@ "metadata": {}, "outputs": [], "source": [ - "import abc\n", - "from typing import Union\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import tqdm\n", - "\n", - "\n", "class IdentitySampler:\n", " def run(\n", " self, features: Union[torch.Tensor, np.ndarray]\n", @@ -156,19 +157,21 @@ " self,\n", " percentage: float,\n", " device: torch.device,\n", + " dtype: torch.dtype = torch.float32,\n", " dimension_to_project_features_to=128,\n", " ):\n", " \"\"\"Greedy Coreset sampling base class.\"\"\"\n", " super().__init__(percentage)\n", "\n", " self.device = device\n", + " self.dtype = dtype\n", " self.dimension_to_project_features_to = dimension_to_project_features_to\n", "\n", " def _reduce_features(self, features):\n", " if features.shape[1] == self.dimension_to_project_features_to:\n", " return features\n", " mapper = torch.nn.Linear(\n", - " features.shape[1], self.dimension_to_project_features_to, bias=False\n", + " features.shape[1], self.dimension_to_project_features_to, bias=False, dtype=self.dtype,\n", " )\n", " _ = mapper.to(self.device)\n", " features = features.to(self.device)\n", @@ -189,8 +192,7 @@ " features = torch.from_numpy(features)\n", " reduced_features = self._reduce_features(features)\n", " sample_indices = self._compute_greedy_coreset_indices(reduced_features)\n", - " features = features[sample_indices]\n", - " return self._restore_type(features)\n", + " return sample_indices\n", "\n", " @staticmethod\n", " def _compute_batchwise_differences(\n", @@ -227,7 +229,7 @@ " )\n", " coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values\n", "\n", - " return np.array(coreset_indices)\n", + " return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n", "\n", "\n", "class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):\n", @@ -235,12 +237,13 @@ " self,\n", " percentage: float,\n", " device: torch.device,\n", + " dtype: torch.dtype = torch.float32,\n", " number_of_starting_points: int = 10,\n", " dimension_to_project_features_to: int = 128,\n", " ):\n", " \"\"\"Approximate Greedy Coreset sampling base class.\"\"\"\n", " self.number_of_starting_points = number_of_starting_points\n", - " super().__init__(percentage, device, dimension_to_project_features_to)\n", + " super().__init__(percentage, device, dtype, dimension_to_project_features_to)\n", "\n", " def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:\n", " \"\"\"Runs approximate iterative greedy coreset selection.\n", @@ -283,7 +286,7 @@ " approximate_coreset_anchor_distances, dim=1\n", " ).values.reshape(-1, 1)\n", "\n", - " return np.array(coreset_indices)\n", + " return torch.tensor(coreset_indices, device=features.device, dtype=torch.int64)\n", "\n", "\n", "class RandomSampler(BaseSampler):\n", @@ -302,8 +305,7 @@ " subset_indices = np.random.choice(\n", " len(features), num_random_samples, replace=False\n", " )\n", - " subset_indices = np.array(subset_indices)\n", - " return features[subset_indices]" + " return torch.tensor(subset_indices, device=features.device, dtype=torch.int64)" ] }, { @@ -311,7 +313,46 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# I ran this cell on Google Colab because I don't have a GPU on my local machine,\n", + "# hence why you see the Google Drive paths\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model = SentenceTransformer(\"nomic-ai/modernbert-embed-base\")\n", + "print(model)\n", + "\n", + "def get_entry_info(entry) -> str:\n", + " return entry['task_description']\n", + "\n", + "def get_embeddings(text) -> torch.Tensor:\n", + " return torch.from_numpy(model.encode(text)).to(torch.bfloat16)\n", + "\n", + "embeddings = []\n", + "\n", + "with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\") as f:\n", + " for line in tqdm.tqdm(f):\n", + " entry = json.loads(line)\n", + " entry_info = get_entry_info(entry)\n", + " embeddings.append(get_embeddings(entry_info))\n", + "\n", + "embeddings = torch.stack(embeddings).to(torch.bfloat16).to(device)\n", + "print(embeddings.shape)\n", + "\n", + "sampler = ApproximateGreedyCoresetSampler(\n", + " percentage=0.05, \n", + " device=device, \n", + " dtype=torch.bfloat16,\n", + " dimension_to_project_features_to=768,\n", + ")\n", + "subsampled = sampler.run(embeddings)\n", + "\n", + "indices = set(subsampled.cpu().tolist())\n", + "with open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-extracted.jsonl\", \"r\") as f_in, \\\n", + " open(\"./drive/MyDrive/reasoning-gym/codeio-pyedu-best-coverage.jsonl\", \"w+\") as f_out:\n", + " for i, line in enumerate(f_in):\n", + " if i in indices:\n", + " f_out.write(line)" + ] }, { "cell_type": "markdown", @@ -379,6 +420,7 @@ " full_response = response.json()[\"choices\"][0][\"message\"][\"content\"]\n", " input_generator = re.search(r\"(.*?)\", full_response, re.DOTALL).group(1).strip()\n", "\n", + " # Example of how to execute the generated code\n", " # local_dict = {}\n", " # exec(input_generator, globals(), local_dict)\n", " # generate_input_func = local_dict['generate_input']\n", @@ -389,25 +431,6 @@ " # print(f\"[{i}]: {random_input}\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}\n", - "[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}\n", - "[2]: {'board': [[0]]}\n", - "[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}\n", - "[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}\n" - ] - } - ], - "source": [] - }, { "cell_type": "markdown", "metadata": {},