{ "cells": [ { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import json\n", "from collections import defaultdict" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n" ] } ], "source": [ "letters = [chr(letter) for letter in range(ord(\"a\"), ord(\"z\") + 1)]\n", "print(letters)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "370105\n" ] } ], "source": [ "# The file `words_alpha.txt` has been obtained from https://github.com/dwyl/english-words \n", "with open(\"./reasoning_gym/data/words_alpha.txt\") as f:\n", " words = f.read().splitlines()\n", "print(len(words))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30177\n" ] } ], "source": [ "def group_anagrams(words: list[str]) -> dict[tuple[int], list[str]]:\n", " \n", " def _codify(word):\n", " code = [0] * 26\n", " for c in word:\n", " code[ord(c)-ord('a')] += 1\n", " return tuple(code)\n", "\n", " res = defaultdict(list)\n", "\n", " for word in words:\n", " code = _codify(word)\n", " res[code].append(word)\n", " return res\n", "\n", "anagrams = group_anagrams(words)\n", "anagrams = {k: v for k, v in anagrams.items() if len(v) > 1} # only keep anagrams with more than 1 word\n", "print(len(anagrams))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "with open(\"./reasoning_gym/data/anagrams.jsonl\", \"w\") as f:\n", " for counts, words in anagrams.items():\n", " letter_counts = {letter: count for letter, count in zip(letters, counts)}\n", " f.write(json.dumps({\"letter_counts\": letter_counts, \"words\": words}) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "reasoning_gym", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }