mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-30 17:40:45 +00:00
Merge pull request #65 from zafstojano/env/group-anagrams
Group Anagrams together
This commit is contained in:
commit
d2bef8d30f
7 changed files with 30644 additions and 0 deletions
126
notebooks/generate_anagrams.ipynb
Normal file
126
notebooks/generate_anagrams.ipynb
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from collections import defaultdict"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"letters = [chr(letter) for letter in range(ord(\"a\"), ord(\"z\") + 1)]\n",
|
||||
"print(letters)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"370105\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The file `words_alpha.txt` has been obtained from https://github.com/dwyl/english-words \n",
|
||||
"with open(\"./reasoning_gym/data/words_alpha.txt\") as f:\n",
|
||||
" words = f.read().splitlines()\n",
|
||||
"print(len(words))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"30177\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def group_anagrams(words: list[str]) -> dict[tuple[int], list[str]]:\n",
|
||||
" \n",
|
||||
" def _codify(word):\n",
|
||||
" code = [0] * 26\n",
|
||||
" for c in word:\n",
|
||||
" code[ord(c)-ord('a')] += 1\n",
|
||||
" return tuple(code)\n",
|
||||
"\n",
|
||||
" res = defaultdict(list)\n",
|
||||
"\n",
|
||||
" for word in words:\n",
|
||||
" code = _codify(word)\n",
|
||||
" res[code].append(word)\n",
|
||||
" return res\n",
|
||||
"\n",
|
||||
"anagrams = group_anagrams(words)\n",
|
||||
"anagrams = {k: v for k, v in anagrams.items() if len(v) > 1} # only keep anagrams with more than 1 word\n",
|
||||
"print(len(anagrams))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"./reasoning_gym/data/anagrams.jsonl\", \"w\") as f:\n",
|
||||
" for counts, words in anagrams.items():\n",
|
||||
" letter_counts = {letter: count for letter, count in zip(letters, counts)}\n",
|
||||
" f.write(json.dumps({\"letter_counts\": letter_counts, \"words\": words}) + \"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "reasoning_gym",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue