Skip to content

Commit

Permalink
Parallel tokenization and files reading
Browse files Browse the repository at this point in the history
  • Loading branch information
devforfu committed May 21, 2019
1 parent 6f44065 commit e20bb13
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 84 deletions.
31 changes: 31 additions & 0 deletions dev/00c_utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,37 @@
"assert combine(0, a) == 1\n",
"assert combine(0, a, a, a) == 3"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def chunks(arr, sz=10):\n",
" \"\"\"Splits list into list of lists with specific size or maybe less (for the last chunk).\"\"\"\n",
" n = len(arr)\n",
" n_chunks = n // sz\n",
" for i in range(n_chunks):\n",
" yield arr[i*sz:(i + 1)*sz]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"assert list(chunks(list(range(9)), 3)) == [[0, 1, 2], [3, 4, 5], [6, 7, 8]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
206 changes: 122 additions & 84 deletions dev/05a_text.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 83,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ImportError",
"evalue": "cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-83-518dbfd6219a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mannotations\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mMaybeList\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCallable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcombine\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mImportError\u001b[0m: cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)"
]
}
],
"source": [
"#export\n",
"\"\"\"\n",
Expand All @@ -14,18 +26,22 @@
"NLP models. The texts are cleaned and converted into list of tokens.\n",
"\"\"\"\n",
"import html\n",
"from multiprocessing import cpu_count\n",
"from pathlib import Path\n",
"import re\n",
"\n",
"from joblib import Parallel, delayed\n",
"import pandas as pd\n",
"import spacy\n",
"from spacy.lang.en import English\n",
"\n",
"from loop.annotations import MaybeList, Callable\n",
"from loop.utils import combine"
"from loop.utils import combine, chunks"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -310,83 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['xxmaj',\n",
" 'english',\n",
" 'text',\n",
" 'that',\n",
" 'should',\n",
" 'be',\n",
" 'tokenized',\n",
" '.',\n",
" '\\n \\n ',\n",
" 'xxmaj',\n",
" 'the',\n",
" 'text',\n",
" 'contains',\n",
" '\"',\n",
" 'quoted',\n",
" 'names',\n",
" '\"',\n",
" ',',\n",
" 'commas',\n",
" ',',\n",
" 'dots',\n",
" '.',\n",
" 'xxmaj',\n",
" 'it',\n",
" 'also',\n",
" 'has',\n",
" 'some',\n",
" 'shortcuts',\n",
" ',',\n",
" 'like',\n",
" '\"',\n",
" 'does',\n",
" \"n't\",\n",
" '\"',\n",
" '\\n ',\n",
" 'and',\n",
" '\"',\n",
" 'do',\n",
" \"n't\",\n",
" '\"',\n",
" ',',\n",
" 'if',\n",
" 'you',\n",
" \"'d\",\n",
" 'like',\n",
" '.',\n",
" '\\n \\n ',\n",
" 'xxmaj',\n",
" 'also',\n",
" ',',\n",
" 'we',\n",
" \"'ve\",\n",
" 'xxup',\n",
" 'some',\n",
" 'xxup',\n",
" 'capslock',\n",
" 'here',\n",
" '.',\n",
" '\\n ']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -397,7 +337,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -419,7 +359,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -429,13 +369,111 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def print_tokens(tokens, n=500): print(format_tokens(tokens[:n]))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"xxmaj•english•text•that•should•be•tokenized•.•\n",
" \n",
" •xxmaj•the•text•contains•\"•quoted•names•\"•,•commas•,•dots•.•xxmaj•it•also•has•some•shortcuts•,•like•\"•does•n't•\"\n",
" •and•\"•do•n't•\"•,•if•you•'d•like•.•\n",
" \n",
" •xxmaj•also•,•we•'ve•xxup•some•xxup•capslock•here•.•\n",
" \n"
]
}
],
"source": [
"print_tokens(expected)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def read_files(root, labels=None, ext='txt', as_pandas=False):\n",
" \"\"\"Reads files from folders, using each one as a label name.\"\"\"\n",
" texts = []\n",
" for path in Path(root).iterdir():\n",
" if path.is_dir():\n",
" label = path.stem\n",
" if labels is not None and label in labels:\n",
" continue\n",
" items = [\n",
" {'text': fn.open().read(), 'name': fn.stem, 'label': label}\n",
" for fn in path.glob(f'*.{ext}')]\n",
" texts += items\n",
" return pd.DataFrame(texts) if as_pandas else texts"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"imdb = read_files('/home/ck/data/imdb/train', as_pandas=True)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"#export\n",
"def parallel_tokenizer(texts, tokenizer_fn, chunk_size=10000, n_jobs=None,\n",
" backend=None, as_pandas=False):\n",
" n_jobs = n_jobs or cpu_count()\n",
" with Parallel(n_jobs=n_jobs, backend=backend) as parallel:\n",
" results = parallel(delayed(tokenizer_fn)(ch) for ch in chunks(texts))\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'chunks' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-82-cab6d0d9498f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel_tokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenize_english\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-80-0d2258e06752>\u001b[0m in \u001b[0;36mparallel_tokenizer\u001b[0;34m(texts, tokenizer_fn, chunk_size, n_jobs, backend, as_pandas)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mcpu_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mParallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbackend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbackend\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mch\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'chunks' is not defined"
]
}
],
"source": [
"tokens = parallel_tokenizer(imdb.text, tokenize_english)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit e20bb13

Please sign in to comment.