diff --git a/dev/00c_utils.ipynb b/dev/00c_utils.ipynb index 7dacf77..59ea650 100644 --- a/dev/00c_utils.ipynb +++ b/dev/00c_utils.ipynb @@ -340,6 +340,37 @@ "assert combine(0, a) == 1\n", "assert combine(0, a, a, a) == 3" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "def chunks(arr, sz=10):\n", + " \"\"\"Splits list into list of lists with specific size or maybe less (for the last chunk).\"\"\"\n", + " n = len(arr)\n", + " n_chunks = n // sz\n", + " for i in range(n_chunks):\n", + " yield arr[i*sz:(i + 1)*sz]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "assert list(chunks(list(range(9)), 3)) == [[0, 1, 2], [3, 4, 5], [6, 7, 8]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/dev/05a_text.ipynb b/dev/05a_text.ipynb index d812182..bff2b74 100644 --- a/dev/05a_text.ipynb +++ b/dev/05a_text.ipynb @@ -2,9 +2,21 @@ "cells": [ { "cell_type": "code", - "execution_count": 25, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mannotations\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mMaybeList\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCallable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcombine\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)" + ] + } + ], "source": [ "#export\n", "\"\"\"\n", @@ -14,18 +26,22 @@ "NLP models. The texts are cleaned and converted into list of tokens.\n", "\"\"\"\n", "import html\n", + "from multiprocessing import cpu_count\n", + "from pathlib import Path\n", "import re\n", "\n", + "from joblib import Parallel, delayed\n", + "import pandas as pd\n", "import spacy\n", "from spacy.lang.en import English\n", "\n", "from loop.annotations import MaybeList, Callable\n", - "from loop.utils import combine" + "from loop.utils import combine, chunks" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -310,83 +326,7 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['xxmaj',\n", - " 'english',\n", - " 'text',\n", - " 'that',\n", - " 'should',\n", - " 'be',\n", - " 'tokenized',\n", - " '.',\n", - " '\\n \\n ',\n", - " 'xxmaj',\n", - " 'the',\n", - " 'text',\n", - " 'contains',\n", - " '\"',\n", - " 'quoted',\n", - " 'names',\n", - " '\"',\n", - " ',',\n", - " 'commas',\n", - " ',',\n", - " 'dots',\n", - " '.',\n", - " 'xxmaj',\n", - " 'it',\n", - " 'also',\n", - " 'has',\n", - " 'some',\n", - " 'shortcuts',\n", - " ',',\n", - " 'like',\n", - " '\"',\n", - " 'does',\n", - " \"n't\",\n", - " '\"',\n", - " '\\n ',\n", - " 'and',\n", - " '\"',\n", - " 'do',\n", - " \"n't\",\n", - " '\"',\n", - " ',',\n", - " 'if',\n", - " 'you',\n", - " \"'d\",\n", - " 'like',\n", - " '.',\n", - " '\\n \\n ',\n", - " 'xxmaj',\n", - " 'also',\n", - " ',',\n", - " 'we',\n", - " \"'ve\",\n", - " 'xxup',\n", - " 'some',\n", - " 'xxup',\n", - " 'capslock',\n", - " 'here',\n", - " '.',\n", - " '\\n ']" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 21, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -397,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -419,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -429,13 +369,111 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "#export\n", "def print_tokens(tokens, n=500): print(format_tokens(tokens[:n]))" ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xxmaj•english•text•that•should•be•tokenized•.•\n", + " \n", + " •xxmaj•the•text•contains•\"•quoted•names•\"•,•commas•,•dots•.•xxmaj•it•also•has•some•shortcuts•,•like•\"•does•n't•\"•\n", + " •and•\"•do•n't•\"•,•if•you•'d•like•.•\n", + " \n", + " •xxmaj•also•,•we•'ve•xxup•some•xxup•capslock•here•.•\n", + " \n" + ] + } + ], + "source": [ + "print_tokens(expected)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "def read_files(root, labels=None, ext='txt', as_pandas=False):\n", + " \"\"\"Reads files from folders, using each one as a label name.\"\"\"\n", + " texts = []\n", + " for path in Path(root).iterdir():\n", + " if path.is_dir():\n", + " label = path.stem\n", + " if labels is not None and label in labels:\n", + " continue\n", + " items = [\n", + " {'text': fn.open().read(), 'name': fn.stem, 'label': label}\n", + " for fn in path.glob(f'*.{ext}')]\n", + " texts += items\n", + " return pd.DataFrame(texts) if as_pandas else texts" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "imdb = read_files('/home/ck/data/imdb/train', as_pandas=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "def parallel_tokenizer(texts, tokenizer_fn, chunk_size=10000, n_jobs=None,\n", + " backend=None, as_pandas=False):\n", + " n_jobs = n_jobs or cpu_count()\n", + " with Parallel(n_jobs=n_jobs, backend=backend) as parallel:\n", + " results = parallel(delayed(tokenizer_fn)(ch) for ch in chunks(texts))\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'chunks' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel_tokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenize_english\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mparallel_tokenizer\u001b[0;34m(texts, tokenizer_fn, chunk_size, n_jobs, backend, as_pandas)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mcpu_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mParallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbackend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbackend\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mch\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'chunks' is not defined" + ] + } + ], + "source": [ + "tokens = parallel_tokenizer(imdb.text, tokenize_english)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {