Parallel tokenization and files reading

devforfu · May 21, 2019 · e20bb13 · e20bb13
1 parent 6f44065
commit e20bb13
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 84 deletions.
diff --git a/dev/00c_utils.ipynb b/dev/00c_utils.ipynb
@@ -340,6 +340,37 @@
     "assert combine(0, a) == 1\n",
     "assert combine(0, a, a, a) == 3"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#export\n",
+    "def chunks(arr, sz=10):\n",
+    "    \"\"\"Splits list into list of lists with specific size or maybe less (for the last chunk).\"\"\"\n",
+    "    n = len(arr)\n",
+    "    n_chunks = n // sz\n",
+    "    for i in range(n_chunks):\n",
+    "        yield arr[i*sz:(i + 1)*sz]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert list(chunks(list(range(9)), 3)) == [[0, 1, 2], [3, 4, 5], [6, 7, 8]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/dev/05a_text.ipynb b/dev/05a_text.ipynb
@@ -2,9 +2,21 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 83,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-83-518dbfd6219a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mannotations\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mMaybeList\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCallable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mloop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcombine\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'chunks' from 'loop.utils' (/home/ck/code/loop/dev/loop/utils.py)"
+     ]
+    }
+   ],
    "source": [
     "#export\n",
     "\"\"\"\n",
@@ -14,18 +26,22 @@
     "NLP models. The texts are cleaned and converted into list of tokens.\n",
     "\"\"\"\n",
     "import html\n",
+    "from multiprocessing import cpu_count\n",
+    "from pathlib import Path\n",
     "import re\n",
     "\n",
+    "from joblib import Parallel, delayed\n",
+    "import pandas as pd\n",
     "import spacy\n",
     "from spacy.lang.en import English\n",
     "\n",
     "from loop.annotations import MaybeList, Callable\n",
-    "from loop.utils import combine"
+    "from loop.utils import combine, chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -310,83 +326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['xxmaj',\n",
-       " 'english',\n",
-       " 'text',\n",
-       " 'that',\n",
-       " 'should',\n",
-       " 'be',\n",
-       " 'tokenized',\n",
-       " '.',\n",
-       " '\\n \\n ',\n",
-       " 'xxmaj',\n",
-       " 'the',\n",
-       " 'text',\n",
-       " 'contains',\n",
-       " '\"',\n",
-       " 'quoted',\n",
-       " 'names',\n",
-       " '\"',\n",
-       " ',',\n",
-       " 'commas',\n",
-       " ',',\n",
-       " 'dots',\n",
-       " '.',\n",
-       " 'xxmaj',\n",
-       " 'it',\n",
-       " 'also',\n",
-       " 'has',\n",
-       " 'some',\n",
-       " 'shortcuts',\n",
-       " ',',\n",
-       " 'like',\n",
-       " '\"',\n",
-       " 'does',\n",
-       " \"n't\",\n",
-       " '\"',\n",
-       " '\\n ',\n",
-       " 'and',\n",
-       " '\"',\n",
-       " 'do',\n",
-       " \"n't\",\n",
-       " '\"',\n",
-       " ',',\n",
-       " 'if',\n",
-       " 'you',\n",
-       " \"'d\",\n",
-       " 'like',\n",
-       " '.',\n",
-       " '\\n \\n ',\n",
-       " 'xxmaj',\n",
-       " 'also',\n",
-       " ',',\n",
-       " 'we',\n",
-       " \"'ve\",\n",
-       " 'xxup',\n",
-       " 'some',\n",
-       " 'xxup',\n",
-       " 'capslock',\n",
-       " 'here',\n",
-       " '.',\n",
-       " '\\n ']"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -397,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -419,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -429,13 +369,111 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
     "#export\n",
     "def print_tokens(tokens, n=500): print(format_tokens(tokens[:n]))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "xxmaj•english•text•that•should•be•tokenized•.•\n",
+      " \n",
+      " •xxmaj•the•text•contains•\"•quoted•names•\"•,•commas•,•dots•.•xxmaj•it•also•has•some•shortcuts•,•like•\"•does•n't•\"•\n",
+      " •and•\"•do•n't•\"•,•if•you•'d•like•.•\n",
+      " \n",
+      " •xxmaj•also•,•we•'ve•xxup•some•xxup•capslock•here•.•\n",
+      " \n"
+     ]
+    }
+   ],
+   "source": [
+    "print_tokens(expected)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#export\n",
+    "def read_files(root, labels=None, ext='txt', as_pandas=False):\n",
+    "    \"\"\"Reads files from folders, using each one as a label name.\"\"\"\n",
+    "    texts = []\n",
+    "    for path in Path(root).iterdir():\n",
+    "        if path.is_dir():\n",
+    "            label = path.stem\n",
+    "            if labels is not None and label in labels:\n",
+    "                continue\n",
+    "            items = [\n",
+    "                {'text': fn.open().read(), 'name': fn.stem, 'label': label}\n",
+    "                for fn in path.glob(f'*.{ext}')]\n",
+    "            texts += items\n",
+    "    return pd.DataFrame(texts) if as_pandas else texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb = read_files('/home/ck/data/imdb/train', as_pandas=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#export\n",
+    "def parallel_tokenizer(texts, tokenizer_fn, chunk_size=10000, n_jobs=None,\n",
+    "                       backend=None, as_pandas=False):\n",
+    "    n_jobs = n_jobs or cpu_count()\n",
+    "    with Parallel(n_jobs=n_jobs, backend=backend) as parallel:\n",
+    "        results = parallel(delayed(tokenizer_fn)(ch) for ch in chunks(texts))\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'chunks' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-82-cab6d0d9498f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel_tokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenize_english\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-80-0d2258e06752>\u001b[0m in \u001b[0;36mparallel_tokenizer\u001b[0;34m(texts, tokenizer_fn, chunk_size, n_jobs, backend, as_pandas)\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mn_jobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mcpu_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mParallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbackend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbackend\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m         \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mch\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'chunks' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "tokens = parallel_tokenizer(imdb.text, tokenize_english)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {