From e83bf05a4d115176f5f3bf7f17041fc208b76eac Mon Sep 17 00:00:00 2001 From: saidbleik Date: Wed, 23 Oct 2019 19:26:17 +0000 Subject: [PATCH 1/8] edits and updates --- .../text_classification/tc_mnli_bert.ipynb | 819 --- .../tc_mnli_transformers.ipynb | 6083 +---------------- tests/conftest.py | 4 +- .../test_notebooks_text_classification.py | 18 +- utils_nlp/dataset/squad.py | 10 +- utils_nlp/models/transformers/common.py | 37 +- .../transformers/named_entity_recognition.py | 99 +- .../models/transformers/question_answering.py | 14 +- .../transformers/sequence_classification.py | 14 +- 9 files changed, 328 insertions(+), 6770 deletions(-) delete mode 100644 examples/text_classification/tc_mnli_bert.ipynb diff --git a/examples/text_classification/tc_mnli_bert.ipynb b/examples/text_classification/tc_mnli_bert.ipynb deleted file mode 100644 index 7712416a4..000000000 --- a/examples/text_classification/tc_mnli_bert.ipynb +++ /dev/null @@ -1,819 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "*Copyright (c) Microsoft Corporation. All rights reserved.*\n", - "\n", - "*Licensed under the MIT License.*\n", - "\n", - "# Text Classification of MultiNLI Sentences using BERT" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Before You Start\n", - "\n", - "> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n", - "\n", - "The table below provides some reference running time on different machine configurations. \n", - "\n", - "|QUICK_RUN|Machine Configurations|Running time|\n", - "|:---------|:----------------------|:------------|\n", - "|True|4 **CPU**s, 14GB memory| ~ 15 minutes|\n", - "|False|4 **CPU**s, 14GB memory| ~19.5 hours|\n", - "|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 3 minutes |\n", - "|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1.5 hours|\n", - "\n", - "If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_LEN`, but note that model performance will be compromised. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n", - "QUICK_RUN = False" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(\"../../\")\n", - "import os\n", - "import json\n", - "import pandas as pd\n", - "import numpy as np\n", - "import scrapbook as sb\n", - "from sklearn.metrics import classification_report, accuracy_score\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.model_selection import train_test_split\n", - "import torch\n", - "import torch.nn as nn\n", - "\n", - "from utils_nlp.dataset.multinli import load_pandas_df\n", - "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n", - "from utils_nlp.models.bert.common import Language, Tokenizer\n", - "from utils_nlp.common.timer import Timer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.\n", - "\n", - "We use a [sequence classifier](../../utils_nlp/models/bert/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert)." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "TRAIN_DATA_FRACTION = 1\n", - "TEST_DATA_FRACTION = 1\n", - "NUM_EPOCHS = 1\n", - "\n", - "if QUICK_RUN:\n", - " TRAIN_DATA_FRACTION = 0.01\n", - " TEST_DATA_FRACTION = 0.01\n", - " NUM_EPOCHS = 1\n", - "\n", - "if torch.cuda.is_available():\n", - " BATCH_SIZE = 32\n", - "else:\n", - " BATCH_SIZE = 8\n", - "\n", - "DATA_FOLDER = \"./temp\"\n", - "BERT_CACHE_DIR = \"./temp\"\n", - "LANGUAGE = Language.ENGLISH\n", - "TO_LOWER = True\n", - "MAX_LEN = 150\n", - "BATCH_SIZE_PRED = 512\n", - "TRAIN_SIZE = 0.6\n", - "LABEL_COL = \"genre\"\n", - "TEXT_COL = \"sentence1\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Dataset\n", - "We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.\n", - "\n", - "The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.\n", - "\n", - "For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = load_pandas_df(DATA_FOLDER, \"train\")\n", - "df = df[df[\"gold_label\"]==\"neutral\"] # get unique sentences" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
genresentence1
0governmentConceptually cream skimming has two basic dime...
4telephoneyeah i tell you what though if you go price so...
6travelBut a few Christian mosaics survive above the ...
12slateIt's not that the questions they asked weren't...
13travelThebes held onto power until the 12th Dynasty,...
\n", - "
" - ], - "text/plain": [ - " genre sentence1\n", - "0 government Conceptually cream skimming has two basic dime...\n", - "4 telephone yeah i tell you what though if you go price so...\n", - "6 travel But a few Christian mosaics survive above the ...\n", - "12 slate It's not that the questions they asked weren't...\n", - "13 travel Thebes held onto power until the 12th Dynasty,..." - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[[LABEL_COL, TEXT_COL]].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examples in the dataset are grouped into 5 genres:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "telephone 27783\n", - "government 25784\n", - "travel 25783\n", - "fiction 25782\n", - "slate 25768\n", - "Name: genre, dtype: int64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[LABEL_COL].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We split the data for training and testing, and encode the class labels:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", - " FutureWarning)\n" - ] - } - ], - "source": [ - "# split\n", - "df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df_train = df_train.sample(frac=TRAIN_DATA_FRACTION).reset_index(drop=True)\n", - "df_test = df_test.sample(frac=TEST_DATA_FRACTION).reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# encode labels\n", - "label_encoder = LabelEncoder()\n", - "labels_train = label_encoder.fit_transform(df_train[LABEL_COL])\n", - "labels_test = label_encoder.transform(df_test[LABEL_COL])\n", - "\n", - "num_labels = len(np.unique(labels_train))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of unique labels: 5\n", - "Number of training examples: 78540\n", - "Number of testing examples: 52360\n" - ] - } - ], - "source": [ - "print(\"Number of unique labels: {}\".format(num_labels))\n", - "print(\"Number of training examples: {}\".format(df_train.shape[0]))\n", - "print(\"Number of testing examples: {}\".format(df_test.shape[0]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tokenize and Preprocess" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 78540/78540 [00:27<00:00, 2841.38it/s]\n", - "100%|██████████| 52360/52360 [00:18<00:00, 2834.92it/s]\n" - ] - } - ], - "source": [ - "tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)\n", - "\n", - "tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))\n", - "tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition, we perform the following preprocessing steps in the cell below:\n", - "- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n", - "- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n", - "- Pad or truncate the token lists to the specified max length\n", - "- Return mask lists that indicate paddings' positions\n", - "- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)\n", - "\n", - "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(\n", - " tokens_train, MAX_LEN\n", - ")\n", - "tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(\n", - " tokens_test, MAX_LEN\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Model\n", - "Next, we create a sequence classifier that loads a pre-trained BERT model, given the language and number of labels." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "classifier = BERTSequenceClassifier(\n", - " language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train\n", - "We train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "t_total value of -1 results in schedule not being applied\n", - "Iteration: 0%| | 0/2455 [00:00246/2455; average training loss:1.653734\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 10%|█ | 247/2455 [07:39<1:09:04, 1.88s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:247->492/2455; average training loss:0.376494\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 20%|██ | 493/2455 [15:23<1:01:48, 1.89s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:493->738/2455; average training loss:0.314981\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 30%|███ | 739/2455 [23:06<53:42, 1.88s/it] " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:739->984/2455; average training loss:0.286209\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 40%|████ | 985/2455 [30:50<46:17, 1.89s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:985->1230/2455; average training loss:0.265873\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 50%|█████ | 1231/2455 [38:33<38:29, 1.89s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:1231->1476/2455; average training loss:0.252521\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 60%|██████ | 1477/2455 [46:16<30:38, 1.88s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:1477->1722/2455; average training loss:0.243316\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 70%|███████ | 1723/2455 [54:00<23:04, 1.89s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:1723->1968/2455; average training loss:0.235114\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 80%|████████ | 1969/2455 [1:01:44<15:14, 1.88s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:1969->2214/2455; average training loss:0.229056\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 90%|█████████ | 2215/2455 [1:09:26<07:30, 1.88s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch:1/1; batch:2215->2455/2455; average training loss:0.223192\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 100%|██████████| 2455/2455 [1:16:56<00:00, 1.57s/it]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Training time: 1.283 hrs]\n" - ] - } - ], - "source": [ - "with Timer() as t:\n", - " classifier.fit(\n", - " token_ids=tokens_train,\n", - " input_mask=mask_train,\n", - " labels=labels_train, \n", - " num_epochs=NUM_EPOCHS,\n", - " batch_size=BATCH_SIZE, \n", - " verbose=True,\n", - " ) \n", - "print(\"[Training time: {:.3f} hrs]\".format(t.interval / 3600))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Score\n", - "We score the test set using the trained classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning: Only 1 CUDA device is available. Data parallelism is not possible.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 100%|██████████| 103/103 [18:00<00:00, 8.24s/it]\n" - ] - } - ], - "source": [ - "preds = classifier.predict(token_ids=tokens_test, \n", - " input_mask=mask_test, \n", - " batch_size=BATCH_SIZE_PRED)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluate Results\n", - "Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "accuracy: 0.9421504965622612\n", - "{\n", - " \"fiction\": {\n", - " \"f1-score\": 0.924482109227872,\n", - " \"precision\": 0.8953944368445053,\n", - " \"recall\": 0.9555231143552312,\n", - " \"support\": 10275\n", - " },\n", - " \"government\": {\n", - " \"f1-score\": 0.948873653281097,\n", - " \"precision\": 0.9565560821484992,\n", - " \"recall\": 0.9413136416634279,\n", - " \"support\": 10292\n", - " },\n", - " \"macro avg\": {\n", - " \"f1-score\": 0.9408187527049234,\n", - " \"precision\": 0.9413336757882582,\n", - " \"recall\": 0.9411302847360989,\n", - " \"support\": 52360\n", - " },\n", - " \"micro avg\": {\n", - " \"f1-score\": 0.9421504965622612,\n", - " \"precision\": 0.9421504965622612,\n", - " \"recall\": 0.9421504965622612,\n", - " \"support\": 52360\n", - " },\n", - " \"slate\": {\n", - " \"f1-score\": 0.8725352112676057,\n", - " \"precision\": 0.9031552639800062,\n", - " \"recall\": 0.8439233239272161,\n", - " \"support\": 10277\n", - " },\n", - " \"telephone\": {\n", - " \"f1-score\": 0.9935128410201723,\n", - " \"precision\": 0.9892929829218653,\n", - " \"recall\": 0.99776885319054,\n", - " \"support\": 11205\n", - " },\n", - " \"travel\": {\n", - " \"f1-score\": 0.9646899487278707,\n", - " \"precision\": 0.9622696130464151,\n", - " \"recall\": 0.9671224905440792,\n", - " \"support\": 10311\n", - " },\n", - " \"weighted avg\": {\n", - " \"f1-score\": 0.9417711062461178,\n", - " \"precision\": 0.942203390713011,\n", - " \"recall\": 0.9421504965622612,\n", - " \"support\": 52360\n", - " }\n", - "}\n" - ] - } - ], - "source": [ - "report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) \n", - "accuracy = accuracy_score(labels_test, preds )\n", - "print(\"accuracy: {}\".format(accuracy))\n", - "print(json.dumps(report, indent=4, sort_keys=True))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9421504965622612, - "encoder": "json", - "name": "accuracy", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "accuracy" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9413336757882582, - "encoder": "json", - "name": "precision", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "precision" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9411302847360989, - "encoder": "json", - "name": "recall", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "recall" - } - }, - "output_type": "display_data" - }, - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9408187527049234, - "encoder": "json", - "name": "f1", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "f1" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# for testing\n", - "sb.glue(\"accuracy\", accuracy)\n", - "sb.glue(\"precision\", report[\"macro avg\"][\"precision\"])\n", - "sb.glue(\"recall\", report[\"macro avg\"][\"recall\"])\n", - "sb.glue(\"f1\", report[\"macro avg\"][\"f1-score\"])\n" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "nlp_gpu", - "language": "python", - "name": "nlp_gpu" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb index 00ddedf87..bb6bcbffe 100644 --- a/examples/text_classification/tc_mnli_transformers.ipynb +++ b/examples/text_classification/tc_mnli_transformers.ipynb @@ -13,28 +13,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "import os\n", "import json\n", - "import pandas as pd\n", + "import os\n", + "import sys\n", + "from tempfile import TemporaryDirectory\n", + "\n", "import numpy as np\n", + "import pandas as pd\n", "import scrapbook as sb\n", - "from sklearn.metrics import classification_report, accuracy_score\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.model_selection import train_test_split\n", "import torch\n", "import torch.nn as nn\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", "from tqdm import tqdm\n", + "from utils_nlp.common.timer import Timer\n", "from utils_nlp.dataset.multinli import load_pandas_df\n", "from utils_nlp.models.transformers.sequence_classification import (\n", - " SequenceClassifier,\n", - " Processor,\n", - ")\n", - "from utils_nlp.common.timer import Timer" + " Processor, SequenceClassifier)" ] }, { @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -58,18 +58,18 @@ "outputs": [], "source": [ "# notebook parameters\n", - "DATA_FOLDER = \"./temp\"\n", - "CACHE_DIR = \"./temp\"\n", - "DEVICE = \"cuda\"\n", + "DATA_FOLDER = TemporaryDirectory().name\n", + "CACHE_DIR = TemporaryDirectory().name\n", "NUM_EPOCHS = 1\n", "BATCH_SIZE = 16\n", "NUM_GPUS = 2\n", - "MAX_LEN = 150\n", - "TRAIN_DATA_FRACTION = 0.15\n", - "TEST_DATA_FRACTION = 0.15\n", + "MAX_LEN = 100\n", + "TRAIN_DATA_FRACTION = 0.05\n", + "TEST_DATA_FRACTION = 0.05\n", "TRAIN_SIZE = 0.75\n", "LABEL_COL = \"genre\"\n", - "TEXT_COL = \"sentence1\"" + "TEXT_COL = \"sentence1\"\n", + "MODEL_NAMES = [\"distilbert-base-uncased\", \"roberta-base\", \"xlnet-base-cased\"]" ] }, { @@ -86,9 +86,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 222k/222k [02:38<00:00, 1.40kKB/s] \n" + ] + } + ], "source": [ "df = load_pandas_df(DATA_FOLDER, \"train\")\n", "df = df[df[\"gold_label\"]==\"neutral\"] # get unique sentences" @@ -96,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -163,7 +171,7 @@ "13 travel Thebes held onto power until the 12th Dynasty,..." ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -200,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -218,21 +226,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "telephone 3146\n", - "fiction 2960\n", - "slate 2901\n", - "government 2893\n", - "travel 2826\n", + "slate 1055\n", + "fiction 1019\n", + "telephone 968\n", + "government 939\n", + "travel 928\n", "Name: genre, dtype: int64" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -257,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -265,8 +273,8 @@ "output_type": "stream", "text": [ "Number of unique labels: 5\n", - "Number of training examples: 14726\n", - "Number of testing examples: 4909\n" + "Number of training examples: 4909\n", + "Number of testing examples: 1636\n" ] } ], @@ -287,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -369,30 +377,38 @@ " \n", " \n", " 13\n", - " roberta-base\n", + " bert-base-german-dbmdz-cased\n", " \n", " \n", " 14\n", - " roberta-large\n", + " bert-base-german-dbmdz-uncased\n", " \n", " \n", " 15\n", - " roberta-large-mnli\n", + " roberta-base\n", " \n", " \n", " 16\n", - " xlnet-base-cased\n", + " roberta-large\n", " \n", " \n", " 17\n", - " xlnet-large-cased\n", + " roberta-large-mnli\n", " \n", " \n", " 18\n", - " distilbert-base-uncased\n", + " xlnet-base-cased\n", " \n", " \n", " 19\n", + " xlnet-large-cased\n", + " \n", + " \n", + " 20\n", + " distilbert-base-uncased\n", + " \n", + " \n", + " 21\n", " distilbert-base-uncased-distilled-squad\n", " \n", " \n", @@ -414,16 +430,18 @@ "10 bert-large-uncased-whole-word-masking-finetune...\n", "11 bert-large-cased-whole-word-masking-finetuned-...\n", "12 bert-base-cased-finetuned-mrpc\n", - "13 roberta-base\n", - "14 roberta-large\n", - "15 roberta-large-mnli\n", - "16 xlnet-base-cased\n", - "17 xlnet-large-cased\n", - "18 distilbert-base-uncased\n", - "19 distilbert-base-uncased-distilled-squad" + "13 bert-base-german-dbmdz-cased\n", + "14 bert-base-german-dbmdz-uncased\n", + "15 roberta-base\n", + "16 roberta-large\n", + "17 roberta-large-mnli\n", + "18 xlnet-base-cased\n", + "19 xlnet-large-cased\n", + "20 distilbert-base-uncased\n", + "21 distilbert-base-uncased-distilled-squad" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -443,11 +461,19 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['distilbert-base-uncased', 'roberta-base', 'xlnet-base-cased']\n" + ] + } + ], "source": [ - "model_names = [\"distilbert-base-uncased\", \"roberta-base\", \"xlnet-base-cased\"]" + "print(MODEL_NAMES)" ] }, { @@ -459,5803 +485,180 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}\n", + "\n", + "for model_name in tqdm(MODEL_NAMES):\n", + " \n", + " # preprocess\n", + " processor = Processor(model_name=model_name, cache_dir=CACHE_DIR)\n", + " ds_train = processor.preprocess(\n", + " df_train[TEXT_COL], labels_train, max_len=MAX_LEN\n", + " )\n", + " ds_test = processor.preprocess(df_test[TEXT_COL], None, max_len=MAX_LEN)\n", + "\n", + " # fine-tune\n", + " classifier = SequenceClassifier(\n", + " model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR\n", + " )\n", + " with Timer() as t:\n", + " classifier.fit(\n", + " ds_train,\n", + " num_epochs=NUM_EPOCHS,\n", + " batch_size=BATCH_SIZE,\n", + " num_gpus=NUM_GPUS,\n", + " verbose=False,\n", + " )\n", + " train_time = t.interval / 3600\n", + "\n", + " # predict\n", + " preds = classifier.predict(\n", + " ds_test, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, verbose=False\n", + " )\n", + "\n", + " # eval\n", + " accuracy = accuracy_score(labels_test, preds)\n", + " class_report = classification_report(\n", + " labels_test, preds, target_names=label_encoder.classes_, output_dict=True\n", + " )\n", + "\n", + " # save results\n", + " results[model_name] = {\n", + " \"accuracy\": accuracy,\n", + " \"f1-score\": class_report[\"macro avg\"][\"f1-score\"],\n", + " \"time(hrs)\": train_time,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate\n", + "\n", + "Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours." + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " 0%| | 0/3 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distilbert-base-uncasedroberta-basexlnet-base-cased
accuracy0.8704160.8991440.911369
f1-score0.8703050.8976140.910810
time(hrs)0.0218280.0353250.046363
\n", + "" + ], + "text/plain": [ + " distilbert-base-uncased roberta-base xlnet-base-cased\n", + "accuracy 0.870416 0.899144 0.911369\n", + "f1-score 0.870305 0.897614 0.910810\n", + "time(hrs) 0.021828 0.035325 0.046363" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_results = pd.DataFrame(results)\n", + "df_results" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loss:0.402193\n" - ] + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.8936430317848411, + "encoder": "json", + "name": "accuracy", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "accuracy" + } + }, + "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \n", - " 0%| | 0/3 [01:42\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
distilbert-base-uncasedroberta-basexlnet-base-cased
accuracy0.9014060.9195360.925647
f1-score0.8978290.9167930.923171
time0.1119360.1895810.270957
\n", - "" - ], - "text/plain": [ - " distilbert-base-uncased roberta-base xlnet-base-cased\n", - "accuracy 0.901406 0.919536 0.925647\n", - "f1-score 0.897829 0.916793 0.923171\n", - "time 0.111936 0.189581 0.270957" - ] + "application/scrapbook.scrap.json+json": { + "data": 0.8929098953149991, + "encoder": "json", + "name": "f1", + "version": 1 + } }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "f1" + } + }, + "output_type": "display_data" } ], "source": [ - "pd.DataFrame(results)" + "# for testing\n", + "sb.glue(\"accuracy\", df_results.iloc[0, :].mean())\n", + "sb.glue(\"f1\", df_results.iloc[1, :].mean())" ] } ], diff --git a/tests/conftest.py b/tests/conftest.py index 580e0c826..111d2ae34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,7 +69,9 @@ def notebooks(): "tc_bert_azureml": os.path.join( folder_notebooks, "text_classification", "tc_bert_azureml.ipynb" ), - "tc_mnli_bert": os.path.join(folder_notebooks, "text_classification", "tc_mnli_bert.ipynb"), + "tc_mnli_transformers": os.path.join( + folder_notebooks, "text_classification", "tc_mnli_transformers.ipynb" + ), "tc_dac_bert_ar": os.path.join( folder_notebooks, "text_classification", "tc_dac_bert_ar.ipynb" ), diff --git a/tests/integration/test_notebooks_text_classification.py b/tests/integration/test_notebooks_text_classification.py index 7411445fa..a631eead5 100644 --- a/tests/integration/test_notebooks_text_classification.py +++ b/tests/integration/test_notebooks_text_classification.py @@ -15,8 +15,8 @@ @pytest.mark.gpu @pytest.mark.integration -def test_tc_mnli_bert(notebooks, tmp): - notebook_path = notebooks["tc_mnli_bert"] +def test_tc_mnli_transformers(notebooks, tmp): + notebook_path = notebooks["tc_mnli_transformers"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -24,17 +24,17 @@ def test_tc_mnli_bert(notebooks, tmp): parameters=dict( NUM_GPUS=1, DATA_FOLDER=tmp, - BERT_CACHE_DIR=tmp, - BATCH_SIZE=32, - BATCH_SIZE_PRED=512, + CACHE_DIR=tmp, + BATCH_SIZE=16, NUM_EPOCHS=1, + TRAIN_DATA_FRACTION=0.05, + TEST_DATA_FRACTION=0.05, + MODEL_NAMES=["distilbert-base-uncased"], ), ) result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict - assert pytest.approx(result["accuracy"], 0.93, abs=ABS_TOL) - assert pytest.approx(result["precision"], 0.93, abs=ABS_TOL) - assert pytest.approx(result["recall"], 0.93, abs=ABS_TOL) - assert pytest.approx(result["f1"], 0.93, abs=ABS_TOL) + assert pytest.approx(result["accuracy"], 0.87, abs=ABS_TOL) + assert pytest.approx(result["f1"], 0.87, abs=ABS_TOL) @pytest.mark.gpu diff --git a/utils_nlp/dataset/squad.py b/utils_nlp/dataset/squad.py index cdb07ff05..f807f878c 100644 --- a/utils_nlp/dataset/squad.py +++ b/utils_nlp/dataset/squad.py @@ -23,9 +23,7 @@ } -def load_pandas_df( - local_cache_path=".", squad_version="v1.1", file_split="train" -): +def load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="train"): """Loads the SQuAD dataset in pandas data frame. Args: @@ -34,8 +32,12 @@ def load_pandas_df( squad_version (str, optional): Version of the SQuAD dataset, accepted values are: "v1.1" and "v2.0". Defaults to "v1.1". file_split (str, optional): Dataset split to load, accepted values are: "train" and "dev". - Defaults to "train". + Defaults to "train". """ + + if file_split not in ["train", "dev"]: + raise ValueError("file_split should be either train or dev") + URL = URL_DICT[squad_version][file_split] file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index acd5571c1..36024d842 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -34,9 +34,9 @@ logger = logging.getLogger(__name__) -def get_device(device, num_gpus, local_rank): +def get_device(num_gpus=None, local_rank=-1): if local_rank == -1: - device = torch.device("cuda" if torch.cuda.is_available() and device == "cuda" else "cpu") + device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu") num_gpus = ( min(num_gpus, torch.cuda.device_count()) if num_gpus else torch.cuda.device_count() ) @@ -45,7 +45,6 @@ def get_device(device, num_gpus, local_rank): device = torch.device("cuda", local_rank) torch.distributed.init_process_group(backend="nccl") num_gpus = 1 - return device, num_gpus @@ -58,35 +57,31 @@ def __init__( cache_dir=".", load_model_from_dir=None, ): - self.model_name = model_name + + if model_name not in self.list_supported_models(): + raise ValueError( + "Model name {0} is not supported by {1}. " + "Call '{2}.list_supported_models()' to get all supported model " + "names.".format(value, self.__class__.__name__, self.__class__.__name__) + ) + self._model_name = model_name + self._model_type = model_name.split("-")[0] self.cache_dir = cache_dir self.load_model_from_dir = load_model_from_dir if load_model_from_dir is None: self.model = model_class[model_name].from_pretrained( - model_name, cache_dir=cache_dir, num_labels=num_labels + model_name, cache_dir=cache_dir, num_labels=num_labels, output_loading_info=False ) else: logger.info("Loading cached model from {}".format(load_model_from_dir)) self.model = model_class[model_name].from_pretrained( - load_model_from_dir, num_labels=num_labels + load_model_from_dir, num_labels=num_labels, output_loading_info=False ) @property def model_name(self): return self._model_name - @model_name.setter - def model_name(self, value): - if value not in self.list_supported_models(): - raise ValueError( - "Model name {0} is not supported by {1}. " - "Call '{2}.list_supported_models()' to get all supported model " - "names.".format(value, self.__class__.__name__, self.__class__.__name__) - ) - - self._model_name = value - self._model_type = value.split("-")[0] - @property def model_type(self): return self._model_type @@ -263,10 +258,8 @@ def predict( def save_model(self): output_model_dir = os.path.join(self.cache_dir, "fine_tuned") - if not os.path.exists(self.cache_dir): - os.makedirs(self.cache_dir) - if not os.path.exists(output_model_dir): - os.makedirs(output_model_dir) + os.makedirs(self.cache_dir, exist_ok=True) + os.makedirs(output_model_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", output_model_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py index 1a6370053..a5d140709 100644 --- a/utils_nlp/models/transformers/named_entity_recognition.py +++ b/utils_nlp/models/transformers/named_entity_recognition.py @@ -1,19 +1,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import torch import logging -import numpy as np - -from torch.utils.data import TensorDataset -from cached_property import cached_property from collections import Iterable -from transformers.modeling_bert import ( - BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - BertForTokenClassification -) - +import numpy as np +import torch +from torch.utils.data import TensorDataset +from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification from utils_nlp.models.transformers.common import ( MAX_SEQ_LEN, TOKENIZER_CLASS, @@ -21,10 +15,11 @@ get_device, ) + TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP} -class TokenClassificationProcessor(): +class TokenClassificationProcessor: """ Process raw dataset for training and testing. @@ -42,7 +37,7 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."): self.to_lower = to_lower self.cache_dir = cache_dir self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained( - model_name, do_lower_case=to_lower, cache_dir=cache_dir + model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False ) @staticmethod @@ -55,20 +50,19 @@ def get_inputs(batch, model_name, train_mode=True): train_mode (bool, optional): Whether it's for model training. Set it to False if it's for testing and it won't have the 'labels' data field. Defaults to True, for model training. - + Returns: dict: A dictionary object contains all needed information for training or testing. """ if model_name.split("-")[0] not in ["bert"]: raise ValueError("Model not supported: {}".format(model_name)) - + if train_mode: return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} else: return {"input_ids": batch[0], "attention_mask": batch[1]} - @staticmethod def create_label_map(label_lists, trailing_piece_tag="X"): """ @@ -79,9 +73,9 @@ def create_label_map(label_lists, trailing_piece_tag="X"): which presents class of each token. trailing_piece_tag (str, optional): Tag used to label trailing word pieces. Defaults to "X". - + Returns: - dict: A dictionary object to map a label (str) to an ID (int). + dict: A dictionary object to map a label (str) to an ID (int). """ label_set = set() @@ -94,14 +88,8 @@ def create_label_map(label_lists, trailing_piece_tag="X"): label_map[trailing_piece_tag] = len(label_set) return label_map - def preprocess_for_bert( - self, - text, - max_len=MAX_SEQ_LEN, - labels=None, - label_map=None, - trailing_piece_tag="X" + self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X" ): """ Tokenize and preprocesses input word lists, involving the following steps @@ -157,7 +145,9 @@ def _is_iterable_but_not_string(obj): return isinstance(obj, Iterable) and not isinstance(obj, str) if max_len > MAX_SEQ_LEN: - logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)) + logging.warning( + "Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN) + ) max_len = MAX_SEQ_LEN if not _is_iterable_but_not_string(text): @@ -168,7 +158,7 @@ def _is_iterable_but_not_string(obj): # list of lists for later iteration if not _is_iterable_but_not_string(text[0]): text = [text] - + if labels is not None: if not _is_iterable_but_not_string(labels): raise ValueError("labels must be an iterable and not a string.") @@ -206,7 +196,11 @@ def _is_iterable_but_not_string(obj): new_tokens.append(sub_word) if len(new_tokens) > max_len: - logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens))) + logging.warn( + "Text after tokenization with length {} has been truncated".format( + len(new_tokens) + ) + ) new_tokens = new_tokens[:max_len] new_labels = new_labels[:max_len] input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens) @@ -241,13 +235,13 @@ def _is_iterable_but_not_string(obj): torch.tensor(input_ids_all, dtype=torch.long), torch.tensor(input_mask_all, dtype=torch.long), torch.tensor(trailing_token_mask_all, dtype=torch.bool), - torch.tensor(label_ids_all, dtype=torch.long) + torch.tensor(label_ids_all, dtype=torch.long), ) else: td = TensorDataset( torch.tensor(input_ids_all, dtype=torch.long), torch.tensor(input_mask_all, dtype=torch.long), - torch.tensor(trailing_token_mask_all, dtype=torch.bool) + torch.tensor(trailing_token_mask_all, dtype=torch.bool), ) return td @@ -280,7 +274,6 @@ def list_supported_models(): def fit( self, train_dataset, - device="cuda", num_epochs=1, batch_size=32, num_gpus=None, @@ -297,14 +290,13 @@ def fit( Args: train_dataset (Dataset): Dataset for training. - device (torch.device, optional): A PyTorch device. - Defaults to 'cuda'. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. - num_gpus (int, optional): The number of GPUs to be used. - Defaults to None, all gpus are used. + num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will + be used. If set to 0 or GPUs are not available, CPU device will + be used. Defaults to None. local_rank (int, optional): Whether need to do distributed training. Defaults to -1, no distributed training. weight_decay (float, optional): Weight decay rate. @@ -321,7 +313,7 @@ def fit( Defaults to None, use the default seed. """ - device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank) + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) self.model.to(device) super().fine_tune( @@ -339,32 +331,22 @@ def fit( seed=seed, ) - - def predict( - self, - eval_dataset, - device="cuda", - batch_size=32, - num_gpus=None, - local_rank=-1, - verbose=False, - ): + def predict(self, eval_dataset, batch_size=32, num_gpus=None, local_rank=-1, verbose=False): """ Test on an evaluation dataset and get the token label predictions. Args: eval_dataset (TensorDataset): A TensorDataset for evaluation. - device (torch.device, optional): A PyTorch device. - Defaults to 'cuda'. batch_size (int, optional): The batch size for evaluation. Defaults to 32. - num_gpus (int, optional): The number of GPUs to be used. - Defaults to None, all gpus are used. + num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will + be used. If set to 0 or GPUs are not available, CPU device will + be used. Defaults to None. local_rank (int, optional): Whether need to do distributed training. Defaults to -1, no distributed training. verbose (bool, optional): Verbose model. Defaults to False. - + Returns: ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels]. Each @@ -372,6 +354,7 @@ def predict( to get the probability for each class label. """ + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) preds = list( super().predict( eval_dataset=eval_dataset, @@ -386,13 +369,7 @@ def predict( preds_np = np.concatenate(preds) return preds_np - - def get_predicted_token_labels( - self, - predictions, - label_map, - dataset - ): + def get_predicted_token_labels(self, predictions, label_map, dataset): """ Post-process the raw prediction values and get the class label for each token. @@ -409,9 +386,9 @@ def get_predicted_token_labels( num_samples = len(dataset.tensors[0]) if num_samples != predictions.shape[0]: - raise ValueError("Predictions have {0} samples, but got {1} samples in dataset".format( - predictions.shape[0], - num_samples + raise ValueError( + "Predictions have {0} samples, but got {1} samples in dataset".format( + predictions.shape[0], num_samples ) ) @@ -430,7 +407,7 @@ def get_predicted_token_labels( for sid in range(seq_len): if attention_mask[sid] == 0: break - + if not trailing_mask[sid]: continue diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py index 4c11ef2a4..f33add222 100644 --- a/utils_nlp/models/transformers/question_answering.py +++ b/utils_nlp/models/transformers/question_answering.py @@ -96,7 +96,7 @@ def __init__( self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="." ): self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained( - model_name, do_lower_case=to_lower, cache_dir=cache_dir + model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False ) self.custom_tokenize = custom_tokenize self.model_name = model_name @@ -371,7 +371,6 @@ def list_supported_models(): def fit( self, train_dataset, - device="cuda", num_gpus=None, per_gpu_batch_size=8, num_epochs=1, @@ -395,9 +394,8 @@ def fit( Args: train_dataset (QADataset): Training dataset of type :class:`utils_nlp.dataset.pytorch.QADataset`. - device (str, optional): Device to use. Accepted values are "cuda" and "cpu". - Defaults to "cuda". num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will + be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. per_gpu_batch_size (int, optional): Training batch size on each GPU. Defaults to 8. num_epochs (int, optional): Number of training epochs. Defaults to 1. @@ -428,7 +426,7 @@ def fit( """ - device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank) + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) self.model.to(device) super().fine_tune( @@ -458,7 +456,6 @@ def predict( self, test_dataset, per_gpu_batch_size=16, - device="cuda", num_gpus=None, local_rank=-1, verbose=True, @@ -471,9 +468,8 @@ def predict( test_dataset (QADataset): Testing dataset of type :class:`utils_nlp.dataset.pytorch.QADataset`. per_gpu_batch_size (int, optional): Testing batch size on each GPU. Defaults to 8. - device (str, optional): Device to use. Accepted values are "cuda" and "cpu". - Defaults to "cuda". num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will + be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed. @@ -485,7 +481,7 @@ def predict( def _to_list(tensor): return tensor.detach().cpu().tolist() - device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank) + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) batch_size = per_gpu_batch_size * max(1, num_gpus) self.model.to(device) diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 49576ee3f..3f7305362 100644 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -42,7 +42,7 @@ class Processor: def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."): self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained( - model_name, do_lower_case=to_lower, cache_dir=cache_dir + model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False ) @staticmethod @@ -106,7 +106,6 @@ def list_supported_models(): def fit( self, train_dataset, - device="cuda", num_epochs=1, batch_size=32, num_gpus=None, @@ -118,7 +117,11 @@ def fit( verbose=True, seed=None, ): - device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank) + """ + Fine-tunes a pre-trained sequence classification model. + """ + + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) self.model.to(device) super().fine_tune( train_dataset=train_dataset, @@ -135,7 +138,8 @@ def fit( seed=seed, ) - def predict(self, eval_dataset, device="cuda", batch_size=16, num_gpus=1, verbose=True): + def predict(self, eval_dataset, batch_size=16, num_gpus=1, verbose=True): + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) preds = list( super().predict( eval_dataset=eval_dataset, @@ -143,7 +147,7 @@ def predict(self, eval_dataset, device="cuda", batch_size=16, num_gpus=1, verbos device=device, per_gpu_eval_batch_size=batch_size, n_gpu=num_gpus, - verbose=True, + verbose=verbose, ) ) preds = np.concatenate(preds) From b2415e755f90b5cf247a12bd705f1e700ced41ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?= <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 24 Oct 2019 12:52:21 +0100 Subject: [PATCH 2/8] optimizer and scheduler out --- utils_nlp/models/transformers/common.py | 43 ++++++++++++++----------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 36024d842..8e83c810c 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -105,6 +105,8 @@ def fine_tune( gradient_accumulation_steps=1, per_gpu_train_batch_size=8, n_gpu=1, + optimizer=None, + scheduler=None, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, @@ -134,25 +136,28 @@ def fine_tune( else: t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in self.model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": weight_decay, - }, - { - "params": [ - p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) + if optimizer is None: + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p + for n, p in self.model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": weight_decay, + }, + { + "params": [ + p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) + + if scheduler is None: + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if fp16: try: From 6ff9b2dd3c084e505d91cf19524d49437ae0faa0 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 25 Oct 2019 14:24:24 +0100 Subject: [PATCH 3/8] removed repeated get_device --- tests/unit/test_common_pytorch_utils.py | 82 +++++++++++++------------ utils_nlp/common/pytorch_utils.py | 37 ++++++----- utils_nlp/models/transformers/common.py | 20 ++---- 3 files changed, 69 insertions(+), 70 deletions(-) diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py index efbbd5e97..416d2c01d 100644 --- a/tests/unit/test_common_pytorch_utils.py +++ b/tests/unit/test_common_pytorch_utils.py @@ -8,81 +8,91 @@ from torch.nn.modules.container import Sequential from utils_nlp.common.pytorch_utils import get_device, move_to_device - + @pytest.fixture def model(): - return nn.Sequential( - nn.Linear(24, 8), nn.ReLU(), nn.Linear(8, 2), nn.Sigmoid() - ) + return nn.Sequential(nn.Linear(24, 8), nn.ReLU(), nn.Linear(8, 2), nn.Sigmoid()) + - def test_get_device_cpu(): - device = get_device("cpu") + device, gpus = get_device(num_gpus=0) assert isinstance(device, torch.device) assert device.type == "cpu" - - -def test_get_device_exception(): - with pytest.raises(ValueError): - get_device("abc") + assert gpus == 0 @pytest.mark.gpu def test_machine_is_gpu_machine(): assert torch.cuda.is_available() is True - - + + @pytest.mark.gpu def test_get_device_gpu(): - device = get_device() + device, gpus = get_device(num_gpus=1) assert isinstance(device, torch.device) assert device.type == "cuda" - + assert gpus == 1 + + +@pytest.mark.gpu +def test_get_device_all_gpus(): + device, gpus = get_device() + assert isinstance(device, torch.device) + assert device.type == "cuda" + assert gpus == torch.cuda.device_count() + + +@pytest.mark.gpu +def test_get_device_local_rank(): + device, gpus = get_device(local_rank=1) + assert isinstance(device, torch.device) + assert device.type == "cuda" + assert device.index == 1 + assert gpus == 1 + def test_move_to_device_cpu(model): # test when device.type="cpu" model_cpu = move_to_device(model, torch.device("cpu")) assert isinstance(model_cpu, nn.modules.container.Sequential) - + def test_move_to_device_cpu_parallelized(model): # test when input model is parallelized model_parallelized = nn.DataParallel(model) - model_parallelized_output = move_to_device( - model_parallelized, torch.device("cpu") - ) - assert isinstance( - model_parallelized_output, nn.modules.container.Sequential - ) - - + model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu")) + assert isinstance(model_parallelized_output, nn.modules.container.Sequential) + + def test_move_to_device_exception_not_torch_device(model): # test when device is not torch.device with pytest.raises(ValueError): move_to_device(model, "abc") - - + + def test_move_to_device_exception_wrong_type(model): # test when device.type is not "cuda" or "cpu" with pytest.raises(Exception): move_to_device(model, torch.device("opengl")) -@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine") +@pytest.mark.skipif( + torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine" +) def test_move_to_device_exception_gpu_model_on_cpu_machine(model): # test when the model is moved to a gpu but it is a cpu machine with pytest.raises(Exception): move_to_device(model, torch.device("cuda")) - - -@pytest.mark.gpu + + +@pytest.mark.gpu def test_move_to_device_exception_cuda_zero_gpus(model): # test when device.type is cuda, but num_gpus is 0 with pytest.raises(ValueError): move_to_device(model, torch.device("cuda"), num_gpus=0) - - + + @pytest.mark.gpu def test_move_to_device_gpu(model): # test when device.type="cuda" @@ -94,9 +104,7 @@ def test_move_to_device_gpu(model): else: assert isinstance(model_cuda, Sequential) - model_cuda_1_gpu = move_to_device( - model, torch.device("cuda"), num_gpus=1 - ) + model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1) assert isinstance(model_cuda_1_gpu, Sequential) model_cuda_1_more_gpu = move_to_device( @@ -107,9 +115,7 @@ def test_move_to_device_gpu(model): else: assert isinstance(model_cuda_1_more_gpu, Sequential) - model_cuda_same_gpu = move_to_device( - model, torch.device("cuda"), num_gpus=num_cuda_devices - ) + model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices) if num_cuda_devices > 1: assert isinstance(model_cuda_same_gpu, DataParallel) else: diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index 07c3b504e..ea09f8768 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -8,23 +8,28 @@ import warnings -def get_device(device="gpu"): - """Gets a PyTorch device. - - Args: - device (str, optional): Device string: "cpu" or "gpu". Defaults to "gpu". - - Returns: - torch.device: A PyTorch device (cpu or gpu). - """ - if device == "gpu": - if torch.cuda.is_available(): - return torch.device("cuda:0") - raise Exception("CUDA device not available") - elif device == "cpu": - return torch.device("cpu") +def get_device( + num_gpus=None, + local_rank=-1, + # backend="nccl", + # rank=0, + # world_size=1, + # init_method="file:///distributed", +): + if local_rank == -1: + num_gpus = ( + min(num_gpus, torch.cuda.device_count()) + if num_gpus is not None + else torch.cuda.device_count() + ) + device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu") else: - raise ValueError("Only 'cpu' and 'gpu' devices are supported.") + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + # torch.distributed.init_process_group(backend="nccl") + # torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=init_method) + num_gpus = 1 + return device, num_gpus def move_to_device(model, device, num_gpus=None): diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 8e83c810c..1c82ae9e0 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -34,20 +34,6 @@ logger = logging.getLogger(__name__) -def get_device(num_gpus=None, local_rank=-1): - if local_rank == -1: - device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu") - num_gpus = ( - min(num_gpus, torch.cuda.device_count()) if num_gpus else torch.cuda.device_count() - ) - else: - torch.cuda.set_device(local_rank) - device = torch.device("cuda", local_rank) - torch.distributed.init_process_group(backend="nccl") - num_gpus = 1 - return device, num_gpus - - class Transformer: def __init__( self, @@ -149,13 +135,15 @@ def fine_tune( }, { "params": [ - p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) + p + for n, p in self.model.named_parameters() + if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) - + if scheduler is None: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) From d5c10c1e679668ed03429cbfaf44b76754269549 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 25 Oct 2019 14:37:05 +0100 Subject: [PATCH 4/8] refactored get_device --- .../models/bert/sequence_classification.py | 75 +++++-------------- .../sequence_classification_distributed.py | 11 ++- utils_nlp/models/bert/sequence_encoding.py | 6 +- utils_nlp/models/bert/token_classification.py | 69 +++++------------ .../models/xlnet/sequence_classification.py | 4 +- 5 files changed, 46 insertions(+), 119 deletions(-) diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py index ced02acc6..03a324604 100644 --- a/utils_nlp/models/bert/sequence_classification.py +++ b/utils_nlp/models/bert/sequence_classification.py @@ -7,12 +7,7 @@ import numpy as np import torch import torch.nn as nn -from torch.utils.data import ( - DataLoader, - RandomSampler, - SequentialSampler, - TensorDataset, -) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from pytorch_pretrained_bert.modeling import BertForSequenceClassification from pytorch_pretrained_bert.optimization import BertAdam from tqdm import tqdm @@ -22,6 +17,7 @@ from cached_property import cached_property + class BERTSequenceClassifier: """BERT-based sequence classifier""" @@ -55,7 +51,7 @@ def cuda(self): self.has_cuda = torch.cuda.is_available() return self.has_cuda - + def fit( self, token_ids, @@ -93,9 +89,8 @@ def fit( loss values. Defaults to True. """ - device = get_device( - "cpu" if num_gpus == 0 or not self.cuda else "gpu" - ) + device, num_gpus = get_device(num_gpus) + self.model = move_to_device(self.model, device, num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) @@ -103,42 +98,25 @@ def fit( labels_tensor = torch.tensor(labels, dtype=torch.long) if token_type_ids: - token_type_ids_tensor = torch.tensor( - token_type_ids, dtype=torch.long - ) + token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) train_dataset = TensorDataset( - token_ids_tensor, - input_mask_tensor, - token_type_ids_tensor, - labels_tensor, + token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor ) else: - train_dataset = TensorDataset( - token_ids_tensor, input_mask_tensor, labels_tensor - ) + train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=batch_size - ) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { - "params": [ - p - for n, p in param_optimizer - if not any(nd in n for nd in no_decay) - ], + "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01, }, { - "params": [ - p - for n, p in param_optimizer - if any(nd in n for nd in no_decay) - ], + "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] @@ -164,18 +142,14 @@ def fit( for epoch in range(num_epochs): training_loss = 0 - for i, batch in enumerate( - tqdm(train_dataloader, desc="Iteration") - ): + for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( t.to(device) for t in batch ) else: token_type_ids_batch = None - x_batch, mask_batch, y_batch = tuple( - t.to(device) for t in batch - ) + x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch) opt.zero_grad() @@ -236,9 +210,7 @@ def predict( 1darray, namedtuple(1darray, ndarray): Predicted classes or (classes, probabilities) if probabilities is True. """ - device = get_device( - "cpu" if num_gpus == 0 or not self.cuda else "gpu" - ) + device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) # score @@ -248,26 +220,18 @@ def predict( input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) if token_type_ids: - token_type_ids_tensor = torch.tensor( - token_type_ids, dtype=torch.long - ) - test_dataset = TensorDataset( - token_ids_tensor, input_mask_tensor, token_type_ids_tensor - ) + token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) + test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor) else: test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor) test_sampler = SequentialSampler(test_dataset) - test_dataloader = DataLoader( - test_dataset, sampler=test_sampler, batch_size=batch_size - ) + test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size) preds = [] for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")): if token_type_ids: - x_batch, mask_batch, token_type_ids_batch = tuple( - t.to(device) for t in batch - ) + x_batch, mask_batch, token_type_ids_batch = tuple(t.to(device) for t in batch) else: token_type_ids_batch = None x_batch, mask_batch = tuple(t.to(device) for t in batch) @@ -285,8 +249,7 @@ def predict( if probabilities: return namedtuple("Predictions", "classes probabilities")( - preds.argmax(axis=1), - nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(), + preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy() ) else: return preds.argmax(axis=1) diff --git a/utils_nlp/models/bert/sequence_classification_distributed.py b/utils_nlp/models/bert/sequence_classification_distributed.py index 5c3faa07a..ee5061158 100644 --- a/utils_nlp/models/bert/sequence_classification_distributed.py +++ b/utils_nlp/models/bert/sequence_classification_distributed.py @@ -167,7 +167,7 @@ def fit( epoch, bert_optimizer=None, num_epochs=1, - num_gpus=0, + num_gpus=None, lr=2e-5, warmup_proportion=None, fp16_allreduce=False, @@ -181,7 +181,7 @@ def fit( epoch(int): Current epoch number of training. bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod num_epochs(int): the number of epochs to run - num_gpus(int): the number of gpus + num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used. lr (float): learning rate of the adam optimizer. defaults to 2e-5. warmup_proportion (float, optional): proportion of training to perform linear learning rate warmup for. e.g., 0.1 = 10% of @@ -190,10 +190,9 @@ def fit( num_train_optimization_steps: number of steps the optimizer should take. """ - device = get_device("cpu" if num_gpus == 0 else "gpu") + device, num_gpus = get_device(num_gpus) - if device: - self.model.cuda() + self.model = move_to_device(self.model, device, num_gpus) if bert_optimizer is None: bert_optimizer = self.create_optimizer( @@ -277,7 +276,7 @@ def predict(self, test_loader, num_gpus=None, probabilities=False): 1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or a dictionary with classes, target labels, probabilities) if probabilities is True. """ - device = get_device("cpu" if num_gpus == 0 else "gpu") + device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) # score diff --git a/utils_nlp/models/bert/sequence_encoding.py b/utils_nlp/models/bert/sequence_encoding.py index 7a747c963..088a6310d 100644 --- a/utils_nlp/models/bert/sequence_encoding.py +++ b/utils_nlp/models/bert/sequence_encoding.py @@ -18,6 +18,7 @@ from utils_nlp.models.bert.common import Language, Tokenizer from cached_property import cached_property + class PoolingStrategy(str, Enum): """Enumerate pooling strategies""" @@ -79,12 +80,11 @@ def layer_index(self, layer_index): self._layer_index = [layer_index] else: self.layer_index = layer_index - @cached_property def cuda(self): """ cache the output of torch.cuda.is_available() """ - + self.has_cuda = torch.cuda.is_available() return self.has_cuda @@ -106,7 +106,7 @@ def get_hidden_states(self, text, batch_size=32): Returns: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). """ - device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu") + device, num_gpus = get_device(self.num_gpus) self.model = move_to_device(self.model, device, self.num_gpus) self.model.eval() diff --git a/utils_nlp/models/bert/token_classification.py b/utils_nlp/models/bert/token_classification.py index bce7de8b1..ce98357bc 100644 --- a/utils_nlp/models/bert/token_classification.py +++ b/utils_nlp/models/bert/token_classification.py @@ -20,6 +20,7 @@ from cached_property import cached_property + class BERTTokenClassifier: """BERT-based token classifier.""" @@ -64,9 +65,7 @@ def cuda(self): self.has_cuda = torch.cuda.is_available() return self.has_cuda - def _get_optimizer( - self, learning_rate, num_train_optimization_steps, warmup_proportion - ): + def _get_optimizer(self, learning_rate, num_train_optimization_steps, warmup_proportion): """ Initializes the optimizer and configure parameters to apply weight decay on. @@ -77,26 +76,18 @@ def _get_optimizer( optimizer_grouped_parameters = [ { "params": [ - p - for n, p in param_optimizer - if not any(nd in n for nd in no_decay_params) + p for n, p in param_optimizer if not any(nd in n for nd in no_decay_params) ], "weight_decay": params_weight_decay, }, { - "params": [ - p - for n, p in param_optimizer - if any(nd in n for nd in no_decay_params) - ], + "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay_params)], "weight_decay": 0.0, }, ] if warmup_proportion is None: - optimizer = BertAdam( - optimizer_grouped_parameters, lr=learning_rate - ) + optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate) else: optimizer = BertAdam( optimizer_grouped_parameters, @@ -151,9 +142,8 @@ def fit( batch_size=batch_size, ) - device = get_device( - "cpu" if num_gpus == 0 or not self.cuda else "gpu" - ) + device, num_gpus = get_device(num_gpus) + self.model = move_to_device(self.model, device, num_gpus) if num_gpus is None: @@ -161,9 +151,7 @@ def fit( else: num_gpus_used = min(num_gpus, torch.cuda.device_count()) - num_train_optimization_steps = max( - (int(len(token_ids) / batch_size) * num_epochs), 1 - ) + num_train_optimization_steps = max((int(len(token_ids) / batch_size) * num_epochs), 1) optimizer = self._get_optimizer( learning_rate=learning_rate, num_train_optimization_steps=num_train_optimization_steps, @@ -174,16 +162,12 @@ def fit( for _ in trange(int(num_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 - for step, batch in enumerate( - tqdm(train_dataloader, desc="Iteration", mininterval=30) - ): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=30)): batch = tuple(t.to(device) for t in batch) b_token_ids, b_input_mask, b_label_ids = batch loss = self.model( - input_ids=b_token_ids, - attention_mask=b_input_mask, - labels=b_label_ids, + input_ids=b_token_ids, attention_mask=b_input_mask, labels=b_label_ids ) if num_gpus_used > 1: @@ -206,13 +190,7 @@ def fit( torch.cuda.empty_cache() def predict( - self, - token_ids, - input_mask, - labels=None, - batch_size=32, - num_gpus=None, - probabilities=False, + self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None, probabilities=False ): """ Predict token labels on the testing data. @@ -248,18 +226,14 @@ def predict( batch_size=batch_size, sample_method="sequential", ) - device = get_device( - "cpu" if num_gpus == 0 or not self.cuda else "gpu" - ) + device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() eval_loss = 0 nb_eval_steps = 0 - for step, batch in enumerate( - tqdm(test_dataloader, desc="Iteration", mininterval=10) - ): + for step, batch in enumerate(tqdm(test_dataloader, desc="Iteration", mininterval=10)): batch = tuple(t.to(device) for t in batch) true_label_available = False if labels: @@ -272,9 +246,7 @@ def predict( logits = self.model(b_input_ids, attention_mask=b_input_mask) if true_label_available: active_loss = b_input_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels)[ - active_loss - ] + active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = b_labels.view(-1)[active_loss] loss_fct = nn.CrossEntropyLoss() tmp_eval_loss = loss_fct(active_logits, active_labels) @@ -298,8 +270,7 @@ def predict( if probabilities: return namedtuple("Predictions", "classes probabilities")( - predictions, - np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2), + predictions, np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2) ) else: return predictions @@ -315,11 +286,7 @@ def create_label_map(label_list, trailing_piece_tag="X"): def postprocess_token_labels( - labels, - input_mask, - label_map=None, - remove_trailing_word_pieces=False, - trailing_token_mask=None, + labels, input_mask, label_map=None, remove_trailing_word_pieces=False, trailing_token_mask=None ): """ Postprocesses token classification output: @@ -372,9 +339,7 @@ def postprocess_token_labels( labels_no_trailing_pieces = [ [label for label, mask in zip(label_list, mask_list) if mask] - for label_list, mask_list in zip( - labels_org_no_padding, token_mask_no_padding - ) + for label_list, mask_list in zip(labels_org_no_padding, token_mask_no_padding) ] return labels_no_trailing_pieces else: diff --git a/utils_nlp/models/xlnet/sequence_classification.py b/utils_nlp/models/xlnet/sequence_classification.py index 055fad50f..90c514747 100644 --- a/utils_nlp/models/xlnet/sequence_classification.py +++ b/utils_nlp/models/xlnet/sequence_classification.py @@ -113,7 +113,7 @@ def fit( loss values. Defaults to True. """ - device = get_device("cpu" if self.num_gpus == 0 or not torch.cuda.is_available() else "gpu") + device, num_gpus = get_device(self.num_gpus) self.model = move_to_device(self.model, device, self.num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) @@ -329,7 +329,7 @@ def predict( (classes, probabilities) if probabilities is True. """ - device = get_device("cpu" if num_gpus == 0 or not torch.cuda.is_available() else "gpu") + device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() From fc800398353b97ff5021c1af47e22e358a5ca9bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?= <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 8 Nov 2019 20:36:12 +0000 Subject: [PATCH 5/8] trigger tests --- tests/unit/test_common_pytorch_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py index 416d2c01d..e2fce1e10 100644 --- a/tests/unit/test_common_pytorch_utils.py +++ b/tests/unit/test_common_pytorch_utils.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. + import pytest import torch import torch.nn as nn From e50811f75cd58b907e0edac9c4967d3fae987ad8 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 8 Nov 2019 22:07:52 +0000 Subject: [PATCH 6/8] :bug: in imports --- .../models/transformers/named_entity_recognition.py | 8 ++------ utils_nlp/models/transformers/question_answering.py | 5 +---- .../models/transformers/sequence_classification.py | 10 ++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py index a5d140709..d3d82e3b6 100644 --- a/utils_nlp/models/transformers/named_entity_recognition.py +++ b/utils_nlp/models/transformers/named_entity_recognition.py @@ -8,12 +8,8 @@ import torch from torch.utils.data import TensorDataset from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification -from utils_nlp.models.transformers.common import ( - MAX_SEQ_LEN, - TOKENIZER_CLASS, - Transformer, - get_device, -) +from utils_nlp.common.pytorch_utils import get_device +from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP} diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py index 2a24b0818..41ace6b42 100644 --- a/utils_nlp/models/transformers/question_answering.py +++ b/utils_nlp/models/transformers/question_answering.py @@ -29,24 +29,21 @@ from torch.utils.data import TensorDataset, SequentialSampler, DataLoader from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize - from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering - from transformers.modeling_xlnet import ( XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering, ) - from transformers.modeling_distilbert import ( DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering, ) +from utils_nlp.common.pytorch_utils import get_device from utils_nlp.models.transformers.common import ( MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer, - get_device, ) MODEL_CLASS = {} diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 3f7305362..76f530203 100644 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -4,6 +4,7 @@ import numpy as np import torch from torch.utils.data import TensorDataset + from transformers.modeling_bert import ( BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification, @@ -21,12 +22,9 @@ XLNetForSequenceClassification, ) -from utils_nlp.models.transformers.common import ( - MAX_SEQ_LEN, - TOKENIZER_CLASS, - Transformer, - get_device, -) +from utils_nlp.common.pytorch_utils import get_device +from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer + MODEL_CLASS = {} MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) From 3cbe8465d7462496e6a1ec3450a2ebd69c675131 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Wed, 13 Nov 2019 05:40:54 +0000 Subject: [PATCH 7/8] updates to finetuning --- utils_nlp/models/transformers/common.py | 39 +--- utils_nlp/models/transformers/datasets.py | 215 ++++++++++++++++++ .../transformers/sequence_classification.py | 93 ++++---- 3 files changed, 272 insertions(+), 75 deletions(-) create mode 100644 utils_nlp/models/transformers/datasets.py diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 1c82ae9e0..26d387b38 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -10,8 +10,6 @@ import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler -from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from transformers import AdamW, WarmupLinearSchedule from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -47,8 +45,8 @@ def __init__( if model_name not in self.list_supported_models(): raise ValueError( "Model name {0} is not supported by {1}. " - "Call '{2}.list_supported_models()' to get all supported model " - "names.".format(value, self.__class__.__name__, self.__class__.__name__) + "Call '{1}.list_supported_models()' to get all supported model " + "names.".format(value, self.__class__.__name__) ) self._model_name = model_name self._model_type = model_name.split("-")[0] @@ -82,14 +80,13 @@ def set_seed(seed, cuda=True): def fine_tune( self, - train_dataset, + train_dataloader, get_inputs, device, max_steps=-1, num_train_epochs=1, max_grad_norm=1.0, gradient_accumulation_steps=1, - per_gpu_train_batch_size=8, n_gpu=1, optimizer=None, scheduler=None, @@ -106,14 +103,6 @@ def fine_tune( if seed is not None: Transformer.set_seed(seed, n_gpu > 0) - train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) - train_sampler = ( - RandomSampler(train_dataset) if local_rank == -1 else DistributedSampler(train_dataset) - ) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=train_batch_size - ) - if max_steps > 0: t_total = max_steps num_train_epochs = ( @@ -191,7 +180,7 @@ def fine_tune( loss = loss / gradient_accumulation_steps if step % 10 == 0 and verbose: - tqdm.write("Loss:{:.6f}".format(loss / train_batch_size)) + tqdm.write("Loss:{:.6f}".format(loss)) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: @@ -220,24 +209,7 @@ def fine_tune( torch.cuda.empty_cache() return global_step, tr_loss / global_step - def predict( - self, - eval_dataset, - get_inputs, - device, - per_gpu_eval_batch_size=16, - n_gpu=1, - local_rank=-1, - verbose=True, - ): - eval_batch_size = per_gpu_eval_batch_size * max(1, n_gpu) - eval_sampler = ( - SequentialSampler(eval_dataset) - if local_rank == -1 - else DistributedSampler(eval_dataset) - ) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size) - + def predict(self, eval_dataloader, get_inputs, device, verbose=True): for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): self.model.eval() batch = tuple(t.to(device) for t in batch) @@ -245,7 +217,6 @@ def predict( inputs = get_inputs(batch, self.model_name, train_mode=False) outputs = self.model(**inputs) logits = outputs[0] - yield logits.detach().cpu().numpy() def save_model(self): diff --git a/utils_nlp/models/transformers/datasets.py b/utils_nlp/models/transformers/datasets.py new file mode 100644 index 000000000..a2170c88d --- /dev/null +++ b/utils_nlp/models/transformers/datasets.py @@ -0,0 +1,215 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import collections +import torch +from torch.utils.data import Dataset + + +class SCDataSet(Dataset): + """Dataset for single sequence classification tasks""" + + def __init__(self, df, text_col, label_col, max_len, transform): + self.df = df + cols = list(df.columns) + self.transform = transform + self.max_len = max_len + + if isinstance(text_col, int): + self.text_col = text_col + elif isinstance(text_col, str): + self.text_col = cols.index(text_col) + else: + raise TypeError("text_col must be of type int or str") + + if label_col is None: + self.label_col = None + elif isinstance(label_col, int): + self.label_col = label_col + elif isinstance(label_col, str): + self.label_col = cols.index(label_col) + else: + raise TypeError("label_col must be of type int or str") + + def __getitem__(self, idx): + input_ids, attention_mask = self.transform( + self.df.iloc[idx, self.text_col], max_len=self.max_len + ) + if self.label_col is None: + return tuple( + [ + torch.tensor(input_ids, dtype=torch.long), + torch.tensor(attention_mask, dtype=torch.long), + ] + ) + labels = self.df.iloc[idx, self.label_col] + return tuple( + [ + torch.tensor(input_ids, dtype=torch.long), # input_ids + torch.tensor(attention_mask, dtype=torch.long), # attention_mask + torch.tensor(labels, dtype=torch.long), # labels + ] + ) + + def __len__(self): + return self.df.shape[0] + + +class SPCDataSet(Dataset): + """Dataset for sequence pair classification tasks""" + + def __init__(self, df, text1_col, text2_col, label_col, max_len, transform): + self.df = df + cols = list(df.columns) + self.transform = transform + self.max_len = max_len + + if isinstance(text1_col, int): + self.text1_col = text1_col + elif isinstance(text1_col, str): + self.text1_col = cols.index(text1_col) + else: + raise TypeError("text1_col must be of type int or str") + + if isinstance(text2_col, int): + self.text2_col = text2_col + elif isinstance(text2_col, str): + self.text2_col = cols.index(text2_col) + else: + raise TypeError("text2_col must be of type int or str") + + if label_col is None: + self.label_col = None + elif isinstance(label_col, int): + self.label_col = label_col + elif isinstance(label_col, str): + self.label_col = cols.index(label_col) + else: + raise TypeError("label_col must be of type int or str") + + def __getitem__(self, idx): + input1_ids, attention1_mask = self.transform( + self.df.iloc[idx, self.text1_col], max_len=self.max_len + ) + input2_ids, attention2_mask = transform( + self.df.iloc[idx, self.text2_col], max_len=self.max_len + ) + + if self.label_col is None: + return tuple( + [ + torch.tensor(input1_ids + input2_ids, dtype=torch.long), + torch.tensor(attention1_mask + attention2_mask, dtype=torch.long), + torch.tensor([0] * len(input1_ids) + [1] * len(input2_ids), dtype=torch.long), + ] + ) + + labels = self.df.iloc[idx, self.label_col] + return tuple( + [ + torch.tensor(input1_ids + input2_ids, dtype=torch.long), + torch.tensor(attention1_mask + attention2_mask, dtype=torch.long), + torch.tensor([0] * len(input1_ids) + [1] * len(input2_ids), dtype=torch.long), + torch.tensor(labels, dtype=torch.long), + ] + ) + + def __len__(self): + return self.df.shape[0] + + +# QAInput is a data structure representing an unique document-question-answer triplet. +# Args: +# doc_text (str): Input document text. +# question_text(str): Input question text. +# qa_id (int or str): An unique id identifying a document-question-answer sample. +# is_impossible (bool): If the question is impossible to answer based on the input document. +# answer_start (int or list): Index of the answer start word in doc_text. For testing data, +# this can be a list of integers for multiple ground truth answers. +# answer_text (str or list): Text of the answer. For testing data, this can be a list of strings +# for multiple ground truth answers. +QAInput = collections.namedtuple( + "QAInput", + ["doc_text", "question_text", "qa_id", "is_impossible", "answer_start", "answer_text"], +) + + +class QADataset(Dataset): + def __init__( + self, + df, + doc_text_col, + question_text_col, + qa_id_col=None, + answer_start_col=None, + answer_text_col=None, + is_impossible_col=None, + ): + """ + A standard dataset structure for question answering that can be processed by + :meth:`utils_nlp.models.transformers.question_answering.QAProcessor.preprocess` + + Args: + df (pandas.DataFrame): Input data frame. + doc_text_col (str): Name of the column containing the document texts. + question_text_col (str): Name of the column containing the question texts. + qa_id_col (str, optional): Name of the column containing the unique ids identifying + document-question-answer samples. If not provided, a "qa_id" column is + automatically created. Defaults to None. + answer_start_col (str, optional): Name of the column containing answer start indices. + For testing data, each value in the column can be a list of integers for multiple + ground truth answers. Defaults to None. + answer_text_col (str, optional): Name of the column containing answer texts. For + testing data, each value in the column can be a list of strings for multiple + ground truth answers. Defaults to None. + is_impossible_col (str, optional): Name of the column containing boolean values + indicating if the question is impossible to answer. If not provided, + a "is_impossible" column is automatically created and populated with False. + Defaults to None. + """ + self.df = df.copy() + self.doc_text_col = doc_text_col + self.question_text_col = question_text_col + + if qa_id_col is None: + self.qa_id_col = "qa_id" + self.df[self.qa_id_col] = list(range(self.df.shape[0])) + else: + self.qa_id_col = qa_id_col + + if is_impossible_col is None: + self.is_impossible_col = "is_impossible" + self.df[self.is_impossible_col] = False + else: + self.is_impossible_col = is_impossible_col + + if answer_start_col is not None and answer_text_col is not None: + self.actual_answer_available = True + else: + self.actual_answer_available = False + self.answer_start_col = answer_start_col + self.answer_text_col = answer_text_col + + def __getitem__(self, idx): + current_item = self.df.iloc[idx,] + if self.actual_answer_available: + return QAInput( + doc_text=current_item[self.doc_text_col], + question_text=current_item[self.question_text_col], + qa_id=current_item[self.qa_id_col], + is_impossible=current_item[self.is_impossible_col], + answer_start=current_item[self.answer_start_col], + answer_text=current_item[self.answer_text_col], + ) + else: + return QAInput( + doc_text=current_item[self.doc_text_col], + question_text=current_item[self.question_text_col], + qa_id=current_item[self.qa_id_col], + is_impossible=current_item[self.is_impossible_col], + answer_start=-1, + answer_text="", + ) + + def __len__(self): + return self.df.shape[0] diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 3f7305362..cc26f0c14 100644 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -2,8 +2,9 @@ # Licensed under the MIT License. import numpy as np -import torch -from torch.utils.data import TensorDataset + +from torch.utils.data import DataLoader, SequentialSampler, RandomSampler +from torch.utils.data.distributed import DistributedSampler from transformers.modeling_bert import ( BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification, @@ -20,13 +21,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification, ) - -from utils_nlp.models.transformers.common import ( - MAX_SEQ_LEN, - TOKENIZER_CLASS, - Transformer, - get_device, -) +from utils_nlp.common.pytorch_utils import get_device +from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet +from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer MODEL_CLASS = {} MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) @@ -55,39 +52,57 @@ def get_inputs(batch, model_name, train_mode=True): else: raise ValueError("Model not supported: {}".format(model_name)) - def preprocess(self, text, labels=None, max_len=MAX_SEQ_LEN): - """preprocess data or batches""" + def text_transform(self, text, max_len=MAX_SEQ_LEN): + """preprocess text""" if max_len > MAX_SEQ_LEN: print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)) max_len = MAX_SEQ_LEN - - tokens = [self.tokenizer.tokenize(x) for x in text] - # truncate and add CLS & SEP markers - tokens = [ - [self.tokenizer.cls_token] + x[0 : max_len - 2] + [self.tokenizer.sep_token] - for x in tokens - ] + tokens = ( + [self.tokenizer.cls_token] + + self.tokenizer.tokenize(text)[0 : max_len - 2] + + [self.tokenizer.sep_token] + ) # get input ids - input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens] + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # pad sequence - input_ids = [x + [0] * (max_len - len(x)) for x in input_ids] + input_ids = input_ids + [0] * (max_len - len(input_ids)) # create input mask - input_mask = [[min(1, x) for x in y] for y in input_ids] - # create segment ids - # segment_ids = None - if labels is None: - td = TensorDataset( - torch.tensor(input_ids, dtype=torch.long), - torch.tensor(input_mask, dtype=torch.long), - ) + attention_mask = [min(1, x) for x in input_ids] + return input_ids, attention_mask + + def create_dataloader_from_df( + self, + df, + text_col, + label_col, + max_len=MAX_SEQ_LEN, + text2_col=None, + batch_size=32, + num_gpus=None, + shuffle=True, + distributed=False, + ): + if text2_col is None: + ds = SCDataSet(df, text_col, label_col, max_len=max_len, transform=self.text_transform) else: - td = TensorDataset( - torch.tensor(input_ids, dtype=torch.long), - torch.tensor(input_mask, dtype=torch.long), - torch.tensor(labels, dtype=torch.long), + ds = SPCDataSet( + df, text_col, text2_col, label_col, max_len=max_len, transform=self.text_transform ) - return td + if num_gpus is not None: + batch_size = batch_size * max(1, num_gpus) + if distributed: + sampler = DistributedSampler(dataset) + else: + sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds) + + return DataLoader(ds, sampler=sampler, batch_size=batch_size) + + # def get_eval_dataloader(dataset, batch_size, num_gpus): + # if num_gpus is not None: + # batch_size = batch_size * max(1, num_gpus) + # sampler = SequentialSampler(dataset) + # return DataLoader(dataset, sampler=sampler, batch_size=batch_size) class SequenceClassifier(Transformer): @@ -105,9 +120,8 @@ def list_supported_models(): def fit( self, - train_dataset, + train_dataloader, num_epochs=1, - batch_size=32, num_gpus=None, local_rank=-1, weight_decay=0.0, @@ -124,10 +138,9 @@ def fit( device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) self.model.to(device) super().fine_tune( - train_dataset=train_dataset, + train_dataloader=train_dataloader, get_inputs=Processor.get_inputs, device=device, - per_gpu_train_batch_size=batch_size, n_gpu=num_gpus, num_train_epochs=num_epochs, weight_decay=weight_decay, @@ -138,15 +151,13 @@ def fit( seed=seed, ) - def predict(self, eval_dataset, batch_size=16, num_gpus=1, verbose=True): + def predict(self, eval_dataloader, num_gpus=1, verbose=True): device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) preds = list( super().predict( - eval_dataset=eval_dataset, + eval_dataloader=eval_dataloader, get_inputs=Processor.get_inputs, device=device, - per_gpu_eval_batch_size=batch_size, - n_gpu=num_gpus, verbose=verbose, ) ) From 304551544e0d63c302d7846cf26e9255922e62b5 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Wed, 13 Nov 2019 06:02:06 +0000 Subject: [PATCH 8/8] update text classification notebook --- examples/text_classification/README.md | 3 +- .../tc_mnli_transformers.ipynb | 98 ++++++++++++------- 2 files changed, 62 insertions(+), 39 deletions(-) diff --git a/examples/text_classification/README.md b/examples/text_classification/README.md index 2283c73b3..f1b09cc29 100644 --- a/examples/text_classification/README.md +++ b/examples/text_classification/README.md @@ -18,10 +18,9 @@ The following summarizes each notebook for Text Classification. Each notebook pr |Notebook|Environment|Description|Dataset| |---|---|---|---| -|[BERT for text classification with MNLI](tc_mnli_bert.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on a subset of the MultiNLI dataset|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| |[BERT for text classification on AzureML](tc_bert_azureml.ipynb) |Azure ML|A notebook which walks through fine-tuning and evaluating pre-trained BERT model on a distributed setup with AzureML. |[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| |[XLNet for text classification with MNLI](tc_mnli_xlnet.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained XLNet model on a subset of the MultiNLI dataset|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| |[BERT for text classification of Hindi BBC News](tc_bbc_bert_hi.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on Hindi BBC news data|[BBC Hindi News](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1)| |[BERT for text classification of Arabic News](tc_dac_bert_ar.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on Arabic news articles|[DAC](https://data.mendeley.com/datasets/v524p5dhpj/2)| -|[Text Classification of MultiNLI Sentences using Different Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| +|[Text Classification of MultiNLI Sentences using Multiple Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| |[Text Classification Pipelines with Azure Machine Learning](tc_transformers_azureml_pipelines/tc_transformers_azureml_pipelines.ipynb)|Azure ML| A notebook which walks through building Azure ML pipelines for fine-tuning multiple transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)| diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb index bb6bcbffe..952f2bafa 100644 --- a/examples/text_classification/tc_mnli_transformers.ipynb +++ b/examples/text_classification/tc_mnli_transformers.ipynb @@ -8,7 +8,7 @@ "\n", "*Licensed under the MIT License.*\n", "\n", - "# Text Classification of MultiNLI Sentences using Different Transformer Models" + "# Text Classification of MultiNLI Sentences using Multiple Transformer Models" ] }, { @@ -93,7 +93,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 222k/222k [02:38<00:00, 1.40kKB/s] \n" + "100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n" ] } ], @@ -232,11 +232,11 @@ { "data": { "text/plain": [ - "slate 1055\n", - "fiction 1019\n", - "telephone 968\n", - "government 939\n", - "travel 928\n", + "telephone 1055\n", + "slate 1003\n", + "travel 961\n", + "fiction 952\n", + "government 938\n", "Name: genre, dtype: int64" ] }, @@ -257,10 +257,10 @@ "source": [ "# encode labels\n", "label_encoder = LabelEncoder()\n", - "labels_train = label_encoder.fit_transform(df_train[LABEL_COL])\n", - "labels_test = label_encoder.transform(df_test[LABEL_COL])\n", + "df_train[LABEL_COL] = label_encoder.fit_transform(df_train[LABEL_COL])\n", + "df_test[LABEL_COL] = label_encoder.transform(df_test[LABEL_COL])\n", "\n", - "num_labels = len(np.unique(labels_train))" + "num_labels = len(np.unique(df_train[LABEL_COL]))" ] }, { @@ -485,20 +485,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n", + "100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n", + "100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n", + "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n", + "100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n", + "100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n", + "100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n", + "100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n", + "100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n", + "100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n", + "100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n" + ] + } + ], "source": [ "results = {}\n", "\n", - "for model_name in tqdm(MODEL_NAMES):\n", - " \n", + "for model_name in tqdm(MODEL_NAMES, disable=True):\n", + "\n", " # preprocess\n", - " processor = Processor(model_name=model_name, cache_dir=CACHE_DIR)\n", - " ds_train = processor.preprocess(\n", - " df_train[TEXT_COL], labels_train, max_len=MAX_LEN\n", + " processor = Processor(\n", + " model_name=model_name,\n", + " to_lower=model_name.endswith(\"uncased\"),\n", + " cache_dir=CACHE_DIR,\n", + " )\n", + " train_dataloader = processor.create_dataloader_from_df(\n", + " df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n", + " )\n", + " test_dataloader = processor.create_dataloader_from_df(\n", + " df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n", " )\n", - " ds_test = processor.preprocess(df_test[TEXT_COL], None, max_len=MAX_LEN)\n", "\n", " # fine-tune\n", " classifier = SequenceClassifier(\n", @@ -506,9 +531,8 @@ " )\n", " with Timer() as t:\n", " classifier.fit(\n", - " ds_train,\n", + " train_dataloader,\n", " num_epochs=NUM_EPOCHS,\n", - " batch_size=BATCH_SIZE,\n", " num_gpus=NUM_GPUS,\n", " verbose=False,\n", " )\n", @@ -516,13 +540,13 @@ "\n", " # predict\n", " preds = classifier.predict(\n", - " ds_test, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, verbose=False\n", + " test_dataloader, num_gpus=NUM_GPUS, verbose=False\n", " )\n", "\n", " # eval\n", - " accuracy = accuracy_score(labels_test, preds)\n", + " accuracy = accuracy_score(df_test[LABEL_COL], preds)\n", " class_report = classification_report(\n", - " labels_test, preds, target_names=label_encoder.classes_, output_dict=True\n", + " df_test[LABEL_COL], preds, target_names=label_encoder.classes_, output_dict=True\n", " )\n", "\n", " # save results\n", @@ -576,21 +600,21 @@ " \n", " \n", " accuracy\n", - " 0.870416\n", - " 0.899144\n", - " 0.911369\n", + " 0.895477\n", + " 0.879584\n", + " 0.894866\n", " \n", " \n", " f1-score\n", - " 0.870305\n", - " 0.897614\n", - " 0.910810\n", + " 0.896656\n", + " 0.881218\n", + " 0.896108\n", " \n", " \n", " time(hrs)\n", - " 0.021828\n", - " 0.035325\n", - " 0.046363\n", + " 0.021865\n", + " 0.035351\n", + " 0.046295\n", " \n", " \n", "\n", @@ -598,9 +622,9 @@ ], "text/plain": [ " distilbert-base-uncased roberta-base xlnet-base-cased\n", - "accuracy 0.870416 0.899144 0.911369\n", - "f1-score 0.870305 0.897614 0.910810\n", - "time(hrs) 0.021828 0.035325 0.046363" + "accuracy 0.895477 0.879584 0.894866\n", + "f1-score 0.896656 0.881218 0.896108\n", + "time(hrs) 0.021865 0.035351 0.046295" ] }, "execution_count": 13, @@ -621,7 +645,7 @@ { "data": { "application/scrapbook.scrap.json+json": { - "data": 0.8936430317848411, + "data": 0.8899755501222494, "encoder": "json", "name": "accuracy", "version": 1 @@ -639,7 +663,7 @@ { "data": { "application/scrapbook.scrap.json+json": { - "data": 0.8929098953149991, + "data": 0.8913273009038569, "encoder": "json", "name": "f1", "version": 1