From e83bf05a4d115176f5f3bf7f17041fc208b76eac Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Wed, 23 Oct 2019 19:26:17 +0000
Subject: [PATCH 1/8] edits and updates

---
 .../text_classification/tc_mnli_bert.ipynb    |  819 ---
 .../tc_mnli_transformers.ipynb                | 6083 +----------------
 tests/conftest.py                             |    4 +-
 .../test_notebooks_text_classification.py     |   18 +-
 utils_nlp/dataset/squad.py                    |   10 +-
 utils_nlp/models/transformers/common.py       |   37 +-
 .../transformers/named_entity_recognition.py  |   99 +-
 .../models/transformers/question_answering.py |   14 +-
 .../transformers/sequence_classification.py   |   14 +-
 9 files changed, 328 insertions(+), 6770 deletions(-)
 delete mode 100644 examples/text_classification/tc_mnli_bert.ipynb

diff --git a/examples/text_classification/tc_mnli_bert.ipynb b/examples/text_classification/tc_mnli_bert.ipynb
deleted file mode 100644
index 7712416a4..000000000
--- a/examples/text_classification/tc_mnli_bert.ipynb
+++ /dev/null
@@ -1,819 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "*Copyright (c) Microsoft Corporation. All rights reserved.*\n",
-    "\n",
-    "*Licensed under the MIT License.*\n",
-    "\n",
-    "# Text Classification of MultiNLI Sentences using BERT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Before You Start\n",
-    "\n",
-    "> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
-    "\n",
-    "The table below provides some reference running time on different machine configurations.  \n",
-    "\n",
-    "|QUICK_RUN|Machine Configurations|Running time|\n",
-    "|:---------|:----------------------|:------------|\n",
-    "|True|4 **CPU**s, 14GB memory| ~ 15 minutes|\n",
-    "|False|4 **CPU**s, 14GB memory| ~19.5 hours|\n",
-    "|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 3 minutes |\n",
-    "|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1.5 hours|\n",
-    "\n",
-    "If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_LEN`, but note that model performance will be compromised. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
-    "QUICK_RUN = False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.append(\"../../\")\n",
-    "import os\n",
-    "import json\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import scrapbook as sb\n",
-    "from sklearn.metrics import classification_report, accuracy_score\n",
-    "from sklearn.preprocessing import LabelEncoder\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "\n",
-    "from utils_nlp.dataset.multinli import load_pandas_df\n",    
-    "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
-    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
-    "from utils_nlp.common.timer import Timer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Introduction\n",
-    "In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.\n",
-    "\n",
-    "We use a [sequence classifier](../../utils_nlp/models/bert/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "TRAIN_DATA_FRACTION = 1\n",
-    "TEST_DATA_FRACTION = 1\n",
-    "NUM_EPOCHS = 1\n",
-    "\n",
-    "if QUICK_RUN:\n",
-    "    TRAIN_DATA_FRACTION = 0.01\n",
-    "    TEST_DATA_FRACTION = 0.01\n",
-    "    NUM_EPOCHS = 1\n",
-    "\n",
-    "if torch.cuda.is_available():\n",
-    "    BATCH_SIZE = 32\n",
-    "else:\n",
-    "    BATCH_SIZE = 8\n",
-    "\n",
-    "DATA_FOLDER = \"./temp\"\n",
-    "BERT_CACHE_DIR = \"./temp\"\n",
-    "LANGUAGE = Language.ENGLISH\n",
-    "TO_LOWER = True\n",
-    "MAX_LEN = 150\n",
-    "BATCH_SIZE_PRED = 512\n",
-    "TRAIN_SIZE = 0.6\n",
-    "LABEL_COL = \"genre\"\n",
-    "TEXT_COL = \"sentence1\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Read Dataset\n",
-    "We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.\n",
-    "\n",
-    "The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.\n",
-    "\n",
-    "For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = load_pandas_df(DATA_FOLDER, \"train\")\n",
-    "df = df[df[\"gold_label\"]==\"neutral\"]  # get unique sentences"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>genre</th>\n",
-       "      <th>sentence1</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>government</td>\n",
-       "      <td>Conceptually cream skimming has two basic dime...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>telephone</td>\n",
-       "      <td>yeah i tell you what though if you go price so...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>travel</td>\n",
-       "      <td>But a few Christian mosaics survive above the ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>slate</td>\n",
-       "      <td>It's not that the questions they asked weren't...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>travel</td>\n",
-       "      <td>Thebes held onto power until the 12th Dynasty,...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         genre                                          sentence1\n",
-       "0   government  Conceptually cream skimming has two basic dime...\n",
-       "4    telephone  yeah i tell you what though if you go price so...\n",
-       "6       travel  But a few Christian mosaics survive above the ...\n",
-       "12       slate  It's not that the questions they asked weren't...\n",
-       "13      travel  Thebes held onto power until the 12th Dynasty,..."
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[[LABEL_COL, TEXT_COL]].head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The examples in the dataset are grouped into 5 genres:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "telephone     27783\n",
-       "government    25784\n",
-       "travel        25783\n",
-       "fiction       25782\n",
-       "slate         25768\n",
-       "Name: genre, dtype: int64"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[LABEL_COL].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We split the data for training and testing, and encode the class labels:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
-      "  FutureWarning)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# split\n",
-    "df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_train = df_train.sample(frac=TRAIN_DATA_FRACTION).reset_index(drop=True)\n",
-    "df_test = df_test.sample(frac=TEST_DATA_FRACTION).reset_index(drop=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# encode labels\n",
-    "label_encoder = LabelEncoder()\n",
-    "labels_train = label_encoder.fit_transform(df_train[LABEL_COL])\n",
-    "labels_test = label_encoder.transform(df_test[LABEL_COL])\n",
-    "\n",
-    "num_labels = len(np.unique(labels_train))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of unique labels: 5\n",
-      "Number of training examples: 78540\n",
-      "Number of testing examples: 52360\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Number of unique labels: {}\".format(num_labels))\n",
-    "print(\"Number of training examples: {}\".format(df_train.shape[0]))\n",
-    "print(\"Number of testing examples: {}\".format(df_test.shape[0]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Tokenize and Preprocess"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 78540/78540 [00:27<00:00, 2841.38it/s]\n",
-      "100%|██████████| 52360/52360 [00:18<00:00, 2834.92it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)\n",
-    "\n",
-    "tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))\n",
-    "tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In addition, we perform the following preprocessing steps in the cell below:\n",
-    "- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
-    "- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
-    "- Pad or truncate the token lists to the specified max length\n",
-    "- Return mask lists that indicate paddings' positions\n",
-    "- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)\n",
-    "\n",
-    "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(\n",
-    "    tokens_train, MAX_LEN\n",
-    ")\n",
-    "tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(\n",
-    "    tokens_test, MAX_LEN\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Create Model\n",
-    "Next, we create a sequence classifier that loads a pre-trained BERT model, given the language and number of labels."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "classifier = BERTSequenceClassifier(\n",
-    "    language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Train\n",
-    "We train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "t_total value of -1 results in schedule not being applied\n",
-      "Iteration:   0%|          | 0/2455 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning: Only 1 CUDA device is available. Data parallelism is not possible.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Iteration:   0%|          | 1/2455 [00:01<1:21:44,  2.00s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:1->246/2455; average training loss:1.653734\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  10%|█         | 247/2455 [07:39<1:09:04,  1.88s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:247->492/2455; average training loss:0.376494\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  20%|██        | 493/2455 [15:23<1:01:48,  1.89s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:493->738/2455; average training loss:0.314981\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  30%|███       | 739/2455 [23:06<53:42,  1.88s/it]  "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:739->984/2455; average training loss:0.286209\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  40%|████      | 985/2455 [30:50<46:17,  1.89s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:985->1230/2455; average training loss:0.265873\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  50%|█████     | 1231/2455 [38:33<38:29,  1.89s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:1231->1476/2455; average training loss:0.252521\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  60%|██████    | 1477/2455 [46:16<30:38,  1.88s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:1477->1722/2455; average training loss:0.243316\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  70%|███████   | 1723/2455 [54:00<23:04,  1.89s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:1723->1968/2455; average training loss:0.235114\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  80%|████████  | 1969/2455 [1:01:44<15:14,  1.88s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:1969->2214/2455; average training loss:0.229056\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration:  90%|█████████ | 2215/2455 [1:09:26<07:30,  1.88s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/1; batch:2215->2455/2455; average training loss:0.223192\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration: 100%|██████████| 2455/2455 [1:16:56<00:00,  1.57s/it]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Training time: 1.283 hrs]\n"
-     ]
-    }
-   ],
-   "source": [
-    "with Timer() as t:\n",
-    "    classifier.fit(\n",
-    "        token_ids=tokens_train,\n",
-    "        input_mask=mask_train,\n",
-    "        labels=labels_train,    \n",
-    "        num_epochs=NUM_EPOCHS,\n",
-    "        batch_size=BATCH_SIZE,    \n",
-    "        verbose=True,\n",
-    "    )    \n",
-    "print(\"[Training time: {:.3f} hrs]\".format(t.interval / 3600))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Score\n",
-    "We score the test set using the trained classifier:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning: Only 1 CUDA device is available. Data parallelism is not possible.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Iteration: 100%|██████████| 103/103 [18:00<00:00,  8.24s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "preds = classifier.predict(token_ids=tokens_test, \n",
-    "                           input_mask=mask_test, \n",
-    "                           batch_size=BATCH_SIZE_PRED)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Evaluate Results\n",
-    "Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "accuracy: 0.9421504965622612\n",
-      "{\n",
-      "    \"fiction\": {\n",
-      "        \"f1-score\": 0.924482109227872,\n",
-      "        \"precision\": 0.8953944368445053,\n",
-      "        \"recall\": 0.9555231143552312,\n",
-      "        \"support\": 10275\n",
-      "    },\n",
-      "    \"government\": {\n",
-      "        \"f1-score\": 0.948873653281097,\n",
-      "        \"precision\": 0.9565560821484992,\n",
-      "        \"recall\": 0.9413136416634279,\n",
-      "        \"support\": 10292\n",
-      "    },\n",
-      "    \"macro avg\": {\n",
-      "        \"f1-score\": 0.9408187527049234,\n",
-      "        \"precision\": 0.9413336757882582,\n",
-      "        \"recall\": 0.9411302847360989,\n",
-      "        \"support\": 52360\n",
-      "    },\n",
-      "    \"micro avg\": {\n",
-      "        \"f1-score\": 0.9421504965622612,\n",
-      "        \"precision\": 0.9421504965622612,\n",
-      "        \"recall\": 0.9421504965622612,\n",
-      "        \"support\": 52360\n",
-      "    },\n",
-      "    \"slate\": {\n",
-      "        \"f1-score\": 0.8725352112676057,\n",
-      "        \"precision\": 0.9031552639800062,\n",
-      "        \"recall\": 0.8439233239272161,\n",
-      "        \"support\": 10277\n",
-      "    },\n",
-      "    \"telephone\": {\n",
-      "        \"f1-score\": 0.9935128410201723,\n",
-      "        \"precision\": 0.9892929829218653,\n",
-      "        \"recall\": 0.99776885319054,\n",
-      "        \"support\": 11205\n",
-      "    },\n",
-      "    \"travel\": {\n",
-      "        \"f1-score\": 0.9646899487278707,\n",
-      "        \"precision\": 0.9622696130464151,\n",
-      "        \"recall\": 0.9671224905440792,\n",
-      "        \"support\": 10311\n",
-      "    },\n",
-      "    \"weighted avg\": {\n",
-      "        \"f1-score\": 0.9417711062461178,\n",
-      "        \"precision\": 0.942203390713011,\n",
-      "        \"recall\": 0.9421504965622612,\n",
-      "        \"support\": 52360\n",
-      "    }\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) \n",
-    "accuracy = accuracy_score(labels_test, preds )\n",
-    "print(\"accuracy: {}\".format(accuracy))\n",
-    "print(json.dumps(report, indent=4, sort_keys=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/scrapbook.scrap.json+json": {
-       "data": 0.9421504965622612,
-       "encoder": "json",
-       "name": "accuracy",
-       "version": 1
-      }
-     },
-     "metadata": {
-      "scrapbook": {
-       "data": true,
-       "display": false,
-       "name": "accuracy"
-      }
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/scrapbook.scrap.json+json": {
-       "data": 0.9413336757882582,
-       "encoder": "json",
-       "name": "precision",
-       "version": 1
-      }
-     },
-     "metadata": {
-      "scrapbook": {
-       "data": true,
-       "display": false,
-       "name": "precision"
-      }
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/scrapbook.scrap.json+json": {
-       "data": 0.9411302847360989,
-       "encoder": "json",
-       "name": "recall",
-       "version": 1
-      }
-     },
-     "metadata": {
-      "scrapbook": {
-       "data": true,
-       "display": false,
-       "name": "recall"
-      }
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/scrapbook.scrap.json+json": {
-       "data": 0.9408187527049234,
-       "encoder": "json",
-       "name": "f1",
-       "version": 1
-      }
-     },
-     "metadata": {
-      "scrapbook": {
-       "data": true,
-       "display": false,
-       "name": "f1"
-      }
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# for testing\n",
-    "sb.glue(\"accuracy\", accuracy)\n",
-    "sb.glue(\"precision\", report[\"macro avg\"][\"precision\"])\n",
-    "sb.glue(\"recall\", report[\"macro avg\"][\"recall\"])\n",
-    "sb.glue(\"f1\", report[\"macro avg\"][\"f1-score\"])\n"
-   ]
-  }
- ],
- "metadata": {
-  "celltoolbar": "Tags",
-  "kernelspec": {
-   "display_name": "nlp_gpu",
-   "language": "python",
-   "name": "nlp_gpu"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb
index 00ddedf87..bb6bcbffe 100644
--- a/examples/text_classification/tc_mnli_transformers.ipynb
+++ b/examples/text_classification/tc_mnli_transformers.ipynb
@@ -13,28 +13,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys\n",
-    "import os\n",
     "import json\n",
-    "import pandas as pd\n",
+    "import os\n",
+    "import sys\n",
+    "from tempfile import TemporaryDirectory\n",
+    "\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "import scrapbook as sb\n",
-    "from sklearn.metrics import classification_report, accuracy_score\n",
-    "from sklearn.preprocessing import LabelEncoder\n",
-    "from sklearn.model_selection import train_test_split\n",
     "import torch\n",
     "import torch.nn as nn\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
     "from tqdm import tqdm\n",
+    "from utils_nlp.common.timer import Timer\n",
     "from utils_nlp.dataset.multinli import load_pandas_df\n",
     "from utils_nlp.models.transformers.sequence_classification import (\n",
-    "    SequenceClassifier,\n",
-    "    Processor,\n",
-    ")\n",
-    "from utils_nlp.common.timer import Timer"
+    "    Processor, SequenceClassifier)"
    ]
   },
   {
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 2,
    "metadata": {
     "tags": [
      "parameters"
@@ -58,18 +58,18 @@
    "outputs": [],
    "source": [
     "# notebook parameters\n",
-    "DATA_FOLDER = \"./temp\"\n",
-    "CACHE_DIR = \"./temp\"\n",
-    "DEVICE = \"cuda\"\n",
+    "DATA_FOLDER = TemporaryDirectory().name\n",
+    "CACHE_DIR = TemporaryDirectory().name\n",
     "NUM_EPOCHS = 1\n",
     "BATCH_SIZE = 16\n",
     "NUM_GPUS = 2\n",
-    "MAX_LEN = 150\n",
-    "TRAIN_DATA_FRACTION = 0.15\n",
-    "TEST_DATA_FRACTION = 0.15\n",
+    "MAX_LEN = 100\n",
+    "TRAIN_DATA_FRACTION = 0.05\n",
+    "TEST_DATA_FRACTION = 0.05\n",
     "TRAIN_SIZE = 0.75\n",
     "LABEL_COL = \"genre\"\n",
-    "TEXT_COL = \"sentence1\""
+    "TEXT_COL = \"sentence1\"\n",
+    "MODEL_NAMES = [\"distilbert-base-uncased\", \"roberta-base\", \"xlnet-base-cased\"]"
    ]
   },
   {
@@ -86,9 +86,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 222k/222k [02:38<00:00, 1.40kKB/s] \n"
+     ]
+    }
+   ],
    "source": [
     "df = load_pandas_df(DATA_FOLDER, \"train\")\n",
     "df = df[df[\"gold_label\"]==\"neutral\"]  # get unique sentences"
@@ -96,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -163,7 +171,7 @@
        "13      travel  Thebes held onto power until the 12th Dynasty,..."
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -200,7 +208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -218,21 +226,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "telephone     3146\n",
-       "fiction       2960\n",
-       "slate         2901\n",
-       "government    2893\n",
-       "travel        2826\n",
+       "slate         1055\n",
+       "fiction       1019\n",
+       "telephone      968\n",
+       "government     939\n",
+       "travel         928\n",
        "Name: genre, dtype: int64"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -243,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -257,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -265,8 +273,8 @@
      "output_type": "stream",
      "text": [
       "Number of unique labels: 5\n",
-      "Number of training examples: 14726\n",
-      "Number of testing examples: 4909\n"
+      "Number of training examples: 4909\n",
+      "Number of testing examples: 1636\n"
      ]
     }
    ],
@@ -287,7 +295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -369,30 +377,38 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
-       "      <td>roberta-base</td>\n",
+       "      <td>bert-base-german-dbmdz-cased</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
-       "      <td>roberta-large</td>\n",
+       "      <td>bert-base-german-dbmdz-uncased</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
-       "      <td>roberta-large-mnli</td>\n",
+       "      <td>roberta-base</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>xlnet-base-cased</td>\n",
+       "      <td>roberta-large</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>xlnet-large-cased</td>\n",
+       "      <td>roberta-large-mnli</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
-       "      <td>distilbert-base-uncased</td>\n",
+       "      <td>xlnet-base-cased</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
+       "      <td>xlnet-large-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>distilbert-base-uncased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
        "      <td>distilbert-base-uncased-distilled-squad</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -414,16 +430,18 @@
        "10  bert-large-uncased-whole-word-masking-finetune...\n",
        "11  bert-large-cased-whole-word-masking-finetuned-...\n",
        "12                     bert-base-cased-finetuned-mrpc\n",
-       "13                                       roberta-base\n",
-       "14                                      roberta-large\n",
-       "15                                 roberta-large-mnli\n",
-       "16                                   xlnet-base-cased\n",
-       "17                                  xlnet-large-cased\n",
-       "18                            distilbert-base-uncased\n",
-       "19            distilbert-base-uncased-distilled-squad"
+       "13                       bert-base-german-dbmdz-cased\n",
+       "14                     bert-base-german-dbmdz-uncased\n",
+       "15                                       roberta-base\n",
+       "16                                      roberta-large\n",
+       "17                                 roberta-large-mnli\n",
+       "18                                   xlnet-base-cased\n",
+       "19                                  xlnet-large-cased\n",
+       "20                            distilbert-base-uncased\n",
+       "21            distilbert-base-uncased-distilled-squad"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -443,11 +461,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['distilbert-base-uncased', 'roberta-base', 'xlnet-base-cased']\n"
+     ]
+    }
+   ],
    "source": [
-    "model_names = [\"distilbert-base-uncased\", \"roberta-base\", \"xlnet-base-cased\"]"
+    "print(MODEL_NAMES)"
    ]
   },
   {
@@ -459,5803 +485,180 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "\n",
+    "for model_name in tqdm(MODEL_NAMES):\n",
+    "    \n",
+    "    # preprocess\n",
+    "    processor = Processor(model_name=model_name, cache_dir=CACHE_DIR)\n",
+    "    ds_train = processor.preprocess(\n",
+    "        df_train[TEXT_COL], labels_train, max_len=MAX_LEN\n",
+    "    )\n",
+    "    ds_test = processor.preprocess(df_test[TEXT_COL], None, max_len=MAX_LEN)\n",
+    "\n",
+    "    # fine-tune\n",
+    "    classifier = SequenceClassifier(\n",
+    "        model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR\n",
+    "    )\n",
+    "    with Timer() as t:\n",
+    "        classifier.fit(\n",
+    "            ds_train,\n",
+    "            num_epochs=NUM_EPOCHS,\n",
+    "            batch_size=BATCH_SIZE,\n",
+    "            num_gpus=NUM_GPUS,\n",
+    "            verbose=False,\n",
+    "        )\n",
+    "    train_time = t.interval / 3600\n",
+    "\n",
+    "    # predict\n",
+    "    preds = classifier.predict(\n",
+    "        ds_test, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, verbose=False\n",
+    "    )\n",
+    "\n",
+    "    # eval\n",
+    "    accuracy = accuracy_score(labels_test, preds)\n",
+    "    class_report = classification_report(\n",
+    "        labels_test, preds, target_names=label_encoder.classes_, output_dict=True\n",
+    "    )\n",
+    "\n",
+    "    # save results\n",
+    "    results[model_name] = {\n",
+    "        \"accuracy\": accuracy,\n",
+    "        \"f1-score\": class_report[\"macro avg\"][\"f1-score\"],\n",
+    "        \"time(hrs)\": train_time,\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate\n",
+    "\n",
+    "Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "  0%|          | 0/3 [00:00<?, ?it/s]\u001b[AI1002 17:19:07.367456 140305852307264 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ./temp/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n",
-      "I1002 17:19:16.283660 140305852307264 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at ./temp/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.1ccd1a11c9ff276830e114ea477ea2407100f4a3be7bdc45d37be9e37fa71c7e\n",
-      "I1002 17:19:16.285341 140305852307264 configuration_utils.py:168] Model config {\n",
-      "  \"activation\": \"gelu\",\n",
-      "  \"attention_dropout\": 0.1,\n",
-      "  \"dim\": 768,\n",
-      "  \"dropout\": 0.1,\n",
-      "  \"finetuning_task\": null,\n",
-      "  \"hidden_dim\": 3072,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"n_heads\": 12,\n",
-      "  \"n_layers\": 6,\n",
-      "  \"num_labels\": 5,\n",
-      "  \"output_attentions\": false,\n",
-      "  \"output_hidden_states\": false,\n",
-      "  \"pruned_heads\": {},\n",
-      "  \"qa_dropout\": 0.1,\n",
-      "  \"seq_classif_dropout\": 0.2,\n",
-      "  \"sinusoidal_pos_embds\": false,\n",
-      "  \"tie_weights_\": true,\n",
-      "  \"torchscript\": false,\n",
-      "  \"use_bfloat16\": false,\n",
-      "  \"vocab_size\": 30522\n",
-      "}\n",
-      "\n",
-      "I1002 17:19:16.311881 140305852307264 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin from cache at ./temp/7b8a8f0b21c4e7f6962451c9370a5d9af90372a5f64637a251f2de154d0fc72c.c2015533705b9dff680ae707e205a35e2860e8d148b45d35085419d74fe57ac5\n",
-      "I1002 17:19:17.992355 140305852307264 modeling_utils.py:405] Weights of DistilBertForSequenceClassification not initialized from pretrained model: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
-      "I1002 17:19:17.993372 140305852307264 modeling_utils.py:408] Weights from pretrained model not used in DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
-      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "                                     \n",
-      "  0%|          | 0/3 [01:09<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:23<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.613472\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:14<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:28<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.296651\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:18<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:33<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.851296\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:23<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:37<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.720305\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:28<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:42<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.880179\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:33<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:47<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.748330\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:37<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:51<?, ?it/s]\u001b[A"
-     ]
-    },
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>distilbert-base-uncased</th>\n",
+       "      <th>roberta-base</th>\n",
+       "      <th>xlnet-base-cased</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>accuracy</th>\n",
+       "      <td>0.870416</td>\n",
+       "      <td>0.899144</td>\n",
+       "      <td>0.911369</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>f1-score</th>\n",
+       "      <td>0.870305</td>\n",
+       "      <td>0.897614</td>\n",
+       "      <td>0.910810</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>time(hrs)</th>\n",
+       "      <td>0.021828</td>\n",
+       "      <td>0.035325</td>\n",
+       "      <td>0.046363</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           distilbert-base-uncased  roberta-base  xlnet-base-cased\n",
+       "accuracy                  0.870416      0.899144          0.911369\n",
+       "f1-score                  0.870305      0.897614          0.910810\n",
+       "time(hrs)                 0.021828      0.035325          0.046363"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_results = pd.DataFrame(results)\n",
+    "df_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.402193\n"
-     ]
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.8936430317848411,
+       "encoder": "json",
+       "name": "accuracy",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "accuracy"
+      }
+     },
+     "output_type": "display_data"
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:42<?, ?it/s]\n",
-      "  0%|          | 0/3 [00:56<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.389777\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:47<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:01<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.482890\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:52<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:06<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.904992\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [01:56<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:11<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.446059\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:01<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:15<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.387229\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:06<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:20<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.532320\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:11<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:25<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.083408\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:15<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:29<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.549527\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:20<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:34<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.460988\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:25<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:39<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.446642\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:30<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:44<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.402221\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:34<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:49<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.483969\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:39<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:53<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.156701\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:44<?, ?it/s]\n",
-      "  0%|          | 0/3 [01:58<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.200881\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:49<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:03<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.451065\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:53<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:08<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.536547\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [02:58<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:12<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.345483\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:03<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:17<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.219984\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:07<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:22<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.216656\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:12<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:27<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.457885\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:17<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:31<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.232123\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:22<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:36<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.282065\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:27<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:41<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.602874\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:31<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:46<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.287213\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:36<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:50<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.324319\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:41<?, ?it/s]\n",
-      "  0%|          | 0/3 [02:55<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.110210\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:45<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:00<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.311971\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:50<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:04<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.109896\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [03:55<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:09<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.062447\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:00<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:14<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.260447\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:04<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:19<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.404324\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:09<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:23<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.487277\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:14<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:28<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.618984\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:19<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:33<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.208204\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:23<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:38<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.383544\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:28<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:42<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.490614\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:32<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:46<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.175832\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:36<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:50<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.306752\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:40<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:54<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.318104\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:44<?, ?it/s]\n",
-      "  0%|          | 0/3 [03:58<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.534412\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:47<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:02<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.203843\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:51<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:06<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.061628\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:55<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:09<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.350042\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [04:59<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:13<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.429678\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:03<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:17<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.124946\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:07<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:21<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.420080\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:10<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:25<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.088511\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:14<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:28<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.703935\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:18<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:32<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.348099\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:22<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:36<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.640956\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:26<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:40<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.437338\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:29<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:44<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.216858\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:33<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:47<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.246830\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:37<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:51<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.060938\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:41<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:55<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.276095\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:45<?, ?it/s]\n",
-      "  0%|          | 0/3 [04:59<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.278681\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:48<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:03<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.311547\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:52<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:07<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.332097\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [05:56<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:10<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.397385\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:00<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:14<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.307721\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:04<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:18<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.329896\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:08<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:22<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.607863\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:11<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:26<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.746738\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:15<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:29<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.063720\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:19<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:33<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.252187\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:23<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:37<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.297432\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:27<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:41<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.331586\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:30<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:45<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.202993\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:34<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:48<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.500522\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:38<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:52<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.276734\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:42<?, ?it/s]\n",
-      "  0%|          | 0/3 [05:56<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.314735\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:46<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:00<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.197529\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:49<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:04<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.347036\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:53<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:07<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.189967\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [06:57<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:11<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.419700\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:01<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:15<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.278979\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:05<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:19<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.278433\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:08<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:23<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.190359\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:12<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:26<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.598480\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:16<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:30<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.140968\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:20<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:34<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.255686\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:23<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:38<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.373986\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:27<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:41<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.196667\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:31<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:45<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.074657\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:35<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:49<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.316425\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [07:39<?, ?it/s]\n",
-      "  0%|          | 0/3 [06:53<?, ?it/s]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.059075\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Evaluating:   0%|          | 0/154 [00:00<?, ?it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|          | 1/154 [00:00<00:42,  3.56it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|▏         | 2/154 [00:00<00:39,  3.90it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   2%|▏         | 3/154 [00:00<00:36,  4.15it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 4/154 [00:00<00:34,  4.35it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 5/154 [00:01<00:33,  4.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   4%|▍         | 6/154 [00:01<00:32,  4.60it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▍         | 7/154 [00:01<00:31,  4.67it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▌         | 8/154 [00:01<00:30,  4.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▌         | 9/154 [00:01<00:30,  4.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▋         | 10/154 [00:02<00:30,  4.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   7%|▋         | 11/154 [00:02<00:29,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 12/154 [00:02<00:29,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 13/154 [00:02<00:29,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   9%|▉         | 14/154 [00:02<00:28,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|▉         | 15/154 [00:03<00:28,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|█         | 16/154 [00:03<00:28,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  11%|█         | 17/154 [00:03<00:28,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 18/154 [00:03<00:28,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 19/154 [00:03<00:27,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  13%|█▎        | 20/154 [00:04<00:27,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▎        | 21/154 [00:04<00:27,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▍        | 22/154 [00:04<00:27,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  15%|█▍        | 23/154 [00:04<00:27,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 24/154 [00:05<00:26,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 25/154 [00:05<00:26,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  17%|█▋        | 26/154 [00:05<00:26,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 27/154 [00:05<00:26,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 28/154 [00:05<00:26,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 29/154 [00:06<00:25,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 30/154 [00:06<00:25,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  20%|██        | 31/154 [00:06<00:25,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██        | 32/154 [00:06<00:25,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██▏       | 33/154 [00:06<00:24,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  22%|██▏       | 34/154 [00:07<00:24,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 35/154 [00:07<00:24,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 36/154 [00:07<00:24,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  24%|██▍       | 37/154 [00:07<00:24,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▍       | 38/154 [00:07<00:23,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▌       | 39/154 [00:08<00:23,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  26%|██▌       | 40/154 [00:08<00:23,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 41/154 [00:08<00:23,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 42/154 [00:08<00:23,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  28%|██▊       | 43/154 [00:08<00:22,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▊       | 44/154 [00:09<00:22,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▉       | 45/154 [00:09<00:22,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  30%|██▉       | 46/154 [00:09<00:22,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 47/154 [00:09<00:22,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 48/154 [00:09<00:21,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 49/154 [00:10<00:21,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 50/154 [00:10<00:21,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  33%|███▎      | 51/154 [00:10<00:21,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 52/154 [00:10<00:20,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 53/154 [00:10<00:20,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  35%|███▌      | 54/154 [00:11<00:20,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▌      | 55/154 [00:11<00:20,  4.88it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▋      | 56/154 [00:11<00:20,  4.88it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  37%|███▋      | 57/154 [00:11<00:19,  4.88it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 58/154 [00:12<00:19,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 59/154 [00:12<00:19,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  39%|███▉      | 60/154 [00:12<00:19,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|███▉      | 61/154 [00:12<00:19,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|████      | 62/154 [00:12<00:19,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  41%|████      | 63/154 [00:13<00:18,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 64/154 [00:13<00:18,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 65/154 [00:13<00:18,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  43%|████▎     | 66/154 [00:13<00:18,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▎     | 67/154 [00:13<00:17,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▍     | 68/154 [00:14<00:17,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▍     | 69/154 [00:14<00:17,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▌     | 70/154 [00:14<00:17,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  46%|████▌     | 71/154 [00:14<00:17,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 72/154 [00:14<00:16,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 73/154 [00:15<00:16,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  48%|████▊     | 74/154 [00:15<00:16,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▊     | 75/154 [00:15<00:16,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▉     | 76/154 [00:15<00:16,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  50%|█████     | 77/154 [00:15<00:15,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████     | 78/154 [00:16<00:15,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████▏    | 79/154 [00:16<00:15,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  52%|█████▏    | 80/154 [00:16<00:15,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 81/154 [00:16<00:14,  4.88it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 82/154 [00:16<00:14,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  54%|█████▍    | 83/154 [00:17<00:14,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▍    | 84/154 [00:17<00:14,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▌    | 85/154 [00:17<00:14,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▌    | 86/154 [00:17<00:14,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▋    | 87/154 [00:17<00:13,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  57%|█████▋    | 88/154 [00:18<00:13,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 89/154 [00:18<00:13,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 90/154 [00:18<00:13,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  59%|█████▉    | 91/154 [00:18<00:13,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|█████▉    | 92/154 [00:19<00:12,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|██████    | 93/154 [00:19<00:12,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  61%|██████    | 94/154 [00:19<00:12,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 95/154 [00:19<00:12,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 96/154 [00:19<00:11,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  63%|██████▎   | 97/154 [00:20<00:11,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▎   | 98/154 [00:20<00:11,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▍   | 99/154 [00:20<00:11,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  65%|██████▍   | 100/154 [00:20<00:11,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 101/154 [00:20<00:11,  4.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 102/154 [00:21<00:10,  4.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  67%|██████▋   | 103/154 [00:21<00:10,  4.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 104/154 [00:21<00:10,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 105/154 [00:21<00:10,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 106/154 [00:21<00:09,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 107/154 [00:22<00:09,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  70%|███████   | 108/154 [00:22<00:09,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████   | 109/154 [00:22<00:09,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████▏  | 110/154 [00:22<00:09,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  72%|███████▏  | 111/154 [00:22<00:08,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 112/154 [00:23<00:08,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 113/154 [00:23<00:08,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  74%|███████▍  | 114/154 [00:23<00:08,  4.88it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▍  | 115/154 [00:23<00:08,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▌  | 116/154 [00:23<00:07,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  76%|███████▌  | 117/154 [00:24<00:07,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 118/154 [00:24<00:07,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 119/154 [00:24<00:07,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  78%|███████▊  | 120/154 [00:24<00:06,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▊  | 121/154 [00:25<00:06,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▉  | 122/154 [00:25<00:06,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  80%|███████▉  | 123/154 [00:25<00:06,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 124/154 [00:25<00:06,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 125/154 [00:25<00:06,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 126/154 [00:26<00:05,  4.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 127/154 [00:26<00:05,  4.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  83%|████████▎ | 128/154 [00:26<00:05,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 129/154 [00:26<00:05,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 130/154 [00:26<00:04,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  85%|████████▌ | 131/154 [00:27<00:04,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▌ | 132/154 [00:27<00:04,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▋ | 133/154 [00:27<00:04,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  87%|████████▋ | 134/154 [00:27<00:04,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 135/154 [00:27<00:03,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 136/154 [00:28<00:03,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  89%|████████▉ | 137/154 [00:28<00:03,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|████████▉ | 138/154 [00:28<00:03,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|█████████ | 139/154 [00:28<00:03,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  91%|█████████ | 140/154 [00:28<00:02,  4.87it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 141/154 [00:29<00:02,  4.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 142/154 [00:29<00:02,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  93%|█████████▎| 143/154 [00:29<00:02,  4.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▎| 144/154 [00:29<00:02,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▍| 145/154 [00:29<00:01,  4.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▍| 146/154 [00:30<00:01,  4.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▌| 147/154 [00:30<00:01,  4.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  96%|█████████▌| 148/154 [00:30<00:01,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 149/154 [00:30<00:01,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 150/154 [00:31<00:00,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  98%|█████████▊| 151/154 [00:31<00:00,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▊| 152/154 [00:31<00:00,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▉| 153/154 [00:31<00:00,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating: 100%|██████████| 154/154 [00:31<00:00,  4.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 33%|███▎      | 1/3 [07:25<14:50, 445.45s/it]\u001b[AI1002 17:26:32.851838 140305852307264 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at ./temp/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b\n",
-      "I1002 17:26:32.852628 140305852307264 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at ./temp/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda\n",
-      "I1002 17:26:39.075830 140305852307264 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at ./temp/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.9dad9043216064080cf9dd3711c53c0f11fe2b09313eaa66931057b4bdcaf068\n",
-      "I1002 17:26:39.077136 140305852307264 configuration_utils.py:168] Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"finetuning_task\": null,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"layer_norm_eps\": 1e-05,\n",
-      "  \"max_position_embeddings\": 514,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"num_labels\": 5,\n",
-      "  \"output_attentions\": false,\n",
-      "  \"output_hidden_states\": false,\n",
-      "  \"pruned_heads\": {},\n",
-      "  \"torchscript\": false,\n",
-      "  \"type_vocab_size\": 1,\n",
-      "  \"use_bfloat16\": false,\n",
-      "  \"vocab_size\": 50265\n",
-      "}\n",
-      "\n",
-      "I1002 17:26:39.110901 140305852307264 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin from cache at ./temp/228756ed15b6d200d7cb45aaef08c087e2706f54cb912863d2efe07c89584eb7.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e\n",
-      "I1002 17:26:43.032809 140305852307264 modeling_utils.py:405] Weights of RobertaForSequenceClassification not initialized from pretrained model: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n",
-      "I1002 17:26:43.033986 140305852307264 modeling_utils.py:408] Weights from pretrained model not used in RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']\n",
-      "                                     \n",
-      "  0%|          | 0/3 [08:21<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [07:36<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.559738\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [08:29<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [07:43<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.471496\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [08:36<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [07:50<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.184053\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [08:43<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [07:58<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.148488\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [08:51<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:05<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.857149\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [08:58<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:12<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.837435\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:05<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.819125\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:12<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:27<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.380328\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:20<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.433528\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:41<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.638522\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:34<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:49<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.553224\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [08:56<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.496607\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:49<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:03<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.599746\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [09:56<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:10<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.237169\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:04<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:18<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.637425\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:11<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:25<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.194495\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:18<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:32<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.805220\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:25<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:40<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.253925\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:33<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:47<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.689832\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:40<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [09:54<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.641453\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:47<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:02<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.147626\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [10:55<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:09<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.484902\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:02<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:16<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.276827\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:09<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:24<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.534456\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:17<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:31<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.223130\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:24<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:38<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.450985\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:31<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:45<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.533025\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:39<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [10:53<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.724428\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:46<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:00<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.477905\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [11:53<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:07<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.363852\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:01<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:15<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.228255\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:08<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:22<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.269107\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:15<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:30<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.163860\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:23<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:37<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.105093\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:30<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:44<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.288066\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:37<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:52<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.157583\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:45<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [11:59<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.537946\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:52<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:06<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.677483\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [12:59<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:14<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.022764\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:07<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:21<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.311463\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:14<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:28<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.695526\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:21<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:36<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.289729\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:29<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:43<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.240375\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:36<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:50<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.375174\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:44<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [12:58<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.186912\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:51<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:05<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.096166\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [13:58<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:12<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.535375\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:05<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:20<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.372227\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:13<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:27<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.930988\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:20<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.142896\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:42<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.651106\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:35<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:49<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.048819\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [13:56<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.576708\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:50<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:04<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.265014\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [14:57<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:11<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.765183\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:04<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.264336\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:12<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:26<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.360902\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:19<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.332452\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:41<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.356505\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:34<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:48<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.498224\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [14:56<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.320431\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:50<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:04<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.077777\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [15:57<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:11<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.281813\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:04<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.240123\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:12<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:26<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.078818\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:19<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.386153\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:41<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.044126\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:35<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:49<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.070761\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [15:56<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.445788\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:50<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:04<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.318622\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [16:57<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:12<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.582086\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:05<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.263850\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:12<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:27<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.122566\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:20<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.594169\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:41<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.586406\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:35<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:49<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.270444\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [16:57<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.403090\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:50<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:04<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.080049\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [17:57<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:12<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.108087\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:05<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.437748\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:12<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:27<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.036126\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:20<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.066607\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:41<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.705766\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:35<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:49<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.025862\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:42<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [17:57<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.535521\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:50<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:04<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.281819\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [18:57<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:12<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.186776\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:05<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:19<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.256166\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:13<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:27<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.078913\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:20<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:34<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.206851\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:27<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:42<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.165107\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:35<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:50<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.362343\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [19:43<?, ?it/s]         \n",
-      " 33%|███▎      | 1/3 [18:57<14:50, 445.45s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.030030\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Evaluating:   0%|          | 0/154 [00:00<?, ?it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|          | 1/154 [00:00<01:00,  2.51it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|▏         | 2/154 [00:00<01:00,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   2%|▏         | 3/154 [00:01<01:00,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 4/154 [00:01<01:00,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 5/154 [00:02<01:00,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   4%|▍         | 6/154 [00:02<00:59,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▍         | 7/154 [00:02<00:59,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▌         | 8/154 [00:03<00:58,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▌         | 9/154 [00:03<00:58,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▋         | 10/154 [00:04<00:58,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   7%|▋         | 11/154 [00:04<00:57,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 12/154 [00:04<00:57,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 13/154 [00:05<00:56,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   9%|▉         | 14/154 [00:05<00:56,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|▉         | 15/154 [00:06<00:55,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|█         | 16/154 [00:06<00:55,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  11%|█         | 17/154 [00:06<00:55,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 18/154 [00:07<00:54,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 19/154 [00:07<00:54,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  13%|█▎        | 20/154 [00:08<00:53,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▎        | 21/154 [00:08<00:53,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▍        | 22/154 [00:08<00:53,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  15%|█▍        | 23/154 [00:09<00:52,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 24/154 [00:09<00:52,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 25/154 [00:10<00:51,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  17%|█▋        | 26/154 [00:10<00:51,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 27/154 [00:10<00:51,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 28/154 [00:11<00:50,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 29/154 [00:11<00:50,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 30/154 [00:12<00:50,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  20%|██        | 31/154 [00:12<00:49,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██        | 32/154 [00:12<00:49,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██▏       | 33/154 [00:13<00:48,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  22%|██▏       | 34/154 [00:13<00:48,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 35/154 [00:14<00:48,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 36/154 [00:14<00:47,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  24%|██▍       | 37/154 [00:14<00:47,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▍       | 38/154 [00:15<00:46,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▌       | 39/154 [00:15<00:46,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  26%|██▌       | 40/154 [00:16<00:46,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 41/154 [00:16<00:45,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 42/154 [00:16<00:45,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  28%|██▊       | 43/154 [00:17<00:44,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▊       | 44/154 [00:17<00:44,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▉       | 45/154 [00:18<00:43,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  30%|██▉       | 46/154 [00:18<00:43,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 47/154 [00:18<00:43,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 48/154 [00:19<00:42,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 49/154 [00:19<00:42,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 50/154 [00:20<00:42,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  33%|███▎      | 51/154 [00:20<00:41,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 52/154 [00:20<00:41,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 53/154 [00:21<00:40,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  35%|███▌      | 54/154 [00:21<00:40,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▌      | 55/154 [00:22<00:40,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▋      | 56/154 [00:22<00:39,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  37%|███▋      | 57/154 [00:23<00:39,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 58/154 [00:23<00:38,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 59/154 [00:23<00:38,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  39%|███▉      | 60/154 [00:24<00:37,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|███▉      | 61/154 [00:24<00:37,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|████      | 62/154 [00:25<00:37,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  41%|████      | 63/154 [00:25<00:36,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 64/154 [00:25<00:36,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 65/154 [00:26<00:35,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  43%|████▎     | 66/154 [00:26<00:35,  2.49it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▎     | 67/154 [00:27<00:35,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▍     | 68/154 [00:27<00:34,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▍     | 69/154 [00:27<00:34,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▌     | 70/154 [00:28<00:33,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  46%|████▌     | 71/154 [00:28<00:33,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 72/154 [00:29<00:33,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 73/154 [00:29<00:32,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  48%|████▊     | 74/154 [00:29<00:32,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▊     | 75/154 [00:30<00:32,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▉     | 76/154 [00:30<00:31,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  50%|█████     | 77/154 [00:31<00:31,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████     | 78/154 [00:31<00:30,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████▏    | 79/154 [00:31<00:30,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  52%|█████▏    | 80/154 [00:32<00:29,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 81/154 [00:32<00:29,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 82/154 [00:33<00:29,  2.45it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  54%|█████▍    | 83/154 [00:33<00:28,  2.45it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▍    | 84/154 [00:33<00:28,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▌    | 85/154 [00:34<00:27,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▌    | 86/154 [00:34<00:27,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▋    | 87/154 [00:35<00:27,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  57%|█████▋    | 88/154 [00:35<00:26,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 89/154 [00:35<00:26,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 90/154 [00:36<00:25,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  59%|█████▉    | 91/154 [00:36<00:25,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|█████▉    | 92/154 [00:37<00:25,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|██████    | 93/154 [00:37<00:24,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  61%|██████    | 94/154 [00:37<00:24,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 95/154 [00:38<00:23,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 96/154 [00:38<00:23,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  63%|██████▎   | 97/154 [00:39<00:23,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▎   | 98/154 [00:39<00:22,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▍   | 99/154 [00:40<00:22,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  65%|██████▍   | 100/154 [00:40<00:21,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 101/154 [00:40<00:21,  2.45it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 102/154 [00:41<00:21,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  67%|██████▋   | 103/154 [00:41<00:20,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 104/154 [00:42<00:20,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 105/154 [00:42<00:19,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 106/154 [00:42<00:19,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 107/154 [00:43<00:18,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  70%|███████   | 108/154 [00:43<00:18,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████   | 109/154 [00:44<00:18,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████▏  | 110/154 [00:44<00:17,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  72%|███████▏  | 111/154 [00:44<00:17,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 112/154 [00:45<00:17,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 113/154 [00:45<00:16,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  74%|███████▍  | 114/154 [00:46<00:16,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▍  | 115/154 [00:46<00:15,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▌  | 116/154 [00:46<00:15,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  76%|███████▌  | 117/154 [00:47<00:14,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 118/154 [00:47<00:14,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 119/154 [00:48<00:14,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  78%|███████▊  | 120/154 [00:48<00:13,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▊  | 121/154 [00:48<00:13,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▉  | 122/154 [00:49<00:12,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  80%|███████▉  | 123/154 [00:49<00:12,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 124/154 [00:50<00:12,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 125/154 [00:50<00:11,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 126/154 [00:50<00:11,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 127/154 [00:51<00:10,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  83%|████████▎ | 128/154 [00:51<00:10,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 129/154 [00:52<00:10,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 130/154 [00:52<00:09,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  85%|████████▌ | 131/154 [00:52<00:09,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▌ | 132/154 [00:53<00:08,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▋ | 133/154 [00:53<00:08,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  87%|████████▋ | 134/154 [00:54<00:08,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 135/154 [00:54<00:07,  2.46it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 136/154 [00:54<00:07,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  89%|████████▉ | 137/154 [00:55<00:06,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|████████▉ | 138/154 [00:55<00:06,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|█████████ | 139/154 [00:56<00:06,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  91%|█████████ | 140/154 [00:56<00:05,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 141/154 [00:56<00:05,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 142/154 [00:57<00:04,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  93%|█████████▎| 143/154 [00:57<00:04,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▎| 144/154 [00:58<00:04,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▍| 145/154 [00:58<00:03,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▍| 146/154 [00:59<00:03,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▌| 147/154 [00:59<00:02,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  96%|█████████▌| 148/154 [00:59<00:02,  2.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 149/154 [01:00<00:02,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 150/154 [01:00<00:01,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  98%|█████████▊| 151/154 [01:01<00:01,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▊| 152/154 [01:01<00:00,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▉| 153/154 [01:01<00:00,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating: 100%|██████████| 154/154 [01:02<00:00,  2.48it/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 67%|██████▋   | 2/3 [20:00<08:58, 538.28s/it]\u001b[AI1002 17:39:07.674497 140305852307264 file_utils.py:296] https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model not found in cache or force_download set to True, downloading to /tmp/tmpb352wzvg\n",
-      "\n",
-      "\n",
-      "100%|██████████| 798011/798011 [00:00<00:00, 32344450.10B/s]\n",
-      "I1002 17:39:07.763530 140305852307264 file_utils.py:309] copying /tmp/tmpb352wzvg to cache at ./temp/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8\n",
-      "I1002 17:39:07.765353 140305852307264 file_utils.py:313] creating metadata file for ./temp/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8\n",
-      "I1002 17:39:07.766156 140305852307264 file_utils.py:322] removing temp file /tmp/tmpb352wzvg\n",
-      "I1002 17:39:07.766944 140305852307264 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at ./temp/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8\n",
-      "I1002 17:39:13.274621 140305852307264 file_utils.py:296] https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json not found in cache or force_download set to True, downloading to /tmp/tmpp8xzpnn5\n",
-      "\n",
-      "\n",
-      "100%|██████████| 641/641 [00:00<00:00, 633195.68B/s]\n",
-      "I1002 17:39:13.326435 140305852307264 file_utils.py:309] copying /tmp/tmpp8xzpnn5 to cache at ./temp/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f\n",
-      "I1002 17:39:13.327156 140305852307264 file_utils.py:313] creating metadata file for ./temp/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f\n",
-      "I1002 17:39:13.327944 140305852307264 file_utils.py:322] removing temp file /tmp/tmpp8xzpnn5\n",
-      "I1002 17:39:13.328718 140305852307264 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at ./temp/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f\n",
-      "I1002 17:39:13.329597 140305852307264 configuration_utils.py:168] Model config {\n",
-      "  \"attn_type\": \"bi\",\n",
-      "  \"bi_data\": false,\n",
-      "  \"clamp_len\": -1,\n",
-      "  \"d_head\": 64,\n",
-      "  \"d_inner\": 3072,\n",
-      "  \"d_model\": 768,\n",
-      "  \"dropout\": 0.1,\n",
-      "  \"end_n_top\": 5,\n",
-      "  \"ff_activation\": \"gelu\",\n",
-      "  \"finetuning_task\": null,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"layer_norm_eps\": 1e-12,\n",
-      "  \"mem_len\": null,\n",
-      "  \"n_head\": 12,\n",
-      "  \"n_layer\": 12,\n",
-      "  \"n_token\": 32000,\n",
-      "  \"num_labels\": 5,\n",
-      "  \"output_attentions\": false,\n",
-      "  \"output_hidden_states\": false,\n",
-      "  \"pruned_heads\": {},\n",
-      "  \"reuse_len\": null,\n",
-      "  \"same_length\": false,\n",
-      "  \"start_n_top\": 5,\n",
-      "  \"summary_activation\": \"tanh\",\n",
-      "  \"summary_last_dropout\": 0.1,\n",
-      "  \"summary_type\": \"last\",\n",
-      "  \"summary_use_proj\": true,\n",
-      "  \"torchscript\": false,\n",
-      "  \"untie_r\": true,\n",
-      "  \"use_bfloat16\": false\n",
-      "}\n",
-      "\n",
-      "I1002 17:39:13.363325 140305852307264 file_utils.py:296] https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin not found in cache or force_download set to True, downloading to /tmp/tmp9p3qlkni\n",
-      "\n",
-      "\n",
-      "  0%|          | 0/467042463 [00:00<?, ?B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  1%|▏         | 6108160/467042463 [00:00<00:07, 61080930.11B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  3%|▎         | 11719680/467042463 [00:00<00:07, 59499531.10B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  4%|▍         | 18286592/467042463 [00:00<00:07, 61225008.14B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  5%|▌         | 25005056/467042463 [00:00<00:07, 62898610.07B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  7%|▋         | 31676416/467042463 [00:00<00:06, 63995194.38B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "  8%|▊         | 38431744/467042463 [00:00<00:06, 65022202.95B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 10%|▉         | 45115392/467042463 [00:00<00:06, 65554973.58B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 11%|█         | 51746816/467042463 [00:00<00:06, 65776133.46B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 13%|█▎        | 58499072/467042463 [00:00<00:06, 66289961.54B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 14%|█▍        | 65259520/467042463 [00:01<00:06, 66677572.71B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 15%|█▌        | 72017920/467042463 [00:01<00:05, 66944869.63B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 17%|█▋        | 78795776/467042463 [00:01<00:05, 67191881.49B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 18%|█▊        | 85434368/467042463 [00:01<00:05, 66817741.76B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 20%|█▉        | 92127232/467042463 [00:01<00:05, 66850609.65B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 21%|██        | 98847744/467042463 [00:01<00:05, 66956254.51B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 23%|██▎       | 105695232/467042463 [00:01<00:05, 67404313.88B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 24%|██▍       | 112539648/467042463 [00:01<00:05, 67711420.88B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 26%|██▌       | 119299072/467042463 [00:01<00:05, 67021676.71B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 27%|██▋       | 126150656/467042463 [00:01<00:05, 67461156.56B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 28%|██▊       | 132977664/467042463 [00:02<00:04, 67701544.68B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 30%|██▉       | 139745280/467042463 [00:02<00:04, 67658513.56B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 31%|███▏      | 146509824/467042463 [00:02<00:04, 67181999.84B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 33%|███▎      | 153228288/467042463 [00:02<00:04, 66751974.35B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 34%|███▍      | 159973376/467042463 [00:02<00:04, 66955718.12B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 36%|███▌      | 166794240/467042463 [00:02<00:04, 67326658.73B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 37%|███▋      | 173569024/467042463 [00:02<00:04, 67445805.16B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 39%|███▊      | 180418560/467042463 [00:02<00:04, 67752360.28B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 40%|████      | 187195392/467042463 [00:02<00:04, 67133948.47B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 42%|████▏     | 193975296/467042463 [00:02<00:04, 67331546.96B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 43%|████▎     | 200794112/467042463 [00:03<00:03, 67585706.67B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 44%|████▍     | 207618048/467042463 [00:03<00:03, 67777132.64B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 46%|████▌     | 214456320/467042463 [00:03<00:03, 67957312.50B/s]\u001b[A\u001b[A\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 47%|████▋     | 221253632/467042463 [00:03<00:03, 67609840.27B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 49%|████▉     | 228016128/467042463 [00:03<00:03, 67575830.31B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 50%|█████     | 234821632/467042463 [00:03<00:03, 67718613.20B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 52%|█████▏    | 241704960/467042463 [00:03<00:03, 68048309.12B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 53%|█████▎    | 248511488/467042463 [00:03<00:03, 67755089.10B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 55%|█████▍    | 255288320/467042463 [00:03<00:03, 67445026.53B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 56%|█████▌    | 262109184/467042463 [00:03<00:03, 67670119.51B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 58%|█████▊    | 268877824/467042463 [00:04<00:03, 65621534.12B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 59%|█████▉    | 275616768/467042463 [00:04<00:02, 66141225.28B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 60%|██████    | 282419200/467042463 [00:04<00:02, 66694338.51B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 62%|██████▏   | 289139712/467042463 [00:04<00:02, 66846537.09B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 63%|██████▎   | 295904256/467042463 [00:04<00:02, 67083857.89B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 65%|██████▍   | 302617600/467042463 [00:04<00:02, 66051839.59B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 66%|██████▌   | 309370880/467042463 [00:04<00:02, 66488625.66B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 68%|██████▊   | 316243968/467042463 [00:04<00:02, 67144915.83B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 69%|██████▉   | 322988032/467042463 [00:04<00:02, 67230119.73B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 71%|███████   | 329772032/467042463 [00:04<00:02, 67409888.30B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 72%|███████▏  | 336516096/467042463 [00:05<00:01, 67070862.65B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 73%|███████▎  | 343226368/467042463 [00:05<00:01, 67058571.53B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 75%|███████▍  | 350028800/467042463 [00:05<00:01, 67344851.30B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 76%|███████▋  | 356783104/467042463 [00:05<00:01, 67402521.04B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 78%|███████▊  | 363606016/467042463 [00:05<00:01, 67647257.89B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 79%|███████▉  | 370372608/467042463 [00:05<00:01, 67201475.63B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 81%|████████  | 377210880/467042463 [00:05<00:01, 67550940.22B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 82%|████████▏ | 383968256/467042463 [00:05<00:01, 65218002.24B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 84%|████████▎ | 390651904/467042463 [00:05<00:01, 65693934.06B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 85%|████████▌ | 397539328/467042463 [00:05<00:01, 66610735.32B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 87%|████████▋ | 404311040/467042463 [00:06<00:00, 66933081.36B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 88%|████████▊ | 411158528/467042463 [00:06<00:00, 67385928.42B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 89%|████████▉ | 417927168/467042463 [00:06<00:00, 67475086.98B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 91%|█████████ | 424688640/467042463 [00:06<00:00, 67511197.65B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 92%|█████████▏| 431537152/467042463 [00:06<00:00, 67799224.34B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 94%|█████████▍| 438320128/467042463 [00:06<00:00, 62589113.01B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 95%|█████████▌| 444660736/467042463 [00:06<00:00, 62510472.00B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 97%|█████████▋| 451505152/467042463 [00:06<00:00, 64178435.74B/s]\u001b[A\u001b[A\n",
-      "\n",
-      " 98%|█████████▊| 458164224/467042463 [00:06<00:00, 64879878.65B/s]\u001b[A\u001b[A\n",
-      "\n",
-      "100%|██████████| 467042463/467042463 [00:07<00:00, 66613804.27B/s]\u001b[A\u001b[A\n",
-      "I1002 17:39:20.478424 140305852307264 file_utils.py:309] copying /tmp/tmp9p3qlkni to cache at ./temp/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac\n",
-      "I1002 17:39:21.012291 140305852307264 file_utils.py:313] creating metadata file for ./temp/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac\n",
-      "I1002 17:39:21.013607 140305852307264 file_utils.py:322] removing temp file /tmp/tmp9p3qlkni\n",
-      "I1002 17:39:21.087461 140305852307264 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin from cache at ./temp/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac\n",
-      "I1002 17:39:24.382914 140305852307264 modeling_utils.py:405] Weights of XLNetForSequenceClassification not initialized from pretrained model: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']\n",
-      "I1002 17:39:24.384073 140305852307264 modeling_utils.py:408] Weights from pretrained model not used in XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']\n",
-      "                                     \n",
-      "  0%|          | 0/3 [21:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [20:17<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.819770\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [21:13<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [20:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:1.304597\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [21:23<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [20:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.942012\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [21:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [20:47<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.797735\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [21:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [20:57<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.515332\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [21:52<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:07<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.404242\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:02<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:16<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.763274\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:12<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:26<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.802602\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:22<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:36<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.425111\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:47<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.593007\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [21:57<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.205651\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [22:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:07<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.052174\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:17<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.191218\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:13<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.248251\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:23<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.664998\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:47<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.791746\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [22:57<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.187446\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [23:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:07<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.260010\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:17<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.162322\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:13<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.341495\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:23<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.298152\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:47<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.233707\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [23:58<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.475194\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [24:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:08<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.208173\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:18<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.039098\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:13<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.052284\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:23<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.290496\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:48<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.234923\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [24:58<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.346255\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [25:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:08<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.188442\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:17<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.455602\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:13<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.609619\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:23<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.024805\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:33<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:47<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.795632\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [25:57<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.525080\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [26:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [26:07<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.258776\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [27:04<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [26:18<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.321544\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [27:15<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [26:29<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.084718\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [27:26<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [26:40<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.383003\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [27:37<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [26:51<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.203176\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [27:49<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:03<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.356263\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:00<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:14<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.088029\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:11<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:25<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.609321\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:22<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:37<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.233717\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:34<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:48<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.392966\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:45<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [27:59<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.239043\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [28:56<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [28:10<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.309102\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [29:07<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [28:22<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.033599\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [29:19<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [28:33<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.207570\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [29:30<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [28:44<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.566612\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [29:41<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [28:55<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.155714\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [29:52<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [29:07<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.355020\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:03<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [29:18<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.153492\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:14<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [29:29<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.195237\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:25<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [29:40<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.147319\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:36<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [29:50<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.551497\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:47<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:01<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.446988\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [30:58<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:12<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.392400\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:08<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:22<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.676942\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:18<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:33<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.447995\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:29<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:43<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.124699\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:39<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [30:53<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.044253\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:49<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:03<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.137005\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [31:59<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:13<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.384042\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:09<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:24<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.318679\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:19<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:34<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.152525\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:30<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:44<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.244761\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:39<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [31:54<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.476746\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:49<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:04<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.074933\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [32:59<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:14<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.304292\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [33:09<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:24<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.016483\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [33:20<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:34<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.385908\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [33:31<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:45<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.385652\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [33:42<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [32:57<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.106855\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [33:53<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [33:08<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.165334\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [34:05<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [33:19<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.280620\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [34:16<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [33:30<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.546574\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [34:27<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [33:42<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.524420\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [34:39<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [33:53<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.130612\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [34:50<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [34:04<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.045096\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:01<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [34:15<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.080000\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:12<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [34:27<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.660747\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:24<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [34:38<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.104844\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:35<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [34:49<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.044222\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:46<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:01<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.029447\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [35:58<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:12<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.209993\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [36:09<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:23<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.317221\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [36:20<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:35<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.413533\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [36:32<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:46<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.068395\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [36:43<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [35:58<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.511159\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [36:55<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [36:09<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.197926\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [37:06<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [36:20<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.332851\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \n",
-      "  0%|          | 0/3 [37:17<?, ?it/s]         \n",
-      " 67%|██████▋   | 2/3 [36:31<08:58, 538.28s/it]\u001b[A"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loss:0.042949\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Evaluating:   0%|          | 0/154 [00:00<?, ?it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|          | 1/154 [00:00<01:53,  1.35it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   1%|▏         | 2/154 [00:01<01:43,  1.47it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   2%|▏         | 3/154 [00:01<01:37,  1.56it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 4/154 [00:02<01:31,  1.64it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   3%|▎         | 5/154 [00:02<01:28,  1.69it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   4%|▍         | 6/154 [00:03<01:25,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▍         | 7/154 [00:04<01:24,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   5%|▌         | 8/154 [00:04<01:22,  1.76it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▌         | 9/154 [00:05<01:20,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   6%|▋         | 10/154 [00:05<01:19,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   7%|▋         | 11/154 [00:06<01:19,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 12/154 [00:06<01:27,  1.62it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   8%|▊         | 13/154 [00:07<01:24,  1.67it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:   9%|▉         | 14/154 [00:08<01:23,  1.68it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|▉         | 15/154 [00:08<01:21,  1.71it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  10%|█         | 16/154 [00:09<01:18,  1.75it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  11%|█         | 17/154 [00:09<01:16,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 18/154 [00:10<01:16,  1.77it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  12%|█▏        | 19/154 [00:10<01:15,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  13%|█▎        | 20/154 [00:11<01:14,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▎        | 21/154 [00:11<01:13,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  14%|█▍        | 22/154 [00:12<01:12,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  15%|█▍        | 23/154 [00:13<01:11,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 24/154 [00:13<01:11,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  16%|█▌        | 25/154 [00:14<01:10,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  17%|█▋        | 26/154 [00:14<01:09,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 27/154 [00:15<01:09,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  18%|█▊        | 28/154 [00:15<01:08,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 29/154 [00:16<01:08,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  19%|█▉        | 30/154 [00:16<01:08,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  20%|██        | 31/154 [00:17<01:14,  1.64it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██        | 32/154 [00:18<01:12,  1.69it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  21%|██▏       | 33/154 [00:18<01:09,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  22%|██▏       | 34/154 [00:19<01:07,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 35/154 [00:19<01:07,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  23%|██▎       | 36/154 [00:20<01:06,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  24%|██▍       | 37/154 [00:20<01:04,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▍       | 38/154 [00:21<01:04,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  25%|██▌       | 39/154 [00:22<01:03,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  26%|██▌       | 40/154 [00:22<01:02,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 41/154 [00:23<01:01,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  27%|██▋       | 42/154 [00:23<01:00,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  28%|██▊       | 43/154 [00:24<01:00,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▊       | 44/154 [00:24<01:03,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  29%|██▉       | 45/154 [00:25<01:02,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  30%|██▉       | 46/154 [00:25<01:01,  1.76it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 47/154 [00:26<01:00,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  31%|███       | 48/154 [00:27<00:59,  1.77it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 49/154 [00:27<00:59,  1.77it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  32%|███▏      | 50/154 [00:28<00:58,  1.77it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  33%|███▎      | 51/154 [00:28<00:57,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 52/154 [00:29<00:56,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  34%|███▍      | 53/154 [00:29<00:55,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  35%|███▌      | 54/154 [00:30<00:54,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▌      | 55/154 [00:30<00:53,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  36%|███▋      | 56/154 [00:31<00:52,  1.86it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  37%|███▋      | 57/154 [00:31<00:52,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 58/154 [00:32<00:52,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  38%|███▊      | 59/154 [00:33<00:52,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  39%|███▉      | 60/154 [00:33<00:52,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|███▉      | 61/154 [00:34<00:50,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  40%|████      | 62/154 [00:34<00:50,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  41%|████      | 63/154 [00:35<00:50,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 64/154 [00:35<00:49,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  42%|████▏     | 65/154 [00:36<00:48,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  43%|████▎     | 66/154 [00:36<00:47,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▎     | 67/154 [00:37<00:47,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  44%|████▍     | 68/154 [00:38<00:47,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▍     | 69/154 [00:38<00:46,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  45%|████▌     | 70/154 [00:39<00:45,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  46%|████▌     | 71/154 [00:39<00:45,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 72/154 [00:40<00:44,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  47%|████▋     | 73/154 [00:40<00:44,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  48%|████▊     | 74/154 [00:41<00:43,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▊     | 75/154 [00:41<00:43,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  49%|████▉     | 76/154 [00:42<00:42,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  50%|█████     | 77/154 [00:42<00:42,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████     | 78/154 [00:43<00:42,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  51%|█████▏    | 79/154 [00:44<00:41,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  52%|█████▏    | 80/154 [00:44<00:40,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 81/154 [00:45<00:39,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  53%|█████▎    | 82/154 [00:45<00:39,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  54%|█████▍    | 83/154 [00:46<00:39,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▍    | 84/154 [00:46<00:38,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  55%|█████▌    | 85/154 [00:47<00:38,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▌    | 86/154 [00:47<00:37,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  56%|█████▋    | 87/154 [00:48<00:36,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  57%|█████▋    | 88/154 [00:49<00:36,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 89/154 [00:49<00:35,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  58%|█████▊    | 90/154 [00:50<00:37,  1.72it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  59%|█████▉    | 91/154 [00:50<00:35,  1.76it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|█████▉    | 92/154 [00:51<00:34,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  60%|██████    | 93/154 [00:51<00:34,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  61%|██████    | 94/154 [00:52<00:33,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 95/154 [00:52<00:32,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  62%|██████▏   | 96/154 [00:53<00:32,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  63%|██████▎   | 97/154 [00:54<00:31,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▎   | 98/154 [00:54<00:31,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  64%|██████▍   | 99/154 [00:55<00:30,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  65%|██████▍   | 100/154 [00:55<00:29,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 101/154 [00:56<00:28,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  66%|██████▌   | 102/154 [00:56<00:28,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  67%|██████▋   | 103/154 [00:57<00:27,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 104/154 [00:57<00:27,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  68%|██████▊   | 105/154 [00:58<00:26,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 106/154 [00:58<00:26,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  69%|██████▉   | 107/154 [00:59<00:25,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  70%|███████   | 108/154 [01:00<00:24,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████   | 109/154 [01:00<00:24,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  71%|███████▏  | 110/154 [01:01<00:23,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  72%|███████▏  | 111/154 [01:01<00:23,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 112/154 [01:02<00:23,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  73%|███████▎  | 113/154 [01:02<00:22,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  74%|███████▍  | 114/154 [01:03<00:21,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▍  | 115/154 [01:03<00:21,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  75%|███████▌  | 116/154 [01:04<00:21,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  76%|███████▌  | 117/154 [01:05<00:20,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 118/154 [01:05<00:19,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  77%|███████▋  | 119/154 [01:06<00:18,  1.85it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  78%|███████▊  | 120/154 [01:06<00:18,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▊  | 121/154 [01:07<00:17,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  79%|███████▉  | 122/154 [01:07<00:17,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  80%|███████▉  | 123/154 [01:08<00:17,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 124/154 [01:08<00:16,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  81%|████████  | 125/154 [01:09<00:15,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 126/154 [01:09<00:15,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  82%|████████▏ | 127/154 [01:10<00:14,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  83%|████████▎ | 128/154 [01:11<00:14,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 129/154 [01:11<00:13,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  84%|████████▍ | 130/154 [01:12<00:13,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  85%|████████▌ | 131/154 [01:12<00:12,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▌ | 132/154 [01:13<00:12,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  86%|████████▋ | 133/154 [01:13<00:11,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  87%|████████▋ | 134/154 [01:14<00:11,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 135/154 [01:14<00:10,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  88%|████████▊ | 136/154 [01:15<00:09,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  89%|████████▉ | 137/154 [01:16<00:09,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|████████▉ | 138/154 [01:16<00:08,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  90%|█████████ | 139/154 [01:17<00:08,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  91%|█████████ | 140/154 [01:17<00:07,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 141/154 [01:18<00:07,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  92%|█████████▏| 142/154 [01:18<00:06,  1.83it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  93%|█████████▎| 143/154 [01:19<00:05,  1.84it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▎| 144/154 [01:19<00:05,  1.74it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  94%|█████████▍| 145/154 [01:20<00:05,  1.77it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▍| 146/154 [01:21<00:04,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  95%|█████████▌| 147/154 [01:21<00:03,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  96%|█████████▌| 148/154 [01:22<00:03,  1.78it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 149/154 [01:22<00:02,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  97%|█████████▋| 150/154 [01:23<00:02,  1.79it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  98%|█████████▊| 151/154 [01:23<00:01,  1.80it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▊| 152/154 [01:24<00:01,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating:  99%|█████████▉| 153/154 [01:24<00:00,  1.82it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "Evaluating: 100%|██████████| 154/154 [01:25<00:00,  1.81it/s]\u001b[A\u001b[A\n",
-      "\n",
-      "100%|██████████| 3/3 [37:57<00:00, 759.27s/it]\u001b[A\n"
-     ]
-    }
-   ],
-   "source": [
-    "results = {}\n",
-    "\n",
-    "for model_name in tqdm(model_names):\n",
-    "    \n",
-    "    # preprocess\n",
-    "    processor = Processor(model_name=model_name, cache_dir=CACHE_DIR)\n",
-    "    ds_train = processor.preprocess(\n",
-    "        df_train[TEXT_COL], labels_train, max_len=MAX_LEN\n",
-    "    )\n",
-    "    ds_test = processor.preprocess(df_test[TEXT_COL], None, max_len=MAX_LEN)\n",
-    "\n",
-    "    # fine-tune\n",
-    "    classifier = SequenceClassifier(\n",
-    "        model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR\n",
-    "    )\n",
-    "    with Timer() as t:\n",
-    "        classifier.fit(\n",
-    "            ds_train,\n",
-    "            device=DEVICE,\n",
-    "            num_epochs=NUM_EPOCHS,\n",
-    "            batch_size=BATCH_SIZE,\n",
-    "            num_gpus=NUM_GPUS,\n",
-    "            verbose=False,\n",
-    "        )\n",
-    "    train_time = t.interval / 3600\n",
-    "\n",
-    "    # predict\n",
-    "    preds = classifier.predict(\n",
-    "        ds_test, device=\"cuda\", batch_size=BATCH_SIZE, num_gpus=NUM_GPUS\n",
-    "    )\n",
-    "\n",
-    "    # eval\n",
-    "    accuracy = accuracy_score(labels_test, preds)\n",
-    "    class_report = classification_report(\n",
-    "        labels_test, preds, target_names=label_encoder.classes_, output_dict=True\n",
-    "    )\n",
-    "\n",
-    "    # save results\n",
-    "    results[model_name] = {\n",
-    "        \"accuracy\": accuracy,\n",
-    "        \"f1-score\": class_report[\"macro avg\"][\"f1-score\"],\n",
-    "        \"time(hrs)\": train_time,\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Evaluate\n",
-    "\n",
-    "Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>distilbert-base-uncased</th>\n",
-       "      <th>roberta-base</th>\n",
-       "      <th>xlnet-base-cased</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>accuracy</th>\n",
-       "      <td>0.901406</td>\n",
-       "      <td>0.919536</td>\n",
-       "      <td>0.925647</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>f1-score</th>\n",
-       "      <td>0.897829</td>\n",
-       "      <td>0.916793</td>\n",
-       "      <td>0.923171</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>time</th>\n",
-       "      <td>0.111936</td>\n",
-       "      <td>0.189581</td>\n",
-       "      <td>0.270957</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          distilbert-base-uncased  roberta-base  xlnet-base-cased\n",
-       "accuracy                 0.901406      0.919536          0.925647\n",
-       "f1-score                 0.897829      0.916793          0.923171\n",
-       "time                     0.111936      0.189581          0.270957"
-      ]
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.8929098953149991,
+       "encoder": "json",
+       "name": "f1",
+       "version": 1
+      }
      },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "f1"
+      }
+     },
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "pd.DataFrame(results)"
+    "# for testing\n",
+    "sb.glue(\"accuracy\", df_results.iloc[0, :].mean())\n",
+    "sb.glue(\"f1\", df_results.iloc[1, :].mean())"
    ]
   }
  ],
diff --git a/tests/conftest.py b/tests/conftest.py
index 580e0c826..111d2ae34 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -69,7 +69,9 @@ def notebooks():
         "tc_bert_azureml": os.path.join(
             folder_notebooks, "text_classification", "tc_bert_azureml.ipynb"
         ),
-        "tc_mnli_bert": os.path.join(folder_notebooks, "text_classification", "tc_mnli_bert.ipynb"),
+        "tc_mnli_transformers": os.path.join(
+            folder_notebooks, "text_classification", "tc_mnli_transformers.ipynb"
+        ),
         "tc_dac_bert_ar": os.path.join(
             folder_notebooks, "text_classification", "tc_dac_bert_ar.ipynb"
         ),
diff --git a/tests/integration/test_notebooks_text_classification.py b/tests/integration/test_notebooks_text_classification.py
index 7411445fa..a631eead5 100644
--- a/tests/integration/test_notebooks_text_classification.py
+++ b/tests/integration/test_notebooks_text_classification.py
@@ -15,8 +15,8 @@
 
 @pytest.mark.gpu
 @pytest.mark.integration
-def test_tc_mnli_bert(notebooks, tmp):
-    notebook_path = notebooks["tc_mnli_bert"]
+def test_tc_mnli_transformers(notebooks, tmp):
+    notebook_path = notebooks["tc_mnli_transformers"]
     pm.execute_notebook(
         notebook_path,
         OUTPUT_NOTEBOOK,
@@ -24,17 +24,17 @@ def test_tc_mnli_bert(notebooks, tmp):
         parameters=dict(
             NUM_GPUS=1,
             DATA_FOLDER=tmp,
-            BERT_CACHE_DIR=tmp,
-            BATCH_SIZE=32,
-            BATCH_SIZE_PRED=512,
+            CACHE_DIR=tmp,
+            BATCH_SIZE=16,
             NUM_EPOCHS=1,
+            TRAIN_DATA_FRACTION=0.05,
+            TEST_DATA_FRACTION=0.05,
+            MODEL_NAMES=["distilbert-base-uncased"],
         ),
     )
     result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
-    assert pytest.approx(result["accuracy"], 0.93, abs=ABS_TOL)
-    assert pytest.approx(result["precision"], 0.93, abs=ABS_TOL)
-    assert pytest.approx(result["recall"], 0.93, abs=ABS_TOL)
-    assert pytest.approx(result["f1"], 0.93, abs=ABS_TOL)
+    assert pytest.approx(result["accuracy"], 0.87, abs=ABS_TOL)
+    assert pytest.approx(result["f1"], 0.87, abs=ABS_TOL)
 
 
 @pytest.mark.gpu
diff --git a/utils_nlp/dataset/squad.py b/utils_nlp/dataset/squad.py
index cdb07ff05..f807f878c 100644
--- a/utils_nlp/dataset/squad.py
+++ b/utils_nlp/dataset/squad.py
@@ -23,9 +23,7 @@
 }
 
 
-def load_pandas_df(
-    local_cache_path=".", squad_version="v1.1", file_split="train"
-):
+def load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="train"):
     """Loads the SQuAD dataset in pandas data frame.
 
     Args:
@@ -34,8 +32,12 @@ def load_pandas_df(
         squad_version (str, optional): Version of the SQuAD dataset, accepted values are: 
             "v1.1" and "v2.0". Defaults to "v1.1".
         file_split (str, optional): Dataset split to load, accepted values are: "train" and "dev".
-            Defaults to "train". 
+            Defaults to "train".
     """
+
+    if file_split not in ["train", "dev"]:
+        raise ValueError("file_split should be either train or dev")
+
     URL = URL_DICT[squad_version][file_split]
     file_name = URL.split("/")[-1]
     maybe_download(URL, file_name, local_cache_path)
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index acd5571c1..36024d842 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -34,9 +34,9 @@
 logger = logging.getLogger(__name__)
 
 
-def get_device(device, num_gpus, local_rank):
+def get_device(num_gpus=None, local_rank=-1):
     if local_rank == -1:
-        device = torch.device("cuda" if torch.cuda.is_available() and device == "cuda" else "cpu")
+        device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
         num_gpus = (
             min(num_gpus, torch.cuda.device_count()) if num_gpus else torch.cuda.device_count()
         )
@@ -45,7 +45,6 @@ def get_device(device, num_gpus, local_rank):
         device = torch.device("cuda", local_rank)
         torch.distributed.init_process_group(backend="nccl")
         num_gpus = 1
-
     return device, num_gpus
 
 
@@ -58,35 +57,31 @@ def __init__(
         cache_dir=".",
         load_model_from_dir=None,
     ):
-        self.model_name = model_name
+
+        if model_name not in self.list_supported_models():
+            raise ValueError(
+                "Model name {0} is not supported by {1}. "
+                "Call '{2}.list_supported_models()' to get all supported model "
+                "names.".format(value, self.__class__.__name__, self.__class__.__name__)
+            )
+        self._model_name = model_name
+        self._model_type = model_name.split("-")[0]
         self.cache_dir = cache_dir
         self.load_model_from_dir = load_model_from_dir
         if load_model_from_dir is None:
             self.model = model_class[model_name].from_pretrained(
-                model_name, cache_dir=cache_dir, num_labels=num_labels
+                model_name, cache_dir=cache_dir, num_labels=num_labels, output_loading_info=False
             )
         else:
             logger.info("Loading cached model from {}".format(load_model_from_dir))
             self.model = model_class[model_name].from_pretrained(
-                load_model_from_dir, num_labels=num_labels
+                load_model_from_dir, num_labels=num_labels, output_loading_info=False
             )
 
     @property
     def model_name(self):
         return self._model_name
 
-    @model_name.setter
-    def model_name(self, value):
-        if value not in self.list_supported_models():
-            raise ValueError(
-                "Model name {0} is not supported by {1}. "
-                "Call '{2}.list_supported_models()' to get all supported model "
-                "names.".format(value, self.__class__.__name__, self.__class__.__name__)
-            )
-
-        self._model_name = value
-        self._model_type = value.split("-")[0]
-
     @property
     def model_type(self):
         return self._model_type
@@ -263,10 +258,8 @@ def predict(
     def save_model(self):
         output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
 
-        if not os.path.exists(self.cache_dir):
-            os.makedirs(self.cache_dir)
-        if not os.path.exists(output_model_dir):
-            os.makedirs(output_model_dir)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        os.makedirs(output_model_dir, exist_ok=True)
 
         logger.info("Saving model checkpoint to %s", output_model_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py
index 1a6370053..a5d140709 100644
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@@ -1,19 +1,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import torch
 import logging
-import numpy as np
-
-from torch.utils.data import TensorDataset
-from cached_property import cached_property
 from collections import Iterable
 
-from transformers.modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForTokenClassification
-)
-
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset
+from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
 from utils_nlp.models.transformers.common import (
     MAX_SEQ_LEN,
     TOKENIZER_CLASS,
@@ -21,10 +15,11 @@
     get_device,
 )
 
+
 TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
 
 
-class TokenClassificationProcessor():
+class TokenClassificationProcessor:
     """
     Process raw dataset for training and testing.
 
@@ -42,7 +37,7 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
         self.to_lower = to_lower
         self.cache_dir = cache_dir
         self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir
+            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
         )
 
     @staticmethod
@@ -55,20 +50,19 @@ def get_inputs(batch, model_name, train_mode=True):
             train_mode (bool, optional): Whether it's for model training. Set it to False if
                 it's for testing and it won't have the 'labels' data field.
                 Defaults to True, for model training.
-        
+
         Returns:
             dict: A dictionary object contains all needed information for training or testing.
         """
 
         if model_name.split("-")[0] not in ["bert"]:
             raise ValueError("Model not supported: {}".format(model_name))
-    
+
         if train_mode:
             return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
         else:
             return {"input_ids": batch[0], "attention_mask": batch[1]}
 
-
     @staticmethod
     def create_label_map(label_lists, trailing_piece_tag="X"):
         """
@@ -79,9 +73,9 @@ def create_label_map(label_lists, trailing_piece_tag="X"):
                 which presents class of each token.
             trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
                 Defaults to "X".
-            
+
         Returns:
-            dict: A dictionary object to map a label (str) to an ID (int). 
+            dict: A dictionary object to map a label (str) to an ID (int).
         """
 
         label_set = set()
@@ -94,14 +88,8 @@ def create_label_map(label_lists, trailing_piece_tag="X"):
             label_map[trailing_piece_tag] = len(label_set)
         return label_map
 
-
     def preprocess_for_bert(
-        self,
-        text,
-        max_len=MAX_SEQ_LEN,
-        labels=None,
-        label_map=None,
-        trailing_piece_tag="X"
+        self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"
     ):
         """
         Tokenize and preprocesses input word lists, involving the following steps
@@ -157,7 +145,9 @@ def _is_iterable_but_not_string(obj):
             return isinstance(obj, Iterable) and not isinstance(obj, str)
 
         if max_len > MAX_SEQ_LEN:
-            logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
+            logging.warning(
+                "Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)
+            )
             max_len = MAX_SEQ_LEN
 
         if not _is_iterable_but_not_string(text):
@@ -168,7 +158,7 @@ def _is_iterable_but_not_string(obj):
             # list of lists for later iteration
             if not _is_iterable_but_not_string(text[0]):
                 text = [text]
-        
+
         if labels is not None:
             if not _is_iterable_but_not_string(labels):
                 raise ValueError("labels must be an iterable and not a string.")
@@ -206,7 +196,11 @@ def _is_iterable_but_not_string(obj):
                     new_tokens.append(sub_word)
 
             if len(new_tokens) > max_len:
-                logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens)))
+                logging.warn(
+                    "Text after tokenization with length {} has been truncated".format(
+                        len(new_tokens)
+                    )
+                )
                 new_tokens = new_tokens[:max_len]
                 new_labels = new_labels[:max_len]
             input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@@ -241,13 +235,13 @@ def _is_iterable_but_not_string(obj):
                 torch.tensor(input_ids_all, dtype=torch.long),
                 torch.tensor(input_mask_all, dtype=torch.long),
                 torch.tensor(trailing_token_mask_all, dtype=torch.bool),
-                torch.tensor(label_ids_all, dtype=torch.long)
+                torch.tensor(label_ids_all, dtype=torch.long),
             )
         else:
             td = TensorDataset(
                 torch.tensor(input_ids_all, dtype=torch.long),
                 torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.bool)
+                torch.tensor(trailing_token_mask_all, dtype=torch.bool),
             )
         return td
 
@@ -280,7 +274,6 @@ def list_supported_models():
     def fit(
         self,
         train_dataset,
-        device="cuda",
         num_epochs=1,
         batch_size=32,
         num_gpus=None,
@@ -297,14 +290,13 @@ def fit(
 
         Args:
             train_dataset (Dataset): Dataset for training.
-            device (torch.device, optional): A PyTorch device.
-                Defaults to 'cuda'.
             num_epochs (int, optional): Number of training epochs.
                 Defaults to 1.
             batch_size (int, optional): Training batch size.
                 Defaults to 32.
-            num_gpus (int, optional): The number of GPUs to be used.
-                Defaults to None, all gpus are used.
+            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will
+                be used. Defaults to None.
             local_rank (int, optional): Whether need to do distributed training.
                 Defaults to -1, no distributed training.
             weight_decay (float, optional): Weight decay rate.
@@ -321,7 +313,7 @@ def fit(
                 Defaults to None, use the default seed.
         """
 
-        device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank)
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
         self.model.to(device)
 
         super().fine_tune(
@@ -339,32 +331,22 @@ def fit(
             seed=seed,
         )
 
-
-    def predict(
-        self,
-        eval_dataset,
-        device="cuda",
-        batch_size=32,
-        num_gpus=None,
-        local_rank=-1,
-        verbose=False,
-    ):
+    def predict(self, eval_dataset, batch_size=32, num_gpus=None, local_rank=-1, verbose=False):
         """
         Test on an evaluation dataset and get the token label predictions.
 
         Args:
             eval_dataset (TensorDataset): A TensorDataset for evaluation.
-            device (torch.device, optional): A PyTorch device.
-                Defaults to 'cuda'.
             batch_size (int, optional): The batch size for evaluation.
                 Defaults to 32.
-            num_gpus (int, optional): The number of GPUs to be used.
-                Defaults to None, all gpus are used.
+            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will
+                be used. Defaults to None.
             local_rank (int, optional): Whether need to do distributed training.
                 Defaults to -1, no distributed training.
             verbose (bool, optional): Verbose model.
                 Defaults to False.
-        
+
         Returns:
             ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
             [number_of_examples, sequence_length, number_of_labels]. Each
@@ -372,6 +354,7 @@ def predict(
             to get the probability for each class label.
         """
 
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
         preds = list(
             super().predict(
                 eval_dataset=eval_dataset,
@@ -386,13 +369,7 @@ def predict(
         preds_np = np.concatenate(preds)
         return preds_np
 
-
-    def get_predicted_token_labels(
-        self,
-        predictions,
-        label_map,
-        dataset
-    ):
+    def get_predicted_token_labels(self, predictions, label_map, dataset):
         """
         Post-process the raw prediction values and get the class label for each token.
 
@@ -409,9 +386,9 @@ def get_predicted_token_labels(
 
         num_samples = len(dataset.tensors[0])
         if num_samples != predictions.shape[0]:
-            raise ValueError("Predictions have {0} samples, but got {1} samples in dataset".format(
-                    predictions.shape[0],
-                    num_samples
+            raise ValueError(
+                "Predictions have {0} samples, but got {1} samples in dataset".format(
+                    predictions.shape[0], num_samples
                 )
             )
 
@@ -430,7 +407,7 @@ def get_predicted_token_labels(
             for sid in range(seq_len):
                 if attention_mask[sid] == 0:
                     break
-        
+
                 if not trailing_mask[sid]:
                     continue
 
diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py
index 4c11ef2a4..f33add222 100644
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@@ -96,7 +96,7 @@ def __init__(
         self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."
     ):
         self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir
+            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
         )
         self.custom_tokenize = custom_tokenize
         self.model_name = model_name
@@ -371,7 +371,6 @@ def list_supported_models():
     def fit(
         self,
         train_dataset,
-        device="cuda",
         num_gpus=None,
         per_gpu_batch_size=8,
         num_epochs=1,
@@ -395,9 +394,8 @@ def fit(
         Args:
             train_dataset (QADataset): Training dataset of type
                 :class:`utils_nlp.dataset.pytorch.QADataset`.
-            device (str, optional): Device to use. Accepted values are "cuda" and "cpu".
-                Defaults to "cuda".
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will
                 be used. Defaults to None.
             per_gpu_batch_size (int, optional): Training batch size on each GPU. Defaults to 8.
             num_epochs (int, optional): Number of training epochs. Defaults to 1.
@@ -428,7 +426,7 @@ def fit(
 
         """
 
-        device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank)
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
 
         self.model.to(device)
         super().fine_tune(
@@ -458,7 +456,6 @@ def predict(
         self,
         test_dataset,
         per_gpu_batch_size=16,
-        device="cuda",
         num_gpus=None,
         local_rank=-1,
         verbose=True,
@@ -471,9 +468,8 @@ def predict(
             test_dataset (QADataset): Testing dataset of type
             :class:`utils_nlp.dataset.pytorch.QADataset`.
             per_gpu_batch_size (int, optional): Testing batch size on each GPU. Defaults to 8.
-            device (str, optional): Device to use. Accepted values are "cuda" and "cpu".
-                Defaults to "cuda".
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will
                 be used. Defaults to None.
             local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                 -1, which means non-distributed.
@@ -485,7 +481,7 @@ def predict(
         def _to_list(tensor):
             return tensor.detach().cpu().tolist()
 
-        device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank)
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
         batch_size = per_gpu_batch_size * max(1, num_gpus)
 
         self.model.to(device)
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 49576ee3f..3f7305362 100644
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -42,7 +42,7 @@
 class Processor:
     def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
         self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir
+            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
         )
 
     @staticmethod
@@ -106,7 +106,6 @@ def list_supported_models():
     def fit(
         self,
         train_dataset,
-        device="cuda",
         num_epochs=1,
         batch_size=32,
         num_gpus=None,
@@ -118,7 +117,11 @@ def fit(
         verbose=True,
         seed=None,
     ):
-        device, num_gpus = get_device(device=device, num_gpus=num_gpus, local_rank=local_rank)
+        """
+        Fine-tunes a pre-trained sequence classification model.
+        """
+
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
         self.model.to(device)
         super().fine_tune(
             train_dataset=train_dataset,
@@ -135,7 +138,8 @@ def fit(
             seed=seed,
         )
 
-    def predict(self, eval_dataset, device="cuda", batch_size=16, num_gpus=1, verbose=True):
+    def predict(self, eval_dataset, batch_size=16, num_gpus=1, verbose=True):
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
         preds = list(
             super().predict(
                 eval_dataset=eval_dataset,
@@ -143,7 +147,7 @@ def predict(self, eval_dataset, device="cuda", batch_size=16, num_gpus=1, verbos
                 device=device,
                 per_gpu_eval_batch_size=batch_size,
                 n_gpu=num_gpus,
-                verbose=True,
+                verbose=verbose,
             )
         )
         preds = np.concatenate(preds)

From b2415e755f90b5cf247a12bd705f1e700ced41ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?=
 <3491412+miguelgfierro@users.noreply.github.com>
Date: Thu, 24 Oct 2019 12:52:21 +0100
Subject: [PATCH 2/8] optimizer and scheduler out

---
 utils_nlp/models/transformers/common.py | 43 ++++++++++++++-----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 36024d842..8e83c810c 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -105,6 +105,8 @@ def fine_tune(
         gradient_accumulation_steps=1,
         per_gpu_train_batch_size=8,
         n_gpu=1,
+        optimizer=None,
+        scheduler=None,
         weight_decay=0.0,
         learning_rate=5e-5,
         adam_epsilon=1e-8,
@@ -134,25 +136,28 @@ def fine_tune(
         else:
             t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
 
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in self.model.named_parameters()
-                    if not any(nd in n for nd in no_decay)
-                ],
-                "weight_decay": weight_decay,
-            },
-            {
-                "params": [
-                    p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
-        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
+        if optimizer is None:
+            no_decay = ["bias", "LayerNorm.weight"]
+            optimizer_grouped_parameters = [
+                {
+                    "params": [
+                        p
+                        for n, p in self.model.named_parameters()
+                        if not any(nd in n for nd in no_decay)
+                    ],
+                    "weight_decay": weight_decay,
+                },
+                {
+                    "params": [
+                        p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
+                    ],
+                    "weight_decay": 0.0,
+                },
+            ]
+            optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
+        
+        if scheduler is None:
+            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
 
         if fp16:
             try:

From 6ff9b2dd3c084e505d91cf19524d49437ae0faa0 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 25 Oct 2019 14:24:24 +0100
Subject: [PATCH 3/8] removed repeated get_device

---
 tests/unit/test_common_pytorch_utils.py | 82 +++++++++++++------------
 utils_nlp/common/pytorch_utils.py       | 37 ++++++-----
 utils_nlp/models/transformers/common.py | 20 ++----
 3 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py
index efbbd5e97..416d2c01d 100644
--- a/tests/unit/test_common_pytorch_utils.py
+++ b/tests/unit/test_common_pytorch_utils.py
@@ -8,81 +8,91 @@
 from torch.nn.modules.container import Sequential
 
 from utils_nlp.common.pytorch_utils import get_device, move_to_device
-        
+
 
 @pytest.fixture
 def model():
-    return nn.Sequential(
-        nn.Linear(24, 8), nn.ReLU(), nn.Linear(8, 2), nn.Sigmoid()
-    )
+    return nn.Sequential(nn.Linear(24, 8), nn.ReLU(), nn.Linear(8, 2), nn.Sigmoid())
+
 
-        
 def test_get_device_cpu():
-    device = get_device("cpu")
+    device, gpus = get_device(num_gpus=0)
     assert isinstance(device, torch.device)
     assert device.type == "cpu"
-
-
-def test_get_device_exception():
-    with pytest.raises(ValueError):
-        get_device("abc")
+    assert gpus == 0
 
 
 @pytest.mark.gpu
 def test_machine_is_gpu_machine():
     assert torch.cuda.is_available() is True
-    
-    
+
+
 @pytest.mark.gpu
 def test_get_device_gpu():
-    device = get_device()
+    device, gpus = get_device(num_gpus=1)
     assert isinstance(device, torch.device)
     assert device.type == "cuda"
-    
+    assert gpus == 1
+
+
+@pytest.mark.gpu
+def test_get_device_all_gpus():
+    device, gpus = get_device()
+    assert isinstance(device, torch.device)
+    assert device.type == "cuda"
+    assert gpus == torch.cuda.device_count()
+
+
+@pytest.mark.gpu
+def test_get_device_local_rank():
+    device, gpus = get_device(local_rank=1)
+    assert isinstance(device, torch.device)
+    assert device.type == "cuda"
+    assert device.index == 1
+    assert gpus == 1
+
 
 def test_move_to_device_cpu(model):
     # test when device.type="cpu"
     model_cpu = move_to_device(model, torch.device("cpu"))
     assert isinstance(model_cpu, nn.modules.container.Sequential)
-    
+
 
 def test_move_to_device_cpu_parallelized(model):
     # test when input model is parallelized
     model_parallelized = nn.DataParallel(model)
-    model_parallelized_output = move_to_device(
-        model_parallelized, torch.device("cpu")
-    )
-    assert isinstance(
-        model_parallelized_output, nn.modules.container.Sequential
-    )
-    
-    
+    model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu"))
+    assert isinstance(model_parallelized_output, nn.modules.container.Sequential)
+
+
 def test_move_to_device_exception_not_torch_device(model):
     # test when device is not torch.device
     with pytest.raises(ValueError):
         move_to_device(model, "abc")
-        
-        
+
+
 def test_move_to_device_exception_wrong_type(model):
     # test when device.type is not "cuda" or "cpu"
     with pytest.raises(Exception):
         move_to_device(model, torch.device("opengl"))
 
 
-@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine")
+@pytest.mark.skipif(
+    torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine"
+)
 def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
     # test when the model is moved to a gpu but it is a cpu machine
     with pytest.raises(Exception):
         move_to_device(model, torch.device("cuda"))
-        
-        
-@pytest.mark.gpu        
+
+
+@pytest.mark.gpu
 def test_move_to_device_exception_cuda_zero_gpus(model):
     # test when device.type is cuda, but num_gpus is 0
     with pytest.raises(ValueError):
         move_to_device(model, torch.device("cuda"), num_gpus=0)
-    
-    
+
+
 @pytest.mark.gpu
 def test_move_to_device_gpu(model):
     # test when device.type="cuda"
@@ -94,9 +104,7 @@ def test_move_to_device_gpu(model):
     else:
         assert isinstance(model_cuda, Sequential)
 
-    model_cuda_1_gpu = move_to_device(
-        model, torch.device("cuda"), num_gpus=1
-    )
+    model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
     assert isinstance(model_cuda_1_gpu, Sequential)
 
     model_cuda_1_more_gpu = move_to_device(
@@ -107,9 +115,7 @@ def test_move_to_device_gpu(model):
     else:
         assert isinstance(model_cuda_1_more_gpu, Sequential)
 
-    model_cuda_same_gpu = move_to_device(
-        model, torch.device("cuda"), num_gpus=num_cuda_devices
-    )
+    model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
     if num_cuda_devices > 1:
         assert isinstance(model_cuda_same_gpu, DataParallel)
     else:
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index 07c3b504e..ea09f8768 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -8,23 +8,28 @@
 import warnings
 
 
-def get_device(device="gpu"):
-    """Gets a PyTorch device.
-
-    Args:
-        device (str, optional): Device string: "cpu" or "gpu". Defaults to "gpu".
-
-    Returns:
-        torch.device: A PyTorch device (cpu or gpu).
-    """
-    if device == "gpu":
-        if torch.cuda.is_available():
-            return torch.device("cuda:0")
-        raise Exception("CUDA device not available")
-    elif device == "cpu":
-        return torch.device("cpu")
+def get_device(
+    num_gpus=None,
+    local_rank=-1,
+    #    backend="nccl",
+    #    rank=0,
+    #    world_size=1,
+    #    init_method="file:///distributed",
+):
+    if local_rank == -1:
+        num_gpus = (
+            min(num_gpus, torch.cuda.device_count())
+            if num_gpus is not None
+            else torch.cuda.device_count()
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
     else:
-        raise ValueError("Only 'cpu' and 'gpu' devices are supported.")
+        torch.cuda.set_device(local_rank)
+        device = torch.device("cuda", local_rank)
+        # torch.distributed.init_process_group(backend="nccl")
+        # torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=init_method)
+        num_gpus = 1
+    return device, num_gpus
 
 
 def move_to_device(model, device, num_gpus=None):
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 8e83c810c..1c82ae9e0 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -34,20 +34,6 @@
 logger = logging.getLogger(__name__)
 
 
-def get_device(num_gpus=None, local_rank=-1):
-    if local_rank == -1:
-        device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
-        num_gpus = (
-            min(num_gpus, torch.cuda.device_count()) if num_gpus else torch.cuda.device_count()
-        )
-    else:
-        torch.cuda.set_device(local_rank)
-        device = torch.device("cuda", local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        num_gpus = 1
-    return device, num_gpus
-
-
 class Transformer:
     def __init__(
         self,
@@ -149,13 +135,15 @@ def fine_tune(
                 },
                 {
                     "params": [
-                        p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
+                        p
+                        for n, p in self.model.named_parameters()
+                        if any(nd in n for nd in no_decay)
                     ],
                     "weight_decay": 0.0,
                 },
             ]
             optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
-        
+
         if scheduler is None:
             scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
 

From d5c10c1e679668ed03429cbfaf44b76754269549 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 25 Oct 2019 14:37:05 +0100
Subject: [PATCH 4/8] refactored get_device

---
 .../models/bert/sequence_classification.py    | 75 +++++--------------
 .../sequence_classification_distributed.py    | 11 ++-
 utils_nlp/models/bert/sequence_encoding.py    |  6 +-
 utils_nlp/models/bert/token_classification.py | 69 +++++------------
 .../models/xlnet/sequence_classification.py   |  4 +-
 5 files changed, 46 insertions(+), 119 deletions(-)

diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py
index ced02acc6..03a324604 100644
--- a/utils_nlp/models/bert/sequence_classification.py
+++ b/utils_nlp/models/bert/sequence_classification.py
@@ -7,12 +7,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.utils.data import (
-    DataLoader,
-    RandomSampler,
-    SequentialSampler,
-    TensorDataset,
-)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm
@@ -22,6 +17,7 @@
 
 from cached_property import cached_property
 
+
 class BERTSequenceClassifier:
     """BERT-based sequence classifier"""
 
@@ -55,7 +51,7 @@ def cuda(self):
 
         self.has_cuda = torch.cuda.is_available()
         return self.has_cuda
-        
+
     def fit(
         self,
         token_ids,
@@ -93,9 +89,8 @@ def fit(
                 loss values. Defaults to True.
         """
 
-        device = get_device(
-            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
-        )
+        device, num_gpus = get_device(num_gpus)
+
         self.model = move_to_device(self.model, device, num_gpus)
 
         token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
@@ -103,42 +98,25 @@ def fit(
         labels_tensor = torch.tensor(labels, dtype=torch.long)
 
         if token_type_ids:
-            token_type_ids_tensor = torch.tensor(
-                token_type_ids, dtype=torch.long
-            )
+            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
             train_dataset = TensorDataset(
-                token_ids_tensor,
-                input_mask_tensor,
-                token_type_ids_tensor,
-                labels_tensor,
+                token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
             )
         else:
-            train_dataset = TensorDataset(
-                token_ids_tensor, input_mask_tensor, labels_tensor
-            )
+            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)
         train_sampler = RandomSampler(train_dataset)
 
-        train_dataloader = DataLoader(
-            train_dataset, sampler=train_sampler, batch_size=batch_size
-        )
+        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
         # define optimizer and model parameters
         param_optimizer = list(self.model.named_parameters())
         no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [
-                    p
-                    for n, p in param_optimizer
-                    if not any(nd in n for nd in no_decay)
-                ],
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 "weight_decay": 0.01,
             },
             {
-                "params": [
-                    p
-                    for n, p in param_optimizer
-                    if any(nd in n for nd in no_decay)
-                ],
+                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                 "weight_decay": 0.0,
             },
         ]
@@ -164,18 +142,14 @@ def fit(
 
         for epoch in range(num_epochs):
             training_loss = 0
-            for i, batch in enumerate(
-                tqdm(train_dataloader, desc="Iteration")
-            ):
+            for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 if token_type_ids:
                     x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                         t.to(device) for t in batch
                     )
                 else:
                     token_type_ids_batch = None
-                    x_batch, mask_batch, y_batch = tuple(
-                        t.to(device) for t in batch
-                    )
+                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)
 
                 opt.zero_grad()
 
@@ -236,9 +210,7 @@ def predict(
             1darray, namedtuple(1darray, ndarray): Predicted classes or
                 (classes, probabilities) if probabilities is True.
         """
-        device = get_device(
-            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
-        )
+        device, num_gpus = get_device(num_gpus)
         self.model = move_to_device(self.model, device, num_gpus)
 
         # score
@@ -248,26 +220,18 @@ def predict(
         input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
 
         if token_type_ids:
-            token_type_ids_tensor = torch.tensor(
-                token_type_ids, dtype=torch.long
-            )
-            test_dataset = TensorDataset(
-                token_ids_tensor, input_mask_tensor, token_type_ids_tensor
-            )
+            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
+            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor)
         else:
             test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)
 
         test_sampler = SequentialSampler(test_dataset)
-        test_dataloader = DataLoader(
-            test_dataset, sampler=test_sampler, batch_size=batch_size
-        )
+        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)
 
         preds = []
         for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
             if token_type_ids:
-                x_batch, mask_batch, token_type_ids_batch = tuple(
-                    t.to(device) for t in batch
-                )
+                x_batch, mask_batch, token_type_ids_batch = tuple(t.to(device) for t in batch)
             else:
                 token_type_ids_batch = None
                 x_batch, mask_batch = tuple(t.to(device) for t in batch)
@@ -285,8 +249,7 @@ def predict(
 
         if probabilities:
             return namedtuple("Predictions", "classes probabilities")(
-                preds.argmax(axis=1),
-                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),
+                preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()
             )
         else:
             return preds.argmax(axis=1)
diff --git a/utils_nlp/models/bert/sequence_classification_distributed.py b/utils_nlp/models/bert/sequence_classification_distributed.py
index 5c3faa07a..ee5061158 100644
--- a/utils_nlp/models/bert/sequence_classification_distributed.py
+++ b/utils_nlp/models/bert/sequence_classification_distributed.py
@@ -167,7 +167,7 @@ def fit(
         epoch,
         bert_optimizer=None,
         num_epochs=1,
-        num_gpus=0,
+        num_gpus=None,
         lr=2e-5,
         warmup_proportion=None,
         fp16_allreduce=False,
@@ -181,7 +181,7 @@ def fit(
             epoch(int): Current epoch number of training.
             bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod
             num_epochs(int): the number of epochs to run
-            num_gpus(int): the number of gpus
+            num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used.
             lr (float): learning rate of the adam optimizer. defaults to 2e-5.
             warmup_proportion (float, optional): proportion of training to
                 perform linear learning rate warmup for. e.g., 0.1 = 10% of
@@ -190,10 +190,9 @@ def fit(
             num_train_optimization_steps: number of steps the optimizer should take.
         """
 
-        device = get_device("cpu" if num_gpus == 0 else "gpu")
+        device, num_gpus = get_device(num_gpus)
 
-        if device:
-            self.model.cuda()
+        self.model = move_to_device(self.model, device, num_gpus)
 
         if bert_optimizer is None:
             bert_optimizer = self.create_optimizer(
@@ -277,7 +276,7 @@ def predict(self, test_loader, num_gpus=None, probabilities=False):
             1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
                 a dictionary with classes, target labels, probabilities) if probabilities is True.
         """
-        device = get_device("cpu" if num_gpus == 0 else "gpu")
+        device, num_gpus = get_device(num_gpus)
         self.model = move_to_device(self.model, device, num_gpus)
 
         # score
diff --git a/utils_nlp/models/bert/sequence_encoding.py b/utils_nlp/models/bert/sequence_encoding.py
index 7a747c963..088a6310d 100644
--- a/utils_nlp/models/bert/sequence_encoding.py
+++ b/utils_nlp/models/bert/sequence_encoding.py
@@ -18,6 +18,7 @@
 from utils_nlp.models.bert.common import Language, Tokenizer
 from cached_property import cached_property
 
+
 class PoolingStrategy(str, Enum):
     """Enumerate pooling strategies"""
 
@@ -79,12 +80,11 @@ def layer_index(self, layer_index):
             self._layer_index = [layer_index]
         else:
             self.layer_index = layer_index
-        
 
     @cached_property
     def cuda(self):
         """ cache the output of torch.cuda.is_available() """
-        
+
         self.has_cuda = torch.cuda.is_available()
         return self.has_cuda
 
@@ -106,7 +106,7 @@ def get_hidden_states(self, text, batch_size=32):
         Returns:
             pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
         """
-        device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu")
+        device, num_gpus = get_device(self.num_gpus)
         self.model = move_to_device(self.model, device, self.num_gpus)
 
         self.model.eval()
diff --git a/utils_nlp/models/bert/token_classification.py b/utils_nlp/models/bert/token_classification.py
index bce7de8b1..ce98357bc 100644
--- a/utils_nlp/models/bert/token_classification.py
+++ b/utils_nlp/models/bert/token_classification.py
@@ -20,6 +20,7 @@
 
 from cached_property import cached_property
 
+
 class BERTTokenClassifier:
     """BERT-based token classifier."""
 
@@ -64,9 +65,7 @@ def cuda(self):
         self.has_cuda = torch.cuda.is_available()
         return self.has_cuda
 
-    def _get_optimizer(
-        self, learning_rate, num_train_optimization_steps, warmup_proportion
-    ):
+    def _get_optimizer(self, learning_rate, num_train_optimization_steps, warmup_proportion):
         """
         Initializes the optimizer and configure parameters to apply weight
         decay on.
@@ -77,26 +76,18 @@ def _get_optimizer(
         optimizer_grouped_parameters = [
             {
                 "params": [
-                    p
-                    for n, p in param_optimizer
-                    if not any(nd in n for nd in no_decay_params)
+                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay_params)
                 ],
                 "weight_decay": params_weight_decay,
             },
             {
-                "params": [
-                    p
-                    for n, p in param_optimizer
-                    if any(nd in n for nd in no_decay_params)
-                ],
+                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay_params)],
                 "weight_decay": 0.0,
             },
         ]
 
         if warmup_proportion is None:
-            optimizer = BertAdam(
-                optimizer_grouped_parameters, lr=learning_rate
-            )
+            optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate)
         else:
             optimizer = BertAdam(
                 optimizer_grouped_parameters,
@@ -151,9 +142,8 @@ def fit(
             batch_size=batch_size,
         )
 
-        device = get_device(
-            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
-        )
+        device, num_gpus = get_device(num_gpus)
+
         self.model = move_to_device(self.model, device, num_gpus)
 
         if num_gpus is None:
@@ -161,9 +151,7 @@ def fit(
         else:
             num_gpus_used = min(num_gpus, torch.cuda.device_count())
 
-        num_train_optimization_steps = max(
-            (int(len(token_ids) / batch_size) * num_epochs), 1
-        )
+        num_train_optimization_steps = max((int(len(token_ids) / batch_size) * num_epochs), 1)
         optimizer = self._get_optimizer(
             learning_rate=learning_rate,
             num_train_optimization_steps=num_train_optimization_steps,
@@ -174,16 +162,12 @@ def fit(
         for _ in trange(int(num_epochs), desc="Epoch"):
             tr_loss = 0
             nb_tr_steps = 0
-            for step, batch in enumerate(
-                tqdm(train_dataloader, desc="Iteration", mininterval=30)
-            ):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=30)):
                 batch = tuple(t.to(device) for t in batch)
                 b_token_ids, b_input_mask, b_label_ids = batch
 
                 loss = self.model(
-                    input_ids=b_token_ids,
-                    attention_mask=b_input_mask,
-                    labels=b_label_ids,
+                    input_ids=b_token_ids, attention_mask=b_input_mask, labels=b_label_ids
                 )
 
                 if num_gpus_used > 1:
@@ -206,13 +190,7 @@ def fit(
             torch.cuda.empty_cache()
 
     def predict(
-        self,
-        token_ids,
-        input_mask,
-        labels=None,
-        batch_size=32,
-        num_gpus=None,
-        probabilities=False,
+        self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None, probabilities=False
     ):
         """
         Predict token labels on the testing data.
@@ -248,18 +226,14 @@ def predict(
             batch_size=batch_size,
             sample_method="sequential",
         )
-        device = get_device(
-            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
-        )
+        device, num_gpus = get_device(num_gpus)
 
         self.model = move_to_device(self.model, device, num_gpus)
 
         self.model.eval()
         eval_loss = 0
         nb_eval_steps = 0
-        for step, batch in enumerate(
-            tqdm(test_dataloader, desc="Iteration", mininterval=10)
-        ):
+        for step, batch in enumerate(tqdm(test_dataloader, desc="Iteration", mininterval=10)):
             batch = tuple(t.to(device) for t in batch)
             true_label_available = False
             if labels:
@@ -272,9 +246,7 @@ def predict(
                 logits = self.model(b_input_ids, attention_mask=b_input_mask)
                 if true_label_available:
                     active_loss = b_input_mask.view(-1) == 1
-                    active_logits = logits.view(-1, self.num_labels)[
-                        active_loss
-                    ]
+                    active_logits = logits.view(-1, self.num_labels)[active_loss]
                     active_labels = b_labels.view(-1)[active_loss]
                     loss_fct = nn.CrossEntropyLoss()
                     tmp_eval_loss = loss_fct(active_logits, active_labels)
@@ -298,8 +270,7 @@ def predict(
 
         if probabilities:
             return namedtuple("Predictions", "classes probabilities")(
-                predictions,
-                np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2),
+                predictions, np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2)
             )
         else:
             return predictions
@@ -315,11 +286,7 @@ def create_label_map(label_list, trailing_piece_tag="X"):
 
 
 def postprocess_token_labels(
-    labels,
-    input_mask,
-    label_map=None,
-    remove_trailing_word_pieces=False,
-    trailing_token_mask=None,
+    labels, input_mask, label_map=None, remove_trailing_word_pieces=False, trailing_token_mask=None
 ):
     """
     Postprocesses token classification output:
@@ -372,9 +339,7 @@ def postprocess_token_labels(
 
         labels_no_trailing_pieces = [
             [label for label, mask in zip(label_list, mask_list) if mask]
-            for label_list, mask_list in zip(
-                labels_org_no_padding, token_mask_no_padding
-            )
+            for label_list, mask_list in zip(labels_org_no_padding, token_mask_no_padding)
         ]
         return labels_no_trailing_pieces
     else:
diff --git a/utils_nlp/models/xlnet/sequence_classification.py b/utils_nlp/models/xlnet/sequence_classification.py
index 055fad50f..90c514747 100644
--- a/utils_nlp/models/xlnet/sequence_classification.py
+++ b/utils_nlp/models/xlnet/sequence_classification.py
@@ -113,7 +113,7 @@ def fit(
                 loss values. Defaults to True.
         """
 
-        device = get_device("cpu" if self.num_gpus == 0 or not torch.cuda.is_available() else "gpu")
+        device, num_gpus = get_device(self.num_gpus)
         self.model = move_to_device(self.model, device, self.num_gpus)
 
         token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
@@ -329,7 +329,7 @@ def predict(
                 (classes, probabilities) if probabilities is True.
         """
 
-        device = get_device("cpu" if num_gpus == 0 or not torch.cuda.is_available() else "gpu")
+        device, num_gpus = get_device(num_gpus)
         self.model = move_to_device(self.model, device, num_gpus)
 
         self.model.eval()

From fc800398353b97ff5021c1af47e22e358a5ca9bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?=
 <3491412+miguelgfierro@users.noreply.github.com>
Date: Fri, 8 Nov 2019 20:36:12 +0000
Subject: [PATCH 5/8] trigger tests

---
 tests/unit/test_common_pytorch_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py
index 416d2c01d..e2fce1e10 100644
--- a/tests/unit/test_common_pytorch_utils.py
+++ b/tests/unit/test_common_pytorch_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+
 import pytest
 import torch
 import torch.nn as nn

From e50811f75cd58b907e0edac9c4967d3fae987ad8 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Fri, 8 Nov 2019 22:07:52 +0000
Subject: [PATCH 6/8] :bug: in imports

---
 .../models/transformers/named_entity_recognition.py    |  8 ++------
 utils_nlp/models/transformers/question_answering.py    |  5 +----
 .../models/transformers/sequence_classification.py     | 10 ++++------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py
index a5d140709..d3d82e3b6 100644
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@@ -8,12 +8,8 @@
 import torch
 from torch.utils.data import TensorDataset
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-    get_device,
-)
+from utils_nlp.common.pytorch_utils import get_device
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 
 
 TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py
index 2a24b0818..41ace6b42 100644
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@@ -29,24 +29,21 @@
 from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
 
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
-
 from transformers.modeling_xlnet import (
     XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLNetForQuestionAnswering,
 )
-
 from transformers.modeling_distilbert import (
     DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     DistilBertForQuestionAnswering,
 )
 
+from utils_nlp.common.pytorch_utils import get_device
 from utils_nlp.models.transformers.common import (
     MAX_SEQ_LEN,
     TOKENIZER_CLASS,
     Transformer,
-    get_device,
 )
 
 MODEL_CLASS = {}
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 3f7305362..76f530203 100644
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 from torch.utils.data import TensorDataset
+
 from transformers.modeling_bert import (
     BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     BertForSequenceClassification,
@@ -21,12 +22,9 @@
     XLNetForSequenceClassification,
 )
 
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-    get_device,
-)
+from utils_nlp.common.pytorch_utils import get_device
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
+
 
 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})

From 3cbe8465d7462496e6a1ec3450a2ebd69c675131 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Wed, 13 Nov 2019 05:40:54 +0000
Subject: [PATCH 7/8] updates to finetuning

---
 utils_nlp/models/transformers/common.py       |  39 +---
 utils_nlp/models/transformers/datasets.py     | 215 ++++++++++++++++++
 .../transformers/sequence_classification.py   |  93 ++++----
 3 files changed, 272 insertions(+), 75 deletions(-)
 create mode 100644 utils_nlp/models/transformers/datasets.py

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 1c82ae9e0..26d387b38 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -10,8 +10,6 @@
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from transformers import AdamW, WarmupLinearSchedule
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -47,8 +45,8 @@ def __init__(
         if model_name not in self.list_supported_models():
             raise ValueError(
                 "Model name {0} is not supported by {1}. "
-                "Call '{2}.list_supported_models()' to get all supported model "
-                "names.".format(value, self.__class__.__name__, self.__class__.__name__)
+                "Call '{1}.list_supported_models()' to get all supported model "
+                "names.".format(value, self.__class__.__name__)
             )
         self._model_name = model_name
         self._model_type = model_name.split("-")[0]
@@ -82,14 +80,13 @@ def set_seed(seed, cuda=True):
 
     def fine_tune(
         self,
-        train_dataset,
+        train_dataloader,
         get_inputs,
         device,
         max_steps=-1,
         num_train_epochs=1,
         max_grad_norm=1.0,
         gradient_accumulation_steps=1,
-        per_gpu_train_batch_size=8,
         n_gpu=1,
         optimizer=None,
         scheduler=None,
@@ -106,14 +103,6 @@ def fine_tune(
         if seed is not None:
             Transformer.set_seed(seed, n_gpu > 0)
 
-        train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
-        train_sampler = (
-            RandomSampler(train_dataset) if local_rank == -1 else DistributedSampler(train_dataset)
-        )
-        train_dataloader = DataLoader(
-            train_dataset, sampler=train_sampler, batch_size=train_batch_size
-        )
-
         if max_steps > 0:
             t_total = max_steps
             num_train_epochs = (
@@ -191,7 +180,7 @@ def fine_tune(
                     loss = loss / gradient_accumulation_steps
 
                 if step % 10 == 0 and verbose:
-                    tqdm.write("Loss:{:.6f}".format(loss / train_batch_size))
+                    tqdm.write("Loss:{:.6f}".format(loss))
 
                 if fp16:
                     with amp.scale_loss(loss, optimizer) as scaled_loss:
@@ -220,24 +209,7 @@ def fine_tune(
             torch.cuda.empty_cache()
         return global_step, tr_loss / global_step
 
-    def predict(
-        self,
-        eval_dataset,
-        get_inputs,
-        device,
-        per_gpu_eval_batch_size=16,
-        n_gpu=1,
-        local_rank=-1,
-        verbose=True,
-    ):
-        eval_batch_size = per_gpu_eval_batch_size * max(1, n_gpu)
-        eval_sampler = (
-            SequentialSampler(eval_dataset)
-            if local_rank == -1
-            else DistributedSampler(eval_dataset)
-        )
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)
-
+    def predict(self, eval_dataloader, get_inputs, device, verbose=True):
         for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
             self.model.eval()
             batch = tuple(t.to(device) for t in batch)
@@ -245,7 +217,6 @@ def predict(
                 inputs = get_inputs(batch, self.model_name, train_mode=False)
                 outputs = self.model(**inputs)
                 logits = outputs[0]
-
             yield logits.detach().cpu().numpy()
 
     def save_model(self):
diff --git a/utils_nlp/models/transformers/datasets.py b/utils_nlp/models/transformers/datasets.py
new file mode 100644
index 000000000..a2170c88d
--- /dev/null
+++ b/utils_nlp/models/transformers/datasets.py
@@ -0,0 +1,215 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import collections
+import torch
+from torch.utils.data import Dataset
+
+
+class SCDataSet(Dataset):
+    """Dataset for single sequence classification tasks"""
+
+    def __init__(self, df, text_col, label_col, max_len, transform):
+        self.df = df
+        cols = list(df.columns)
+        self.transform = transform
+        self.max_len = max_len
+
+        if isinstance(text_col, int):
+            self.text_col = text_col
+        elif isinstance(text_col, str):
+            self.text_col = cols.index(text_col)
+        else:
+            raise TypeError("text_col must be of type int or str")
+
+        if label_col is None:
+            self.label_col = None
+        elif isinstance(label_col, int):
+            self.label_col = label_col
+        elif isinstance(label_col, str):
+            self.label_col = cols.index(label_col)
+        else:
+            raise TypeError("label_col must be of type int or str")
+
+    def __getitem__(self, idx):
+        input_ids, attention_mask = self.transform(
+            self.df.iloc[idx, self.text_col], max_len=self.max_len
+        )
+        if self.label_col is None:
+            return tuple(
+                [
+                    torch.tensor(input_ids, dtype=torch.long),
+                    torch.tensor(attention_mask, dtype=torch.long),
+                ]
+            )
+        labels = self.df.iloc[idx, self.label_col]
+        return tuple(
+            [
+                torch.tensor(input_ids, dtype=torch.long),  # input_ids
+                torch.tensor(attention_mask, dtype=torch.long),  # attention_mask
+                torch.tensor(labels, dtype=torch.long),  # labels
+            ]
+        )
+
+    def __len__(self):
+        return self.df.shape[0]
+
+
+class SPCDataSet(Dataset):
+    """Dataset for sequence pair classification tasks"""
+
+    def __init__(self, df, text1_col, text2_col, label_col, max_len, transform):
+        self.df = df
+        cols = list(df.columns)
+        self.transform = transform
+        self.max_len = max_len
+
+        if isinstance(text1_col, int):
+            self.text1_col = text1_col
+        elif isinstance(text1_col, str):
+            self.text1_col = cols.index(text1_col)
+        else:
+            raise TypeError("text1_col must be of type int or str")
+
+        if isinstance(text2_col, int):
+            self.text2_col = text2_col
+        elif isinstance(text2_col, str):
+            self.text2_col = cols.index(text2_col)
+        else:
+            raise TypeError("text2_col must be of type int or str")
+
+        if label_col is None:
+            self.label_col = None
+        elif isinstance(label_col, int):
+            self.label_col = label_col
+        elif isinstance(label_col, str):
+            self.label_col = cols.index(label_col)
+        else:
+            raise TypeError("label_col must be of type int or str")
+
+    def __getitem__(self, idx):
+        input1_ids, attention1_mask = self.transform(
+            self.df.iloc[idx, self.text1_col], max_len=self.max_len
+        )
+        input2_ids, attention2_mask = transform(
+            self.df.iloc[idx, self.text2_col], max_len=self.max_len
+        )
+
+        if self.label_col is None:
+            return tuple(
+                [
+                    torch.tensor(input1_ids + input2_ids, dtype=torch.long),
+                    torch.tensor(attention1_mask + attention2_mask, dtype=torch.long),
+                    torch.tensor([0] * len(input1_ids) + [1] * len(input2_ids), dtype=torch.long),
+                ]
+            )
+
+        labels = self.df.iloc[idx, self.label_col]
+        return tuple(
+            [
+                torch.tensor(input1_ids + input2_ids, dtype=torch.long),
+                torch.tensor(attention1_mask + attention2_mask, dtype=torch.long),
+                torch.tensor([0] * len(input1_ids) + [1] * len(input2_ids), dtype=torch.long),
+                torch.tensor(labels, dtype=torch.long),
+            ]
+        )
+
+    def __len__(self):
+        return self.df.shape[0]
+
+
+# QAInput is a data structure representing an unique document-question-answer triplet.
+# Args:
+#    doc_text (str): Input document text.
+#    question_text(str): Input question text.
+#    qa_id (int or str): An unique id identifying a document-question-answer sample.
+#    is_impossible (bool): If the question is impossible to answer based on the input document.
+#    answer_start (int or list): Index of the answer start word in doc_text. For testing data,
+#        this can be a list of integers for multiple ground truth answers.
+#    answer_text (str or list): Text of the answer. For testing data, this can be a list of strings
+#        for multiple ground truth answers.
+QAInput = collections.namedtuple(
+    "QAInput",
+    ["doc_text", "question_text", "qa_id", "is_impossible", "answer_start", "answer_text"],
+)
+
+
+class QADataset(Dataset):
+    def __init__(
+        self,
+        df,
+        doc_text_col,
+        question_text_col,
+        qa_id_col=None,
+        answer_start_col=None,
+        answer_text_col=None,
+        is_impossible_col=None,
+    ):
+        """
+        A standard dataset structure for question answering that can be processed by
+        :meth:`utils_nlp.models.transformers.question_answering.QAProcessor.preprocess`
+
+        Args:
+            df (pandas.DataFrame): Input data frame.
+            doc_text_col (str): Name of the column containing the document texts.
+            question_text_col (str): Name of the column containing the question texts.
+            qa_id_col (str, optional): Name of the column containing the unique ids identifying
+                document-question-answer samples. If not provided, a "qa_id" column is
+                automatically created. Defaults to None.
+            answer_start_col (str, optional): Name of the column containing answer start indices.
+                For testing data, each value in the column can be a list of integers for multiple
+                ground truth answers. Defaults to None.
+            answer_text_col (str, optional): Name of the column containing answer texts. For
+                testing data, each value in the column can be a list of strings for multiple
+                ground truth answers. Defaults to None.
+            is_impossible_col (str, optional): Name of the column containing boolean values
+                indicating if the question is impossible to answer. If not provided,
+                a "is_impossible" column is automatically created and populated with False.
+                Defaults to None.
+        """
+        self.df = df.copy()
+        self.doc_text_col = doc_text_col
+        self.question_text_col = question_text_col
+
+        if qa_id_col is None:
+            self.qa_id_col = "qa_id"
+            self.df[self.qa_id_col] = list(range(self.df.shape[0]))
+        else:
+            self.qa_id_col = qa_id_col
+
+        if is_impossible_col is None:
+            self.is_impossible_col = "is_impossible"
+            self.df[self.is_impossible_col] = False
+        else:
+            self.is_impossible_col = is_impossible_col
+
+        if answer_start_col is not None and answer_text_col is not None:
+            self.actual_answer_available = True
+        else:
+            self.actual_answer_available = False
+        self.answer_start_col = answer_start_col
+        self.answer_text_col = answer_text_col
+
+    def __getitem__(self, idx):
+        current_item = self.df.iloc[idx,]
+        if self.actual_answer_available:
+            return QAInput(
+                doc_text=current_item[self.doc_text_col],
+                question_text=current_item[self.question_text_col],
+                qa_id=current_item[self.qa_id_col],
+                is_impossible=current_item[self.is_impossible_col],
+                answer_start=current_item[self.answer_start_col],
+                answer_text=current_item[self.answer_text_col],
+            )
+        else:
+            return QAInput(
+                doc_text=current_item[self.doc_text_col],
+                question_text=current_item[self.question_text_col],
+                qa_id=current_item[self.qa_id_col],
+                is_impossible=current_item[self.is_impossible_col],
+                answer_start=-1,
+                answer_text="",
+            )
+
+    def __len__(self):
+        return self.df.shape[0]
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 3f7305362..cc26f0c14 100644
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -2,8 +2,9 @@
 # Licensed under the MIT License.
 
 import numpy as np
-import torch
-from torch.utils.data import TensorDataset
+
+from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
+from torch.utils.data.distributed import DistributedSampler
 from transformers.modeling_bert import (
     BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     BertForSequenceClassification,
@@ -20,13 +21,9 @@
     XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLNetForSequenceClassification,
 )
-
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-    get_device,
-)
+from utils_nlp.common.pytorch_utils import get_device
+from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 
 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@@ -55,39 +52,57 @@ def get_inputs(batch, model_name, train_mode=True):
         else:
             raise ValueError("Model not supported: {}".format(model_name))
 
-    def preprocess(self, text, labels=None, max_len=MAX_SEQ_LEN):
-        """preprocess data or batches"""
+    def text_transform(self, text, max_len=MAX_SEQ_LEN):
+        """preprocess text"""
         if max_len > MAX_SEQ_LEN:
             print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
             max_len = MAX_SEQ_LEN
-
-        tokens = [self.tokenizer.tokenize(x) for x in text]
-
         # truncate and add CLS & SEP markers
-        tokens = [
-            [self.tokenizer.cls_token] + x[0 : max_len - 2] + [self.tokenizer.sep_token]
-            for x in tokens
-        ]
+        tokens = (
+            [self.tokenizer.cls_token]
+            + self.tokenizer.tokenize(text)[0 : max_len - 2]
+            + [self.tokenizer.sep_token]
+        )
         # get input ids
-        input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]
+        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
         # pad sequence
-        input_ids = [x + [0] * (max_len - len(x)) for x in input_ids]
+        input_ids = input_ids + [0] * (max_len - len(input_ids))
         # create input mask
-        input_mask = [[min(1, x) for x in y] for y in input_ids]
-        # create segment ids
-        # segment_ids = None
-        if labels is None:
-            td = TensorDataset(
-                torch.tensor(input_ids, dtype=torch.long),
-                torch.tensor(input_mask, dtype=torch.long),
-            )
+        attention_mask = [min(1, x) for x in input_ids]
+        return input_ids, attention_mask
+
+    def create_dataloader_from_df(
+        self,
+        df,
+        text_col,
+        label_col,
+        max_len=MAX_SEQ_LEN,
+        text2_col=None,
+        batch_size=32,
+        num_gpus=None,
+        shuffle=True,
+        distributed=False,
+    ):
+        if text2_col is None:
+            ds = SCDataSet(df, text_col, label_col, max_len=max_len, transform=self.text_transform)
         else:
-            td = TensorDataset(
-                torch.tensor(input_ids, dtype=torch.long),
-                torch.tensor(input_mask, dtype=torch.long),
-                torch.tensor(labels, dtype=torch.long),
+            ds = SPCDataSet(
+                df, text_col, text2_col, label_col, max_len=max_len, transform=self.text_transform
             )
-        return td
+        if num_gpus is not None:
+            batch_size = batch_size * max(1, num_gpus)
+        if distributed:
+            sampler = DistributedSampler(dataset)
+        else:
+            sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
+
+        return DataLoader(ds, sampler=sampler, batch_size=batch_size)
+
+    # def get_eval_dataloader(dataset, batch_size, num_gpus):
+    #     if num_gpus is not None:
+    #         batch_size = batch_size * max(1, num_gpus)
+    #     sampler = SequentialSampler(dataset)
+    #     return DataLoader(dataset, sampler=sampler, batch_size=batch_size)
 
 
 class SequenceClassifier(Transformer):
@@ -105,9 +120,8 @@ def list_supported_models():
 
     def fit(
         self,
-        train_dataset,
+        train_dataloader,
         num_epochs=1,
-        batch_size=32,
         num_gpus=None,
         local_rank=-1,
         weight_decay=0.0,
@@ -124,10 +138,9 @@ def fit(
         device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
         self.model.to(device)
         super().fine_tune(
-            train_dataset=train_dataset,
+            train_dataloader=train_dataloader,
             get_inputs=Processor.get_inputs,
             device=device,
-            per_gpu_train_batch_size=batch_size,
             n_gpu=num_gpus,
             num_train_epochs=num_epochs,
             weight_decay=weight_decay,
@@ -138,15 +151,13 @@ def fit(
             seed=seed,
         )
 
-    def predict(self, eval_dataset, batch_size=16, num_gpus=1, verbose=True):
+    def predict(self, eval_dataloader, num_gpus=1, verbose=True):
         device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
         preds = list(
             super().predict(
-                eval_dataset=eval_dataset,
+                eval_dataloader=eval_dataloader,
                 get_inputs=Processor.get_inputs,
                 device=device,
-                per_gpu_eval_batch_size=batch_size,
-                n_gpu=num_gpus,
                 verbose=verbose,
             )
         )

From 304551544e0d63c302d7846cf26e9255922e62b5 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Wed, 13 Nov 2019 06:02:06 +0000
Subject: [PATCH 8/8] update text classification notebook

---
 examples/text_classification/README.md        |  3 +-
 .../tc_mnli_transformers.ipynb                | 98 ++++++++++++-------
 2 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/examples/text_classification/README.md b/examples/text_classification/README.md
index 2283c73b3..f1b09cc29 100644
--- a/examples/text_classification/README.md
+++ b/examples/text_classification/README.md
@@ -18,10 +18,9 @@ The following summarizes each notebook for Text Classification. Each notebook pr
 
 |Notebook|Environment|Description|Dataset|
 |---|---|---|---|
-|[BERT for text classification with MNLI](tc_mnli_bert.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on a subset of the MultiNLI dataset|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
 |[BERT for text classification on AzureML](tc_bert_azureml.ipynb) |Azure ML|A notebook which walks through fine-tuning and evaluating pre-trained BERT model on a distributed setup with AzureML. |[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
 |[XLNet for text classification with MNLI](tc_mnli_xlnet.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained XLNet model on a subset of the MultiNLI dataset|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
 |[BERT for text classification of Hindi BBC News](tc_bbc_bert_hi.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on Hindi BBC news data|[BBC Hindi News](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1)|
 |[BERT for text classification of Arabic News](tc_dac_bert_ar.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a pre-trained BERT model on Arabic news articles|[DAC](https://data.mendeley.com/datasets/v524p5dhpj/2)|
-|[Text Classification of MultiNLI Sentences using Different Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
+|[Text Classification of MultiNLI Sentences using Multiple Transformer Models](tc_mnli_transformers.ipynb)|Local| A notebook which walks through fine-tuning and evaluating a number of pre-trained transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
 |[Text Classification Pipelines with Azure Machine Learning](tc_transformers_azureml_pipelines/tc_transformers_azureml_pipelines.ipynb)|Azure ML| A notebook which walks through building Azure ML pipelines for fine-tuning multiple transformer models|[MultiNLI](https://www.nyu.edu/projects/bowman/multinli/)|
diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb
index bb6bcbffe..952f2bafa 100644
--- a/examples/text_classification/tc_mnli_transformers.ipynb
+++ b/examples/text_classification/tc_mnli_transformers.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "*Licensed under the MIT License.*\n",
     "\n",
-    "# Text Classification of MultiNLI Sentences using Different Transformer Models"
+    "# Text Classification of MultiNLI Sentences using Multiple Transformer Models"
    ]
   },
   {
@@ -93,7 +93,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 222k/222k [02:38<00:00, 1.40kKB/s] \n"
+      "100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n"
      ]
     }
    ],
@@ -232,11 +232,11 @@
     {
      "data": {
       "text/plain": [
-       "slate         1055\n",
-       "fiction       1019\n",
-       "telephone      968\n",
-       "government     939\n",
-       "travel         928\n",
+       "telephone     1055\n",
+       "slate         1003\n",
+       "travel         961\n",
+       "fiction        952\n",
+       "government     938\n",
        "Name: genre, dtype: int64"
       ]
      },
@@ -257,10 +257,10 @@
    "source": [
     "# encode labels\n",
     "label_encoder = LabelEncoder()\n",
-    "labels_train = label_encoder.fit_transform(df_train[LABEL_COL])\n",
-    "labels_test = label_encoder.transform(df_test[LABEL_COL])\n",
+    "df_train[LABEL_COL] = label_encoder.fit_transform(df_train[LABEL_COL])\n",
+    "df_test[LABEL_COL] = label_encoder.transform(df_test[LABEL_COL])\n",
     "\n",
-    "num_labels = len(np.unique(labels_train))"
+    "num_labels = len(np.unique(df_train[LABEL_COL]))"
    ]
   },
   {
@@ -485,20 +485,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n",
+      "100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n",
+      "100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n",
+      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n",
+      "100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n",
+      "100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n",
+      "100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n",
+      "100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n",
+      "100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n",
+      "100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n"
+     ]
+    }
+   ],
    "source": [
     "results = {}\n",
     "\n",
-    "for model_name in tqdm(MODEL_NAMES):\n",
-    "    \n",
+    "for model_name in tqdm(MODEL_NAMES, disable=True):\n",
+    "\n",
     "    # preprocess\n",
-    "    processor = Processor(model_name=model_name, cache_dir=CACHE_DIR)\n",
-    "    ds_train = processor.preprocess(\n",
-    "        df_train[TEXT_COL], labels_train, max_len=MAX_LEN\n",
+    "    processor = Processor(\n",
+    "        model_name=model_name,\n",
+    "        to_lower=model_name.endswith(\"uncased\"),\n",
+    "        cache_dir=CACHE_DIR,\n",
+    "    )\n",
+    "    train_dataloader = processor.create_dataloader_from_df(\n",
+    "        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    "    )\n",
+    "    test_dataloader = processor.create_dataloader_from_df(\n",
+    "        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
     "    )\n",
-    "    ds_test = processor.preprocess(df_test[TEXT_COL], None, max_len=MAX_LEN)\n",
     "\n",
     "    # fine-tune\n",
     "    classifier = SequenceClassifier(\n",
@@ -506,9 +531,8 @@
     "    )\n",
     "    with Timer() as t:\n",
     "        classifier.fit(\n",
-    "            ds_train,\n",
+    "            train_dataloader,\n",
     "            num_epochs=NUM_EPOCHS,\n",
-    "            batch_size=BATCH_SIZE,\n",
     "            num_gpus=NUM_GPUS,\n",
     "            verbose=False,\n",
     "        )\n",
@@ -516,13 +540,13 @@
     "\n",
     "    # predict\n",
     "    preds = classifier.predict(\n",
-    "        ds_test, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, verbose=False\n",
+    "        test_dataloader, num_gpus=NUM_GPUS, verbose=False\n",
     "    )\n",
     "\n",
     "    # eval\n",
-    "    accuracy = accuracy_score(labels_test, preds)\n",
+    "    accuracy = accuracy_score(df_test[LABEL_COL], preds)\n",
     "    class_report = classification_report(\n",
-    "        labels_test, preds, target_names=label_encoder.classes_, output_dict=True\n",
+    "        df_test[LABEL_COL], preds, target_names=label_encoder.classes_, output_dict=True\n",
     "    )\n",
     "\n",
     "    # save results\n",
@@ -576,21 +600,21 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>accuracy</th>\n",
-       "      <td>0.870416</td>\n",
-       "      <td>0.899144</td>\n",
-       "      <td>0.911369</td>\n",
+       "      <td>0.895477</td>\n",
+       "      <td>0.879584</td>\n",
+       "      <td>0.894866</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>f1-score</th>\n",
-       "      <td>0.870305</td>\n",
-       "      <td>0.897614</td>\n",
-       "      <td>0.910810</td>\n",
+       "      <td>0.896656</td>\n",
+       "      <td>0.881218</td>\n",
+       "      <td>0.896108</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>time(hrs)</th>\n",
-       "      <td>0.021828</td>\n",
-       "      <td>0.035325</td>\n",
-       "      <td>0.046363</td>\n",
+       "      <td>0.021865</td>\n",
+       "      <td>0.035351</td>\n",
+       "      <td>0.046295</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -598,9 +622,9 @@
       ],
       "text/plain": [
        "           distilbert-base-uncased  roberta-base  xlnet-base-cased\n",
-       "accuracy                  0.870416      0.899144          0.911369\n",
-       "f1-score                  0.870305      0.897614          0.910810\n",
-       "time(hrs)                 0.021828      0.035325          0.046363"
+       "accuracy                  0.895477      0.879584          0.894866\n",
+       "f1-score                  0.896656      0.881218          0.896108\n",
+       "time(hrs)                 0.021865      0.035351          0.046295"
       ]
      },
      "execution_count": 13,
@@ -621,7 +645,7 @@
     {
      "data": {
       "application/scrapbook.scrap.json+json": {
-       "data": 0.8936430317848411,
+       "data": 0.8899755501222494,
        "encoder": "json",
        "name": "accuracy",
        "version": 1
@@ -639,7 +663,7 @@
     {
      "data": {
       "application/scrapbook.scrap.json+json": {
-       "data": 0.8929098953149991,
+       "data": 0.8913273009038569,
        "encoder": "json",
        "name": "f1",
        "version": 1