Merge pull request SKTBrain#13 from seujung/master

add NSMC pytorch example
fngo-bigfinance · Dec 24, 2019 · 80274ac · 80274ac
2 parents 2cb8198 + 8600b1c
commit 80274ac
Show file tree

Hide file tree

Showing 2 changed files with 331 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -173,6 +173,10 @@ Vocab(size=8002, unk="[UNK]", reserved="['[MASK]', '[SEP]', '[CLS]']")
 - Naver Sentiment Analysis Fine-Tuning with MXNet
   - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_gluon_kobert.ipynb)
 
+- Naver Sentiment Analysis Fine-Tuning with pytorch
+  - Colab에서 [런타임] - [런타임 유형 변경] - 하드웨어 가속기(GPU) 사용을 권장합니다.
+  - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb)
+
 #### Tokenizer
 
 * Pretrained [Sentencepiece](https://github.com/google/sentencepiece) tokenizer

diff --git a/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb b/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb
@@ -0,0 +1,327 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install mxnet-cu101\n",
+    "!pip install gluonnlp pandas tqdm\n",
+    "!pip install sentencepiece==0.1.85\n",
+    "!pip install transformers==2.1.1\n",
+    "!pip install torch==1.3.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install git+https://[email protected]/SKTBrain/KoBERT.git@master"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import gluonnlp as nlp\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm, tqdm_notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kobert.utils import get_tokenizer\n",
+    "from kobert.pytorch_kobert import get_pytorch_kobert_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AdamW\n",
+    "from transformers.optimization import WarmupLinearSchedule"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##GPU 사용 시\n",
+    "device = torch.device(\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bertmodel, vocab = get_pytorch_kobert_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1\n",
+    "!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_train = nlp.data.TSVDataset(\"ratings_train.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)\n",
+    "dataset_test = nlp.data.TSVDataset(\"ratings_test.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = get_tokenizer()\n",
+    "tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BERTDataset(Dataset):\n",
+    "    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,\n",
+    "                 pad, pair):\n",
+    "        transform = nlp.data.BERTSentenceTransform(\n",
+    "            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)\n",
+    "\n",
+    "        self.sentences = [transform([i[sent_idx]]) for i in dataset]\n",
+    "        self.labels = [np.int32(i[label_idx]) for i in dataset]\n",
+    "\n",
+    "    def __getitem__(self, i):\n",
+    "        return (self.sentences[i] + (self.labels[i], ))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return (len(self.labels))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Setting parameters\n",
+    "max_len = 64\n",
+    "batch_size = 64\n",
+    "warmup_ratio = 0.1\n",
+    "num_epochs = 5\n",
+    "max_grad_norm = 1\n",
+    "log_interval = 200\n",
+    "learning_rate =  5e-5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)\n",
+    "data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)\n",
+    "test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BERTClassifier(nn.Module):\n",
+    "    def __init__(self,\n",
+    "                 bert,\n",
+    "                 hidden_size = 768,\n",
+    "                 num_classes=2,\n",
+    "                 dr_rate=None,\n",
+    "                 params=None):\n",
+    "        super(BERTClassifier, self).__init__()\n",
+    "        self.bert = bert\n",
+    "        self.dr_rate = dr_rate\n",
+    "                 \n",
+    "        self.classifier = nn.Linear(hidden_size , num_classes)\n",
+    "        if dr_rate:\n",
+    "            self.dropout = nn.Dropout(p=dr_rate)\n",
+    "    \n",
+    "    def gen_attention_mask(self, token_ids, valid_length):\n",
+    "        attention_mask = torch.zeros_like(token_ids)\n",
+    "        for i, v in enumerate(valid_length):\n",
+    "            attention_mask[i][:v] = 1\n",
+    "        return attention_mask.float()\n",
+    "\n",
+    "    def forward(self, token_ids, valid_length, segment_ids):\n",
+    "        attention_mask = self.gen_attention_mask(token_ids, valid_length)\n",
+    "        \n",
+    "        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))\n",
+    "        if self.dr_rate:\n",
+    "            out = self.dropout(pooler)\n",
+    "        return self.classifier(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare optimizer and schedule (linear warmup and decay)\n",
+    "no_decay = ['bias', 'LayerNorm.weight']\n",
+    "optimizer_grouped_parameters = [\n",
+    "    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n",
+    "    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)\n",
+    "loss_fn = nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_total = len(train_dataloader) * num_epochs\n",
+    "warmup_step = int(t_total * warmup_ratio)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calc_accuracy(X,Y):\n",
+    "    max_vals, max_indices = torch.max(X, 1)\n",
+    "    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]\n",
+    "    return train_acc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for e in range(num_epochs):\n",
+    "    train_acc = 0.0\n",
+    "    test_acc = 0.0\n",
+    "    model.train()\n",
+    "    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):\n",
+    "        optimizer.zero_grad()\n",
+    "        token_ids = token_ids.long().to(device)\n",
+    "        segment_ids = segment_ids.long().to(device)\n",
+    "        valid_length= valid_length\n",
+    "        label = label.long().to(device)\n",
+    "        out = model(token_ids, valid_length, segment_ids)\n",
+    "        loss = loss_fn(out, label)\n",
+    "        loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)\n",
+    "        optimizer.step()\n",
+    "        scheduler.step()  # Update learning rate schedule\n",
+    "        train_acc += calc_accuracy(out, label)\n",
+    "        if batch_id % log_interval == 0:\n",
+    "            print(\"epoch {} batch id {} loss {} train acc {}\".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))\n",
+    "    print(\"epoch {} train acc {}\".format(e+1, train_acc / (batch_id+1)))\n",
+    "    model.eval()\n",
+    "    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):\n",
+    "        token_ids = token_ids.long().to(device)\n",
+    "        segment_ids = segment_ids.long().to(device)\n",
+    "        valid_length= valid_length\n",
+    "        label = label.long().to(device)\n",
+    "        out = model(token_ids, valid_length, segment_ids)\n",
+    "        test_acc += calc_accuracy(out, label)\n",
+    "    print(\"epoch {} test acc {}\".format(e+1, test_acc / (batch_id+1)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}