Skip to content

Commit

Permalink
Merge pull request SKTBrain#13 from seujung/master
Browse files Browse the repository at this point in the history
add NSMC pytorch example
  • Loading branch information
haven-jeon authored Dec 24, 2019
2 parents 2cb8198 + 8600b1c commit 80274ac
Show file tree
Hide file tree
Showing 2 changed files with 331 additions and 0 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ Vocab(size=8002, unk="[UNK]", reserved="['[MASK]', '[SEP]', '[CLS]']")
- Naver Sentiment Analysis Fine-Tuning with MXNet
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_gluon_kobert.ipynb)

- Naver Sentiment Analysis Fine-Tuning with pytorch
- Colab에서 [런타임] - [런타임 유형 변경] - 하드웨어 가속기(GPU) 사용을 권장합니다.
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb)

#### Tokenizer

* Pretrained [Sentencepiece](https://github.com/google/sentencepiece) tokenizer
Expand Down
327 changes: 327 additions & 0 deletions scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install mxnet-cu101\n",
"!pip install gluonnlp pandas tqdm\n",
"!pip install sentencepiece==0.1.85\n",
"!pip install transformers==2.1.1\n",
"!pip install torch==1.3.1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install git+https://[email protected]/SKTBrain/KoBERT.git@master"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch import nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"import gluonnlp as nlp\n",
"import numpy as np\n",
"from tqdm import tqdm, tqdm_notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from kobert.utils import get_tokenizer\n",
"from kobert.pytorch_kobert import get_pytorch_kobert_model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AdamW\n",
"from transformers.optimization import WarmupLinearSchedule"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"##GPU 사용 시\n",
"device = torch.device(\"cuda:0\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bertmodel, vocab = get_pytorch_kobert_model()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1\n",
"!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_train = nlp.data.TSVDataset(\"ratings_train.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)\n",
"dataset_test = nlp.data.TSVDataset(\"ratings_test.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = get_tokenizer()\n",
"tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class BERTDataset(Dataset):\n",
" def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,\n",
" pad, pair):\n",
" transform = nlp.data.BERTSentenceTransform(\n",
" bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)\n",
"\n",
" self.sentences = [transform([i[sent_idx]]) for i in dataset]\n",
" self.labels = [np.int32(i[label_idx]) for i in dataset]\n",
"\n",
" def __getitem__(self, i):\n",
" return (self.sentences[i] + (self.labels[i], ))\n",
"\n",
" def __len__(self):\n",
" return (len(self.labels))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Setting parameters\n",
"max_len = 64\n",
"batch_size = 64\n",
"warmup_ratio = 0.1\n",
"num_epochs = 5\n",
"max_grad_norm = 1\n",
"log_interval = 200\n",
"learning_rate = 5e-5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)\n",
"data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)\n",
"test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class BERTClassifier(nn.Module):\n",
" def __init__(self,\n",
" bert,\n",
" hidden_size = 768,\n",
" num_classes=2,\n",
" dr_rate=None,\n",
" params=None):\n",
" super(BERTClassifier, self).__init__()\n",
" self.bert = bert\n",
" self.dr_rate = dr_rate\n",
" \n",
" self.classifier = nn.Linear(hidden_size , num_classes)\n",
" if dr_rate:\n",
" self.dropout = nn.Dropout(p=dr_rate)\n",
" \n",
" def gen_attention_mask(self, token_ids, valid_length):\n",
" attention_mask = torch.zeros_like(token_ids)\n",
" for i, v in enumerate(valid_length):\n",
" attention_mask[i][:v] = 1\n",
" return attention_mask.float()\n",
"\n",
" def forward(self, token_ids, valid_length, segment_ids):\n",
" attention_mask = self.gen_attention_mask(token_ids, valid_length)\n",
" \n",
" _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))\n",
" if self.dr_rate:\n",
" out = self.dropout(pooler)\n",
" return self.classifier(out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare optimizer and schedule (linear warmup and decay)\n",
"no_decay = ['bias', 'LayerNorm.weight']\n",
"optimizer_grouped_parameters = [\n",
" {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n",
" {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)\n",
"loss_fn = nn.CrossEntropyLoss()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t_total = len(train_dataloader) * num_epochs\n",
"warmup_step = int(t_total * warmup_ratio)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def calc_accuracy(X,Y):\n",
" max_vals, max_indices = torch.max(X, 1)\n",
" train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]\n",
" return train_acc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for e in range(num_epochs):\n",
" train_acc = 0.0\n",
" test_acc = 0.0\n",
" model.train()\n",
" for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):\n",
" optimizer.zero_grad()\n",
" token_ids = token_ids.long().to(device)\n",
" segment_ids = segment_ids.long().to(device)\n",
" valid_length= valid_length\n",
" label = label.long().to(device)\n",
" out = model(token_ids, valid_length, segment_ids)\n",
" loss = loss_fn(out, label)\n",
" loss.backward()\n",
" torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)\n",
" optimizer.step()\n",
" scheduler.step() # Update learning rate schedule\n",
" train_acc += calc_accuracy(out, label)\n",
" if batch_id % log_interval == 0:\n",
" print(\"epoch {} batch id {} loss {} train acc {}\".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))\n",
" print(\"epoch {} train acc {}\".format(e+1, train_acc / (batch_id+1)))\n",
" model.eval()\n",
" for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):\n",
" token_ids = token_ids.long().to(device)\n",
" segment_ids = segment_ids.long().to(device)\n",
" valid_length= valid_length\n",
" label = label.long().to(device)\n",
" out = model(token_ids, valid_length, segment_ids)\n",
" test_acc += calc_accuracy(out, label)\n",
" print(\"epoch {} test acc {}\".format(e+1, test_acc / (batch_id+1)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 80274ac

Please sign in to comment.