forked from SKTBrain/KoBERT
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request SKTBrain#13 from seujung/master
add NSMC pytorch example
- Loading branch information
Showing
2 changed files
with
331 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
327 changes: 327 additions & 0 deletions
327
scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,327 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install mxnet-cu101\n", | ||
"!pip install gluonnlp pandas tqdm\n", | ||
"!pip install sentencepiece==0.1.85\n", | ||
"!pip install transformers==2.1.1\n", | ||
"!pip install torch==1.3.1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install git+https://[email protected]/SKTBrain/KoBERT.git@master" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import torch\n", | ||
"from torch import nn\n", | ||
"import torch.nn.functional as F\n", | ||
"import torch.optim as optim\n", | ||
"from torch.utils.data import Dataset, DataLoader\n", | ||
"import gluonnlp as nlp\n", | ||
"import numpy as np\n", | ||
"from tqdm import tqdm, tqdm_notebook" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from kobert.utils import get_tokenizer\n", | ||
"from kobert.pytorch_kobert import get_pytorch_kobert_model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import AdamW\n", | ||
"from transformers.optimization import WarmupLinearSchedule" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"##GPU 사용 시\n", | ||
"device = torch.device(\"cuda:0\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"bertmodel, vocab = get_pytorch_kobert_model()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1\n", | ||
"!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dataset_train = nlp.data.TSVDataset(\"ratings_train.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)\n", | ||
"dataset_test = nlp.data.TSVDataset(\"ratings_test.txt?dl=1\", field_indices=[1,2], num_discard_samples=1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tokenizer = get_tokenizer()\n", | ||
"tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class BERTDataset(Dataset):\n", | ||
" def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,\n", | ||
" pad, pair):\n", | ||
" transform = nlp.data.BERTSentenceTransform(\n", | ||
" bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)\n", | ||
"\n", | ||
" self.sentences = [transform([i[sent_idx]]) for i in dataset]\n", | ||
" self.labels = [np.int32(i[label_idx]) for i in dataset]\n", | ||
"\n", | ||
" def __getitem__(self, i):\n", | ||
" return (self.sentences[i] + (self.labels[i], ))\n", | ||
"\n", | ||
" def __len__(self):\n", | ||
" return (len(self.labels))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"## Setting parameters\n", | ||
"max_len = 64\n", | ||
"batch_size = 64\n", | ||
"warmup_ratio = 0.1\n", | ||
"num_epochs = 5\n", | ||
"max_grad_norm = 1\n", | ||
"log_interval = 200\n", | ||
"learning_rate = 5e-5" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)\n", | ||
"data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)\n", | ||
"test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class BERTClassifier(nn.Module):\n", | ||
" def __init__(self,\n", | ||
" bert,\n", | ||
" hidden_size = 768,\n", | ||
" num_classes=2,\n", | ||
" dr_rate=None,\n", | ||
" params=None):\n", | ||
" super(BERTClassifier, self).__init__()\n", | ||
" self.bert = bert\n", | ||
" self.dr_rate = dr_rate\n", | ||
" \n", | ||
" self.classifier = nn.Linear(hidden_size , num_classes)\n", | ||
" if dr_rate:\n", | ||
" self.dropout = nn.Dropout(p=dr_rate)\n", | ||
" \n", | ||
" def gen_attention_mask(self, token_ids, valid_length):\n", | ||
" attention_mask = torch.zeros_like(token_ids)\n", | ||
" for i, v in enumerate(valid_length):\n", | ||
" attention_mask[i][:v] = 1\n", | ||
" return attention_mask.float()\n", | ||
"\n", | ||
" def forward(self, token_ids, valid_length, segment_ids):\n", | ||
" attention_mask = self.gen_attention_mask(token_ids, valid_length)\n", | ||
" \n", | ||
" _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))\n", | ||
" if self.dr_rate:\n", | ||
" out = self.dropout(pooler)\n", | ||
" return self.classifier(out)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Prepare optimizer and schedule (linear warmup and decay)\n", | ||
"no_decay = ['bias', 'LayerNorm.weight']\n", | ||
"optimizer_grouped_parameters = [\n", | ||
" {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n", | ||
" {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n", | ||
"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)\n", | ||
"loss_fn = nn.CrossEntropyLoss()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t_total = len(train_dataloader) * num_epochs\n", | ||
"warmup_step = int(t_total * warmup_ratio)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def calc_accuracy(X,Y):\n", | ||
" max_vals, max_indices = torch.max(X, 1)\n", | ||
" train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]\n", | ||
" return train_acc" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for e in range(num_epochs):\n", | ||
" train_acc = 0.0\n", | ||
" test_acc = 0.0\n", | ||
" model.train()\n", | ||
" for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):\n", | ||
" optimizer.zero_grad()\n", | ||
" token_ids = token_ids.long().to(device)\n", | ||
" segment_ids = segment_ids.long().to(device)\n", | ||
" valid_length= valid_length\n", | ||
" label = label.long().to(device)\n", | ||
" out = model(token_ids, valid_length, segment_ids)\n", | ||
" loss = loss_fn(out, label)\n", | ||
" loss.backward()\n", | ||
" torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)\n", | ||
" optimizer.step()\n", | ||
" scheduler.step() # Update learning rate schedule\n", | ||
" train_acc += calc_accuracy(out, label)\n", | ||
" if batch_id % log_interval == 0:\n", | ||
" print(\"epoch {} batch id {} loss {} train acc {}\".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))\n", | ||
" print(\"epoch {} train acc {}\".format(e+1, train_acc / (batch_id+1)))\n", | ||
" model.eval()\n", | ||
" for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):\n", | ||
" token_ids = token_ids.long().to(device)\n", | ||
" segment_ids = segment_ids.long().to(device)\n", | ||
" valid_length= valid_length\n", | ||
" label = label.long().to(device)\n", | ||
" out = model(token_ids, valid_length, segment_ids)\n", | ||
" test_acc += calc_accuracy(out, label)\n", | ||
" print(\"epoch {} test acc {}\".format(e+1, test_acc / (batch_id+1)))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |