bert_base_imdb_hugging.py

# -*- coding: utf-8 -*-
"""Copy of BERT_base_imdb_hugging.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1DP9JHqsDPTweMtCpi-i1fnSYjR657PGQ
"""

#from google.colab import drive
#drive.mount('/content/gdrive')

#cd gdrive/'My Drive'

#!ls


#!pip install pytorch_pretrained_bert pytorch-nlp

#!pip install transformers

import sys
import numpy as np
import random as rn
import torch
from torch import nn

from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertConfig


from torchnlp.datasets import imdb_dataset
#from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

# Ref: https://pytorch.org/docs/stable/notes/randomness.html#:~:text=Reproducibility-,Reproducibility,even%20when%20using%20identical%20seeds
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

"""## Prepare the Data"""

train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

len(train_data)

train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer.tokenize('Hi my name is Dima')

#train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
#test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

#len(train_tokens), len(test_tokens)

#train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
#test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

#train_tokens_ids.shape, test_tokens_ids.shape

batch_train = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
batch_test = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

train_tokens_ids = batch_train['input_ids']
test_tokens_ids = batch_test['input_ids']

train_masks = batch_train['attention_mask']
test_masks = batch_test['attention_mask']


train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

#train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
#test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

"""# Baseline"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

baseline_predicted = baseline_model.predict(test_texts)

print(classification_report(test_labels, baseline_predicted))

"""# Bert Model"""

#dropout 0.1, 0.2 gave same results

class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super(BertBinaryClassifier, self).__init__()

        #model_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x)
x.shape, y.shape, pooled.shape

y = bert_clf(x)
y.cpu().detach().numpy()

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

"""# Fine-tune BERT"""

BATCH_SIZE = 8
EPOCHS = 20

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = train_masks
test_masks_tensor = test_masks

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(bert_clf.parameters(), lr=3e-6)

torch.cuda.empty_cache()

#cd gdrive/'My Drive'

l = []
x = 2
l.append(x)

l[-1]

val_losses = []
min_loss = 1
streak = 0
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        #print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        #clear_output(wait=True)
      
    print('Epoch: ', epoch_num + 1)
    print("{0}/{1} train loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

    bert_clf.eval()
    bert_predicted = []
    all_logits = []
    test_loss = 0
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(test_dataloader):
          
            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = bert_clf(token_ids, masks)
            loss_func = nn.BCELoss()
            loss = loss_func(logits, labels)
            test_loss += loss.item()
            numpy_logits = logits.cpu().detach().numpy()
        
            bert_predicted += list(numpy_logits[:, 0] > 0.5)
            all_logits += list(numpy_logits[:, 0])

    ## Ref: early stopping: https://medium.com/analytics-vidhya/early-stopping-with-pytorch-to-restrain-your-model-from-overfitting-dce6de4081c5
    # save and load: https://pytorch.org/tutorials/beginner/saving_loading_models.html
    if len(val_losses)>0:
        if val_losses[-1]>test_loss:
            #torch.save(bert_clf.state_dict(),"/content/gdrive/My Drive/model.pt")
            streak = 0
        if val_losses[-1]<test_loss:
            streak = streak + 1

    if len(val_losses)==0: #first iteration
        torch.save(bert_clf.state_dict(), "/content/gdrive/My Drive/model.pt")
        min_loss = test_loss
        #torch.save(model, PATH) to save entire model
    
    elif test_loss<min_loss:
        torch.save(bert_clf.state_dict(),"/content/gdrive/My Drive/model.pt")
        min_loss = test_loss

    if streak>5:
        break

    val_losses.append(test_loss)
    print("{0}/{1} val loss: {2} ".format(step_num_e, len(test_data) / BATCH_SIZE, test_loss / (step_num_e + 1)))

#bert_clf = torch.load('model')
bert_clf.load_state_dict(torch.load("/content/gdrive/My Drive/model.pt"))
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

np.mean(bert_predicted)

print(classification_report(test_y, bert_predicted))

#dropout = 0.3 min_loss lr=3e-6

for i in range(5):
  clear_output(wait=True)
  print('a', i+1)
  print("{0}/{1} loss: {2} ".format(i, i / 5, i / (i + 1)))