diff --git a/NLP_using_BERT.py b/NLP_using_BERT.py index 70772d5..5e4fa51 100644 --- a/NLP_using_BERT.py +++ b/NLP_using_BERT.py @@ -40,3 +40,241 @@ import pickle from apex import amp import shutil + +device=torch.device('cuda') + +MAX_SEQUENCE_LENGTH = 220 +SEED = 1234 +EPOCHS = 1 +Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification" +Input_dir = "../input" +WORK_DIR = "../working/" +num_to_load=1000000 #Train size to match time limit +valid_size= 100000 #Validation Size +TOXICITY_COLUMN = 'target' + +# Add the Bart Pytorch repo to the PATH +# using files from: https://github.com/huggingface/pytorch-pretrained-BERT +package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT" +sys.path.insert(0, package_dir_a) + +from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch +from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam + +# Translate model from tensorflow to pytorch +BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/' +convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( + BERT_MODEL_PATH + 'bert_model.ckpt', +BERT_MODEL_PATH + 'bert_config.json', +WORK_DIR + 'pytorch_model.bin') + +shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json') + +os.listdir("../working") + +# This is the Bert configuration file +from pytorch_pretrained_bert import BertConfig + +bert_config = BertConfig('../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'+'bert_config.json') + +# Converting the lines to BERT format +# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming +def convert_lines(example, max_seq_length,tokenizer): + max_seq_length -=2 + all_tokens = [] + longer = 0 + for text in tqdm_notebook(example): + tokens_a = tokenizer.tokenize(text) + if len(tokens_a)>max_seq_length: + tokens_a = tokens_a[:max_seq_length] + longer += 1 + one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a)) + all_tokens.append(one_token) + print(longer) + return np.array(all_tokens) + +BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/' + +%%time +tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True) +train_df = pd.read_csv(os.path.join(Data_dir,"train.csv")).sample(num_to_load+valid_size,random_state=SEED) +print('loaded %d records' % len(train_df)) + +# Make sure all comment_text values are strings +train_df['comment_text'] = train_df['comment_text'].astype(str) + +sequences = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer) +train_df=train_df.fillna(0) +# List all identities +identity_columns = [ + 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', + 'muslim', 'black', 'white', 'psychiatric_or_mental_illness'] +y_columns=['target'] + +train_df = train_df.drop(['comment_text'],axis=1) +# convert target to 0,1 +train_df['target']=(train_df['target']>=0.5).astype(float) + + +X = sequences[:num_to_load] +y = train_df[y_columns].values[:num_to_load] +X_val = sequences[num_to_load:] +y_val = train_df[y_columns].values[num_to_load:] + +test_df=train_df.tail(valid_size).copy() +train_df=train_df.head(num_to_load) + + + +train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float)) + +output_model_file = "bert_pytorch.bin" + +lr=2e-5 +batch_size = 32 +accumulation_steps=1 +np.random.seed(SEED) +torch.manual_seed(SEED) +torch.cuda.manual_seed(SEED) +torch.backends.cudnn.deterministic = True + +model = BertForSequenceClassification.from_pretrained("../working",cache_dir=None,num_labels=len(y_columns)) +model.zero_grad() +model = model.to(device) +param_optimizer = list(model.named_parameters()) +no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] +optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] +train = train_dataset + +num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps) + +optimizer = BertAdam(optimizer_grouped_parameters, + lr=lr, + warmup=0.05, + t_total=num_train_optimization_steps) + +model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) +model=model.train() + +tq = tqdm_notebook(range(EPOCHS)) +for epoch in tq: + train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) + avg_loss = 0. + avg_accuracy = 0. + lossf=None + tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False) + for i,(x_batch, y_batch) in tk0: + optimizer.zero_grad() + y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None) + loss = F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device)) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + if (i+1) % accumulation_steps == 0: # Wait for several backward steps + optimizer.step() # Now we can do an optimizer step + optimizer.zero_grad() + if lossf: + lossf = 0.98*lossf+0.02*loss.item() + else: + lossf = loss.item() + tk0.set_postfix(loss = lossf) + avg_loss += loss.item() / len(train_loader) + avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader) + tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy) + + +torch.save(model.state_dict(), output_model_file) + +# Run validation +# The following 2 lines are not needed but show how to download the model for prediction +model = BertForSequenceClassification(bert_config,num_labels=len(y_columns)) +model.load_state_dict(torch.load(output_model_file )) +model.to(device) +for param in model.parameters(): + param.requires_grad=False +model.eval() +valid_preds = np.zeros((len(X_val))) +valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long)) +valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False) + +tk0 = tqdm_notebook(valid_loader) +for i,(x_batch,) in enumerate(tk0): + pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None) + valid_preds[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy() + +# From baseline kernel + +def calculate_overall_auc(df, model_name): + true_labels = df[TOXICITY_COLUMN]>0.5 + predicted_labels = df[model_name] + return metrics.roc_auc_score(true_labels, predicted_labels) + +def power_mean(series, p): + total = sum(np.power(series, p)) + return np.power(total / len(series), 1 / p) + +def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25): + bias_score = np.average([ + power_mean(bias_df[SUBGROUP_AUC], POWER), + power_mean(bias_df[BPSN_AUC], POWER), + power_mean(bias_df[BNSP_AUC], POWER) + ]) + return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score) + + + +SUBGROUP_AUC = 'subgroup_auc' +BPSN_AUC = 'bpsn_auc' # stands for background positive, subgroup negative +BNSP_AUC = 'bnsp_auc' # stands for background negative, subgroup positive + +def compute_auc(y_true, y_pred): + try: + return metrics.roc_auc_score(y_true, y_pred) + except ValueError: + return np.nan + +def compute_subgroup_auc(df, subgroup, label, model_name): + subgroup_examples = df[df[subgroup]>0.5] + return compute_auc((subgroup_examples[label]>0.5), subgroup_examples[model_name]) + +def compute_bpsn_auc(df, subgroup, label, model_name): + """Computes the AUC of the within-subgroup negative examples and the background positive examples.""" + subgroup_negative_examples = df[(df[subgroup]>0.5) & (df[label]<=0.5)] + non_subgroup_positive_examples = df[(df[subgroup]<=0.5) & (df[label]>0.5)] + examples = subgroup_negative_examples.append(non_subgroup_positive_examples) + return compute_auc(examples[label]>0.5, examples[model_name]) + +def compute_bnsp_auc(df, subgroup, label, model_name): + """Computes the AUC of the within-subgroup positive examples and the background negative examples.""" + subgroup_positive_examples = df[(df[subgroup]>0.5) & (df[label]>0.5)] + non_subgroup_negative_examples = df[(df[subgroup]<=0.5) & (df[label]<=0.5)] + examples = subgroup_positive_examples.append(non_subgroup_negative_examples) + return compute_auc(examples[label]>0.5, examples[model_name]) + +def compute_bias_metrics_for_model(dataset, + subgroups, + model, + label_col, + include_asegs=False): + """Computes per-subgroup metrics for all subgroups and one model.""" + records = [] + for subgroup in subgroups: + record = { + 'subgroup': subgroup, + 'subgroup_size': len(dataset[dataset[subgroup]>0.5]) + } + record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model) + record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model) + record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model) + records.append(record) + return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True) + + +MODEL_NAME = 'model1' +test_df[MODEL_NAME]=torch.sigmoid(torch.tensor(valid_preds)).numpy() +TOXICITY_COLUMN = 'target' +bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, MODEL_NAME, 'target') +bias_metrics_df +get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, MODEL_NAME))