Skip to content

Commit

Permalink
Updated Parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
ashneg authored May 26, 2019
1 parent 6c3af01 commit af01e96
Showing 1 changed file with 238 additions and 0 deletions.
238 changes: 238 additions & 0 deletions NLP_using_BERT.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,241 @@
import pickle
from apex import amp
import shutil

device=torch.device('cuda')

MAX_SEQUENCE_LENGTH = 220
SEED = 1234
EPOCHS = 1
Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input"
WORK_DIR = "../working/"
num_to_load=1000000 #Train size to match time limit
valid_size= 100000 #Validation Size
TOXICITY_COLUMN = 'target'

# Add the Bart Pytorch repo to the PATH
# using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam

# Translate model from tensorflow to pytorch
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
BERT_MODEL_PATH + 'bert_model.ckpt',
BERT_MODEL_PATH + 'bert_config.json',
WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')

os.listdir("../working")

# This is the Bert configuration file
from pytorch_pretrained_bert import BertConfig

bert_config = BertConfig('../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'+'bert_config.json')

# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(example, max_seq_length,tokenizer):
max_seq_length -=2
all_tokens = []
longer = 0
for text in tqdm_notebook(example):
tokens_a = tokenizer.tokenize(text)
if len(tokens_a)>max_seq_length:
tokens_a = tokens_a[:max_seq_length]
longer += 1
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
all_tokens.append(one_token)
print(longer)
return np.array(all_tokens)

BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

%%time
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
train_df = pd.read_csv(os.path.join(Data_dir,"train.csv")).sample(num_to_load+valid_size,random_state=SEED)
print('loaded %d records' % len(train_df))

# Make sure all comment_text values are strings
train_df['comment_text'] = train_df['comment_text'].astype(str)

sequences = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
train_df=train_df.fillna(0)
# List all identities
identity_columns = [
'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
y_columns=['target']

train_df = train_df.drop(['comment_text'],axis=1)
# convert target to 0,1
train_df['target']=(train_df['target']>=0.5).astype(float)


X = sequences[:num_to_load]
y = train_df[y_columns].values[:num_to_load]
X_val = sequences[num_to_load:]
y_val = train_df[y_columns].values[num_to_load:]

test_df=train_df.tail(valid_size).copy()
train_df=train_df.head(num_to_load)



train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float))

output_model_file = "bert_pytorch.bin"

lr=2e-5
batch_size = 32
accumulation_steps=1
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained("../working",cache_dir=None,num_labels=len(y_columns))
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
lr=lr,
warmup=0.05,
t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model=model.train()

tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
avg_loss = 0.
avg_accuracy = 0.
lossf=None
tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
for i,(x_batch, y_batch) in tk0:
optimizer.zero_grad()
y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
loss = F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
if (i+1) % accumulation_steps == 0: # Wait for several backward steps
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad()
if lossf:
lossf = 0.98*lossf+0.02*loss.item()
else:
lossf = loss.item()
tk0.set_postfix(loss = lossf)
avg_loss += loss.item() / len(train_loader)
avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)


torch.save(model.state_dict(), output_model_file)

# Run validation
# The following 2 lines are not needed but show how to download the model for prediction
model = BertForSequenceClassification(bert_config,num_labels=len(y_columns))
model.load_state_dict(torch.load(output_model_file ))
model.to(device)
for param in model.parameters():
param.requires_grad=False
model.eval()
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
for i,(x_batch,) in enumerate(tk0):
pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
valid_preds[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy()

# From baseline kernel

def calculate_overall_auc(df, model_name):
true_labels = df[TOXICITY_COLUMN]>0.5
predicted_labels = df[model_name]
return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
total = sum(np.power(series, p))
return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
bias_score = np.average([
power_mean(bias_df[SUBGROUP_AUC], POWER),
power_mean(bias_df[BPSN_AUC], POWER),
power_mean(bias_df[BNSP_AUC], POWER)
])
return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)



SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc' # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc' # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
try:
return metrics.roc_auc_score(y_true, y_pred)
except ValueError:
return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
subgroup_examples = df[df[subgroup]>0.5]
return compute_auc((subgroup_examples[label]>0.5), subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
"""Computes the AUC of the within-subgroup negative examples and the background positive examples."""
subgroup_negative_examples = df[(df[subgroup]>0.5) & (df[label]<=0.5)]
non_subgroup_positive_examples = df[(df[subgroup]<=0.5) & (df[label]>0.5)]
examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
"""Computes the AUC of the within-subgroup positive examples and the background negative examples."""
subgroup_positive_examples = df[(df[subgroup]>0.5) & (df[label]>0.5)]
non_subgroup_negative_examples = df[(df[subgroup]<=0.5) & (df[label]<=0.5)]
examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bias_metrics_for_model(dataset,
subgroups,
model,
label_col,
include_asegs=False):
"""Computes per-subgroup metrics for all subgroups and one model."""
records = []
for subgroup in subgroups:
record = {
'subgroup': subgroup,
'subgroup_size': len(dataset[dataset[subgroup]>0.5])
}
record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
records.append(record)
return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)


MODEL_NAME = 'model1'
test_df[MODEL_NAME]=torch.sigmoid(torch.tensor(valid_preds)).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, MODEL_NAME))

0 comments on commit af01e96

Please sign in to comment.