Skip to content

Commit

Permalink
added fastai to the mix
Browse files Browse the repository at this point in the history
  • Loading branch information
jrzaurin committed Nov 3, 2021
1 parent cec12f7 commit 9e021a3
Show file tree
Hide file tree
Showing 11 changed files with 525 additions and 32 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ datasets/
sftp*-config.json

# misc
.DS_store
.DS_store

# processed_data
processed_data
15 changes: 11 additions & 4 deletions amazon_reviews_classification_HAN_vs_BERT/models/bert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from torch import nn
from transformers import BertModel
from transformers import BertModel, DistilBertModel


from typing import List
Expand All @@ -17,7 +17,11 @@ def __init__(
):
super(BertClassifier, self).__init__()

self.bert = BertModel.from_pretrained(model_name)
self.bert = (
DistilBertModel.from_pretrained(model_name)
if "distil" in model_name
else BertModel.from_pretrained(model_name)
)

classifier_dims = [768] + head_hidden_dim + [num_class]
self.classifier = MLP(classifier_dims, head_dropout)
Expand Down Expand Up @@ -50,14 +54,17 @@ def __init__(
d_hidden[i - 1],
d_hidden[i],
dropout,
activation=(i != len(d_hidden) - 1),
),
)

def forward(self, X: Tensor) -> Tensor:
return self.mlp(X)

@staticmethod
def _dense_layer(inp: int, out: int, p: float):
def _dense_layer(inp: int, out: int, p: float, activation: bool):
layers: List = [nn.Dropout(p)] if p > 0 else []
layers += [nn.Linear(inp, out), nn.ReLU(inplace=True)]
layers += [nn.Linear(inp, out)]
if activation:
layers += [nn.ReLU(inplace=True)]
return nn.Sequential(*layers)
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def parse_args():
default="[256,64]",
help="head hidden dimensions.",
)
parser.add_argument("--head_dropout", type=float, default=0.1, help="head dropout.")
parser.add_argument("--head_dropout", type=float, default=0.2, help="head dropout.")
parser.add_argument(
"--model_name",
type=str,
Expand All @@ -33,9 +33,9 @@ def parse_args():
# Train/Test parameters
parser.add_argument("--n_epochs", type=int, default=5, help="Number of epoch.")
parser.add_argument("--weight_decay", type=float, default=0.0, help="l2 reg.")
parser.add_argument("--lr", type=float, default=0.001, help="Learning rate.")
parser.add_argument("--lr", type=float, default=0.0005, help="Learning rate.")
parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
parser.add_argument("--lr_scheduler", action="store_true", help="use lr scheduler.")
parser.add_argument("--with_scheduler", action="store_true", help="use lr scheduler.")

parser.add_argument(
"--save_results", action="store_true", help="Save model and results"
Expand Down
137 changes: 137 additions & 0 deletions amazon_reviews_classification_HAN_vs_BERT/prepare_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import numpy as np
import pandas as pd
import os
import pickle

from pathlib import Path
from sklearn.model_selection import train_test_split
from utils.tokenizers import HANTokenizer, BertFamilyTokenizer

RAW_DATA = Path("../datasets/amazon_reviews/")
PROCESSED_DATA = Path("processed_data/")
if not os.path.exists(PROCESSED_DATA):
os.makedirs(PROCESSED_DATA)


def preprocess_bert(
df,
out_path,
max_length=120,
text_col="reviewText",
pretrained_tokenizer="bert-base-uncased",
):

train_dir = out_path / "train"
valid_dir = out_path / "valid"
test_dir = out_path / "test"
paths = [train_dir, valid_dir, test_dir]
for p in paths:
if not os.path.exists(p):
os.makedirs(p)
tr_fname = "_".join([pretrained_tokenizer, "train.npz"])
val_fname = "_".join([pretrained_tokenizer, "valid.npz"])
te_fname = "_".join([pretrained_tokenizer, "test.npz"])

texts = df[text_col].tolist()

tok = BertFamilyTokenizer(
pretrained_tokenizer=pretrained_tokenizer,
do_lower_case=True,
max_length=max_length,
)

bert_texts, bert_masks = tok.fit_transform(texts)

X_train, X_valid, mask_train, mask_valid, y_train, y_valid = train_test_split(
bert_texts,
bert_masks,
df.overall,
train_size=0.8,
random_state=1,
stratify=df.overall,
)

X_valid, X_test, mask_valid, mask_test, y_valid, y_test = train_test_split(
X_valid, mask_valid, y_valid, train_size=0.5, random_state=1, stratify=y_valid
)

np.savez(
train_dir / tr_fname, X_train=X_train, mask_train=mask_train, y_train=y_train
)
np.savez(
valid_dir / val_fname, X_valid=X_valid, mask_valid=mask_valid, y_valid=y_valid
)
np.savez(test_dir / te_fname, X_test=X_test, mask_test=mask_test, y_test=y_test)


def preprocess_han(df, out_path, text_col="reviewText"):

train_dir = out_path / "train"
valid_dir = out_path / "valid"
test_dir = out_path / "test"
paths = [train_dir, valid_dir, test_dir]
for p in paths:
if not os.path.exists(p):
os.makedirs(p)
tr_fname = "han_train.npz"
val_fname = "han_valid.npz"
te_fname = "han_test.npz"
tok_name = "HANTokenizer.p"

texts = df[text_col].tolist()

tok = HANTokenizer()

han_texts = tok.fit_transform(texts)
with open(out_path / tok_name, "wb") as f:
pickle.dump(tok, f)

X_train, X_valid, y_train, y_valid = train_test_split(
han_texts, df.overall, train_size=0.8, random_state=1, stratify=df.overall
)
X_valid, X_test, y_valid, y_test = train_test_split(
X_valid, y_valid, train_size=0.5, random_state=1, stratify=y_valid
)

np.savez(train_dir / tr_fname, X_train=X_train, y_train=y_train)
np.savez(valid_dir / val_fname, X_valid=X_valid, y_valid=y_valid)
np.savez(test_dir / te_fname, X_test=X_test, y_test=y_test)


def write_or_read_csv():

inp_fname = RAW_DATA / "reviews_Clothing_Shoes_and_Jewelry_5.json.gz"
out_fname = PROCESSED_DATA / "reviews_Clothing_Shoes_and_Jewelry.csv"

if out_fname.exists():
return pd.read_csv(out_fname)
else:
df_org = pd.read_json(inp_fname, lines=True)

# classes from [0,num_class)
df = df_org.copy()
df["overall"] = (df["overall"] - 1).astype("int64")

# group reviews with 1 and 2 scores into one class
df.loc[df.overall == 0, "overall"] = 1

# and back again to [0,num_class)
df["overall"] = (df["overall"] - 1).astype("int64")

# agressive preprocessing: drop short reviews
df["reviewLength"] = df.reviewText.apply(lambda x: len(x.split(" ")))
df = df[df.reviewLength >= 5]
df = df.drop("reviewLength", axis=1).reset_index()
df.to_csv(out_fname, index=False)

return df


if __name__ == "__main__":

df = write_or_read_csv()

# prepare the arrays
preprocess_han(df, PROCESSED_DATA)
preprocess_bert(df, PROCESSED_DATA)
preprocess_bert(df, PROCESSED_DATA, pretrained_tokenizer="distilbert-base-uncased")
14 changes: 7 additions & 7 deletions amazon_reviews_classification_HAN_vs_BERT/run_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from transformers import AdamW, get_linear_schedule_with_warmup

from models.bert import BertClassifier
from utils.bert_parser import parse_args
from parsers.bert_parser import parse_args
from utils.metrics import CategoricalAccuracy

n_cpus = os.cpu_count()
Expand Down Expand Up @@ -40,7 +40,8 @@ def train_step(model, optimizer, train_loader, epoch, metric, scheduler=None):
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
if scheduler is not None:
scheduler.step()

running_loss += loss.item()
avg_loss = running_loss / (batch_idx + 1)
Expand Down Expand Up @@ -151,9 +152,9 @@ def load_arrays_and_return_loaders(
train_dir=data_dir / "train",
valid_dir=data_dir / "valid",
test_dir=data_dir / "test",
ftrain="bert_train.npz",
fvalid="bert_valid.npz",
ftest="bert_test.npz",
ftrain="_".join([args.model_name, "train.npz"]),
fvalid="_".join([args.model_name, "valid.npz"]),
ftest="_".join([args.model_name, "test.npz"]),
batch_size=args.batch_size,
)

Expand All @@ -164,7 +165,6 @@ def load_arrays_and_return_loaders(
head_dropout=args.head_dropout,
num_class=args.num_class,
)
# Tell PyTorch to run the model on GPU
model.to(device)

optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
Expand Down Expand Up @@ -211,5 +211,5 @@ def load_arrays_and_return_loaders(
)
results_d["best_epoch"] = best_epoch

with open(args.log_dir / filename, "wb") as f:
with open(log_dir / filename, "wb") as f:
pickle.dump(results_d, f)
6 changes: 6 additions & 0 deletions amazon_reviews_classification_HAN_vs_BERT/run_bert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
python run_bert.py --model_name "distilbert-base-uncased" --freeze_bert --lr 0.001 --save_results
python run_bert.py --model_name "distilbert-base-uncased" --freeze_bert --head_hidden_dim "[256]" --lr 0.001 --save_results

python run_bert.py --model_name "distilbert-base-uncased" --batch_size 32 --lr 5e-5 --save_results
python run_bert.py --model_name "distilbert-base-uncased" --with_scheduler --batch_size 32 --lr 5e-5 --save_results
python run_bert.py --model_name "distilbert-base-uncased" --with_scheduler --batch_size 32 --lr 1e-4 --save_results
Loading

0 comments on commit 9e021a3

Please sign in to comment.