Skip to content

Commit

Permalink
multiclass & lit utils code for downscale experimentation
Browse files Browse the repository at this point in the history
  • Loading branch information
dtrizna committed Feb 20, 2024
1 parent 27ae622 commit 1f660f2
Show file tree
Hide file tree
Showing 3 changed files with 379 additions and 36 deletions.
321 changes: 321 additions & 0 deletions evaluation/paper_sota/run_family_downscale_speakeasy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
import os
REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")

import sys
sys.path.extend([REPO_ROOT, "."])

import time
import json
import logging
import numpy as np
from collections import defaultdict
from sklearn.utils import shuffle

from nebula.preprocessing.wrappers import (
preprocess_nebula_speakeasy,
preprocess_quovadis_speakeasy,
preprocess_neurlux,
preprocess_dmds_speakeasy
)

from nebula.models.neurlux import NeurLuxModel
from nebula.models.quovadis import QuoVadisModel
from nebula.models import TransformerEncoderChunks
from nebula.models.dmds import DMDSGatedCNN
from nebula.misc import set_random_seed, clear_cuda_cache
from nebula.constants import SPEAKEASY_LABELMAP

from torch import cuda
from torch.nn import CrossEntropyLoss
from nebula.lit_utils import LitTrainerWrapper
from nebula.misc import fix_random_seed
from sklearn.metrics import f1_score

LIMIT = None
INFOLDER = None # "out_family_downscale_speakeasy_1708346406" # if data is processed already

NEBULA_VOCAB = 50000
NEURLUX_VOCAB = 10000
QUO_VADIS_TOP_API = 600

RANDOM_SEED = 1763
SEQ_LEN = 512
SPEAKEASY_TRAINSET_PATH = os.path.join(REPO_ROOT, "data", "data_raw", "windows_emulation_trainset")
SPEAKEASY_TESTSET_PATH = os.path.join(REPO_ROOT, "data", "data_raw", "windows_emulation_testset")
SPEAKEASY_LABEL_IDX_TO_FAMILY = {v: k for k, v in SPEAKEASY_LABELMAP.items()}

if __name__ == "__main__":

fix_random_seed(RANDOM_SEED)

if INFOLDER:
out_folder_root = INFOLDER
else:
out_folder_root = f"out_family_downscale_speakeasy_{int(time.time())}"
os.makedirs(out_folder_root, exist_ok=True)

# =========== set out logging to both file and stdout
logging.basicConfig(
level=logging.WARNING,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler(os.path.join(out_folder_root, f"log_{int(time.time())}.txt")),
logging.StreamHandler()
]
)

datafolders = {}
datafolders['nebula'] = os.path.join(out_folder_root, f"data_nebula_speakeasy_vocab_{NEBULA_VOCAB}_seqlen_{SEQ_LEN}")
datafolders['nebulawht'] = os.path.join(out_folder_root, f"data_nebulawht_speakeasy_vocab_{NEBULA_VOCAB}_seqlen_{SEQ_LEN}")
datafolders['neurlux'] = os.path.join(out_folder_root, f"data_neurlux_speakeasy_vocab_{NEURLUX_VOCAB}_seqlen_{SEQ_LEN}")
datafolders['quovadis'] = os.path.join(out_folder_root, f"data_quovadis_speakeasy_vocab_{QUO_VADIS_TOP_API}_seqlen_{SEQ_LEN}")
datafolders['dmds'] = os.path.join(out_folder_root, f"data_dmds_speakeasy_vocab_seqlen_{SEQ_LEN}")

# =========== 'nebula' & 'speakeasy' preprocessing
logging.warning("Preprocessing 'nebula' & 'speakeasy'...")
_, y_train, y_paths_train, = preprocess_nebula_speakeasy(
folder=SPEAKEASY_TRAINSET_PATH,
limit=LIMIT,
vocab_size=NEBULA_VOCAB,
seq_len=SEQ_LEN,
outfolder=datafolders['nebula'],
multiclass=True
)
_, y_test, y_paths_test = preprocess_nebula_speakeasy(
folder=SPEAKEASY_TESTSET_PATH,
limit=LIMIT,
vocab_size=NEBULA_VOCAB,
seq_len=SEQ_LEN,
outfolder=datafolders['nebula'],
tokenizer_model=os.path.join(datafolders['nebula'], f"tokenizer_{NEBULA_VOCAB}.model"),
multiclass=True
)

# =========== 'nebula' & 'speakeasy' preprocessing
logging.warning("Preprocessing 'nebula wht' & 'speakeasy'...")
_, y_train, y_paths_train, = preprocess_nebula_speakeasy(
folder=SPEAKEASY_TRAINSET_PATH,
limit=LIMIT,
vocab_size=NEBULA_VOCAB,
seq_len=SEQ_LEN,
outfolder=datafolders['nebulawht'],
multiclass=True,
tokenizer_type="whitespace"
)
_, y_test, y_paths_test = preprocess_nebula_speakeasy(
folder=SPEAKEASY_TESTSET_PATH,
limit=LIMIT,
vocab_size=NEBULA_VOCAB,
seq_len=SEQ_LEN,
outfolder=datafolders['nebulawht'],
tokenizer_model=os.path.join(datafolders['nebulawht'], f"tokenizer_{NEBULA_VOCAB}_vocab.json"),
multiclass=True,
tokenizer_type="whitespace"
)

# ============= 'dmds' & 'speakeasy' preprocessing
preprocess_dmds_speakeasy(
y_paths_train,
y=y_train,
seq_len=SEQ_LEN,
outfolder=datafolders['dmds'],
suffix="train",
limit=LIMIT,
)
preprocess_dmds_speakeasy(
y_paths_test,
y=y_test,
seq_len=SEQ_LEN,
outfolder=datafolders['dmds'],
suffix="test",
limit=LIMIT,
)

# =========== 'neurlux' & 'speakeasy' preprocessing
_, neurlux_vocab_file = preprocess_neurlux(
y_paths_train,
y=y_train,
outfolder=datafolders['neurlux'],
limit=LIMIT,
vocab_size=NEURLUX_VOCAB,
seq_len=SEQ_LEN

)
_ = preprocess_neurlux(
y_paths_test,
y=y_test,
vocab_file=neurlux_vocab_file,
outfolder=datafolders['neurlux'],
limit=LIMIT,
vocab_size=NEURLUX_VOCAB,
seq_len=SEQ_LEN
)

# ============ 'quo vadis' & 'speakeasy' preprocessing
_, quovadis_vocab_file = preprocess_quovadis_speakeasy(
y_paths_train,
y=y_train,
seq_len=SEQ_LEN,
top_api=QUO_VADIS_TOP_API,
outfolder=datafolders['quovadis'],
limit=LIMIT,
)
_ = preprocess_quovadis_speakeasy(
y_paths_test,
y=y_test,
vocab_file=quovadis_vocab_file,
seq_len=SEQ_LEN,
top_api=QUO_VADIS_TOP_API,
outfolder=datafolders['quovadis'],
limit=LIMIT,
)

# ============= DEFINE MODELS =============
models = defaultdict(dict)
# count unique elements in y
n_unique_y = len(set(y_train))
logging.warning(f" [!] Multiclass classification with {n_unique_y} classes")

with open(neurlux_vocab_file) as f:
neurlux_vocab = json.load(f)
models['neurlux']['class'] = NeurLuxModel
models['neurlux']['config'] = {
"embedding_dim": 256,
"vocab_size": len(neurlux_vocab),
"seq_len": SEQ_LEN,
"num_classes": n_unique_y
}

models['quovadis']['class'] = QuoVadisModel
models['quovadis']['config'] = {
"vocab": quovadis_vocab_file,
"seq_len": SEQ_LEN,
"num_classes": n_unique_y
}

with open(os.path.join(datafolders['nebula'], f"tokenizer_{NEBULA_VOCAB}_vocab.json")) as f:
nebula_vocab = json.load(f)
models['nebula']['class'] = TransformerEncoderChunks
models['nebula']['config'] = {
"vocab_size": len(nebula_vocab),
"maxlen": SEQ_LEN,
"chunk_size": 64,
"dModel": 64, # embedding & transformer dimension
"nHeads": 8, # number of heads in nn.MultiheadAttention
"dHidden": 256, # dimension of the feedforward network model in nn.TransformerEncoder
"nLayers": 2, # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
"numClasses": n_unique_y,
"classifier_head": [64],
"layerNorm": False,
"dropout": 0.3,
"pooling": "mean",
"norm_first": True
}
with open(os.path.join(datafolders['nebulawht'], f"tokenizer_{NEBULA_VOCAB}_vocab.json")) as f:
nebula_vocab = json.load(f)
models['nebulawht']['class'] = TransformerEncoderChunks
models['nebulawht']['config'] = {
"vocab_size": len(nebula_vocab),
"maxlen": SEQ_LEN,
"chunk_size": 64,
"dModel": 64, # embedding & transformer dimension
"nHeads": 8, # number of heads in nn.MultiheadAttention
"dHidden": 256, # dimension of the feedforward network model in nn.TransformerEncoder
"nLayers": 2, # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
"numClasses": n_unique_y,
"classifier_head": [64],
"layerNorm": False,
"dropout": 0.3,
"pooling": "mean",
"norm_first": True
}

models['dmds']['class'] = DMDSGatedCNN
models['dmds']['config'] = {
"ndim": 98,
"seq_len": SEQ_LEN,
"num_classes": n_unique_y,
}

device = "cuda" if cuda.is_available() else "cpu"

# ============= TRAINING LOOP =============
for nr_of_families in range(3, n_unique_y+1):
set_random_seed(RANDOM_SEED)

# subsample random number_of_families labels from y_train and
subsampled_labels = np.random.choice(np.unique(y_train), size=nr_of_families, replace=False)
subsampled_families = [SPEAKEASY_LABEL_IDX_TO_FAMILY[x] for x in subsampled_labels]
logging.warning(f" [!] Sampled labels: {subsampled_labels} | Families: {subsampled_families}")

for run_name in models.keys():
set_random_seed(RANDOM_SEED)

out_folder = os.path.join(out_folder_root, f"training_{run_name}")
family_folder = os.path.join(out_folder, f"{run_name}_nr_families_{nr_of_families}_csv")
if os.path.exists(family_folder):
logging.warning(f" [!] Skipping... Output folder for run {run_name} already exists: {family_folder}")
continue

# =============== DATA ===============
suffix = LIMIT if LIMIT else "full"
x_train = np.load(os.path.join(datafolders[run_name], f"x_train_{suffix}.npy"))
y_train = np.load(os.path.join(datafolders[run_name], f"y_train_{suffix}.npy"))
x_test = np.load(os.path.join(datafolders[run_name], f"x_test_{suffix}.npy"))
y_test = np.load(os.path.join(datafolders[run_name], f"y_test_{suffix}.npy"))

if LIMIT:
x_train, y_train = shuffle(x_train, y_train, random_state=RANDOM_SEED)
x_train = x_train[:LIMIT]
y_train = y_train[:LIMIT]
x_test, y_test = shuffle(x_test, y_test, random_state=RANDOM_SEED)
x_test = x_test[:LIMIT]
y_test = y_test[:LIMIT]

# preserve only indices in x_train / y_train with these labels
x_train_indices = np.isin(y_train, subsampled_labels)
x_train_subsample = x_train[x_train_indices]
y_train_subsample = y_train[x_train_indices]

x_test_indices = np.isin(y_test, subsampled_labels)
x_test_subsample = x_test[x_test_indices]
y_test_subsample = y_test[x_test_indices]

logging.warning(f" [!] Sizes of subsampled train set: {x_train_subsample.shape}, {x_test_subsample.shape}")
logging.warning(f" [!] Sizes of subsampled test set: {y_train_subsample.shape}, {y_test_subsample.shape}")

# ============= TRAINING =============
logging.warning(f" [!!!] Starting training: {run_name}!")
model_class = models[run_name]['class']
model_config = models[run_name]['config']
model = model_class(**model_config)

lit_trainer = LitTrainerWrapper(
pytorch_model=model,
name=f"{run_name}_nr_families_{nr_of_families}",
log_folder=out_folder,
epochs=10,
device=device,
log_every_n_steps=1,
scheduler="onecycle",
batch_size=128,
dataloader_workers=4,
gradient_clip_val=1.0,
loss=CrossEntropyLoss()
)
train_loader = lit_trainer.create_dataloader(x_train_subsample, y_train_subsample)
test_loader = lit_trainer.create_dataloader(x_test_subsample, y_test_subsample, shuffle=False)
lit_trainer.train_lit_model(train_loader=train_loader, val_loader=test_loader)

# get f1 scores
y_train_pred = lit_trainer.predict_lit_model(train_loader)
f1_train = f1_score(y_train_subsample, y_train_pred, average='macro')
logging.warning(f"[!] Train F1: {f1_train*100:.2f}%")

y_test_pred = lit_trainer.predict_lit_model(test_loader)
f1_test = f1_score(y_test_subsample, y_test_pred, average='macro')
logging.warning(f"[!] Test F1: {f1_test*100:.2f}%")

del x_train_subsample, y_train_subsample, x_test_subsample, y_test_subsample
del x_train, x_test
clear_cuda_cache()
Loading

0 comments on commit 1f660f2

Please sign in to comment.