Skip to content

Commit

Permalink
add test code
Browse files Browse the repository at this point in the history
  • Loading branch information
rubycheen committed Mar 21, 2022
1 parent b3c3fb5 commit 5c8f56e
Show file tree
Hide file tree
Showing 8 changed files with 357 additions and 0 deletions.
Binary file added hw1/r10946029/ckpt/intent/best.pt
Binary file not shown.
Binary file added hw1/r10946029/ckpt/slot/slot-best.pt
Binary file not shown.
1 change: 1 addition & 0 deletions hw1/r10946029/intent_cls.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3.8 test_intent.py --test_file "${1}" --ckpt_path ckpt/intent/best.pt --pred_file "${2}"
93 changes: 93 additions & 0 deletions hw1/r10946029/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import Dict

import torch
from torch.nn import Embedding

class SeqClassifier(torch.nn.Module):
def __init__(
self,
embeddings: torch.tensor,
hidden_size: int,
num_layers: int,
dropout_rate: float,
bidirectional: bool,
num_class: int,
) -> None:
super(SeqClassifier, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.bidirectional = bidirectional
self.embed = Embedding.from_pretrained(embeddings, freeze=False)
# TODO: model architecture
self.dim_embeddings = 300
self.gru = torch.nn.GRU(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
# self.lstm = torch.nn.LSTM(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, bias = False)
self.dropout = torch.nn.Dropout(dropout_rate)
self.classifier = torch.nn.Linear(hidden_size*2, num_class) if bidirectional else torch.nn.Linear(hidden_size, num_class)

@property
def encoder_output_size(self) -> int:
# calculate the output dimension of rnn
if self.bidirectional:
return 2 * self.hidden_size
return self.hidden_size

def forward(self, batch) -> Dict[str, torch.Tensor]:
# TODO: implement model forward
# print('batch', batch.shape)
# print(batch)
context = self.embed(batch)
# context_outs, (context_h_n, _) = self.lstm(context)
_, context_h_n = self.gru(context)
# print('context_outs.shape', context_outs.shape)
context_h_n = self.dropout(context_h_n)
out = torch.cat((context_h_n[-1], context_h_n[-2]), axis=-1) if self.bidirectional else context_h_n[-1]
# print('context_h_n.shape', context_h_n.shape)
out = self.classifier(out)
return out


class SlotClassifier(torch.nn.Module):
def __init__(
self,
embeddings: torch.tensor,
hidden_size: int,
num_layers: int,
dropout_rate: float,
bidirectional: bool,
num_class: int,
) -> None:
super(SlotClassifier, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.bidirectional = bidirectional
self.num_class = num_class
self.embed = Embedding.from_pretrained(embeddings, freeze=False)
# TODO: model architecture
self.dim_embeddings = 300
# self.lstm = torch.nn.LSTM(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
self.gru = torch.nn.GRU(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
self.dropout = torch.nn.Dropout(dropout_rate)
self.classifier = torch.nn.Linear(hidden_size*2, num_class) if bidirectional else torch.nn.Linear(hidden_size, num_class)

@property
def encoder_output_size(self) -> int:
# calculate the output dimension of rnn
if self.bidirectional:
return 2 * self.hidden_size
return self.hidden_size

def forward(self, batch) -> Dict[str, torch.Tensor]:
# TODO: implement model forward
# print('batch', batch.shape)
# print(batch)
context = self.embed(batch)
# context_outs, _ = self.lstm(context)
context_outs, _ = self.gru(context)
# print('context_outs.shape', context_outs.shape)
out = self.dropout(context_outs)
# print('out.shape', out.shape)
out = self.classifier(out)
out = out.view(-1,self.num_class)
# print(out.shape)
return out
1 change: 1 addition & 0 deletions hw1/r10946029/slot_tag.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3.8 test_slot.py --test_file "${1}" --ckpt_path ckpt/slot/slot-best.pt --pred_file "${2}"
106 changes: 106 additions & 0 deletions hw1/r10946029/test_intent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import json
import pickle
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Dict

import torch

from dataset import SeqClsDataset
from model import SeqClassifier
from utils import Vocab
from torch.utils.data import DataLoader

import pandas as pd


def main(args):
with open(args.cache_dir / "vocab.pkl", "rb") as f:
vocab: Vocab = pickle.load(f)

intent_idx_path = args.cache_dir / "intent2idx.json"
intent2idx: Dict[str, int] = json.loads(intent_idx_path.read_text())

data = json.loads(args.test_file.read_text())
dataset = SeqClsDataset(data, vocab, intent2idx, args.max_len)
# TODO: crecate DataLoader for test dataset
test_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

embeddings = torch.load(args.cache_dir / "embeddings.pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SeqClassifier(
embeddings,
args.hidden_size,
args.num_layers,
args.dropout,
args.bidirectional,
dataset.num_classes,
).to(device)


model.eval()

# load weights into model
model.load_state_dict(torch.load(args.ckpt_path, map_location=torch.device(device)))

# TODO: predict dataset
preds = []
for idx, data in enumerate(test_loader):
text = torch.Tensor(vocab.encode_batch([t.split(' ') for t in data['text']])).type(torch.LongTensor)
text = text.to(device)
outputs = model(text)
preds += [dataset.idx2label(p.detach().cpu().item()) for p in outputs.argmax(dim=1)]

# TODO: write prediction to file (args.pred_file)
# print('idxs', [ f'test-{i}' for i in range(len(preds)) ])
# print('preds', preds)

df = pd.DataFrame({'id':[f'test-{i}' for i in range(len(preds))], 'intent': preds})
df.to_csv(args.pred_file, index=False)

def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument(
"--test_file",
type=Path,
help="Path to the test file.",
required=True
)
parser.add_argument(
"--cache_dir",
type=Path,
help="Directory to the preprocessed caches.",
default="./cache/intent/",
)
parser.add_argument(
"--ckpt_path",
type=Path,
help="Path to model checkpoint.",
required=True
)
parser.add_argument("--pred_file", type=Path, default="pred.intent.csv")

# data
parser.add_argument("--max_len", type=int, default=None)

# model
parser.add_argument("--hidden_size", type=int, default=512)
parser.add_argument("--num_layers", type=int, default=2)
parser.add_argument("--dropout", type=float, default=0.4)
parser.add_argument("--bidirectional", type=bool, default=True)

# data loader
parser.add_argument("--batch_size", type=int, default=128)

parser.add_argument(
"--device", type=torch.device, help="cpu, cuda, cuda:0, cuda:1", default="cpu"
)
args = parser.parse_args()
return args


if __name__ == "__main__":
args = parse_args()
main(args)
112 changes: 112 additions & 0 deletions hw1/r10946029/test_slot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import json
import pickle
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Dict

import torch

from dataset import SlotClsDataset
from model import SlotClassifier
from utils import Vocab
from torch.utils.data import DataLoader

import pandas as pd


def main(args):
with open(args.cache_dir / "vocab.pkl", "rb") as f:
vocab: Vocab = pickle.load(f)

slot_idx_path = args.cache_dir / "tag2idx.json"
tag2idx: Dict[str, int] = json.loads(slot_idx_path.read_text())

data = json.loads(args.test_file.read_text())
dataset = SlotClsDataset(data, vocab, tag2idx, args.max_len)
# TODO: crecate DataLoader for test dataset
test_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

embeddings = torch.load(args.cache_dir / "embeddings.pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SlotClassifier(
embeddings,
args.hidden_size,
args.num_layers,
args.dropout,
args.bidirectional,
dataset.num_classes,
).to(device)


model.eval()

# load weights into model
model.load_state_dict(torch.load(args.ckpt_path, map_location=torch.device(device)))

# TODO: predict dataset
preds = []
with torch.no_grad():
for idx, data in enumerate(test_loader):
text = torch.Tensor(vocab.encode_batch([t.split(' ') for t in data['tokens']])).type(torch.LongTensor)
bz, batch_seq_len = text.shape[0], text.shape[1]
# seq_str_tags = [t.split(' ') for t in data['tags']]
text = text.to(device)
outputs = model(text).view(bz,-1,9)

for idx, indice in enumerate(data['length']):
preds += [[dataset.idx2label(p.detach().cpu().item()) for p in outputs[idx,:indice].argmax(dim=1)]]

# TODO: write prediction to file (args.pred_file)
sep = ' '
preds = [sep.join(p) for p in preds]
print('preds', len(preds), preds[0])

df = pd.DataFrame({'id':[f'test-{i}' for i in range(len(preds))], 'tags': preds})
df.to_csv(args.pred_file, index=False)

def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument(
"--test_file",
type=Path,
help="Path to the test file.",
required=True
)
parser.add_argument(
"--cache_dir",
type=Path,
help="Directory to the preprocessed caches.",
default="./cache/slot/",
)
parser.add_argument(
"--ckpt_path",
type=Path,
help="Path to model checkpoint.",
required=True
)
parser.add_argument("--pred_file", type=Path, default="pred.slot.csv")

# data
parser.add_argument("--max_len", type=int, default=None)

# model
parser.add_argument("--hidden_size", type=int, default=128)
parser.add_argument("--num_layers", type=int, default=2)
parser.add_argument("--dropout", type=float, default=0.4)
parser.add_argument("--bidirectional", type=bool, default=True)

# data loader
parser.add_argument("--batch_size", type=int, default=128)

parser.add_argument(
"--device", type=torch.device, help="cpu, cuda, cuda:0, cuda:1", default="cpu"
)
args = parser.parse_args()
return args


if __name__ == "__main__":
args = parse_args()
main(args)
44 changes: 44 additions & 0 deletions hw1/r10946029/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Iterable, List


class Vocab:
PAD = "[PAD]"
UNK = "[UNK]"

def __init__(self, vocab: Iterable[str]) -> None:
self.token2idx = {
Vocab.PAD: 0,
Vocab.UNK: 1,
**{token: i for i, token in enumerate(vocab, 2)},
}

@property
def pad_id(self) -> int:
return self.token2idx[Vocab.PAD]

@property
def unk_id(self) -> int:
return self.token2idx[Vocab.UNK]

@property
def tokens(self) -> List[str]:
return list(self.token2idx.keys())

def token_to_id(self, token: str) -> int:
return self.token2idx.get(token, self.unk_id)

def encode(self, tokens: List[str]) -> List[int]:
return [self.token_to_id(token) for token in tokens]

def encode_batch(
self, batch_tokens: List[List[str]], to_len: int = None
) -> List[List[int]]:
batch_ids = [self.encode(tokens) for tokens in batch_tokens]
to_len = max(len(ids) for ids in batch_ids) if to_len is None else to_len
padded_ids = pad_to_len(batch_ids, to_len, self.pad_id)
return padded_ids


def pad_to_len(seqs: List[List[int]], to_len: int, padding: int) -> List[List[int]]:
paddeds = [seq[:to_len] + [padding] * max(0, to_len - len(seq)) for seq in seqs]
return paddeds

0 comments on commit 5c8f56e

Please sign in to comment.