add test code

rubycheen · Mar 21, 2022 · 5c8f56e · 5c8f56e
1 parent b3c3fb5
commit 5c8f56e
Show file tree

Hide file tree

Showing 8 changed files with 357 additions and 0 deletions.
diff --git a/hw1/r10946029/ckpt/intent/best.pt b/hw1/r10946029/ckpt/intent/best.pt
diff --git a/hw1/r10946029/ckpt/slot/slot-best.pt b/hw1/r10946029/ckpt/slot/slot-best.pt
diff --git a/hw1/r10946029/intent_cls.sh b/hw1/r10946029/intent_cls.sh
@@ -0,0 +1 @@
+python3.8 test_intent.py --test_file "${1}" --ckpt_path ckpt/intent/best.pt --pred_file "${2}"
diff --git a/hw1/r10946029/model.py b/hw1/r10946029/model.py
@@ -0,0 +1,93 @@
+from typing import Dict
+
+import torch
+from torch.nn import Embedding
+
+class SeqClassifier(torch.nn.Module):
+    def __init__(
+        self,
+        embeddings: torch.tensor,
+        hidden_size: int,
+        num_layers: int,
+        dropout_rate: float,
+        bidirectional: bool,
+        num_class: int,
+    ) -> None:
+        super(SeqClassifier, self).__init__()
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.bidirectional = bidirectional
+        self.embed = Embedding.from_pretrained(embeddings, freeze=False)
+        # TODO: model architecture
+        self.dim_embeddings = 300
+        self.gru = torch.nn.GRU(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)      
+        # self.lstm = torch.nn.LSTM(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, bias = False)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.classifier = torch.nn.Linear(hidden_size*2, num_class) if bidirectional else torch.nn.Linear(hidden_size, num_class)
+
+    @property
+    def encoder_output_size(self) -> int:
+        # calculate the output dimension of rnn
+        if self.bidirectional:
+            return 2 * self.hidden_size
+        return self.hidden_size
+
+    def forward(self, batch) -> Dict[str, torch.Tensor]:
+        # TODO: implement model forward
+        # print('batch', batch.shape)
+        # print(batch)
+        context = self.embed(batch)
+        # context_outs, (context_h_n, _) = self.lstm(context)
+        _, context_h_n = self.gru(context)
+        # print('context_outs.shape', context_outs.shape)
+        context_h_n = self.dropout(context_h_n)
+        out = torch.cat((context_h_n[-1], context_h_n[-2]), axis=-1) if self.bidirectional else context_h_n[-1]
+        # print('context_h_n.shape', context_h_n.shape)
+        out = self.classifier(out)
+        return out
+
+
+class SlotClassifier(torch.nn.Module):
+    def __init__(
+        self,
+        embeddings: torch.tensor,
+        hidden_size: int,
+        num_layers: int,
+        dropout_rate: float,
+        bidirectional: bool,
+        num_class: int,
+    ) -> None:
+        super(SlotClassifier, self).__init__()
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.bidirectional = bidirectional
+        self.num_class = num_class
+        self.embed = Embedding.from_pretrained(embeddings, freeze=False)
+        # TODO: model architecture
+        self.dim_embeddings = 300
+        # self.lstm = torch.nn.LSTM(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
+        self.gru = torch.nn.GRU(self.dim_embeddings, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)      
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.classifier = torch.nn.Linear(hidden_size*2, num_class) if bidirectional else torch.nn.Linear(hidden_size, num_class)
+
+    @property
+    def encoder_output_size(self) -> int:
+        # calculate the output dimension of rnn
+        if self.bidirectional:
+            return 2 * self.hidden_size
+        return self.hidden_size
+
+    def forward(self, batch) -> Dict[str, torch.Tensor]:
+        # TODO: implement model forward
+        # print('batch', batch.shape)
+        # print(batch)
+        context = self.embed(batch)
+        # context_outs, _ = self.lstm(context)
+        context_outs, _ = self.gru(context)
+        # print('context_outs.shape', context_outs.shape)
+        out = self.dropout(context_outs)
+        # print('out.shape', out.shape)
+        out = self.classifier(out)
+        out = out.view(-1,self.num_class)
+        # print(out.shape)
+        return out
diff --git a/hw1/r10946029/slot_tag.sh b/hw1/r10946029/slot_tag.sh
@@ -0,0 +1 @@
+python3.8 test_slot.py --test_file "${1}" --ckpt_path ckpt/slot/slot-best.pt --pred_file "${2}"
diff --git a/hw1/r10946029/test_intent.py b/hw1/r10946029/test_intent.py
@@ -0,0 +1,106 @@
+import json
+import pickle
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Dict
+
+import torch
+
+from dataset import SeqClsDataset
+from model import SeqClassifier
+from utils import Vocab
+from torch.utils.data import DataLoader
+
+import pandas as pd
+
+
+def main(args):
+    with open(args.cache_dir / "vocab.pkl", "rb") as f:
+        vocab: Vocab = pickle.load(f)
+
+    intent_idx_path = args.cache_dir / "intent2idx.json"
+    intent2idx: Dict[str, int] = json.loads(intent_idx_path.read_text())
+
+    data = json.loads(args.test_file.read_text())
+    dataset = SeqClsDataset(data, vocab, intent2idx, args.max_len)
+    # TODO: crecate DataLoader for test dataset
+    test_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
+
+    embeddings = torch.load(args.cache_dir / "embeddings.pt")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model = SeqClassifier(
+        embeddings,
+        args.hidden_size,
+        args.num_layers,
+        args.dropout,
+        args.bidirectional,
+        dataset.num_classes,
+    ).to(device)
+
+
+    model.eval()
+
+    # load weights into model
+    model.load_state_dict(torch.load(args.ckpt_path, map_location=torch.device(device)))
+
+    # TODO: predict dataset
+    preds = []
+    for idx, data in enumerate(test_loader):
+        text = torch.Tensor(vocab.encode_batch([t.split(' ') for t in data['text']])).type(torch.LongTensor)    
+        text = text.to(device)
+        outputs = model(text)
+        preds += [dataset.idx2label(p.detach().cpu().item()) for p in outputs.argmax(dim=1)]
+
+    # TODO: write prediction to file (args.pred_file)
+    # print('idxs', [ f'test-{i}' for i in range(len(preds)) ])
+    # print('preds', preds)
+
+    df = pd.DataFrame({'id':[f'test-{i}' for i in range(len(preds))], 'intent': preds})
+    df.to_csv(args.pred_file, index=False)
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--test_file",
+        type=Path,
+        help="Path to the test file.",
+        required=True
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=Path,
+        help="Directory to the preprocessed caches.",
+        default="./cache/intent/",
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        type=Path,
+        help="Path to model checkpoint.",
+        required=True
+    )
+    parser.add_argument("--pred_file", type=Path, default="pred.intent.csv")
+
+    # data
+    parser.add_argument("--max_len", type=int, default=None)
+
+    # model
+    parser.add_argument("--hidden_size", type=int, default=512)
+    parser.add_argument("--num_layers", type=int, default=2)
+    parser.add_argument("--dropout", type=float, default=0.4)
+    parser.add_argument("--bidirectional", type=bool, default=True)
+
+    # data loader
+    parser.add_argument("--batch_size", type=int, default=128)
+
+    parser.add_argument(
+        "--device", type=torch.device, help="cpu, cuda, cuda:0, cuda:1", default="cpu"
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/hw1/r10946029/test_slot.py b/hw1/r10946029/test_slot.py
@@ -0,0 +1,112 @@
+import json
+import pickle
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Dict
+
+import torch
+
+from dataset import SlotClsDataset
+from model import SlotClassifier
+from utils import Vocab
+from torch.utils.data import DataLoader
+
+import pandas as pd
+
+
+def main(args):
+    with open(args.cache_dir / "vocab.pkl", "rb") as f:
+        vocab: Vocab = pickle.load(f)
+
+    slot_idx_path = args.cache_dir / "tag2idx.json"
+    tag2idx: Dict[str, int] = json.loads(slot_idx_path.read_text())
+
+    data = json.loads(args.test_file.read_text())
+    dataset = SlotClsDataset(data, vocab, tag2idx, args.max_len)
+    # TODO: crecate DataLoader for test dataset
+    test_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
+
+    embeddings = torch.load(args.cache_dir / "embeddings.pt")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model = SlotClassifier(
+        embeddings,
+        args.hidden_size,
+        args.num_layers,
+        args.dropout,
+        args.bidirectional,
+        dataset.num_classes,
+    ).to(device)
+
+
+    model.eval()
+
+    # load weights into model
+    model.load_state_dict(torch.load(args.ckpt_path, map_location=torch.device(device)))
+
+    # TODO: predict dataset
+    preds = []
+    with torch.no_grad():
+        for idx, data in enumerate(test_loader):
+            text = torch.Tensor(vocab.encode_batch([t.split(' ') for t in data['tokens']])).type(torch.LongTensor)
+            bz, batch_seq_len = text.shape[0], text.shape[1]
+            # seq_str_tags = [t.split(' ') for t in data['tags']]
+            text = text.to(device)
+            outputs = model(text).view(bz,-1,9)
+
+            for idx, indice in enumerate(data['length']):
+                preds += [[dataset.idx2label(p.detach().cpu().item()) for p in outputs[idx,:indice].argmax(dim=1)]]
+
+    # TODO: write prediction to file (args.pred_file)
+    sep = ' '
+    preds = [sep.join(p) for p in preds]
+    print('preds', len(preds), preds[0])
+
+    df = pd.DataFrame({'id':[f'test-{i}' for i in range(len(preds))], 'tags': preds})
+    df.to_csv(args.pred_file, index=False)
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--test_file",
+        type=Path,
+        help="Path to the test file.",
+        required=True
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=Path,
+        help="Directory to the preprocessed caches.",
+        default="./cache/slot/",
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        type=Path,
+        help="Path to model checkpoint.",
+        required=True
+    )
+    parser.add_argument("--pred_file", type=Path, default="pred.slot.csv")
+
+    # data
+    parser.add_argument("--max_len", type=int, default=None)
+
+    # model
+    parser.add_argument("--hidden_size", type=int, default=128)
+    parser.add_argument("--num_layers", type=int, default=2)
+    parser.add_argument("--dropout", type=float, default=0.4)
+    parser.add_argument("--bidirectional", type=bool, default=True)
+
+    # data loader
+    parser.add_argument("--batch_size", type=int, default=128)
+
+    parser.add_argument(
+        "--device", type=torch.device, help="cpu, cuda, cuda:0, cuda:1", default="cpu"
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/hw1/r10946029/utils.py b/hw1/r10946029/utils.py
@@ -0,0 +1,44 @@
+from typing import Iterable, List
+
+
+class Vocab:
+    PAD = "[PAD]"
+    UNK = "[UNK]"
+
+    def __init__(self, vocab: Iterable[str]) -> None:
+        self.token2idx = {
+            Vocab.PAD: 0,
+            Vocab.UNK: 1,
+            **{token: i for i, token in enumerate(vocab, 2)},
+        }
+
+    @property
+    def pad_id(self) -> int:
+        return self.token2idx[Vocab.PAD]
+
+    @property
+    def unk_id(self) -> int:
+        return self.token2idx[Vocab.UNK]
+
+    @property
+    def tokens(self) -> List[str]:
+        return list(self.token2idx.keys())
+
+    def token_to_id(self, token: str) -> int:
+        return self.token2idx.get(token, self.unk_id)
+
+    def encode(self, tokens: List[str]) -> List[int]:
+        return [self.token_to_id(token) for token in tokens]
+
+    def encode_batch(
+        self, batch_tokens: List[List[str]], to_len: int = None
+    ) -> List[List[int]]:
+        batch_ids = [self.encode(tokens) for tokens in batch_tokens]
+        to_len = max(len(ids) for ids in batch_ids) if to_len is None else to_len
+        padded_ids = pad_to_len(batch_ids, to_len, self.pad_id)
+        return padded_ids
+
+
+def pad_to_len(seqs: List[List[int]], to_len: int, padding: int) -> List[List[int]]:
+    paddeds = [seq[:to_len] + [padding] * max(0, to_len - len(seq)) for seq in seqs]
+    return paddeds
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python3.8 test_intent.py --test_file "${1}" --ckpt_path ckpt/intent/best.pt --pred_file "${2}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python3.8 test_slot.py --test_file "${1}" --ckpt_path ckpt/slot/slot-best.pt --pred_file "${2}"