Add files via upload

MWXGOD · Aug 12, 2023 · 383647b · 383647b
1 parent c66ed63
commit 383647b
Show file tree

Hide file tree

Showing 15 changed files with 2,213 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 mwxgod
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/config.py b/config.py
@@ -0,0 +1,50 @@
+import json
+
+class Config:
+    def __init__(self, args):
+        with open(args.config, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+        self.dataset = config["dataset"]
+        self.dev_name = config["dev_name"]
+        self.test_name = config["test_name"]
+        self.train_name = config["train_name"]
+
+        self.save_path = config["save_path"]
+        self.predict_path = config["predict_path"]
+
+        self.conv_num = config["conv_num"]
+        self.att_redim = config["att_redim"]
+        self.att_hidden_dim = config["att_hidden_dim"]
+        self.dist_emb_size = config["dist_emb_size"]
+        self.type_emb_size = config["type_emb_size"]
+        self.lstm_hid_size = config["lstm_hid_size"]
+        self.conv_hid_size = config["conv_hid_size"]
+        self.bert_hid_size = config["bert_hid_size"]
+        self.ffnn_hid_size = config["ffnn_hid_size"]
+
+
+        self.emb_dropout = config["emb_dropout"]
+        self.conv_dropout = config["conv_dropout"]
+        self.out_dropout = config["out_dropout"]
+
+        self.epochs = config["epochs"]
+        self.batch_size = config["batch_size"]
+
+        self.learning_rate = config["learning_rate"]
+        self.weight_decay = config["weight_decay"]
+        self.clip_grad_norm = config["clip_grad_norm"]
+        self.bert_name = config["bert_name"]
+        self.bert_learning_rate = config["bert_learning_rate"]
+        self.warm_factor = config["warm_factor"]
+
+        self.use_bert_last_4_layers = config["use_bert_last_4_layers"]
+
+        self.seed = config["seed"]
+
+        for k, v in args.__dict__.items():
+            if v is not None:
+                self.__dict__[k] = v
+
+    def __repr__(self):
+        return "{}".format(self.__dict__.items())
diff --git a/data_loader.py b/data_loader.py
@@ -0,0 +1,213 @@
+import json
+import torch
+from torch.utils.data import Dataset
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+import prettytable as pt
+from gensim.models import KeyedVectors
+from transformers import AutoTokenizer
+import os
+import utils
+import requests
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+dis2idx = np.zeros((1000), dtype='int64')
+dis2idx[1] = 1
+dis2idx[2:] = 2
+dis2idx[4:] = 3
+dis2idx[8:] = 4
+dis2idx[16:] = 5
+dis2idx[32:] = 6
+dis2idx[64:] = 7
+dis2idx[128:] = 8
+dis2idx[256:] = 9
+
+
+class Vocabulary(object):
+    PAD = '<pad>'
+    UNK = '<unk>'
+    SUC = '<suc>'
+
+    def __init__(self):
+        self.label2id = {self.PAD: 0, self.SUC: 1}
+        self.id2label = {0: self.PAD, 1: self.SUC}
+
+    def add_label(self, label):
+        label = label.lower()
+        if label not in self.label2id:
+            self.label2id[label] = len(self.label2id)
+            self.id2label[self.label2id[label]] = label
+
+        assert label == self.id2label[self.label2id[label]]
+
+    def __len__(self):
+        return len(self.token2id)
+
+    def label_to_id(self, label):
+        label = label.lower()
+        return self.label2id[label]
+
+    def id_to_label(self, i):
+        return self.id2label[i]
+
+def collate_fn(data):
+    bert_inputs, grid_labels, grid_mask2d, pieces2word, dist_inputs, sent_length, entity_text = map(list, zip(*data))
+
+    max_tok = np.max(sent_length)
+    sent_length = torch.LongTensor(sent_length)
+    max_pie = np.max([x.shape[0] for x in bert_inputs])
+    bert_inputs = pad_sequence(bert_inputs, True)
+    batch_size = bert_inputs.size(0)
+
+    def fill(data, new_data):
+        for j, x in enumerate(data):
+            new_data[j, :x.shape[0], :x.shape[1]] = x
+        return new_data
+
+    dis_mat = torch.zeros((batch_size, max_tok, max_tok), dtype=torch.long)
+    dist_inputs = fill(dist_inputs, dis_mat)
+    labels_mat = torch.zeros((batch_size, max_tok, max_tok), dtype=torch.long)
+    grid_labels = fill(grid_labels, labels_mat)
+    mask2d_mat = torch.zeros((batch_size, max_tok, max_tok), dtype=torch.bool)
+    grid_mask2d = fill(grid_mask2d, mask2d_mat)
+    sub_mat = torch.zeros((batch_size, max_tok, max_pie), dtype=torch.bool)
+    pieces2word = fill(pieces2word, sub_mat)
+
+    return bert_inputs, grid_labels, grid_mask2d, pieces2word, dist_inputs, sent_length, entity_text
+
+
+class RelationDataset(Dataset):
+    def __init__(self, bert_inputs, grid_labels, grid_mask2d, pieces2word, dist_inputs, sent_length, entity_text):
+        self.bert_inputs = bert_inputs
+        self.grid_labels = grid_labels
+        self.grid_mask2d = grid_mask2d
+        self.pieces2word = pieces2word
+        self.dist_inputs = dist_inputs
+        self.sent_length = sent_length
+        self.entity_text = entity_text
+
+    def __getitem__(self, item):
+        return torch.LongTensor(self.bert_inputs[item]), \
+               torch.LongTensor(self.grid_labels[item]), \
+               torch.LongTensor(self.grid_mask2d[item]), \
+               torch.LongTensor(self.pieces2word[item]), \
+               torch.LongTensor(self.dist_inputs[item]), \
+               self.sent_length[item], \
+               self.entity_text[item]
+
+    def __len__(self):
+        return len(self.bert_inputs)
+
+
+def process_bert(data, tokenizer, vocab):
+
+    bert_inputs = []
+    grid_labels = []
+    grid_mask2d = []
+    dist_inputs = []
+    entity_text = []
+    pieces2word = []
+    sent_length = []
+
+    for index, instance in enumerate(data):
+        if len(instance['sentence']) == 0:
+            continue
+
+        tokens = [tokenizer.tokenize(word) for word in instance['sentence']]
+        pieces = [piece for pieces in tokens for piece in pieces]
+        _bert_inputs = tokenizer.convert_tokens_to_ids(pieces)
+        _bert_inputs = np.array([tokenizer.cls_token_id] + _bert_inputs + [tokenizer.sep_token_id])
+
+        length = len(instance['sentence'])
+        if len(_bert_inputs)>512:
+            print('index:', index)
+            print('sentence:', ' '.join(instance['sentence']))
+            continue
+        _grid_labels = np.zeros((length, length), dtype=np.int)
+        _pieces2word = np.zeros((length, len(_bert_inputs)), dtype=np.bool)
+        _dist_inputs = np.zeros((length, length), dtype=np.int)
+        _grid_mask2d = np.ones((length, length), dtype=np.bool)
+
+        if tokenizer is not None:
+            start = 0
+            for i, pieces in enumerate(tokens):
+                if len(pieces) == 0:
+                    continue
+                pieces = list(range(start, start + len(pieces)))
+                _pieces2word[i, pieces[0] + 1:pieces[-1] + 2] = 1
+                start += len(pieces)
+
+        for k in range(length):
+            _dist_inputs[k, :] += k
+            _dist_inputs[:, k] -= k
+
+        for i in range(length):
+            for j in range(length):
+                if _dist_inputs[i, j] < 0:
+                    _dist_inputs[i, j] = dis2idx[-_dist_inputs[i, j]] + 9
+                else:
+                    _dist_inputs[i, j] = dis2idx[_dist_inputs[i, j]]
+        _dist_inputs[_dist_inputs == 0] = 19
+
+        for entity in instance["ner"]:
+            index = entity["index"]
+            for i in range(len(index)):
+                if i + 1 >= len(index):
+                    break
+                _grid_labels[index[i], index[i + 1]] = 1
+            _grid_labels[index[-1], index[0]] = vocab.label_to_id(entity["type"])
+
+        _entity_text = set([utils.convert_index_to_text(e["index"], vocab.label_to_id(e["type"]))
+                            for e in instance["ner"]])
+
+        sent_length.append(length)
+        bert_inputs.append(_bert_inputs)
+        grid_labels.append(_grid_labels)
+        grid_mask2d.append(_grid_mask2d)
+        dist_inputs.append(_dist_inputs)
+        pieces2word.append(_pieces2word)
+        entity_text.append(_entity_text)
+
+    return bert_inputs, grid_labels, grid_mask2d, pieces2word, dist_inputs, sent_length, entity_text
+
+
+def fill_vocab(vocab, dataset):
+    entity_num = 0
+    for instance in dataset:
+        for entity in instance["ner"]:
+            vocab.add_label(entity["type"])
+        entity_num += len(instance["ner"])
+    return entity_num
+
+
+def load_data_bert(config):
+    with open('./data/{}/{}.json'.format(config.dataset, config.train_name), 'r', encoding='utf-8') as f:
+        train_data = json.load(f)
+    with open('./data/{}/{}.json'.format(config.dataset, config.dev_name), 'r', encoding='utf-8') as f: 
+        dev_data = json.load(f)
+    with open('./data/{}/test.json'.format(config.dataset, config.test_name), 'r', encoding='utf-8') as f:
+        test_data = json.load(f)
+
+    tokenizer = AutoTokenizer.from_pretrained(config.bert_name, cache_dir="./cache/")
+
+    vocab = Vocabulary()
+    train_ent_num = fill_vocab(vocab, train_data)
+    dev_ent_num = fill_vocab(vocab, dev_data)
+    test_ent_num = fill_vocab(vocab, test_data)
+
+    table = pt.PrettyTable([config.dataset, 'sentences', 'entities'])
+    table.add_row(['train', len(train_data), train_ent_num])
+    table.add_row(['dev', len(dev_data), dev_ent_num])
+    table.add_row(['test', len(test_data), test_ent_num])
+    config.logger.info("\n{}".format(table))
+
+    config.label_num = len(vocab.label2id)
+    config.vocab = vocab
+
+    train_dataset = RelationDataset(*process_bert(train_data, tokenizer, vocab))
+    dev_dataset = RelationDataset(*process_bert(dev_data, tokenizer, vocab))
+    test_dataset = RelationDataset(*process_bert(test_data, tokenizer, vocab))
+
+
+    return (train_dataset, dev_dataset, test_dataset), (train_data, dev_data, test_data)
+
diff --git a/figures/1_aug_exam.jpg b/figures/1_aug_exam.jpg
diff --git a/figures/2_model.jpg b/figures/2_model.jpg
diff --git a/figures/3_decode.jpg b/figures/3_decode.jpg
diff --git a/figures/4_data_distribution.png b/figures/4_data_distribution.png
diff --git a/log/placeholder b/log/placeholder