From a485d4784fa5b0e92aec8e44ec6804e8d9d4b5b9 Mon Sep 17 00:00:00 2001 From: Ruifeng-paper <73287728+Ruifeng-paper@users.noreply.github.com> Date: Sat, 2 Jan 2021 13:37:46 +0800 Subject: [PATCH] Add files via upload --- make_data.py | 530 +++++++++++++++++++++++++++++++++++++++++++++++++++ utility.py | 213 +++++++++++++++++++++ 2 files changed, 743 insertions(+) create mode 100644 make_data.py create mode 100644 utility.py diff --git a/make_data.py b/make_data.py new file mode 100644 index 0000000..100f98b --- /dev/null +++ b/make_data.py @@ -0,0 +1,530 @@ +import queue +from threading import Thread +import numpy as np +from transformers import * +from openie import StanfordOpenIE +from utility.utility import * +from bert_serving.client import BertClient +from rouge import Rouge +from stanfordcorenlp import StanfordCoreNLP +import pickle +from data.raw_data_loader import * +''' +nlp = StanfordCoreNLP('/home/ziqiang/stanfordnlp_resources/stanford-corenlp-full-2018-10-05') +bc = BertClient(ip='localhost') +client = StanfordOpenIE() +rougex = Rouge() +''' +g_b=0 + + +import threading + +class threadsafe_generator: + def __init__(self, it): + self.it = it + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): + with self.lock: + return self.it.__next__() + + + +class Example(object): + """Class representing a train/val/test example for text summarization.""" + + def __init__(self, article, abstract, tokenizer, rougex, nlp): + + """ + Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. + Args: + article: source text; list of strings. each token is separated by a single space. + abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. + vocab: Vocabulary object + hps: hyperparameters + """ + self.rougex=rougex + self.nlp=nlp + self.tokenizer=tokenizer + article=article[:60] + self.article=article + self.abstract=abstract + # Process the article + self.article_fact=[] + self.article_sent=[] + self.article_fact_tag=[] + for count,sent in enumerate(article): + + self.article_sent.append(self.tokenizer.encode(sent)) + sent=sent.strip(',') + sent=sent.strip(':') + + sentfact=sent_split(sent,self.nlp) + + sentfact_file=[] + for i in sentfact: + if word_len(i) >50: + ii=i.split(' ') + ii=ii[0:50] + sentfact_file.append(' '.join(ii)) + continue + if len(i) >= 20: + sentfact_file.append(i) + + self.article_fact_tag.append(len(sentfact_file)) + self.article_fact+=sentfact_file + self.article_id=[] + for fact in self.article_fact: + self.article_id.append(self.tokenizer.encode(fact,add_special_tokens=False)) + self.article_len = len(self.article_id) # store the number of sentences of the article + # Process the abstract + self.original_abstract=[] + self.abstract_fact=[] + self.abstract_fact_all=[] + for sent in abstract: + + self.original_abstract.append(self.tokenizer.encode(sent)) + + if word_len(sent) > 20: + sent=sent.strip(',') + sent=sent.strip(':') + sentfact=sent_split(sent,self.nlp) + else: + sentfact=[sent] + + self.abstract_fact_all+=sentfact + + for i in self.abstract_fact_all: + if word_len(i) >50: + ii=i.split(' ') + ii=ii[0:50] + self.abstract_fact.append(' '.join(ii)) + elif len(i) < 15: + continue + else: + self.abstract_fact.append(i) + self.abstract_id=[] + for fact in self.abstract_fact: + self.abstract_id.append(self.tokenizer.encode(fact,add_special_tokens=False)) + + self.abstract_len = len(self.abstract_id) # store the number of sentences of the article + + + + self.enc_fact=[] + self.enc_sent=[] + self.dec_fact=[] + + self.dec_label_bert=[] + self.dec_label_rouge=[] + self.dec_label_sent=[] + + self.grap_sim_bert=np.zeros((self.article_len, self.article_len), dtype=np.float16) + self.grap_sim_rouge=np.zeros((self.article_len, self.article_len), dtype=np.float16) + self.grap_entity=np.zeros((self.article_len, self.article_len), dtype=np.float16) + self.grap_cosent=np.zeros((self.article_len, self.article_len), dtype=np.float16) + self.grap_sent=np.zeros((len(self.article), len(self.article)), dtype=np.float16) + def get_enc_fact(self, max_len): + + """Pad the encoder input sequence with pad_id up to max_len.""" + + for i in self.article_id: + if len(i) > max_len: + self.enc_fact.append(i[0:max_len]) + else: + self.enc_fact.append(i) + + + for i in self.article_sent: + if len(i) > max_len*2: + self.enc_sent.append(i[0:max_len*2]) + else: + self.enc_sent.append(i) + + def get_dec_fact(self, max_len): + + """Pad the encoder input sequence with pad_id up to max_len.""" + + for i in self.abstract_id: + if len(i) > max_len: + self.dec_fact.append(i[0:max_len]) + else: + self.dec_fact.append(i) + + def get_grap(self): + + """Get the sim bert graph """ + + """Get the sim rouge graph """ + for i,facti in enumerate(self.article_fact): + for j,factj in enumerate(self.article_fact): + scores = self.rougex.get_scores(facti, factj) + self.grap_sim_rouge[i][j]=(scores[0]['rouge-1']['f']+scores[0]['rouge-2']['f'])/2 + + """Get the sim sent graph """ + for i,facti in enumerate(self.article): + for j,factj in enumerate(self.article): + scores = self.rougex.get_scores(facti, factj) + self.grap_sent[i][j]=(scores[0]['rouge-1']['f']+scores[0]['rouge-2']['f'])/2 + + """Get the entity graph""" + + """Get the co-sent graph""" + now=0 + for i in self.article_fact_tag: + for x in range(now+i)[now:now+i]: + for y in range(now+i)[now:now+i]: + self.grap_cosent[x][y]=1 + now=now+i + + + + def get_dec_label_bert(self): + self.dec_label_bert=[] + self.oral_score_bert=0 + + + def get_dec_label_rouge(self): + rouge=[] + score_rouge=[] + index_rouge=[] + for j in self.abstract_fact: + score=[] + for k in self.article_fact: + scores = self.rougex.get_scores(j, k) + score.append((scores[0]['rouge-1']['f']+scores[0]['rouge-2']['f'])/2) + choose=score.index(max(score)) + index_rouge.append(choose) + rouge.append(self.article_fact[choose]) + score_rouge.append(max(score)) + for i in range(len(self.article_fact)): + if i in index_rouge: + self.dec_label_rouge.append(1) + else: + self.dec_label_rouge.append(0) + + self.oral_score_rouge = self.rougex.get_scores(' . '.join(rouge), ' . '.join(self.abstract)) + + def get_dec_label_rouge_sent(self): + get_dec_label_sent=self.greedy_selection(self.article, self.abstract, 3, self.rougex) + for i in range(len(self.article)): + if i in get_dec_label_sent: + self.dec_label_sent.append(1) + else: + self.dec_label_sent.append(0) + + def greedy_selection(self, doc_sent_list, abstract_sent_list, summary_size, rougex): + selected = [] + max_rouge = 0.0 + reference='' + for i in abstract_sent_list: + reference+=i + reference+=' . ' + for s in range(summary_size): + cur_max_rouge = max_rouge + cur_id = -1 + for i in range(len(doc_sent_list)): + if (i in selected): + continue + c = selected + [i] + candidates = '' + for j in c: + candidates+=doc_sent_list[j] + candidates+=' . ' + scores = rougex.get_scores(candidates, reference) + rouge_score = (scores[0]['rouge-1']['f']+scores[0]['rouge-2']['f'])/2 + if rouge_score > cur_max_rouge: + cur_max_rouge = rouge_score + cur_id = i + if (cur_id == -1): + return selected + selected.append(cur_id) + max_rouge = cur_max_rouge + return sorted(selected) + + +class Batch(object): + + """Class representing a minibatch of train/val/test examples for text summarization.""" + + def __init__(self, example_list, max_len): + + """ + Turns the example_list into a Batch object. + Args: + example_list: List of Example objects + hps: hyperparameters + vocab: Vocabulary object + """ + + self.init_encoder(example_list, max_len) # initialize the input to the encoder + self.init_decoder(example_list, max_len) # initialize the input and targets for the decoder + self.init_result(example_list) + + + def init_result(self, example_list): + self.original_article=[] + self.original_abstract=[] + self.original_sent=[] + self.sent_to_fact=[] + for ex in example_list: + self.original_sent.append(ex.article) + self.original_article.append(ex.article_fact) + self.original_abstract.append(ex.abstract) + self.sent_to_fact.append(ex.article_fact_tag) + + def init_encoder(self, example_list, max_len): + self.enc_fact=[] + self.enc_sent=[] + self.grap_sim_bert=[] + self.grap_sim_rouge=[] + self.grap_entity=[] + self.grap_cosent=[] + self.grap_sent=[] + + for ex in example_list: + ex.get_enc_fact(max_len) + ex.get_grap() + + # Fill in the numpy arrays + for ex in example_list: + self.enc_fact.append(ex.enc_fact) + self.enc_sent.append(ex.enc_sent) + self.grap_sim_bert.append(ex.grap_sim_bert) + self.grap_sim_rouge.append(ex.grap_sim_rouge) + self.grap_entity.append(ex.grap_entity) + self.grap_cosent.append(ex.grap_cosent) + self.grap_sent.append(ex.grap_sent) + + def init_decoder(self, example_list, max_len): + self.dec_fact=[] + self.dec_label_sent=[] + self.dec_label_bert=[] + self.dec_label_rouge=[] + self.dec_score_bert=[] + self.dec_score_rouge=[] + # Pad the inputs and targets + for ex in example_list: + ex.get_dec_fact(max_len) + ex.get_dec_label_bert() + ex.get_dec_label_rouge() + ex.get_dec_label_rouge_sent() + # Fill in the numpy arrays + for ex in example_list: + self.dec_fact.append(ex.dec_fact) + self.dec_label_sent.append(ex.dec_label_sent) + self.dec_label_bert.append(ex.dec_label_bert) + self.dec_label_rouge.append(ex.dec_label_rouge) + self.dec_score_bert.append(ex.oral_score_bert) + self.dec_score_rouge.append(ex.oral_score_rouge) + + +class Batcher(object): + + """A class to generate minibatches of data. Buckets examples together based on length of the encoder sequence.""" + + BATCH_QUEUE_MAX = 100 # max number of batches the batch_queue can hold + + def __init__(self, data_path, dataset): + + """Initialize the batcher. Start threads that process the data into batches. + Args: + data_path: tf.Example filepattern. + vocab: Vocabulary object + hps: hyperparameters + single_pass: If True, run through the dataset exactly once (useful for when you want to run evaluation on the dev or test set). Otherwise generate random batches indefinitely (useful for training). + """ + self._dataset=dataset + self._data_path = data_path + self._max_len=50 + self._batch_size=4 + + # Initialize a queue of Batches waiting to be used, and a queue of Examples waiting to be batched + self._batch_queue = queue.Queue(self.BATCH_QUEUE_MAX) + self._example_queue = queue.Queue(self.BATCH_QUEUE_MAX * self._batch_size) + # Initialize the tool + self.tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') + self.rougex=Rouge() + self.nlp=StanfordCoreNLP('/home/ziqiang/stanfordnlp_resources/stanford-corenlp-full-2018-10-05') + # Different settings depending on whether we're in single_pass mode or not + self._num_example_q_threads = 1 # just one thread, so we read through the dataset just once + self._num_batch_q_threads = 1 # just one thread to batch examples + self._bucketing_cache_size = 50 # only load one batch's worth of examples before bucketing; this essentially means no bucketing + self._finished_reading = False # this will tell us when we're finished reading the dataset + #prepear dataloader + if self._dataset == 'TLDR': + self.input_gen = threadsafe_generator(example_generator_TLDR(self._data_path)) + if self._dataset == 'MUTIL': + self.input_gen = threadsafe_generator(example_generator_MUTIL(self._data_path)) + if self._dataset == 'DMCNN': + self.input_gen = threadsafe_generator(example_generator_DMCNN(self._data_path)) + + print('finish prepearing') + # Start the threads that load the queues + + + self._example_q_threads = [] + for _ in range(self._num_example_q_threads): + self._example_q_threads.append(Thread(target=self.fill_example_queue)) + self._example_q_threads[-1].daemon = True + self._example_q_threads[-1].start() + + self._batch_q_threads = [] + for _ in range(self._num_batch_q_threads): + self._batch_q_threads.append(Thread(target=self.fill_batch_queue)) +# self._batch_q_threads[-1].daemon = True + self._batch_q_threads[-1].start() + + print('threads started') + + def next_batch(self): + + """ + Return a Batch from the batch queue. + batch: a Batch object, or None if we're in single_pass mode and we've exhausted the dataset. + """ + # If the batch queue is empty, print a warning + if self._batch_queue.qsize() == 0: +# tf.logging.warning('Bucket input queue is empty when calling next_batch. Bucket queue size: %i, Input queue size: %i', self._batch_queue.qsize(), self._example_queue.qsize()) + pass + if self._finished_reading and self._example_queue.qsize() == 0: + print("Finished reading dataset in single_pass mode.") + return None + batch = self._batch_queue.get() # get the next Batch + return batch + + def fill_example_queue(self): + + """Reads data from file and processes into Examples which are then placed into the example queue.""" + global g_b + while True: + g_b+=1 + if g_b%100==0: + print('--------'+str(g_b)+'--------') + print(self._example_queue.qsize()) + print(self._batch_queue.qsize()) + try: + article, abstract = self.input_gen.__next__() # read the next example from file. article and abstract are both strings. + except StopIteration: # if there are no more examples: + print("The example generator for this example queue filling thread has exhausted data.") + self._finished_reading = True + break + example = Example(article, abstract, self.tokenizer, self.rougex, self.nlp) # Process into an Example. + self._example_queue.put(example) # place the Example in the example queue. + + + def fill_batch_queue(self): + + """ + Takes Examples out of example queue, sorts them by encoder sequence length, processes into Batches and places them in the batch queue. + In decode mode, makes batches that each contain a single example repeated. + """ + while True: + # Get bucketing_cache_size-many batches of Examples into a list, then sort + inputs = [] + for _ in range(self._batch_size * self._bucketing_cache_size): + if self._finished_reading and self._example_queue.qsize() == 0: + break + inputs.append(self._example_queue.get()) + # Group the sorted Examples into batches, optionally shuffle the batches, and place in the batch queue. + inputs.sort(key=self.get_sort) + ''' + splits = [] + len_pre=-1 + for indexi,i in enumerate(inputs): + len_now = i.article_len + if len_pre != len_now: + splits.append(indexi) + len_pre=len_now + batches=[] + for indexi,i in enumerate(splits): + if indexi+1 == len(splits): + batches.append(inputs[i:]) + else: + batches.append(inputs[i:splits[indexi+1]]) + batches_max=[] + for i in batches: + if len(i) <= self._batch_size: + batches_max.append(i) + else: + batches_max+=[i[j:j+self._batch_size] for j in range(0, len(i), self._batch_size)] + ''' + batches_max=[] + for indexi,i in enumerate(inputs): + if indexi % self._batch_size ==0: + batches_max.append(inputs[indexi:indexi+self._batch_size]) + for b in batches_max: # each b is a list of Example objects + self._batch_queue.put(Batch(b, self._max_len)) + + def get_sort(self, x): + return x.article_len + +''' +train_data_loader=Batcher('data/DMCNN/train_*', 'DMCNN') + +count=0 +countx=0 +while True: + batch = train_data_loader.next_batch() + each_batch_size=len(batch.enc_fact) + if train_data_loader._finished_reading == True: + break + f=open('data_file/DMCNN/train_file/'+str(count)+'_train_batch_of '+str(each_batch_size)+' examples.pkl','wb') + pickle.dump(batch,f) + f.close() + count+=1 + countx+=each_batch_size +print('Total train data:') +print(countx) + + +train_data_loader=Batcher('data/DMCNN/val_*', 'DMCNN') + +count=0 +countx=0 +while True: + batch = train_data_loader.next_batch() + each_batch_size=len(batch.enc_fact) + if train_data_loader._finished_reading == True: + break + f=open('data_file/DMCNN/val_file/'+str(count)+'_val_batch_of '+str(each_batch_size)+' examples.pkl','wb') + pickle.dump(batch,f) + f.close() + count+=1 + countx+=each_batch_size +print('Total val data:') +print(countx) +''' + + +train_data_loader=Batcher('data/DMCNN/test*', 'DMCNN') + +count=0 +countx=0 +while True: + batch = train_data_loader.next_batch() + each_batch_size=len(batch.enc_fact) + if train_data_loader._finished_reading == True and train_data_loader._batch_queue.qsize() == 0 and train_data_loader._example_queue.qsize() == 0: + break + f=open('data_file/DMCNN/test_file_y/'+str(count)+'_test_batch_of '+str(each_batch_size)+' examples.pkl','wb') + pickle.dump(batch,f) + f.close() + count+=1 + countx+=each_batch_size +print("test*") +print('Total test data:') +print(countx) + + + + + + + + + + \ No newline at end of file diff --git a/utility.py b/utility.py new file mode 100644 index 0000000..907484b --- /dev/null +++ b/utility.py @@ -0,0 +1,213 @@ +from copy import deepcopy as cp +from stanfordcorenlp import StanfordCoreNLP +import numpy as np + + + + +def cos_sim(vector_a, vector_b): + vector_a = np.mat(vector_a) + vector_b = np.mat(vector_b) + num = float(vector_a * vector_b.T) + denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b) + cos = num / denom + sim = 0.5 + 0.5 * cos + return sim + + +def fact_merge(rlist): #redundency + merged={} + for i in rlist: + forward = i['subject']+' '+i['relation'] + backward = i['object'] + if forward not in merged.keys(): + merged[forward]=backward + else: + ex=merged[forward] + if len(ex) < len(backward): + merged[forward]=backward + merged2={} + for i in merged.keys(): + forward = i + backward = merged[i] + if backward not in merged2.keys(): + merged2[backward]=forward + else: + ex=merged2[backward] + if len(ex) < len(forward): + merged2[backward]=forward + + return_list=[] + for i in merged2.keys(): + x=merged2[i]+' '+i + return_list.append(x) + return return_list + +def merge(ilist): #redundency + rlist=[] + root=ilist[0] + ilist.remove(root) + while(True): + if ilist == []: + rlist.append(root) + break + record=[] + for i in ilist: + if list(set(root) & set(i)) != []: + root=list(set(root+i)) + record=record+i + break + if record == []: + rlist.append(root) + root=ilist[0] + ilist.remove(root) + else: + ilist.remove(record) + + return rlist + + +def fact_parse(token, parsing): #redundency + predicate=['nsubj','nsubjpass', 'csubj', 'csubjpass','dobj'] + modify=['amod','nummod','compound','ccomp'] + predicate_tuple=[] + modify_tuple=[] + for i in parsing: + if i[0] in predicate: + predicate_tuple.append([i[1],i[2]]) + if i[0] in modify: + modify_tuple.append([i[1],i[2]]) + tuple_merge=predicate_tuple+modify_tuple + tuple_merge=sorted(tuple_merge, key=lambda x: x[0]) + print(tuple_merge) + if tuple_merge == []: + return [] + tuple_merge=merge(tuple_merge) + print(tuple_merge) + result=[] + for i in tuple_merge: + one='' + if len(i) <=3: + continue + for j in i: + one+=token[j] + one+=' ' + result.append(one.strip().replace(',','').replace('.','')) + return result + +def word_len(sent): + return len((sent.strip()).split(' ')) + +def list_in(listk, sent): + for i in listk: + if i in sent: + return True + return False + +def sent_split(sent,nlp): #sent is a sentence text string and nlp is stanfordcorenlp + keyword1=['punct','cc','mark'] #split + keyword2=['acl:relcl','advcl','appos','ccomp'] #merge + + min_length=4 + max_length=8 + conj_min_length=4 + + token=nlp.word_tokenize(sent) + token=['ROOT']+token + parsing=nlp.dependency_parse(sent) + + + + + split_pos=[] + + for i in parsing: + if i[0] in keyword1: + if i[0]=='cc' and (i[2]-i[1]) > conj_min_length: + x=i[2] + split_pos.append([x,0]) + break + + elif i[0] == 'punct': + x=i[2] + tag=0 + for j in parsing: + if j[0] in keyword2: + if j[1]x: + tag=1 + break + if tag==0: + split_pos.append([x,tag]) + else: + split_pos.append([x,tag]) + + elif i[0] == 'mark': + x=i[1] + tag=0 + for j in parsing: + if j[0] in keyword2: + if j[1]x: + tag=1 + break + if tag==0: + split_pos.append([x,tag]) + else: + split_pos.append([x,tag]) + + else: + pass + + + if len(split_pos) == 0: + return [sent] + else: + tag_list=[] + raw_split_sent=[] + + pointer=1 + for i in split_pos: + pos=i[0] + tag_list.append(i[1]) + + subsent=' '.join(token[pointer:pos]) + raw_split_sent.append(subsent) + + pointer=pos+1 + + raw_split_sent.append(' '.join(token[pointer:])) + + sent_result=[] + for i,subsent in enumerate(raw_split_sent): + itoken=subsent.strip() + if i == 0: + sent_result.append(itoken) + else: + tag_=tag_list[i-1] + if word_len(itoken) <=min_length: + sent_result[-1]+=' , '+itoken + elif tag_== 1 and word_len(itoken) > max_length: + sent_result.append(itoken) + elif tag_== 1 and word_len(itoken) <= max_length: + sent_result[-1]+=' , '+itoken + else: + sent_result.append(itoken) + if (word_len(sent_result[0]) <=min_length) and len(sent_result)>=2: + sent_result[1]=sent_result[0]+' , '+sent_result[1] + sent_result.remove(sent_result[0]) + + return sent_result + + + + + +''' +#test code +from stanfordcorenlp import StanfordCoreNLP + +x='Ahmadinejad essentially called Yukiya Amano, the director general of the IAEA, a U.S. puppet and said the U.N.A has no jurisdiction in Iran and Irap' + +nlp=StanfordCoreNLP('') + +print(sent_tokenize(x,nlp)) +''' \ No newline at end of file