Skip to content

Commit

Permalink
sorry
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicaar committed Dec 7, 2017
1 parent 5411261 commit 84f29a0
Show file tree
Hide file tree
Showing 12 changed files with 776 additions and 0 deletions.
103 changes: 103 additions & 0 deletions G12/1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import nltk
import nltk.data
import sys
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter

exec(open('functions.py').read())

def build_graph_alist(documents,cosine_matrix,t):
graph = {}
for i in range(len(documents)):
id = str(i)
graph[id] = []
for j in range(len(cosine_matrix[i])):
if cos_sim(i,j,cosine_matrix) >= t and i != j:
graph[id].append(str(j))

return graph

def prestige(uid,ranks,links):
sum = 0
for vid in links[uid]:
sum += ranks[vid] / len(links[vid])
return sum

def rank(links,itermax,damping):
pr = {}
i = 0
ndocs = len(links)

for doc in links:
pr[doc] = 1

while i < itermax:
aux = pr
for doc in links:

aux[doc] = (damping/ndocs) + ((1 - damping) * prestige(doc,pr,links))

pr = aux
i += 1
return pr

def build_summary(sentences,t):
S = get_cosine_similarities_matrix(sentences)
graph = build_graph_alist(sentences,S,t)

ranks = rank(graph,50,0.15)

top = get_top_n(ranks,5)
summary = ""
for i in top:
summary += sentences[int(i[0])] + '\n'
return summary


def get_top_n(dictionary,n):
d = Counter(dictionary)
return d.most_common(n)
enc='utf-8'
if len(sys.argv) > 1:
filename = sys.argv[1]
target_filename = sys.argv[2]
else:
filename = 'text.txt'
target_filename = 'textsum.txt'

with open(filename,'r',encoding=enc) as file:
text = file.read()
sentences = text_to_sentences(text)

with open(target_filename,'r',encoding=enc) as target_file:
target_text = target_file.read()

results = []
summaries = []
tvals = np.arange(0.0, 1.05, 0.05)
for thresh in tvals:
summary = build_summary(sentences,thresh)
ap = AP(summary,target_text)
results.append(ap)
summaries.append(summary)

max = np.argmax(results)
print("Best summary with AP =",results[max],' for threshold =',tvals[max])
print("#-----------------------------------#")
print(summaries[max])












119 changes: 119 additions & 0 deletions G12/2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import numpy as np
import nltk
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


exec(open('functions.py').read())
exec(open('priors.py').read())
exec(open('weights.py').read())

def build_graph_matrix(sentences,weight_func,t):
if not callable(weight_func):
return 'Not functions!'
nsents = len(sentences)
weights = np.zeros([nsents,nsents])

cos_matrix = get_cosine_similarities_matrix(sentences)
#create weights
for i in range(len(sentences)):
for j in range(len(sentences)):
weights[i][j] = weight_func(i,j,sentences,cos_matrix,t)
return weights

def build_priors(sentences,graph,prior_func):
if not callable(prior_func):
return 'Not functions!'
nsents = graph.shape[0]
priors = np.zeros(nsents)

for i in range(nsents):
priors[i] = prior_func(i,graph,sentences)
return priors



def prestige(sentence_index,ranks,weights):
sum = 0
for j in range(len(weights[sentence_index])):
if weights[sentence_index,j]== 0:
continue
else:
sum += ((ranks[j] * weights[j][sentence_index]) / np.sum(weights[j]))
return sum

def rank(weights,priors,itermax,damping):
i = 0
nsents = len(priors)
pr = np.ones(nsents)

while i < itermax:
aux = pr
for j in range(nsents):
random = damping * priors[j]
if random != 0:
random /= np.sum(priors)
not_random = (1 - damping) * prestige(j,pr,weights)
aux[j] = random + not_random
pr = aux
i += 1

return pr

def get_top_n(array,n):
indexes = sorted(range(len(array)), key=lambda i: array[i], reverse=True)[:n]
top = {}
for i in range(n):
top[str(indexes[i])] = (array[indexes[i]])
return top

def build_summary(sentences,prior_func,weight_func,t):
weights = build_graph_matrix(sentences,weight_func,t)
priors = build_priors(sentences,weights,prior_func)
ranks = rank(weights,priors,50,0.15)
top = get_top_n(ranks,5)
summary = ""
for i in top:
summary += sentences[int(i)] + '\n'
return summary

if '-d' in sys.argv:
source_path = sys.argv[(sys.argv).index('-d') + 1]
sums_path = sys.argv[(sys.argv).index('-d') + 2]
else:
source_path = '../TeMario/source/'
sums_path = '../TeMario/sums/'

if '-p' in sys.argv:
priorf = globals()[sys.argv[(sys.argv).index('-p') + 1]]
else:
priorf = sentence_position_prior

if '-w' in sys.argv:
weightf = globals()[sys.argv[(sys.argv).index('-w') + 1]]
else:
weightf = cos_sim_weight

results = []
summaries = []
tvals = np.arange(0.0, 1.05, 0.05)

source_texts = os.listdir(source_path)

for thresh in tvals:
MAP = 0

for text_file in source_texts:
with open(source_path + text_file,'r',encoding='Latin-1') as file: #source_path + text_file
text = file.read()
sentences = text_to_sentences(text)
summary = build_summary(sentences,priorf,weightf,thresh)
with open(sums_path+ 'Ext-' + text_file ,'r',encoding='Latin-1') as summary_file: #sums_path+ 'Ext-' + text_file '../ex1/textsum.txt'
MAP += AP(summary,summary_file.read())
MAP /= len(source_texts)
results.append(MAP)
max = np.argmax(results)
print("Best MAP =",results[max],' for threshold =',tvals[max])
Loading

0 comments on commit 84f29a0

Please sign in to comment.