sorry

GoncaloKLopes · Dec 7, 2017 · 84f29a0 · 84f29a0
1 parent 5411261
commit 84f29a0
Show file tree

Hide file tree

Showing 12 changed files with 776 additions and 0 deletions.
diff --git a/G12/1.py b/G12/1.py
@@ -0,0 +1,103 @@
+import nltk
+import nltk.data
+import sys
+import numpy as np
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from collections import Counter
+
+exec(open('functions.py').read())
+
+def build_graph_alist(documents,cosine_matrix,t):
+    graph = {}
+    for i in range(len(documents)):
+        id = str(i)
+        graph[id] = []
+        for j in range(len(cosine_matrix[i])):
+            if cos_sim(i,j,cosine_matrix) >= t and i != j:
+                graph[id].append(str(j))
+
+    return graph
+
+def prestige(uid,ranks,links):
+    sum = 0
+    for vid in links[uid]:
+        sum += ranks[vid] / len(links[vid])
+    return sum
+
+def rank(links,itermax,damping):
+    pr = {}
+    i = 0
+    ndocs = len(links)
+
+    for doc in links:
+        pr[doc] = 1
+
+    while i < itermax:
+        aux = pr
+        for doc in links:
+
+            aux[doc] = (damping/ndocs) + ((1 - damping) * prestige(doc,pr,links))
+
+        pr = aux
+        i += 1
+    return pr
+
+def build_summary(sentences,t):
+    S = get_cosine_similarities_matrix(sentences)
+    graph = build_graph_alist(sentences,S,t)
+
+    ranks = rank(graph,50,0.15)
+
+    top = get_top_n(ranks,5)
+    summary = ""
+    for i in top:
+        summary += sentences[int(i[0])] + '\n'
+    return summary
+
+
+def get_top_n(dictionary,n):
+    d = Counter(dictionary)
+    return d.most_common(n)
+enc='utf-8'
+if len(sys.argv) > 1:
+	filename = sys.argv[1]
+	target_filename = sys.argv[2]
+else:
+	filename = 'text.txt'
+	target_filename = 'textsum.txt'
+
+with open(filename,'r',encoding=enc) as file:
+    text = file.read()
+sentences = text_to_sentences(text)
+
+with open(target_filename,'r',encoding=enc) as target_file:
+    target_text = target_file.read()
+
+results = []
+summaries = []
+tvals = np.arange(0.0, 1.05, 0.05)
+for thresh in tvals:
+    summary = build_summary(sentences,thresh)
+    ap = AP(summary,target_text)
+    results.append(ap)
+    summaries.append(summary)
+
+max = np.argmax(results)
+print("Best summary with AP =",results[max],' for threshold =',tvals[max])
+print("#-----------------------------------#")
+print(summaries[max])
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/G12/2.py b/G12/2.py
@@ -0,0 +1,119 @@
+import os
+import numpy as np
+import nltk
+import sys
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+exec(open('functions.py').read())
+exec(open('priors.py').read())
+exec(open('weights.py').read())
+
+def build_graph_matrix(sentences,weight_func,t):
+     if not callable(weight_func):
+         return 'Not functions!'
+     nsents = len(sentences)
+     weights = np.zeros([nsents,nsents])
+
+     cos_matrix = get_cosine_similarities_matrix(sentences)
+     #create weights
+     for i in range(len(sentences)):
+         for j in range(len(sentences)):
+             weights[i][j] = weight_func(i,j,sentences,cos_matrix,t)
+     return weights
+
+def build_priors(sentences,graph,prior_func):
+	if not callable(prior_func):
+		return 'Not functions!'
+	nsents = graph.shape[0]
+	priors = np.zeros(nsents)
+
+	for i in range(nsents):
+		priors[i] = prior_func(i,graph,sentences)
+	return priors
+
+
+
+def prestige(sentence_index,ranks,weights):
+    sum = 0
+    for j in range(len(weights[sentence_index])):
+        if weights[sentence_index,j]== 0:
+            continue
+        else:   
+            sum += ((ranks[j] * weights[j][sentence_index]) / np.sum(weights[j]))
+    return sum
+
+def rank(weights,priors,itermax,damping):
+    i = 0
+    nsents = len(priors)
+    pr = np.ones(nsents)
+
+    while i < itermax:
+        aux = pr
+        for j in range(nsents):
+            random = damping * priors[j]
+            if random != 0:
+                random /= np.sum(priors)
+            not_random = (1 - damping) * prestige(j,pr,weights)
+            aux[j] = random + not_random
+        pr = aux
+        i += 1
+
+    return pr
+
+def get_top_n(array,n):
+    indexes = sorted(range(len(array)), key=lambda i: array[i], reverse=True)[:n]
+    top = {}
+    for i in range(n):
+        top[str(indexes[i])] = (array[indexes[i]])
+    return top
+
+def build_summary(sentences,prior_func,weight_func,t):
+    weights = build_graph_matrix(sentences,weight_func,t)
+    priors = build_priors(sentences,weights,prior_func)
+    ranks = rank(weights,priors,50,0.15)
+    top = get_top_n(ranks,5)
+    summary = ""
+    for i in top:
+        summary += sentences[int(i)] + '\n'
+    return summary
+
+if '-d' in sys.argv:
+	source_path = sys.argv[(sys.argv).index('-d') + 1]
+	sums_path = sys.argv[(sys.argv).index('-d') + 2]
+else: 
+	source_path = '../TeMario/source/'
+	sums_path = '../TeMario/sums/'
+
+if '-p' in sys.argv:
+    priorf = globals()[sys.argv[(sys.argv).index('-p') + 1]]
+else:
+    priorf = sentence_position_prior
+
+if '-w' in sys.argv:
+    weightf = globals()[sys.argv[(sys.argv).index('-w') + 1]]
+else:
+    weightf = cos_sim_weight
+
+results = []
+summaries = []
+tvals = np.arange(0.0, 1.05, 0.05)
+
+source_texts = os.listdir(source_path)
+
+for thresh in tvals:
+    MAP = 0
+
+    for text_file in source_texts:
+        with open(source_path + text_file,'r',encoding='Latin-1') as file: #source_path + text_file
+            text = file.read()
+        sentences = text_to_sentences(text)
+        summary = build_summary(sentences,priorf,weightf,thresh)
+        with open(sums_path+ 'Ext-' + text_file ,'r',encoding='Latin-1') as summary_file: #sums_path+ 'Ext-' + text_file    '../ex1/textsum.txt'
+            MAP += AP(summary,summary_file.read())
+    MAP /= len(source_texts)
+    results.append(MAP)
+max = np.argmax(results)
+print("Best MAP =",results[max],' for threshold =',tvals[max])