Skip to content

Commit

Permalink
mergeroni pepperoni
Browse files Browse the repository at this point in the history
  • Loading branch information
GoncaloKLopes committed Dec 7, 2017
2 parents fcae58d + 20cbd07 commit 3689a0f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 29 deletions.
5 changes: 3 additions & 2 deletions ex2/priors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def uniform_prior(sent_index,graph,sentences):
#receives graph matrix for convenience...
def degree_centrality_prior(sent_index,graph,sentences):
total_links = 0
degree = 0
degree = 0
for i in range(len(graph)):
nonzeros = len(np.nonzero(graph[i])[0])
if i == sent_index:
Expand All @@ -21,4 +21,5 @@ def sentence_position_prior(sent_index,graph,sentences):
total = 0
for i in range(len(sentences)):
total += (i + 1)
return (len(sentences) - (sent_index)) / total
return (len(sentences) - (sent_index)) / total

80 changes: 58 additions & 22 deletions ex3/3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
import functions
import numpy as np
debug = False

def cosine_similarity_matrix(doc, sentences):
Expand All @@ -21,27 +24,28 @@ def files_to_features(path):
features = []
number_of_docs = 0
for file in onlyfiles:
fp = open(path + '\\' + file, 'r')
fp = open(path + '\\' + file, 'r', encoding='latin-1')
document = fp.read()
fp.close()

doc_lower = re.sub('(\.)?(\n)+', '. ', document).lower()
doc_lower = re.sub('(\w)(\\n)+', r'\1. ', document).lower()

tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
doc = '\n-----\n'.join(tokenizer.tokenize(doc_lower))
sentences = doc.split('\n-----\n')

matrix = cosine_similarity_matrix(doc_lower, sentences)

for i in range(0, len(matrix)):
features.append([i, matrix[i][0]])
for i in range(0, len(sentences)):
graph_cent = functions.degree_centrality(i,sentences,t=0.2)
features.append([i, matrix[i][0], graph_cent])

number_of_docs += 1
if(debug):
print("\n++++++++++++++++++++++\n")
print("\n++++++++++++++++++++++\n")
if(number_of_docs == 1):
break

return features

def files_to_class(source_path, sums_path):
Expand All @@ -56,14 +60,14 @@ def files_to_class(source_path, sums_path):
if(summary != "Sum-" + source):
continue
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
fsrc = open(source_path + '\\' + source, 'r')
src_doc = re.sub('(\.)?(\n)+','. ', fsrc.read()).lower()
fsrc = open(source_path + '\\' + source, 'r', encoding='latin-1')
src_doc = re.sub('(\w)(\\n)+', r'\1. ', fsrc.read()).lower()
src_doc = '\n-----\n'.join(tokenizer.tokenize(src_doc))
src_sentences = src_doc.split('\n-----\n')
fsrc.close()
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
fsum = open(sums_path + '\\' + summary, 'r')
sum_doc = re.sub('(\.)?(\n)+','. ', fsum.read()).lower()
fsum = open(sums_path + '\\' + summary, 'r', encoding='latin-1')
sum_doc = re.sub('(\w)(\\n)+', r'\1. ', fsum.read()).lower()
sum_doc = '\n-----\n'.join(tokenizer.tokenize(sum_doc))
sum_sentences = sum_doc.split('\n-----\n')
fsum.close()
Expand Down Expand Up @@ -91,28 +95,53 @@ def files_to_sentences(path):
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
sentences = []
for file in onlyfiles:
fp = open(path + '\\' + file, 'r')
fp = open(path + '\\' + file, 'r', encoding='latin-1')
document = fp.read()
fp.close()
document = re.sub('(\.)?(\n)+', '. ', document)
document = re.sub('(\w)(\\n)+', r'\1. ', document)
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
sentences.append(tokenizer.tokenize(document))
return sentences

def train():
train_ft = files_to_features('.\\train\\source')
cl = files_to_class('.\\train\\source', '.\\train\\sums')
def train_perceptron(part_ft, cl):
#naive-bayes features
modelNB = train_naive_bayes(part_ft, cl)
train_ft = get_features_with_NB(modelNB, part_ft)
#target class
if(len(train_ft) != len(cl)):
print("Mismatch summaries and source files\n")
return
#train model
model = Perceptron()
model.fit(train_ft, cl)
return model

def train_naive_bayes(train_ft, cl):
if(len(train_ft) != len(cl)):
print("Mismatch summaries and source files\n")
return
model = GaussianNB()
model.fit(train_ft, cl)
return model

def get_features_with_NB(model, features):
probs = model.predict_proba(features)
for i in range(0, len(features)):
features[i].append(probs[i][1])
return features


def summary():
model = train()
features = files_to_features('.\\source')
train_ft3 = files_to_features('.\\train\\source')
cl = files_to_class('.\\train\\source', '.\\train\\sums')
#naive-bayes
part_ft = files_to_features('.\\source')
modelNB = train_naive_bayes(train_ft3, cl)
features = get_features_with_NB(modelNB, part_ft)
#perceptron
model = train_perceptron(train_ft3, cl)
labels = model.predict(features)
#summary
corpus = files_to_sentences('.\\source')
candidates = []
shift = 0
Expand All @@ -127,10 +156,17 @@ def summary():
shift = len(doc)
return candidates

f = open('output.txt', 'a')
summaries = summary()
for s in summaries:
for sentence in s:
f.write(sentence)
f.write("\n")
f.close()
sum_text = ""
for doc in summaries:
for sentence in doc:
sum_text += sentence + "\n"

golden = files_to_sentences('.\\sums')
golden_text = ""
for doc in golden:
for sentence in doc:
golden_text += sentence + "\n"

MAP = functions.AP(golden_text, sum_text)
print(MAP)
12 changes: 7 additions & 5 deletions functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from nltk.corpus import stopwords
from nltk.stem.porter import *
from wordsegment import load, segment
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import priors

cachedStopWords = nltk.corpus.stopwords.words('portuguese')

Expand Down Expand Up @@ -37,6 +40,7 @@ def AP(systemSummaries, targetSummaries):
def get_cosine_similarities_matrix(sentences):
vec = TfidfVectorizer(stop_words=cachedStopWords)


X = vec.fit_transform(sentences)
return cosine_similarity(X)

Expand All @@ -48,7 +52,7 @@ def cos_sim(sent1_index,sent2_index,cosine_matrix):

def degree_centrality(sent_index,sentences,t=0.2):
graph = build_graph_uniform(sentences,t)
return degree_centrality_prior(sent_index,graph,sentences)
return priors.degree_centrality_prior(sent_index,graph,sentences)



Expand All @@ -57,15 +61,13 @@ def degree_centrality(sent_index,sentences,t=0.2):

#--------------------------graph building stuff------------------------#
def build_graph_uniform(sentences,t=0.2):
if not callable(weight_func):
return 'Not functions!'
nsents = len(sentences)
weights = np.zeros([nsents,nsents])
cos_matrix = get_cosine_similarities_matrix(sentences)
#create weights
for i in range(len(sentences)):
for j in range(len(sentences)):
weights[i][j] = (cos_sim(sent1_index,sent2_index,cosine_matrix) >= t) and (sent1_index != sent2_index)
weights[i][j] = (cos_sim(i,j,cos_matrix) >= t) and (i != j)
return weights


Expand All @@ -91,4 +93,4 @@ def stem_text(text):
return stemmed

def remove_stopwords(text):
return' '.join([word for word in text.split() if word not in cachedStopWords])
return' '.join([word for word in text.split() if word not in cachedStopWords])

0 comments on commit 3689a0f

Please sign in to comment.