-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Diego Marcheggiani
committed
May 13, 2016
0 parents
commit 8602053
Showing
22 changed files
with
2,952 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# relation-autoencoder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
__author__ = 'diego' | ||
|
||
class OieExample (object): | ||
|
||
def __init__(self, arg1, arg2, features, trigger, relation=''): | ||
self.features = features | ||
self.arg1 = arg1 | ||
self.arg2 = arg2 | ||
self.relation = relation | ||
self.trigger = trigger | ||
|
||
def setFeatures(self, features): | ||
self.features = features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
__author__ = 'diego' | ||
|
||
|
||
import nltk | ||
import re, string | ||
import settings | ||
parsing = 0 | ||
entities = 1 | ||
trig = 2 | ||
sentence = 3 | ||
pos = 4 | ||
docPath = 5 | ||
# ======= Relation features ======= | ||
stopwords_list = nltk.corpus.stopwords.words('english') | ||
_digits = re.compile('\d') | ||
def bow(info, arg1, arg2): | ||
return info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
|
||
def bow_clean(info, arg1, arg2): | ||
bow = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
result = [] | ||
tmp = [] | ||
for word in bow: | ||
for pun in string.punctuation: | ||
word = word.strip(pun) | ||
if word != '': | ||
tmp.append(word.lower()) | ||
for word in tmp: | ||
if word not in stopwords_list and not _digits.search(word) and not word[0].isupper(): | ||
result.append(word) | ||
return result | ||
|
||
def before_arg1(info, arg1, arg2): | ||
before = info[sentence][:info[sentence].find(arg1)] | ||
beforeSplit = before.lower().strip().split(' ') | ||
beforeSplit = [word for word in beforeSplit if word not in string.punctuation] | ||
# print beforeSplit | ||
if len(beforeSplit) > 1: | ||
return [beforeSplit[-2], beforeSplit[-1]] | ||
elif len(beforeSplit) == 1: | ||
if beforeSplit[0] != '': | ||
return [beforeSplit[-1]] | ||
else: | ||
return [] | ||
else: | ||
return [] | ||
|
||
|
||
def after_arg2(info, arg1, arg2): | ||
after = info[sentence][info[sentence].rfind(arg2)+len(arg2):] | ||
afterSplit = after.lower().strip().split(' ') | ||
afterSplit = [word for word in afterSplit if word not in string.punctuation] | ||
if len(afterSplit) > 1: | ||
return [a for a in afterSplit[0: 2]] | ||
elif len(afterSplit) == 1: | ||
if afterSplit[0] != '': | ||
return [afterSplit[0]] | ||
else: | ||
return [] | ||
else: | ||
return [] | ||
|
||
def bigrams(info, arg1, arg2): | ||
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
tmp = [] | ||
for word in between: | ||
for pun in string.punctuation: | ||
word = word.strip(pun) | ||
if word != '': | ||
tmp.append(word.lower()) | ||
return [x[0]+'_'+x[1] for x in zip(tmp, tmp[1:])] | ||
|
||
def trigrams(info, arg1, arg2): | ||
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
tmp = [] | ||
for word in between: | ||
for pun in string.punctuation: | ||
word = word.strip(pun) | ||
if word != '': | ||
tmp.append(word.lower()) | ||
return [x[0]+'_'+x[1]+'_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])] | ||
|
||
def skiptrigrams(info, arg1, arg2): | ||
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
tmp = [] | ||
for word in between: | ||
for pun in string.punctuation: | ||
word = word.strip(pun) | ||
if word != '': | ||
tmp.append(word.lower()) | ||
return [x[0]+'_X_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])] | ||
|
||
def skipfourgrams(info, arg1, arg2): | ||
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() | ||
tmp = [] | ||
for word in between: | ||
for pun in string.punctuation: | ||
word = word.strip(pun) | ||
if word != '': | ||
tmp.append(word.lower()) | ||
return [x[0]+'_X_'+x[2] + '_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] +\ | ||
[x[0]+'_'+x[1]+'_X_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] | ||
|
||
def trigger(info, arg1, arg2): | ||
return info[trig].replace('TRIGGER:', '') | ||
|
||
def entityTypes(info, arg1, arg2): | ||
return info[entities] | ||
|
||
def entity1Type(info, arg1, arg2): | ||
return info[entities].split('-')[0] | ||
|
||
def entity2Type(info, arg1, arg2): | ||
return info[entities].split('-')[1] | ||
|
||
def arg1(info, arg1, arg2): | ||
return arg1 | ||
|
||
def arg1_lower(info, arg1, arg2): | ||
return arg1.lower() | ||
|
||
def arg1unigrams(info, arg1, arg2): | ||
return arg1.lower().split() | ||
|
||
def arg2(info, arg1, arg2): | ||
return arg2 | ||
|
||
def arg2_lower(info, arg1, arg2): | ||
return arg2.lower() | ||
|
||
def arg2unigrams(info, arg1, arg2): | ||
return arg2.lower().split() | ||
|
||
def lexicalPattern(info, arg1, arg2): | ||
# return info[parsing] | ||
p = info[parsing].replace('->', ' ').replace('<-', ' ').split() | ||
result = [] | ||
for num, x in enumerate(p): | ||
if num % 2 != 0: | ||
result.append(x) | ||
return '_'.join(result) | ||
|
||
def dependencyParsing(info, arg1, arg2): | ||
return info[parsing] | ||
|
||
|
||
def rightDep(info, arg1, arg2): | ||
p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split() | ||
return ''.join(p[:3]) | ||
|
||
def leftDep(info, arg1, arg2): | ||
p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split() | ||
return ''.join(p[-3:]) | ||
|
||
def posPatternPath(info, arg1, arg2): | ||
words = info[sentence].split() | ||
postags = info[pos].split() | ||
assert len(postags) == len(words), 'error' | ||
a = [] | ||
for w in xrange(len(words)): | ||
a.append((words[w], postags[w])) | ||
# a = info[4].split() | ||
if a: | ||
# print arg1, words | ||
# print [a.index(item) for item in a if item[0] == arg1.split()[-1]],'aaaaaaa' | ||
beginList = [a.index(item) for item in a if item[0] == arg1.split()[-1]] | ||
# print beginList | ||
endList = [a.index(item) for item in a if item[0] == arg2.split()[0]] | ||
# print endList | ||
if len(beginList) > 0 and len(endList) > 0: | ||
# posPattern = [item[1] for item in a if beginList[0] > a.index(item) > endList[0]] | ||
posPattern = [] | ||
for num, item in enumerate(a): | ||
if beginList[0] < num < endList[0]: | ||
posPattern.append(item[1]) | ||
# print posPattern | ||
return '_'.join(posPattern) | ||
else: | ||
return '' | ||
else: | ||
return '' | ||
|
||
|
||
import pickle | ||
docLDADic = pickle.load(open(settings.lda_pairs_path + "doc-topicPairs.p", "rb")) | ||
sentLDADic = pickle.load(open(settings.lda_pairs_path + "sent-topicPairs.p", "rb")) | ||
|
||
def sentence_theme(info, arg1, arg2): | ||
return str(sentLDADic[info[sentence]]) | ||
|
||
def doc_theme(info, arg1, arg2): | ||
return str(docLDADic[info[docPath]]) | ||
|
||
def getBasicCleanFeatures(): | ||
features = [trigger, entityTypes, arg1_lower, arg2_lower, bow_clean, entity1Type, entity2Type, lexicalPattern, | ||
posPatternPath] | ||
return features | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
pass | ||
# examples = loadExamples('/Users/admin/isti/amsterdam/data/candidate-100.txt') | ||
# | ||
# ex = examples[0] | ||
# | ||
# print ex | ||
|
||
#pathUp, pathDown = utils.getPathBetween(sent.tokens[3], sent.tokens[11]) | ||
#print pathUp | ||
#print pathDown | ||
# pred = sent.tokens[41] | ||
# arg = sent.tokens[42] | ||
# | ||
# print pred.wordForm + ":" + stringPathBetween(pred, arg, includeLemmas = False, includePos = False) + ":" + arg.wordForm | ||
# print pred.wordForm + ":" + stringPathToRoot(pred, includeLemmas = True, maxLen=1) + ":" + sent.tokens[0].wordForm | ||
# print pred.wordForm + ":" + stringPosTagsBetween(sent, pred, arg, maxLen = 2, includeEnds = True) + ":" + arg.wordForm | ||
# | ||
# processing.Sentences.saveSentences(sents, "/Users/titovian/SRL-Parsing/data/conll08st/train/t", saveOnlyRoles=True) | ||
# | ||
# #features = getJohanssonPredDisFeatures() | ||
# #features = getJohanssonArgLabFeatures() | ||
# features = getBasicFeatures() | ||
# | ||
# print "pred = " + pred.wordForm + ", arg = " + arg.wordForm | ||
# | ||
# s = [] | ||
# for f in features: | ||
# res = f(sent, pred, arg) | ||
# if res is not None: | ||
# s.append(f.__name__ + "#" + res) | ||
# | ||
# print s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'admin' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
__author__ = 'diego' | ||
|
||
models_path = '' | ||
clusters_path = '' | ||
|
||
lda_pairs_path = '' | ||
relations2IdDictionary = '' | ||
|
||
external_embeddings_path = '' | ||
debug = True | ||
|
||
elems_to_visualize = 5 | ||
|
||
low = -1.e-3 | ||
high = 1.e-3 | ||
|
Oops, something went wrong.