Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Diego Marcheggiani committed May 13, 2016
0 parents commit 8602053
Show file tree
Hide file tree
Showing 22 changed files with 2,952 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# relation-autoencoder
13 changes: 13 additions & 0 deletions definitions/OieExample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
__author__ = 'diego'

class OieExample (object):

def __init__(self, arg1, arg2, features, trigger, relation=''):
self.features = features
self.arg1 = arg1
self.arg2 = arg2
self.relation = relation
self.trigger = trigger

def setFeatures(self, features):
self.features = features
234 changes: 234 additions & 0 deletions definitions/OieFeatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
__author__ = 'diego'


import nltk
import re, string
import settings
parsing = 0
entities = 1
trig = 2
sentence = 3
pos = 4
docPath = 5
# ======= Relation features =======
stopwords_list = nltk.corpus.stopwords.words('english')
_digits = re.compile('\d')
def bow(info, arg1, arg2):
return info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()

def bow_clean(info, arg1, arg2):
bow = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
result = []
tmp = []
for word in bow:
for pun in string.punctuation:
word = word.strip(pun)
if word != '':
tmp.append(word.lower())
for word in tmp:
if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
result.append(word)
return result

def before_arg1(info, arg1, arg2):
before = info[sentence][:info[sentence].find(arg1)]
beforeSplit = before.lower().strip().split(' ')
beforeSplit = [word for word in beforeSplit if word not in string.punctuation]
# print beforeSplit
if len(beforeSplit) > 1:
return [beforeSplit[-2], beforeSplit[-1]]
elif len(beforeSplit) == 1:
if beforeSplit[0] != '':
return [beforeSplit[-1]]
else:
return []
else:
return []


def after_arg2(info, arg1, arg2):
after = info[sentence][info[sentence].rfind(arg2)+len(arg2):]
afterSplit = after.lower().strip().split(' ')
afterSplit = [word for word in afterSplit if word not in string.punctuation]
if len(afterSplit) > 1:
return [a for a in afterSplit[0: 2]]
elif len(afterSplit) == 1:
if afterSplit[0] != '':
return [afterSplit[0]]
else:
return []
else:
return []

def bigrams(info, arg1, arg2):
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
tmp = []
for word in between:
for pun in string.punctuation:
word = word.strip(pun)
if word != '':
tmp.append(word.lower())
return [x[0]+'_'+x[1] for x in zip(tmp, tmp[1:])]

def trigrams(info, arg1, arg2):
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
tmp = []
for word in between:
for pun in string.punctuation:
word = word.strip(pun)
if word != '':
tmp.append(word.lower())
return [x[0]+'_'+x[1]+'_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]

def skiptrigrams(info, arg1, arg2):
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
tmp = []
for word in between:
for pun in string.punctuation:
word = word.strip(pun)
if word != '':
tmp.append(word.lower())
return [x[0]+'_X_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]

def skipfourgrams(info, arg1, arg2):
between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
tmp = []
for word in between:
for pun in string.punctuation:
word = word.strip(pun)
if word != '':
tmp.append(word.lower())
return [x[0]+'_X_'+x[2] + '_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] +\
[x[0]+'_'+x[1]+'_X_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])]

def trigger(info, arg1, arg2):
return info[trig].replace('TRIGGER:', '')

def entityTypes(info, arg1, arg2):
return info[entities]

def entity1Type(info, arg1, arg2):
return info[entities].split('-')[0]

def entity2Type(info, arg1, arg2):
return info[entities].split('-')[1]

def arg1(info, arg1, arg2):
return arg1

def arg1_lower(info, arg1, arg2):
return arg1.lower()

def arg1unigrams(info, arg1, arg2):
return arg1.lower().split()

def arg2(info, arg1, arg2):
return arg2

def arg2_lower(info, arg1, arg2):
return arg2.lower()

def arg2unigrams(info, arg1, arg2):
return arg2.lower().split()

def lexicalPattern(info, arg1, arg2):
# return info[parsing]
p = info[parsing].replace('->', ' ').replace('<-', ' ').split()
result = []
for num, x in enumerate(p):
if num % 2 != 0:
result.append(x)
return '_'.join(result)

def dependencyParsing(info, arg1, arg2):
return info[parsing]


def rightDep(info, arg1, arg2):
p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
return ''.join(p[:3])

def leftDep(info, arg1, arg2):
p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
return ''.join(p[-3:])

def posPatternPath(info, arg1, arg2):
words = info[sentence].split()
postags = info[pos].split()
assert len(postags) == len(words), 'error'
a = []
for w in xrange(len(words)):
a.append((words[w], postags[w]))
# a = info[4].split()
if a:
# print arg1, words
# print [a.index(item) for item in a if item[0] == arg1.split()[-1]],'aaaaaaa'
beginList = [a.index(item) for item in a if item[0] == arg1.split()[-1]]
# print beginList
endList = [a.index(item) for item in a if item[0] == arg2.split()[0]]
# print endList
if len(beginList) > 0 and len(endList) > 0:
# posPattern = [item[1] for item in a if beginList[0] > a.index(item) > endList[0]]
posPattern = []
for num, item in enumerate(a):
if beginList[0] < num < endList[0]:
posPattern.append(item[1])
# print posPattern
return '_'.join(posPattern)
else:
return ''
else:
return ''


import pickle
docLDADic = pickle.load(open(settings.lda_pairs_path + "doc-topicPairs.p", "rb"))
sentLDADic = pickle.load(open(settings.lda_pairs_path + "sent-topicPairs.p", "rb"))

def sentence_theme(info, arg1, arg2):
return str(sentLDADic[info[sentence]])

def doc_theme(info, arg1, arg2):
return str(docLDADic[info[docPath]])

def getBasicCleanFeatures():
features = [trigger, entityTypes, arg1_lower, arg2_lower, bow_clean, entity1Type, entity2Type, lexicalPattern,
posPatternPath]
return features




if __name__ == '__main__':
pass
# examples = loadExamples('/Users/admin/isti/amsterdam/data/candidate-100.txt')
#
# ex = examples[0]
#
# print ex

#pathUp, pathDown = utils.getPathBetween(sent.tokens[3], sent.tokens[11])
#print pathUp
#print pathDown
# pred = sent.tokens[41]
# arg = sent.tokens[42]
#
# print pred.wordForm + ":" + stringPathBetween(pred, arg, includeLemmas = False, includePos = False) + ":" + arg.wordForm
# print pred.wordForm + ":" + stringPathToRoot(pred, includeLemmas = True, maxLen=1) + ":" + sent.tokens[0].wordForm
# print pred.wordForm + ":" + stringPosTagsBetween(sent, pred, arg, maxLen = 2, includeEnds = True) + ":" + arg.wordForm
#
# processing.Sentences.saveSentences(sents, "/Users/titovian/SRL-Parsing/data/conll08st/train/t", saveOnlyRoles=True)
#
# #features = getJohanssonPredDisFeatures()
# #features = getJohanssonArgLabFeatures()
# features = getBasicFeatures()
#
# print "pred = " + pred.wordForm + ", arg = " + arg.wordForm
#
# s = []
# for f in features:
# res = f(sent, pred, arg)
# if res is not None:
# s.append(f.__name__ + "#" + res)
#
# print s
1 change: 1 addition & 0 deletions definitions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'admin'
16 changes: 16 additions & 0 deletions definitions/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
__author__ = 'diego'

models_path = ''
clusters_path = ''

lda_pairs_path = ''
relations2IdDictionary = ''

external_embeddings_path = ''
debug = True

elems_to_visualize = 5

low = -1.e-3
high = 1.e-3

Loading

0 comments on commit 8602053

Please sign in to comment.