Skip to content

Commit

Permalink
Code for computing top dissimilar questions
Browse files Browse the repository at this point in the history
  • Loading branch information
nitish-kulkarni committed Nov 12, 2017
1 parent c493d78 commit e46916d
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions data_processing/dissimlar_questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Prints to stdout the top n dissimar questions
for every question in stdin
Similarity is measured from word2vec similarity
of nouns, adjectives and verbs in the question
"""

import sys
from gensim.models.keyedvectors import KeyedVectors

from pos_tag import postags

GOOGLE_WORD2VEC = 'data/word2vec_vectors/GoogleNews-vectors-negative300.bin'

def pretrained_model(filename):
return KeyedVectors.load_word2vec_format(filename, binary=True)

def visual_questions():
questions = []
for line in sys.stdin:
tag, question = line.strip().split('\t')
if tag == 'V':
questions.append(question)
return questions

def question_words(question):
"""Identifier words for a question
In this case, proper and common nouns
"""
return [word for word, tag in postags(question) if tag in ['NOUN', 'PROPN', 'VERB'] ]

def dissimilar_questions(words_all_questions, words_question, top_n):
"""To be implemented
"""
top_questions = []
return top_questions

def main():
model = pretrained_model(GOOGLE_WORD2VEC)
questions = visual_questions()
words_all_questions = [question_words(question) for question in questions]
for i, question in enumerate(questions):
print('%s\t%s' % (question, '|'.join(dissimilar_questions(words_all_questions[i], words_all_questions, 10))))

if __name__ == '__main__':
main()

0 comments on commit e46916d

Please sign in to comment.