forked from Exaphis/HackQ-Trivia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquestion.py
199 lines (160 loc) · 7.98 KB
/
question.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import asyncio
import itertools
import re
import time
from collections import defaultdict
from colorama import Fore, Style
import search
import logging
punctuation_to_none = str.maketrans({key: None for key in "!\"#$%&\'()*+,-.:;<=>?@[\\]^_`{|}~�"})
punctuation_to_space = str.maketrans({key: " " for key in "!\"#$%&\'()*+,-.:;<=>?@[\\]^_`{|}~�"})
async def answer_question(question, original_answers):
print("Searching")
start = time.time()
answers = []
for ans in original_answers:
answers.append(ans.translate(punctuation_to_none))
answers.append(ans.translate(punctuation_to_space))
answers = list(dict.fromkeys(answers))
logging.info(question)
logging.info(original_answers)
question_lower = question.lower()
reverse = "NOT" in question or\
("least" in question_lower and "at least" not in question_lower) or\
"NEVER" in question
quoted = re.findall('"([^"]*)"', question_lower) # Get all words in quotes
no_quote = question_lower
for quote in quoted:
no_quote = no_quote.replace(f"\"{quote}\"", "1placeholder1")
question_keywords = search.find_keywords(no_quote)
for quote in quoted:
question_keywords[question_keywords.index("1placeholder1")] = quote
logging.info(question_keywords)
await asyncio.gather(
search_method1_2_stub(question_keywords, answers, reverse),
# search_method3_stub(question_keywords, quoted, question_lower, question, original_answers, reverse)
)
print(f"Search took {time.time() - start} seconds")
async def search_method1_2_stub(question_keywords, answers, reverse):
search_results = await search.search_google("+".join(question_keywords), 50)
search_results = itertools.chain.from_iterable(search_results)
search_text = [x.translate(punctuation_to_none) for x in await search.get_clean_texts(search_results)]
best_answer = await __search_method1(search_text, answers, reverse)
if best_answer != "":
print(f"{Fore.GREEN}{best_answer}{Style.RESET_ALL}\n")
else:
best_answer = await __search_method2(search_text, answers, reverse)
if best_answer != "":
print(f"{Fore.GREEN}{best_answer}{Style.RESET_ALL}\n")
logging.info(best_answer)
async def search_method3_stub(question_keywords, quoted, question_lower, question, original_answers, reverse):
start = time.time()
# Get key nouns for Method 3
key_nouns = set(quoted)
q_word_location = search.find_q_word_location(question_lower)
if len(key_nouns) == 0:
if q_word_location > len(question) // 2 or q_word_location == -1:
key_nouns.update(search.find_nouns(question, num_words=5))
else:
key_nouns.update(search.find_nouns(question, num_words=5, reverse=True))
key_nouns -= {"type"}
# Add consecutive capitalised words (Thanks talentoscope!)
key_nouns.update(re.findall("([A-Z][a-z]+(?=\s[A-Z])(?:\s[A-Z][a-z]+)+)",
" ".join([w for idx, w in enumerate(question.split(" ")) if idx != q_word_location])))
key_nouns = {noun.lower() for noun in key_nouns}
print(f"Question nouns: {key_nouns}")
answer3 = await __search_method3(list(set(question_keywords)), key_nouns, original_answers, reverse)
print(f"{Fore.GREEN}method3: {answer3}{Style.RESET_ALL}")
print(f"Search method3 took {time.time() - start} seconds")
async def __search_method1(texts, answers, reverse):
"""
Returns the answer with the maximum/minimum number of exact occurrences in the texts.
:param texts: List of text to analyze
:param answers: List of answers
:param reverse: True if the best answer occurs the least, False otherwise
:return: Answer that occurs the most/least in the texts, empty string if there is a tie
"""
print("Running method 1")
counts = {answer.lower(): 0 for answer in answers}
for text in texts:
for answer in counts:
counts[answer] += len(re.findall(f" {answer} ", text))
print(counts)
# If not all answers have count of 0 and the best value doesn't occur more than once, return the best answer
best_value = min(counts.values()) if reverse else max(counts.values())
if not all(c == 0 for c in counts.values()) and list(counts.values()).count(best_value) == 1:
return min(counts, key=counts.get) if reverse else max(counts, key=counts.get)
return ""
async def __search_method2(texts, answers, reverse):
"""
Return the answer with the maximum/minimum number of keyword occurrences in the texts.
:param texts: List of text to analyze
:param answers: List of answers
:param reverse: True if the best answer occurs the least, False otherwise
:return: Answer whose keywords occur most/least in the texts
"""
print("Running method 2")
counts = {answer: {keyword: 0 for keyword in search.find_keywords(answer)} for answer in answers}
for text in texts:
for keyword_counts in counts.values():
for keyword in keyword_counts:
keyword_counts[keyword] += len(re.findall(f" {keyword} ", text))
print(counts)
counts_sum = {answer: sum(keyword_counts.values()) for answer, keyword_counts in counts.items()}
if not all(c == 0 for c in counts_sum.values()):
return min(counts_sum, key=counts_sum.get) if reverse else max(counts_sum, key=counts_sum.get)
return ""
async def __search_method3(question_keywords, question_key_nouns, answers, reverse):
"""
Returns the answer with the maximum number of occurrences of the question keywords in its searches.
:param question_keywords: Keywords of the question
:param question_key_nouns: Key nouns of the question
:param answers: List of answers
:param reverse: True if the best answer occurs the least, False otherwise
:return: Answer whose search results contain the most keywords of the question
"""
print("Running method 3")
search_results = await search.multiple_search(answers, 10)
print("Search processed")
answer_lengths = list(map(len, search_results))
search_results = itertools.chain.from_iterable(search_results)
texts = [x.translate(punctuation_to_none) for x in await search.get_clean_texts(search_results)]
print("URLs fetched")
answer_text_map = {}
for idx, length in enumerate(answer_lengths):
answer_text_map[answers[idx]] = texts[0:length]
del texts[0:length]
keyword_scores = {answer: 0 for answer in answers}
noun_scores = {answer: 0 for answer in answers}
# Create a dictionary of word to type of score so we avoid searching for the same thing twice in the same page
word_score_map = defaultdict(list)
for word in question_keywords:
word_score_map[word].append("KW")
for word in question_key_nouns:
word_score_map[word].append("KN")
answer_noun_scores_map = {}
for answer, texts in answer_text_map.items():
keyword_score = 0
noun_score = 0
noun_score_map = defaultdict(int)
for text in texts:
for keyword, score_types in word_score_map.items():
score = len(re.findall(f" {keyword} ", text))
if "KW" in score_types:
keyword_score += score
if "KN" in score_types:
noun_score += score
noun_score_map[keyword] += score
keyword_scores[answer] = keyword_score
noun_scores[answer] = noun_score
answer_noun_scores_map[answer] = noun_score_map
print()
print("\n".join([f"{answer}: {dict(scores)}" for answer, scores in answer_noun_scores_map.items()]))
print()
print(f"Keyword scores: {keyword_scores}")
print(f"Noun scores: {noun_scores}")
if set(noun_scores.values()) != {0}:
return min(noun_scores, key=noun_scores.get) if reverse else max(noun_scores, key=noun_scores.get)
# if set(keyword_scores.values()) != {0}:
# return min(keyword_scores, key=keyword_scores.get) if reverse else max(keyword_scores, key=keyword_scores.get)
return ""