-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathguess_matching.py
109 lines (84 loc) · 3.44 KB
/
guess_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from difflib import SequenceMatcher
from contextlib import closing
import typing # NamedTuple
import itertools
import sys
import argparse
import guess_helper
import guess_phrases
CandidateWord = typing.NamedTuple('CandidateWord', [('oov', str), ('lexword', str), ('i_lex', int), ('i_oov', int), ('matchlength', int), ('islegal', bool)])
# mono3affix_min6_match4 per word
def is_legal_match(s: str, i: int, l: int) -> bool:
# First, find the two actually matching words of the string
# Only one word? Then it's easy.
if " " in s:
for w in s.split():
if len(w) <= i:
i -= len(w) + 1
continue
# It can only be contained in this word!
s = w
break
rest1 = i
rest2 = len(s) - (i + l)
return l >= 4 and (l / len(s) > 0.65 or (rest1 == 0 or rest2 == 0) and rest1 + rest2 <= 3 and l >= 6)
def get_best_match(oov: str, lexword: str, matcher: SequenceMatcher) -> (str, (int, int, int)):
matcher.set_seq1(oov)
i_o, i_w, matchlength = matcher.find_longest_match(0, len(oov), 0, len(lexword))
return (lexword, (i_o, i_w, matchlength))
def lookup_oov(oov: str, matchers) -> "(str, [CandidateWord])":
print("Looking up", oov, "...", end = '', flush = True, file = sys.stderr)
# Match search
nextbest_lexcandidates = []
nextbest_matchlength = 0
best_lexcandidates = []
best_matchlength = 0
found_legal = False
# First find LCS = match with lexword in a mapreduce fashion
all_pairs = map(lambda w: (oov, w, matchers[w]), sorted(matchers.keys()))
individual_bestmatches = list(itertools.starmap(get_best_match, all_pairs))
print("...", end = '', flush = True, file = sys.stderr)
# Now compare all findings!
all_lexcandidates = sorted([CandidateWord(oov, lexword, i_w, i_o, matchlength, is_legal_match(lexword, i_w, matchlength)) for (lexword, (i_o, i_w, matchlength)) in individual_bestmatches], key = lambda cw: cw.matchlength, reverse = True)
if all_lexcandidates == []:
return (oov, [])
best_matchlength = all_lexcandidates[0].matchlength
cur_matchlength = all_lexcandidates[0].matchlength
cur_wordcount = 0
cur_wordlist = []
result = []
for cw in all_lexcandidates[:500]:
if cw.matchlength == cur_matchlength:
cur_wordcount += 1
else:
cur_matchlength = cw.matchlength
cur_wordcount = 1
result += cur_wordlist
cur_wordlist = []
if cur_wordcount > 75 or cur_matchlength + 4 < best_matchlength:
break
cur_wordlist.append(cw)
print("done!", flush = True, file = sys.stderr)
return (oov, result)
# STDIN: phraseparts
# STDOUT: match dict items
if __name__ == '__main__':
# Load matchers
conf = guess_helper.load_config(None)
(matchers, _) = guess_helper.load_dictionary(conf['global-files']['lexicon'])
# Read 'em all from stdin!
all_uniq_phraseparts = guess_helper.uninorm(sys.stdin.read()).splitlines()
print("Found", len(all_uniq_phraseparts), "phraseparts...", file = sys.stderr)
# Filter those out that are already present in the matchfile
try:
# Load previously calculated matches
with open(conf['global-files']['allmatches']) as f:
prev_matches = dict([eval(guess_helper.uninorm(t)) for t in f.read().splitlines()])
except:
prev_matches = {}
new_uniq_phraseparts = [p for p in all_uniq_phraseparts if p not in prev_matches]
print("... of which", len(new_uniq_phraseparts), "are new ones!", file = sys.stderr)
result = dict([lookup_oov(pp, matchers) for pp in new_uniq_phraseparts])
prev_matches.update(result)
for i in sorted(prev_matches.items(), key = lambda t: t[0]):
print(i)