Skip to content

Commit

Permalink
Changed expand_HTAG to use dictionaries
Browse files Browse the repository at this point in the history
Was previously using parallel lists and indexing into them, this seems
neater and more efficient
  • Loading branch information
EFord36 committed Sep 27, 2016
1 parent 5e71dcb commit ac8a4ec
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions normalise/expand_HTAG.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,27 @@ def expand_HTAG(word):
def expand_URL(word):
"""Expand tokens tagged URL."""
try:
starts = ["http://", "https://", "www."]
starts_exp = ['', '', 'W W W dot']
ends = [".com", ".org", ".org.uk", ".co.uk"]
ends_exp = ["dot com", "dot org", "dot org dot U K", "dot co dot U K"]
starts = {"http://": "", "https://": "", "www.": "W W W dot"}
# starts = ["http://", "https://", "www."]
# starts_exp = ['', '', 'W W W dot']
ends = {".com": "dot com", ".org": "dot org",
".org.uk": "dot org dot U K", ".co.uk": "dot co dot UK"}
# ends = [".com", ".org", ".org.uk", ".co.uk"]
# ends_exp = ["dot com", "dot org", "dot org dot U K", "dot co dot U K"]
m = urlstart_pattern.match(word)
n = urlend_pattern.match(word)
exp = ''
if m.group(1) and n:
start = m.group(1)
middle = urlend_pattern.match(m.group(2))
end = middle.group(2)
exp += (starts_exp[starts.index(start)] + " "
exp += (starts[start] + " "
+ infer_spaces(middle.group(1))
+ " " + ends_exp[ends.index(end)])
+ " " + ends[end])
elif n:
middle = n.group(1)
end = n.group(2)
exp += infer_spaces(middle) + " " + ends_exp[ends.index(end)]
exp += infer_spaces(middle) + " " + ends[end]
else:
return word
return exp
Expand Down

0 comments on commit ac8a4ec

Please sign in to comment.