Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better sorting #50

Merged
merged 9 commits into from
Jan 3, 2022
Prev Previous commit
Next Next commit
✨ Accept kwargs to Pattern.search
  • Loading branch information
JohnGiorgi committed Dec 29, 2021
commit d5ff9cdea0422b945f9d48b417af995ba4eabb52
34 changes: 15 additions & 19 deletions seq2rel_ds/common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,19 @@ class EntityHinting(str, Enum):
# Private functions #


def _search_ent(ent: str, text: str) -> Optional[re.Match]:
"""Search for the first occurance of `ent` in `text`, returning an `re.Match` object if found
and `None` otherwise.
def _first_mention(string: str, text: str, **kwargs: Any) -> Optional[re.Match]:
"""Search for the first occurrence of `string` in `text`, returning an `re.Match` object if
found and `None` otherwise. To match `string` to `text` more accurately, we use a type of
"backoff" strategy. First, we look for the whole entity in text. If we cannot find it, we look
for a lazy match of its first and last tokens. `**kwargs` are passed to `Pattern.search`.
"""
# To match ent to text most accurately, we use a type of "backoff" strategy. First, we look for
# the whole entity in text. If we cannot find it, we look for a lazy match of its first and last
# tokens. In both cases, we look for whole word matches first (considering word boundaries).
match = re.search(fr"\b{re.escape(ent)}\b", text) or re.search(re.escape(ent), text)
match = re.compile(fr"\b{re.escape(string)}\b").search(text, **kwargs)

if not match:
ent_split = ent.split()
ent_split = string.split()
if len(ent_split) > 1:
first, last = re.escape(ent_split[0]), re.escape(ent_split[-1])
match = re.search(fr"\b{first}.*?{last}\b", text) or re.search(
fr"{first}.*?{last}", text
)
match = re.compile(fr"\b{first}.*?{last}\b").search(text, **kwargs)
return match


Expand Down Expand Up @@ -201,22 +199,20 @@ def parse_pubtator(
if uid == "-1":
continue

offset = (start, end)

# If this is a compound entity update the offsets to be as correct as possible.
if len(mentions) > 1:
match = _search_ent(mention, text[start:end])
match = _first_mention(mention, text, pos=start, endpos=end)
if match is not None:
adj_start, adj_end = match.span()
adj_start += start
adj_end += start
else:
adj_start, adj_end = start, end
offset = match.span()

if uid in parsed[-1].clusters:
parsed[-1].clusters[uid].mentions.append(mention)
parsed[-1].clusters[uid].offsets.append((adj_start, adj_end))
parsed[-1].clusters[uid].offsets.append(offset)
else:
parsed[-1].clusters[uid] = PubtatorCluster(
mentions=[mention], offsets=[(adj_start, adj_end)], label=label
mentions=[mention], offsets=[offset], label=label
)
# This is a relation
else:
Expand Down