Skip to content

Commit

Permalink
Lowest Common Hypernyms Change & Tests
Browse files Browse the repository at this point in the history
- Set lowest_common_hypernyms to use max_depth by default
- Updated wup_similarity() to use min_depth with LCH
- Added a tests file for wordnet (only tests lowest_common_hypernyms()
at the moment)
- Added an additional LCH example to demo() which illustrates a case
where a target synset is also the LCH for itself and another synset
  • Loading branch information
dougalg committed Jul 11, 2013
1 parent c3644ab commit 609fe05
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 16 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Dan Garrette,
Jean Mark Gawron,
Sumukh Ghodke,
Yoav Goldberg,
Dougal Graham,
Brent Gray,
Simon Greenhill,
Eduardo Pereira Habkost,
Expand Down
39 changes: 23 additions & 16 deletions nltk/corpus/reader/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,14 +450,16 @@ def common_hypernyms(self, other):
for other_synset in other_synsets)
return list(self_synsets.intersection(other_synsets))

def lowest_common_hypernyms(self, other, simulate_root=False, use_max_depth=False):
"""Get a list of absolute lowest synset(s) that both synsets have as a hypernym.
By default this is calculated by finding the shortest paths for all synsets that
are hypernyms of both words, and returning that with the longest path.
def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
"""
Get a list of absolute lowest synset(s) that both synsets have as a hypernym.
This method is an implementation of Ted Pedersen's "Lowest Common Subsumer" method
from the Perl Wordnet module. It can return either "self" or "other" if they are a
hypernym of the other.
By setting the use_max_depth flag to True, lower hypernyms can be found by searching for the
longest paths of each hypernym.
By setting the use_min_depth flag to True, the behavior of NLTK2 can be preserved.
This was changed in NLTK3 to give more accurate results in a small set of cases,
generally with synsets concerning people. (eg: 'chef.n.01', 'fireman.n.01', etc.)
:type other: Synset
:param other: other input synset
Expand All @@ -470,10 +472,11 @@ def lowest_common_hypernyms(self, other, simulate_root=False, use_max_depth=Fals
there is usually a default root except for WordNet version 1.6.
If you are using wordnet 1.6, a fake root will need to be added
for nouns as well.
:type use_max_depth: bool
:param use_max_depth: If True, will use the max_depth function to
calculate the lowest common hypernyms giving results that should
be lower in the tree than when using the default settings.
:type use_min_depth: bool
:param use_min_depth: This setting mimics older (v2) behavior of NLTK wordnet
If True, will use the min_depth function to calculate the lowest common
hypernyms. This is known to give strange results for some synset pairs
(eg: 'chef.n.01', 'fireman.n.01') but is retained for backwards compatibility
:return: The synsets that are the lowest common hypernyms of both synsets
"""

Expand All @@ -494,12 +497,12 @@ def lowest_common_hypernyms(self, other, simulate_root=False, use_max_depth=Fals
synsets.intersection_update(others)

try:
if use_max_depth:
max_depth = max(s.max_depth() for s in synsets)
return [s for s in synsets if s.max_depth() == max_depth]
else:
if use_min_depth:
max_depth = max(s.min_depth() for s in synsets)
return [s for s in synsets if s.min_depth() == max_depth]
else:
max_depth = max(s.max_depth() for s in synsets)
return [s for s in synsets if s.max_depth() == max_depth]
except ValueError:
return []

Expand Down Expand Up @@ -728,7 +731,10 @@ def wup_similarity(self, other, verbose=False, simulate_root=True):
"""

need_root = self._needs_root()
subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root)
# Note that to preserve behavior from NLTK2 we set use_min_depth=True
# It is possible that more accurate results could be obtained by
# removing this setting and it should be tested later on
subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root, use_min_depth=True)

# If no LCS was found return None
if len(subsumers) == 0:
Expand Down Expand Up @@ -1748,6 +1754,7 @@ def _get_synsets(synset_strings):
print(S('fall.v.12').root_hypernyms())

print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')))
print(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')))

print(S('dog.n.01').path_similarity(S('cat.n.01')))
print(S('dog.n.01').lch_similarity(S('cat.n.01')))
Expand Down
62 changes: 62 additions & 0 deletions nltk/test/unit/test_wordnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.corpus.wordnet. See also: nltk/test/wordnet.doctest
"""
from nltk.corpus import wordnet as wn

def test_lowest_common_hypernyms():
"""
This tests wordnet's lowest_common_hypernyms method. It should
a) Generally only return 1 result (although some are known to return 2+)
b) Return the lowest result (depending on whether max_depth or min_depth)
is used
"""

lch = wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01'))
assert len(lch) == 1
assert lch == [wn.synset('relative.n.01')]

lch = wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01'), use_min_depth=True)
assert len(lch) == 1
assert lch == [wn.synset('organism.n.01')]

lch = wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
assert len(lch) == 1
assert lch == [wn.synset('person.n.01')]

lch = wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True)
assert len(lch) == 1
assert lch == [wn.synset('organism.n.01')]

lch = wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'))
assert len(lch) == 1
assert lch == [wn.synset('woman.n.01')]

lch = wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True)
assert len(lch) == 1
assert lch == [wn.synset('organism.n.01')]

lch = wn.synset('agency.n.01').lowest_common_hypernyms(wn.synset('office.n.01'))
assert len(lch) == 1
assert lch == [wn.synset('entity.n.01')]

lch = wn.synset('agency.n.01').lowest_common_hypernyms(wn.synset('office.n.01'), use_min_depth=True)
assert len(lch) == 1
assert lch == [wn.synset('entity.n.01')]

lch = wn.synset('day.n.10').lowest_common_hypernyms(wn.synset('service.n.07'))
assert len(lch) == 1
assert lch == [wn.synset('writer.n.01')]

lch = wn.synset('day.n.10').lowest_common_hypernyms(wn.synset('service.n.07'), use_min_depth=True)
assert len(lch) == 2
assert lch == [wn.synset('organism.n.01'), wn.synset('writer.n.01')]

lch = wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'))
assert len(lch) == 2
assert lch == [wn.synset('measure.n.02'), wn.synset('attribute.n.02')]

lch = wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True)
assert len(lch) == 2
assert lch == [wn.synset('measure.n.02'), wn.synset('attribute.n.02')]

0 comments on commit 609fe05

Please sign in to comment.