catchup to be mergeable

Discovery666 · Jan 14, 2016 · 6f91668 · 6f91668
2 parents fa7a8dc + 98a2a73
commit 6f91668
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 3 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -10,6 +10,7 @@ Changes
   - up to 300% speed up when training on very short documents (~tweets)
 * Word2vec allows non-strict unicode error handling (ignore or replace) (Gordon Mohr, #466)
 * Doc2Vec `model.docvecs[key]` now raises KeyError for unknown keys (Gordon Mohr, #520)
+* Fix `DocvecsArray.index_to_doctag` so `most_similar()` returns string doctags (Gordon Mohr, #560) 
 * On-demand loading of the `pattern` library in utils.lemmatize (Jan Zikes, #461)
   - `utils.HAS_PATTERN` flag moved to `utils.has_pattern()`
 * Forwards compatibility for NumPy > 1.10 (Matti Lyra, #494, #513)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -319,7 +319,7 @@ def _key_index(self, i_index, missing=None):
 
     def index_to_doctag(self, i_index):
         """Return string key for given i_index, if available. Otherwise return raw int doctag (same int)."""
-        candidate_offset = self.max_rawint - i_index - 1
+        candidate_offset = i_index - self.max_rawint - 1
         if 0 <= candidate_offset < len(self.offset2doctag):
             return self.offset2doctag[candidate_offset]
         else:

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -113,6 +113,8 @@ def test_string_doctags(self):
         self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0]))
         self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags))
         self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0))
+        # verify docvecs.most_similar() returns string doctags rather than indexes
+        self.assertEqual(model.docvecs.offset2doctag[0], model.docvecs.most_similar([model.docvecs[0]])[0][0])
 
     def test_empty_errors(self):
         # no input => "RuntimeError: you must first build vocabulary before training the model"
@@ -252,8 +254,6 @@ def test_mixed_tag_types(self):
         model = doc2vec.Doc2Vec()
         model.build_vocab(mixed_tag_corpus)
         expected_length = len(sentences) + len(model.docvecs.doctags)  # 9 sentences, 7 unique first tokens
-        print(model.docvecs.doctags)
-        print(model.docvecs.count)
         self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)
 
     def models_equal(self, model, model2):