Fix whoosh-community#375: Clean up writer for failed document add whe…

…n exceptions were absorbed
cclauss · May 2, 2019 · e31f17e · e31f17e
1 parent f846e4a
commit e31f17e
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 52 deletions.
diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py
@@ -251,6 +251,10 @@ def finish_doc(self):
             sf.clear()
         self._indoc = False
 
+    def cancel_doc(self):
+        self._doccount -= 1
+        self._indoc = False
+
     def _column_filename(self, fieldname):
         return W3Codec.column_filename(self._segment, fieldname)
 

diff --git a/src/whoosh/writing.py b/src/whoosh/writing.py
@@ -735,58 +735,62 @@ def add_document(self, **fields):
         self._check_fields(schema, fieldnames)
 
         perdocwriter.start_doc(docnum)
-        for fieldname in fieldnames:
-            value = fields.get(fieldname)
-            if value is None:
-                continue
-            field = schema[fieldname]
-
-            length = 0
-            if field.indexed:
-                # TODO: Method for adding progressive field values, ie
-                # setting start_pos/start_char?
-                fieldboost = self._field_boost(fields, fieldname, docboost)
-                # Ask the field to return a list of (text, weight, vbytes)
-                # tuples
-                items = field.index(value)
-                # Only store the length if the field is marked scorable
-                scorable = field.scorable
-                # Add the terms to the pool
-                for tbytes, freq, weight, vbytes in items:
-                    weight *= fieldboost
-                    if scorable:
-                        length += freq
-                    add_post((fieldname, tbytes, docnum, weight, vbytes))
-
-            if field.separate_spelling():
-                spellfield = field.spelling_fieldname(fieldname)
-                for word in field.spellable_words(value):
-                    word = utf8encode(word)[0]
-                    # item = (fieldname, tbytes, docnum, weight, vbytes)
-                    add_post((spellfield, word, 0, 1, vbytes))
-
-            vformat = field.vector
-            if vformat:
-                analyzer = field.analyzer
-                # Call the format's word_values method to get posting values
-                vitems = vformat.word_values(value, analyzer, mode="index")
-                # Remove unused frequency field from the tuple
-                vitems = sorted((text, weight, vbytes)
-                                for text, _, weight, vbytes in vitems)
-                perdocwriter.add_vector_items(fieldname, field, vitems)
-
-            # Allow a custom value for stored field/column
-            customval = fields.get("_stored_%s" % fieldname, value)
-
-            # Add the stored value and length for this field to the per-
-            # document writer
-            sv = customval if field.stored else None
-            perdocwriter.add_field(fieldname, field, sv, length)
-
-            column = field.column_type
-            if column and customval is not None:
-                cv = field.to_column_value(customval)
-                perdocwriter.add_column_value(fieldname, column, cv)
+        try:
+            for fieldname in fieldnames:
+                value = fields.get(fieldname)
+                if value is None:
+                    continue
+                field = schema[fieldname]
+
+                length = 0
+                if field.indexed:
+                    # TODO: Method for adding progressive field values, ie
+                    # setting start_pos/start_char?
+                    fieldboost = self._field_boost(fields, fieldname, docboost)
+                    # Ask the field to return a list of (text, weight, vbytes)
+                    # tuples
+                    items = field.index(value)
+                    # Only store the length if the field is marked scorable
+                    scorable = field.scorable
+                    # Add the terms to the pool
+                    for tbytes, freq, weight, vbytes in items:
+                        weight *= fieldboost
+                        if scorable:
+                            length += freq
+                        add_post((fieldname, tbytes, docnum, weight, vbytes))
+
+                if field.separate_spelling():
+                    spellfield = field.spelling_fieldname(fieldname)
+                    for word in field.spellable_words(value):
+                        word = utf8encode(word)[0]
+                        # item = (fieldname, tbytes, docnum, weight, vbytes)
+                        add_post((spellfield, word, 0, 1, vbytes))
+
+                vformat = field.vector
+                if vformat:
+                    analyzer = field.analyzer
+                    # Call the format's word_values method to get posting values
+                    vitems = vformat.word_values(value, analyzer, mode="index")
+                    # Remove unused frequency field from the tuple
+                    vitems = sorted((text, weight, vbytes)
+                                    for text, _, weight, vbytes in vitems)
+                    perdocwriter.add_vector_items(fieldname, field, vitems)
+
+                # Allow a custom value for stored field/column
+                customval = fields.get("_stored_%s" % fieldname, value)
+
+                # Add the stored value and length for this field to the per-
+                # document writer
+                sv = customval if field.stored else None
+                perdocwriter.add_field(fieldname, field, sv, length)
+
+                column = field.column_type
+                if column and customval is not None:
+                    cv = field.to_column_value(customval)
+                    perdocwriter.add_column_value(fieldname, column, cv)
+        except Exception as ex:
+            perdocwriter.cancel_doc()
+            raise ex
 
         perdocwriter.finish_doc()
         self._added = True

diff --git a/tests/test_writing.py b/tests/test_writing.py
@@ -454,3 +454,42 @@ def test_delete_by_term_has_del():
         with ix.reader() as r:
             assert not r.has_deletions()
 
+
+def test_add_fail_with_absorbed_exception():
+    """
+    Issue #375 https://github.com/whoosh-community/whoosh/issues/375
+    Test that a failed document add with absorbed exceptions does not leave
+    an unfinished document state for the next document to be added.
+
+    Test Case:
+    1. Add a bad document (in this case using integer ID)
+    2. Absorb exceptions when doing so
+
+    Client code is now unaware of previous error and OK with it
+
+    3. Attempt to add a new document, also invalid but without absorbing exceptions
+
+    Expected behavior: Appropriate exception for second document failure
+    Behavior prior to fix: Cryptic "Called start_doc when already in a doc"
+
+    This is because the first document had left the perDocWriter in an unfinished state.
+    The fix cleaned up the writer when aborting from a bad addition, allowing the
+    exception for the actual problem with the second document to bubble up.
+    In this case: "2 is not unicode or sequence"
+    """
+    schema = fields.Schema(id=fields.ID())
+    st = RamStorage()
+    ix = st.create_index(schema)
+
+    with ix.writer() as w:
+        try:
+            # Integer value is invalid, but absorbed exception causes silent failure
+            w.add_document(id=1)
+        except:
+            pass
+        with pytest.raises(Exception) as ex:
+            w.add_document(id=2)
+
+        # Assert that correct exception is raised, not the cryptic one
+        assert 'already' not in ex.value.args[0]
+        assert 'unicode' in ex.value.args[0]