Skip to content

Commit

Permalink
Fix whoosh-community#375: Clean up writer for failed document add whe…
Browse files Browse the repository at this point in the history
…n exceptions were absorbed
  • Loading branch information
stevennic committed May 2, 2019
1 parent f846e4a commit e31f17e
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 52 deletions.
4 changes: 4 additions & 0 deletions src/whoosh/codec/whoosh3.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ def finish_doc(self):
sf.clear()
self._indoc = False

def cancel_doc(self):
self._doccount -= 1
self._indoc = False

def _column_filename(self, fieldname):
return W3Codec.column_filename(self._segment, fieldname)

Expand Down
108 changes: 56 additions & 52 deletions src/whoosh/writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,58 +735,62 @@ def add_document(self, **fields):
self._check_fields(schema, fieldnames)

perdocwriter.start_doc(docnum)
for fieldname in fieldnames:
value = fields.get(fieldname)
if value is None:
continue
field = schema[fieldname]

length = 0
if field.indexed:
# TODO: Method for adding progressive field values, ie
# setting start_pos/start_char?
fieldboost = self._field_boost(fields, fieldname, docboost)
# Ask the field to return a list of (text, weight, vbytes)
# tuples
items = field.index(value)
# Only store the length if the field is marked scorable
scorable = field.scorable
# Add the terms to the pool
for tbytes, freq, weight, vbytes in items:
weight *= fieldboost
if scorable:
length += freq
add_post((fieldname, tbytes, docnum, weight, vbytes))

if field.separate_spelling():
spellfield = field.spelling_fieldname(fieldname)
for word in field.spellable_words(value):
word = utf8encode(word)[0]
# item = (fieldname, tbytes, docnum, weight, vbytes)
add_post((spellfield, word, 0, 1, vbytes))

vformat = field.vector
if vformat:
analyzer = field.analyzer
# Call the format's word_values method to get posting values
vitems = vformat.word_values(value, analyzer, mode="index")
# Remove unused frequency field from the tuple
vitems = sorted((text, weight, vbytes)
for text, _, weight, vbytes in vitems)
perdocwriter.add_vector_items(fieldname, field, vitems)

# Allow a custom value for stored field/column
customval = fields.get("_stored_%s" % fieldname, value)

# Add the stored value and length for this field to the per-
# document writer
sv = customval if field.stored else None
perdocwriter.add_field(fieldname, field, sv, length)

column = field.column_type
if column and customval is not None:
cv = field.to_column_value(customval)
perdocwriter.add_column_value(fieldname, column, cv)
try:
for fieldname in fieldnames:
value = fields.get(fieldname)
if value is None:
continue
field = schema[fieldname]

length = 0
if field.indexed:
# TODO: Method for adding progressive field values, ie
# setting start_pos/start_char?
fieldboost = self._field_boost(fields, fieldname, docboost)
# Ask the field to return a list of (text, weight, vbytes)
# tuples
items = field.index(value)
# Only store the length if the field is marked scorable
scorable = field.scorable
# Add the terms to the pool
for tbytes, freq, weight, vbytes in items:
weight *= fieldboost
if scorable:
length += freq
add_post((fieldname, tbytes, docnum, weight, vbytes))

if field.separate_spelling():
spellfield = field.spelling_fieldname(fieldname)
for word in field.spellable_words(value):
word = utf8encode(word)[0]
# item = (fieldname, tbytes, docnum, weight, vbytes)
add_post((spellfield, word, 0, 1, vbytes))

vformat = field.vector
if vformat:
analyzer = field.analyzer
# Call the format's word_values method to get posting values
vitems = vformat.word_values(value, analyzer, mode="index")
# Remove unused frequency field from the tuple
vitems = sorted((text, weight, vbytes)
for text, _, weight, vbytes in vitems)
perdocwriter.add_vector_items(fieldname, field, vitems)

# Allow a custom value for stored field/column
customval = fields.get("_stored_%s" % fieldname, value)

# Add the stored value and length for this field to the per-
# document writer
sv = customval if field.stored else None
perdocwriter.add_field(fieldname, field, sv, length)

column = field.column_type
if column and customval is not None:
cv = field.to_column_value(customval)
perdocwriter.add_column_value(fieldname, column, cv)
except Exception as ex:
perdocwriter.cancel_doc()
raise ex

perdocwriter.finish_doc()
self._added = True
Expand Down
39 changes: 39 additions & 0 deletions tests/test_writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,42 @@ def test_delete_by_term_has_del():
with ix.reader() as r:
assert not r.has_deletions()


def test_add_fail_with_absorbed_exception():
"""
Issue #375 https://github.com/whoosh-community/whoosh/issues/375
Test that a failed document add with absorbed exceptions does not leave
an unfinished document state for the next document to be added.
Test Case:
1. Add a bad document (in this case using integer ID)
2. Absorb exceptions when doing so
Client code is now unaware of previous error and OK with it
3. Attempt to add a new document, also invalid but without absorbing exceptions
Expected behavior: Appropriate exception for second document failure
Behavior prior to fix: Cryptic "Called start_doc when already in a doc"
This is because the first document had left the perDocWriter in an unfinished state.
The fix cleaned up the writer when aborting from a bad addition, allowing the
exception for the actual problem with the second document to bubble up.
In this case: "2 is not unicode or sequence"
"""
schema = fields.Schema(id=fields.ID())
st = RamStorage()
ix = st.create_index(schema)

with ix.writer() as w:
try:
# Integer value is invalid, but absorbed exception causes silent failure
w.add_document(id=1)
except:
pass
with pytest.raises(Exception) as ex:
w.add_document(id=2)

# Assert that correct exception is raised, not the cryptic one
assert 'already' not in ex.value.args[0]
assert 'unicode' in ex.value.args[0]

0 comments on commit e31f17e

Please sign in to comment.