Skip to content

Commit

Permalink
Work on TermInfo and better quality optimizations.
Browse files Browse the repository at this point in the history
  • Loading branch information
mchaput committed May 29, 2011
1 parent 3fef837 commit af65f65
Show file tree
Hide file tree
Showing 19 changed files with 714 additions and 313 deletions.
27 changes: 19 additions & 8 deletions src/whoosh/filedb/fileindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,7 @@ def _reader(self, storage, schema, segments, generation, reuse=None):
if reuse:
# Put all atomic readers in a dictionary keyed by their
# generation, so we can re-use them if them if possible
if reuse.is_atomic():
readers = [reuse]
else:
readers = [r for r, offset in reuse.leaf_readers()]
readers = [r for r, offset in reuse.leaf_readers()]
reusable = dict((r.generation(), r) for r in readers)

# Make a function to open readers, which reuses reusable readers.
Expand Down Expand Up @@ -383,28 +380,34 @@ class Segment(object):
generation = 0

def __init__(self, name, generation, doccount, fieldlength_totals,
fieldlength_maxes, deleted=None):
fieldlength_mins, fieldlength_maxes, deleted=None):
"""
:param name: The name of the segment (the Index object computes this
from its name and the generation).
:param doccount: The maximum document number in the segment.
:param term_count: Total count of all terms in all documents.
:param fieldlength_totals: A dictionary mapping field numbers to the
:param fieldlength_totals: A dictionary mapping field names to the
total number of terms in that field across all documents in the
segment.
:param fieldlength_mins: A dictionary mapping field names to the
minimum length of that field across all documents.
:param fieldlength_maxes: A dictionary mapping field names to the
maximum length of that field across all documents.
:param deleted: A set of deleted document numbers, or None if no
deleted documents exist in this segment.
"""

assert isinstance(name, basestring)
assert isinstance(doccount, (int, long))
assert fieldlength_totals is None or isinstance(fieldlength_totals, dict), "fl_totals=%r" % fieldlength_totals
assert fieldlength_maxes is None or isinstance(fieldlength_mins, dict), "fl_mins=%r" % fieldlength_maxes
assert fieldlength_maxes is None or isinstance(fieldlength_maxes, dict), "fl_maxes=%r" % fieldlength_maxes

self.name = name
self.generation = generation
self.doccount = doccount
self.fieldlength_totals = fieldlength_totals
self.fieldlength_mins = fieldlength_mins
self.fieldlength_maxes = fieldlength_maxes
self.deleted = deleted
self.uuid = uuid.uuid4()
Expand All @@ -426,8 +429,8 @@ def __getattr__(self, name):

def copy(self):
return Segment(self.name, self.generation, self.doccount,
self.fieldlength_totals, self.fieldlength_maxes,
self.deleted)
self.fieldlength_totals, self.fieldlength_mins,
self.fieldlength_maxes, self.deleted)

def make_filename(self, ext):
return "%s.%s" % (self.name, ext)
Expand Down Expand Up @@ -469,10 +472,18 @@ def field_length(self, fieldname, default=0):
"""
return self.fieldlength_totals.get(fieldname, default)

def min_field_length(self, fieldname, default=0):
"""Returns the maximum length of the given field in any of the
documents in the segment.
"""

return self.fieldlength_mins.get(fieldname, default)

def max_field_length(self, fieldname, default=0):
"""Returns the maximum length of the given field in any of the
documents in the segment.
"""

return self.fieldlength_maxes.get(fieldname, default)

def delete_document(self, docnum, delete=True):
Expand Down
76 changes: 40 additions & 36 deletions src/whoosh/filedb/filepostings.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from whoosh.spans import Span
from whoosh.system import _INT_SIZE
from whoosh.filedb import postblocks
from whoosh.filedb.filetables import TermInfo


class FilePostingWriter(PostingWriter):
Expand All @@ -48,18 +49,19 @@ def __init__(self, postfile, stringids=False, blocklimit=128,
self.blocklimit = blocklimit
self.compression = compression
self.block = None

def _reset_block(self):
self.block = self.blockclass(self.postfile, self.stringids)
self.block = self.blockclass(self.postfile, self.format.posting_size,
stringids=self.stringids)

def start(self, format):
if self.block is not None:
raise Exception("Called start() in a block")

self.format = format
self.blockcount = 0
self.posttotal = 0
self.startoffset = self.postfile.tell()
self.terminfo = TermInfo()

# Magic number
self.postfile.write_int(self.blockclass.magic)
Expand All @@ -73,50 +75,51 @@ def write(self, id, weight, valuestring, dfl):
self.block.append(id, weight, valuestring, dfl)
if len(self.block) >= self.blocklimit:
self._write_block()
self.posttotal += 1

def finish(self):
def finish(self, inlinelimit=1):
if self.block is None:
raise Exception("Called finish() when not in a block")

if self.block:
self._write_block()

# Seek back to the start of this list of posting blocks and writer the
# number of blocks
pf = self.postfile
pf.flush()
offset = pf.tell()
pf.seek(self.startoffset + _INT_SIZE)
pf.write_uint(self.blockcount)
pf.seek(offset)
block = self.block
terminfo = self.terminfo

if self.blockcount < 1 and len(block) <= inlinelimit:
terminfo.add_block(block)
vals = None if not block.values else tuple(block.values)
postings = (tuple(block.ids), tuple(block.weights), vals)
else:
if block:
self._write_block()

# Seek back to the start of this list of posting blocks and write the
# number of blocks
pf = self.postfile
pf.flush()
offset = pf.tell()
pf.seek(self.startoffset + _INT_SIZE)
pf.write_uint(self.blockcount)
pf.seek(offset)
postings = self.startoffset

self.block = None
return self.posttotal

def cancel(self):
self.block = None

terminfo.postings = postings
return terminfo

def close(self):
if self.block:
self.finish()
raise Exception("Closed posting writer without finishing")
self.postfile.close()

def block_stats(self):
return self.block.stats()

def _write_block(self):
self.block.to_file(self.postfile, self.format.posting_size,
compression=self.compression)
self.block.write(compression=self.compression)
self.terminfo.add_block(self.block)
self._reset_block()
self.blockcount += 1

def as_inline(self):
block = self.block
_, maxwol, minlength = block.stats()
return (tuple(block.ids), tuple(block.weights), tuple(block.values),
maxwol, minlength)


class FilePostingReader(Matcher):
def __init__(self, postfile, offset, format, scorer=None,
Expand Down Expand Up @@ -172,7 +175,7 @@ def supports(self, astype):

def value(self):
if self.block.values is None:
self.block.read_values(self.format.posting_size)
self.block.read_values()
return self.block.values[self.i]

def value_as(self, astype):
Expand Down Expand Up @@ -241,7 +244,8 @@ def skip_to(self, id):
def _read_block(self, offset):
pf = self.postfile
pf.seek(offset)
return self.blockclass.from_file(pf, self.stringids)
return self.blockclass.from_file(pf, self.format.posting_size,
stringids=self.stringids)

def _consume_block(self):
self.block.read_ids()
Expand Down Expand Up @@ -277,8 +281,11 @@ def _skip_to_block(self, targetfn):

return skipped

def supports_quality(self):
return self.scorer and self.scorer.supports_quality()
def supports_block_quality(self):
return self.scorer and self.scorer.supports_block_quality()

def max_quality(self):
return self.scorer.max_quality()

def skip_to_quality(self, minquality):
bq = self.block_quality
Expand All @@ -298,9 +305,6 @@ def block_max_weight(self):
def block_max_wol(self):
return self.block.max_wol()

def block_max_id(self):
return self.block.max_id()

def score(self):
return self.scorer.score(self)

Expand Down
57 changes: 38 additions & 19 deletions src/whoosh/filedb/filereading.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,18 @@ def all_stored_fields(self):
def field_length(self, fieldname):
return self.segment.field_length(fieldname)

def min_field_length(self, fieldname):
return self.segment.min_field_length(fieldname)

def max_field_length(self, fieldname):
return self.segment.max_field_length(fieldname)

@protected
def doc_field_length(self, docnum, fieldname, default=0):
if self.fieldlengths is None:
return default
return self.fieldlengths.get(docnum, fieldname, default=default)

def max_field_length(self, fieldname):
return self.segment.max_field_length(fieldname)

@protected
def has_vector(self, docnum, fieldname):
if self.schema[fieldname].vector:
Expand All @@ -171,10 +174,10 @@ def has_vector(self, docnum, fieldname):
@protected
def __iter__(self):
schema = self.schema
for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
for (fieldname, t), (freq, docfreq) in self.termsindex.terms_and_freqs():
if fieldname not in schema:
continue
yield (fieldname, t, postcount, totalfreq)
yield (fieldname, t, docfreq, freq)

def _test_field(self, fieldname):
if fieldname not in self.schema:
Expand All @@ -186,10 +189,11 @@ def _test_field(self, fieldname):
def iter_from(self, fieldname, text):
schema = self.schema
self._test_field(fieldname)
for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)):
term = (fieldname, text)
for (fn, t), (freq, docfreq) in self.termsindex.terms_and_freqs(term):
if fn not in schema:
continue
yield (fn, t, postcount, totalfreq)
yield (fn, t, docfreq, freq)

@protected
def _term_info(self, fieldname, text):
Expand All @@ -199,17 +203,31 @@ def _term_info(self, fieldname, text):
except KeyError:
raise TermNotFound("%s:%r" % (fieldname, text))

def doc_frequency(self, fieldname, text):
def frequency(self, fieldname, text):
self._test_field(fieldname)
try:
return self._term_info(fieldname, text)[2]
except TermNotFound:
return self.termsindex.frequency((fieldname, text))
except KeyError:
return 0

def frequency(self, fieldname, text):
def doc_frequency(self, fieldname, text):
self._test_field(fieldname)
try:
return self._term_info(fieldname, text)[0]
except TermNotFound:
return self.termsindex.doc_frequency((fieldname, text))
except KeyError:
return 0

def min_length(self, fieldname, text):
return self.termsindex.min_length((fieldname, text))

def max_length(self, fieldname, text):
return self.termsindex.max_length((fieldname, text))

def max_weight(self, fieldname, text):
return self.termsindex.max_weight((fieldname, text))

def max_wol(self, fieldname, text):
return self.termsindex.max_wol((fieldname, text))

def lexicon(self, fieldname):
# The base class has a lexicon() implementation that uses iter_from()
Expand Down Expand Up @@ -251,19 +269,20 @@ def expand_prefix(self, fieldname, prefix):

def postings(self, fieldname, text, scorer=None):
try:
offset = self.termsindex[fieldname, text][1]
terminfo = self.termsindex[fieldname, text]
except KeyError:
raise TermNotFound("%s:%r" % (fieldname, text))

format = self.schema[fieldname].format
if isinstance(offset, (int, long)):
postreader = FilePostingReader(self.postfile, offset, format,
postings = terminfo.postings
if isinstance(postings, (int, long)):
postreader = FilePostingReader(self.postfile, postings, format,
scorer=scorer, fieldname=fieldname,
text=text)
else:
docids, weights, values, maxwol, minlength = offset
postreader = ListMatcher(docids, weights, values, format, scorer,
maxwol=maxwol, minlength=minlength)
docids, weights, values = postings
postreader = ListMatcher(docids, weights, values, format,
scorer=scorer)

deleted = self.segment.deleted
if deleted:
Expand Down
Loading

0 comments on commit af65f65

Please sign in to comment.