Work on TermInfo and better quality optimizations.

cclauss · May 29, 2011 · af65f65 · af65f65
1 parent 3fef837
commit af65f65
Show file tree

Hide file tree

Showing 19 changed files with 714 additions and 313 deletions.
diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py
@@ -304,10 +304,7 @@ def _reader(self, storage, schema, segments, generation, reuse=None):
             if reuse:
                 # Put all atomic readers in a dictionary keyed by their
                 # generation, so we can re-use them if them if possible
-                if reuse.is_atomic():
-                    readers = [reuse]
-                else:
-                    readers = [r for r, offset in reuse.leaf_readers()]
+                readers = [r for r, offset in reuse.leaf_readers()]
                 reusable = dict((r.generation(), r) for r in readers)
 
             # Make a function to open readers, which reuses reusable readers.
@@ -383,28 +380,34 @@ class Segment(object):
     generation = 0
 
     def __init__(self, name, generation, doccount, fieldlength_totals,
-                 fieldlength_maxes, deleted=None):
+                 fieldlength_mins, fieldlength_maxes, deleted=None):
         """
         :param name: The name of the segment (the Index object computes this
             from its name and the generation).
         :param doccount: The maximum document number in the segment.
         :param term_count: Total count of all terms in all documents.
-        :param fieldlength_totals: A dictionary mapping field numbers to the
+        :param fieldlength_totals: A dictionary mapping field names to the
             total number of terms in that field across all documents in the
             segment.
+        :param fieldlength_mins: A dictionary mapping field names to the
+            minimum length of that field across all documents.
+        :param fieldlength_maxes: A dictionary mapping field names to the
+            maximum length of that field across all documents.
         :param deleted: A set of deleted document numbers, or None if no
             deleted documents exist in this segment.
         """
 
         assert isinstance(name, basestring)
         assert isinstance(doccount, (int, long))
         assert fieldlength_totals is None or isinstance(fieldlength_totals, dict), "fl_totals=%r" % fieldlength_totals
+        assert fieldlength_maxes is None or isinstance(fieldlength_mins, dict), "fl_mins=%r" % fieldlength_maxes
         assert fieldlength_maxes is None or isinstance(fieldlength_maxes, dict), "fl_maxes=%r" % fieldlength_maxes
 
         self.name = name
         self.generation = generation
         self.doccount = doccount
         self.fieldlength_totals = fieldlength_totals
+        self.fieldlength_mins = fieldlength_mins
         self.fieldlength_maxes = fieldlength_maxes
         self.deleted = deleted
         self.uuid = uuid.uuid4()
@@ -426,8 +429,8 @@ def __getattr__(self, name):
 
     def copy(self):
         return Segment(self.name, self.generation, self.doccount,
-                       self.fieldlength_totals, self.fieldlength_maxes,
-                       self.deleted)
+                       self.fieldlength_totals, self.fieldlength_mins,
+                       self.fieldlength_maxes, self.deleted)
 
     def make_filename(self, ext):
         return "%s.%s" % (self.name, ext)
@@ -469,10 +472,18 @@ def field_length(self, fieldname, default=0):
         """
         return self.fieldlength_totals.get(fieldname, default)
 
+    def min_field_length(self, fieldname, default=0):
+        """Returns the maximum length of the given field in any of the
+        documents in the segment.
+        """
+
+        return self.fieldlength_mins.get(fieldname, default)
+
     def max_field_length(self, fieldname, default=0):
         """Returns the maximum length of the given field in any of the
         documents in the segment.
         """
+
         return self.fieldlength_maxes.get(fieldname, default)
 
     def delete_document(self, docnum, delete=True):

diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py
@@ -31,6 +31,7 @@
 from whoosh.spans import Span
 from whoosh.system import _INT_SIZE
 from whoosh.filedb import postblocks
+from whoosh.filedb.filetables import TermInfo
 
 
 class FilePostingWriter(PostingWriter):
@@ -48,18 +49,19 @@ def __init__(self, postfile, stringids=False, blocklimit=128,
         self.blocklimit = blocklimit
         self.compression = compression
         self.block = None
-
+        
     def _reset_block(self):
-        self.block = self.blockclass(self.postfile, self.stringids)
+        self.block = self.blockclass(self.postfile, self.format.posting_size,
+                                     stringids=self.stringids)
 
     def start(self, format):
         if self.block is not None:
             raise Exception("Called start() in a block")
 
         self.format = format
         self.blockcount = 0
-        self.posttotal = 0
         self.startoffset = self.postfile.tell()
+        self.terminfo = TermInfo()
 
         # Magic number
         self.postfile.write_int(self.blockclass.magic)
@@ -73,50 +75,51 @@ def write(self, id, weight, valuestring, dfl):
         self.block.append(id, weight, valuestring, dfl)
         if len(self.block) >= self.blocklimit:
             self._write_block()
-        self.posttotal += 1
 
-    def finish(self):
+    def finish(self, inlinelimit=1):
         if self.block is None:
             raise Exception("Called finish() when not in a block")
 
-        if self.block:
-            self._write_block()
-
-        # Seek back to the start of this list of posting blocks and writer the
-        # number of blocks
-        pf = self.postfile
-        pf.flush()
-        offset = pf.tell()
-        pf.seek(self.startoffset + _INT_SIZE)
-        pf.write_uint(self.blockcount)
-        pf.seek(offset)
+        block = self.block
+        terminfo = self.terminfo
+
+        if self.blockcount < 1 and len(block) <= inlinelimit:
+            terminfo.add_block(block)
+            vals = None if not block.values else tuple(block.values)
+            postings = (tuple(block.ids), tuple(block.weights), vals)
+        else:
+            if block:
+                self._write_block()
+
+            # Seek back to the start of this list of posting blocks and write the
+            # number of blocks
+            pf = self.postfile
+            pf.flush()
+            offset = pf.tell()
+            pf.seek(self.startoffset + _INT_SIZE)
+            pf.write_uint(self.blockcount)
+            pf.seek(offset)
+            postings = self.startoffset
 
         self.block = None
-        return self.posttotal
-
-    def cancel(self):
-        self.block = None
+
+        terminfo.postings = postings
+        return terminfo
 
     def close(self):
         if self.block:
-            self.finish()
+            raise Exception("Closed posting writer without finishing")
         self.postfile.close()
 
     def block_stats(self):
         return self.block.stats()
 
     def _write_block(self):
-        self.block.to_file(self.postfile, self.format.posting_size,
-                           compression=self.compression)
+        self.block.write(compression=self.compression)
+        self.terminfo.add_block(self.block)
         self._reset_block()
         self.blockcount += 1
 
-    def as_inline(self):
-        block = self.block
-        _, maxwol, minlength = block.stats()
-        return (tuple(block.ids), tuple(block.weights), tuple(block.values),
-                maxwol, minlength)
-
 
 class FilePostingReader(Matcher):
     def __init__(self, postfile, offset, format, scorer=None,
@@ -172,7 +175,7 @@ def supports(self, astype):
 
     def value(self):
         if self.block.values is None:
-            self.block.read_values(self.format.posting_size)
+            self.block.read_values()
         return self.block.values[self.i]
 
     def value_as(self, astype):
@@ -241,7 +244,8 @@ def skip_to(self, id):
     def _read_block(self, offset):
         pf = self.postfile
         pf.seek(offset)
-        return self.blockclass.from_file(pf, self.stringids)
+        return self.blockclass.from_file(pf, self.format.posting_size,
+                                         stringids=self.stringids)
 
     def _consume_block(self):
         self.block.read_ids()
@@ -277,8 +281,11 @@ def _skip_to_block(self, targetfn):
 
         return skipped
 
-    def supports_quality(self):
-        return self.scorer and self.scorer.supports_quality()
+    def supports_block_quality(self):
+        return self.scorer and self.scorer.supports_block_quality()
+
+    def max_quality(self):
+        return self.scorer.max_quality()
 
     def skip_to_quality(self, minquality):
         bq = self.block_quality
@@ -298,9 +305,6 @@ def block_max_weight(self):
     def block_max_wol(self):
         return self.block.max_wol()
 
-    def block_max_id(self):
-        return self.block.max_id()
-
     def score(self):
         return self.scorer.score(self)
 

diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py
@@ -151,15 +151,18 @@ def all_stored_fields(self):
     def field_length(self, fieldname):
         return self.segment.field_length(fieldname)
 
+    def min_field_length(self, fieldname):
+        return self.segment.min_field_length(fieldname)
+
+    def max_field_length(self, fieldname):
+        return self.segment.max_field_length(fieldname)
+
     @protected
     def doc_field_length(self, docnum, fieldname, default=0):
         if self.fieldlengths is None:
             return default
         return self.fieldlengths.get(docnum, fieldname, default=default)
 
-    def max_field_length(self, fieldname):
-        return self.segment.max_field_length(fieldname)
-
     @protected
     def has_vector(self, docnum, fieldname):
         if self.schema[fieldname].vector:
@@ -171,10 +174,10 @@ def has_vector(self, docnum, fieldname):
     @protected
     def __iter__(self):
         schema = self.schema
-        for (fieldname, t), (totalfreq, _, postcount) in self.termsindex:
+        for (fieldname, t), (freq, docfreq) in self.termsindex.terms_and_freqs():
             if fieldname not in schema:
                 continue
-            yield (fieldname, t, postcount, totalfreq)
+            yield (fieldname, t, docfreq, freq)
 
     def _test_field(self, fieldname):
         if fieldname not in self.schema:
@@ -186,10 +189,11 @@ def _test_field(self, fieldname):
     def iter_from(self, fieldname, text):
         schema = self.schema
         self._test_field(fieldname)
-        for (fn, t), (totalfreq, _, postcount) in self.termsindex.items_from((fieldname, text)):
+        term = (fieldname, text)
+        for (fn, t), (freq, docfreq) in self.termsindex.terms_and_freqs(term):
             if fn not in schema:
                 continue
-            yield (fn, t, postcount, totalfreq)
+            yield (fn, t, docfreq, freq)
 
     @protected
     def _term_info(self, fieldname, text):
@@ -199,17 +203,31 @@ def _term_info(self, fieldname, text):
         except KeyError:
             raise TermNotFound("%s:%r" % (fieldname, text))
 
-    def doc_frequency(self, fieldname, text):
+    def frequency(self, fieldname, text):
+        self._test_field(fieldname)
         try:
-            return self._term_info(fieldname, text)[2]
-        except TermNotFound:
+            return self.termsindex.frequency((fieldname, text))
+        except KeyError:
             return 0
 
-    def frequency(self, fieldname, text):
+    def doc_frequency(self, fieldname, text):
+        self._test_field(fieldname)
         try:
-            return self._term_info(fieldname, text)[0]
-        except TermNotFound:
+            return self.termsindex.doc_frequency((fieldname, text))
+        except KeyError:
             return 0
+
+    def min_length(self, fieldname, text):
+        return self.termsindex.min_length((fieldname, text))
+
+    def max_length(self, fieldname, text):
+        return self.termsindex.max_length((fieldname, text))
+
+    def max_weight(self, fieldname, text):
+        return self.termsindex.max_weight((fieldname, text))
+
+    def max_wol(self, fieldname, text):
+        return self.termsindex.max_wol((fieldname, text))
 
     def lexicon(self, fieldname):
         # The base class has a lexicon() implementation that uses iter_from()
@@ -251,19 +269,20 @@ def expand_prefix(self, fieldname, prefix):
 
     def postings(self, fieldname, text, scorer=None):
         try:
-            offset = self.termsindex[fieldname, text][1]
+            terminfo = self.termsindex[fieldname, text]
         except KeyError:
             raise TermNotFound("%s:%r" % (fieldname, text))
 
         format = self.schema[fieldname].format
-        if isinstance(offset, (int, long)):
-            postreader = FilePostingReader(self.postfile, offset, format,
+        postings = terminfo.postings
+        if isinstance(postings, (int, long)):
+            postreader = FilePostingReader(self.postfile, postings, format,
                                            scorer=scorer, fieldname=fieldname,
                                            text=text)
         else:
-            docids, weights, values, maxwol, minlength = offset
-            postreader = ListMatcher(docids, weights, values, format, scorer,
-                                     maxwol=maxwol, minlength=minlength)
+            docids, weights, values = postings
+            postreader = ListMatcher(docids, weights, values, format,
+                                     scorer=scorer)
 
         deleted = self.segment.deleted
         if deleted: