LUCENE-4299: add Terms hasPositions/hasOffsets, so you know what feat…

…ures a docs TVs have git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/branch_4x@1371720 13f79535-47bb-0310-9956-ffa450edef68
andyfowler · Aug 10, 2012 · 3011631 · 3011631
1 parent be14bb1
commit 3011631
Show file tree

Hide file tree

Showing 22 changed files with 238 additions and 93 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -8,6 +8,13 @@ http://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_6_0/lucene/contrib
 
 ======================= Lucene 4.0.0 =======================
 
+API Changes
+
+* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
+  Previously you had no real way to know that a term vector field
+  had positions or offsets, since this can be configured on a 
+  per-field-per-document basis. (Robert Muir)
+
 Bug Fixes
 
 * LUCENE-4297: BooleanScorer2 would multiply the coord() factor

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTermsReader.java
@@ -253,6 +253,16 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
       return new SegmentTermsEnum();
     }
 
+    @Override
+    public boolean hasOffsets() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
+
     @Override
     public long size() {
       return numTerms;

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
@@ -456,6 +456,16 @@ public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
 
+    @Override
+    public boolean hasOffsets() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
+
     @Override
     public TermsEnum iterator(TermsEnum reuse) throws IOException {
       return new SegmentTermsEnum();

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java
@@ -184,6 +184,9 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
     final FieldsEnum fieldsEnum = vectors.iterator();
     String fieldName;
     String lastFieldName = null;
+
+    TermsEnum termsEnum = null;
+    DocsAndPositionsEnum docsAndPositionsEnum = null;
 
     while((fieldName = fieldsEnum.next()) != null) {
       final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
@@ -196,39 +199,30 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
         // FieldsEnum shouldn't lie...
         continue;
       }
+
+      final boolean hasPositions = terms.hasPositions();
+      final boolean hasOffsets = terms.hasOffsets();
+
       final int numTerms = (int) terms.size();
       if (numTerms == -1) {
         throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
       }
-      final TermsEnum termsEnum = terms.iterator(null);
-
-      DocsAndPositionsEnum docsAndPositionsEnum = null;
-
-      boolean startedField = false;
-
-      // NOTE: this is tricky, because TermVectors allow
-      // indexing offsets but NOT positions.  So we must
-      // lazily init the field by checking whether first
-      // position we see is -1 or not.
+
+      startField(fieldInfo, numTerms, hasPositions, hasOffsets);
+      termsEnum = terms.iterator(termsEnum);
 
       int termCount = 0;
       while(termsEnum.next() != null) {
         termCount++;
 
         final int freq = (int) termsEnum.totalTermFreq();
+
+        startTerm(termsEnum.term(), freq);
 
-        if (startedField) {
-          startTerm(termsEnum.term(), freq);
-        }
-
-        // TODO: we need a "query" API where we can ask (via
-        // flex API) what this term was indexed with...
-        // Both positions & offsets:
-        docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
-        boolean hasOffsets = false;
-        boolean hasPositions = false;
-
-        if (docsAndPositionsEnum != null) {
+        if (hasPositions || hasOffsets) {
+          docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
+          assert docsAndPositionsEnum != null;
+
           final int docID = docsAndPositionsEnum.nextDoc();
           assert docID != DocIdSetIterator.NO_MORE_DOCS;
           assert docsAndPositionsEnum.freq() == freq;
@@ -237,28 +231,10 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
             final int pos = docsAndPositionsEnum.nextPosition();
             final int startOffset = docsAndPositionsEnum.startOffset();
             final int endOffset = docsAndPositionsEnum.endOffset();
-            if (!startedField) {
-              assert numTerms > 0;
-              hasPositions = pos != -1;
-              hasOffsets = startOffset != -1;
-              startField(fieldInfo, numTerms, hasPositions, hasOffsets);
-              startTerm(termsEnum.term(), freq);
-              startedField = true;
-            }
-            if (hasOffsets) {
-              assert startOffset != -1;
-              assert endOffset != -1;
-            }
+
             assert !hasPositions || pos >= 0;
             addPosition(pos, startOffset, endOffset);
           }
-        } else {
-          if (!startedField) {
-            assert numTerms > 0;
-            startField(fieldInfo, numTerms, hasPositions, hasOffsets);
-            startTerm(termsEnum.term(), freq);
-            startedField = true;
-          }
         }
       }
       assert termCount == numTerms;

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
@@ -314,6 +314,16 @@ public long getSumDocFreq() throws IOException {
       public int getDocCount() throws IOException {
         return delegateTerms.getDocCount();
       }
+
+      @Override
+      public boolean hasOffsets() {
+        return delegateTerms.hasOffsets();
+      }
+
+      @Override
+      public boolean hasPositions() {
+        return delegateTerms.hasPositions();
+      }
     }
 
     class BloomFilteredTermsEnum extends TermsEnum {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xFields.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xFields.java
@@ -246,6 +246,18 @@ public long getSumDocFreq() throws IOException {
     public int getDocCount() throws IOException {
       return -1;
     }
+
+    @Override
+    public boolean hasOffsets() {
+      // preflex doesn't support this
+      assert fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0;
+      return false;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
   }
 
   private class PreTermsEnum extends TermsEnum {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java
@@ -281,11 +281,16 @@ public int size() {
   private class TVTerms extends Terms {
     private final int numTerms;
     private final long tvfFPStart;
+    private final boolean storePositions;
+    private final boolean storeOffsets;
     private final boolean unicodeSortOrder;
 
     public TVTerms(long tvfFP) throws IOException {
       tvf.seek(tvfFP);
       numTerms = tvf.readVInt();
+      final byte bits = tvf.readByte();
+      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       tvfFPStart = tvf.getFilePointer();
       unicodeSortOrder = sortTermsByUnicode();
     }
@@ -301,7 +306,7 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
       } else {
         termsEnum = new TVTermsEnum();
       }
-      termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder);
+      termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets, unicodeSortOrder);
       return termsEnum;
     }
 
@@ -334,6 +339,16 @@ public Comparator<BytesRef> getComparator() {
         return BytesRef.getUTF8SortedAsUTF16Comparator();
       }
     }
+
+    @Override
+    public boolean hasOffsets() {
+      return storeOffsets;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return storePositions;
+    }
   }
 
   static class TermAndPostings {
@@ -365,13 +380,12 @@ public boolean canReuse(IndexInput tvf) {
       return tvf == origTVF;
     }
 
-    public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException {
+    public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets, boolean unicodeSortOrder) throws IOException {
       this.numTerms = numTerms;
+      this.storePositions = storePositions;
+      this.storeOffsets = storeOffsets;
       currentTerm = -1;
       tvf.seek(tvfFPStart);
-      final byte bits = tvf.readByte();
-      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
-      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       this.unicodeSortOrder = unicodeSortOrder;
       readVectors();
       if (unicodeSortOrder) {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java
@@ -296,10 +296,16 @@ public int size() {
   private class TVTerms extends Terms {
     private final int numTerms;
     private final long tvfFPStart;
+    private final boolean storePositions;
+    private final boolean storeOffsets;
+
 
     public TVTerms(long tvfFP) throws IOException {
       tvf.seek(tvfFP);
       numTerms = tvf.readVInt();
+      final byte bits = tvf.readByte();
+      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       tvfFPStart = tvf.getFilePointer();
     }
 
@@ -314,7 +320,7 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
       } else {
         termsEnum = new TVTermsEnum();
       }
-      termsEnum.reset(numTerms, tvfFPStart);
+      termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets);
       return termsEnum;
     }
 
@@ -345,6 +351,16 @@ public Comparator<BytesRef> getComparator() {
       // this...?  I guess codec could buffer and re-sort...
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
+
+    @Override
+    public boolean hasOffsets() {
+      return storeOffsets;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return storePositions;
+    }
   }
 
   private class TVTermsEnum extends TermsEnum {
@@ -373,13 +389,12 @@ public boolean canReuse(IndexInput tvf) {
       return tvf == origTVF;
     }
 
-    public void reset(int numTerms, long tvfFPStart) throws IOException {
+    public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets) throws IOException {
       this.numTerms = numTerms;
+      this.storePositions = storePositions;
+      this.storeOffsets = storeOffsets;
       nextTerm = 0;
       tvf.seek(tvfFPStart);
-      final byte bits = tvf.readByte();
-      storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
-      storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       tvfFP = 1+tvfFPStart;
       positions = null;
       startOffsets = null;

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
@@ -635,6 +635,16 @@ public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
 
+    @Override
+    public boolean hasOffsets() {
+      return hasOffsets;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return hasPos;
+    }
+
     private final class DirectTermsEnum extends TermsEnum {
 
       private final BytesRef scratch = new BytesRef();

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
@@ -834,6 +834,16 @@ public TermsEnum iterator(TermsEnum reuse) {
     public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
+
+    @Override
+    public boolean hasOffsets() {
+      return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
@@ -609,6 +609,16 @@ public long getSumDocFreq() throws IOException {
     public int getDocCount() throws IOException {
       return docCount;
     }
+
+    @Override
+    public boolean hasOffsets() {
+      return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java
@@ -130,7 +130,7 @@ public Fields get(int doc) throws IOException {
       assert StringHelper.startsWith(scratch, FIELDTERMCOUNT);
       int termCount = parseIntAt(FIELDTERMCOUNT.length);
 
-      SimpleTVTerms terms = new SimpleTVTerms();
+      SimpleTVTerms terms = new SimpleTVTerms(offsets, positions);
       fields.put(fieldName, terms);
 
       for (int j = 0; j < termCount; j++) {
@@ -257,8 +257,12 @@ public int size() throws IOException {
 
   private static class SimpleTVTerms extends Terms {
     final SortedMap<BytesRef,SimpleTVPostings> terms;
+    final boolean hasOffsets;
+    final boolean hasPositions;
 
-    SimpleTVTerms() {
+    SimpleTVTerms(boolean hasOffsets, boolean hasPositions) {
+      this.hasOffsets = hasOffsets;
+      this.hasPositions = hasPositions;
       terms = new TreeMap<BytesRef,SimpleTVPostings>();
     }
 
@@ -292,6 +296,16 @@ public long getSumDocFreq() throws IOException {
     public int getDocCount() throws IOException {
       return 1;
     }
+
+    @Override
+    public boolean hasOffsets() {
+      return hasOffsets;
+    }
+
+    @Override
+    public boolean hasPositions() {
+      return hasPositions;
+    }
   }
 
   private static class SimpleTVPostings {