Skip to content

Commit

Permalink
LUCENE-4299: add Terms hasPositions/hasOffsets, so you know what feat…
Browse files Browse the repository at this point in the history
…ures a docs TVs have

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/branch_4x@1371720 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
rmuir committed Aug 10, 2012
1 parent be14bb1 commit 3011631
Show file tree
Hide file tree
Showing 22 changed files with 238 additions and 93 deletions.
7 changes: 7 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ http://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_6_0/lucene/contrib

======================= Lucene 4.0.0 =======================

API Changes

* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
Previously you had no real way to know that a term vector field
had positions or offsets, since this can be configured on a
per-field-per-document basis. (Robert Muir)

Bug Fixes

* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,16 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
return new SegmentTermsEnum();
}

@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}

@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}

@Override
public long size() {
return numTerms;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,16 @@ public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}

@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}

@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}

@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return new SegmentTermsEnum();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
final FieldsEnum fieldsEnum = vectors.iterator();
String fieldName;
String lastFieldName = null;

TermsEnum termsEnum = null;
DocsAndPositionsEnum docsAndPositionsEnum = null;

while((fieldName = fieldsEnum.next()) != null) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
Expand All @@ -196,39 +199,30 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
// FieldsEnum shouldn't lie...
continue;
}

final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();

final int numTerms = (int) terms.size();
if (numTerms == -1) {
throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
}
final TermsEnum termsEnum = terms.iterator(null);

DocsAndPositionsEnum docsAndPositionsEnum = null;

boolean startedField = false;

// NOTE: this is tricky, because TermVectors allow
// indexing offsets but NOT positions. So we must
// lazily init the field by checking whether first
// position we see is -1 or not.

startField(fieldInfo, numTerms, hasPositions, hasOffsets);
termsEnum = terms.iterator(termsEnum);

int termCount = 0;
while(termsEnum.next() != null) {
termCount++;

final int freq = (int) termsEnum.totalTermFreq();

startTerm(termsEnum.term(), freq);

if (startedField) {
startTerm(termsEnum.term(), freq);
}

// TODO: we need a "query" API where we can ask (via
// flex API) what this term was indexed with...
// Both positions & offsets:
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
boolean hasOffsets = false;
boolean hasPositions = false;

if (docsAndPositionsEnum != null) {
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
assert docsAndPositionsEnum != null;

final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
Expand All @@ -237,28 +231,10 @@ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) thr
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
if (!startedField) {
assert numTerms > 0;
hasPositions = pos != -1;
hasOffsets = startOffset != -1;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
if (hasOffsets) {
assert startOffset != -1;
assert endOffset != -1;
}

assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset);
}
} else {
if (!startedField) {
assert numTerms > 0;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
}
}
assert termCount == numTerms;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,16 @@ public long getSumDocFreq() throws IOException {
public int getDocCount() throws IOException {
return delegateTerms.getDocCount();
}

@Override
public boolean hasOffsets() {
return delegateTerms.hasOffsets();
}

@Override
public boolean hasPositions() {
return delegateTerms.hasPositions();
}
}

class BloomFilteredTermsEnum extends TermsEnum {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,18 @@ public long getSumDocFreq() throws IOException {
public int getDocCount() throws IOException {
return -1;
}

@Override
public boolean hasOffsets() {
// preflex doesn't support this
assert fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0;
return false;
}

@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}

private class PreTermsEnum extends TermsEnum {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,16 @@ public int size() {
private class TVTerms extends Terms {
private final int numTerms;
private final long tvfFPStart;
private final boolean storePositions;
private final boolean storeOffsets;
private final boolean unicodeSortOrder;

public TVTerms(long tvfFP) throws IOException {
tvf.seek(tvfFP);
numTerms = tvf.readVInt();
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFPStart = tvf.getFilePointer();
unicodeSortOrder = sortTermsByUnicode();
}
Expand All @@ -301,7 +306,7 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
} else {
termsEnum = new TVTermsEnum();
}
termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder);
termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets, unicodeSortOrder);
return termsEnum;
}

Expand Down Expand Up @@ -334,6 +339,16 @@ public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}

@Override
public boolean hasOffsets() {
return storeOffsets;
}

@Override
public boolean hasPositions() {
return storePositions;
}
}

static class TermAndPostings {
Expand Down Expand Up @@ -365,13 +380,12 @@ public boolean canReuse(IndexInput tvf) {
return tvf == origTVF;
}

public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException {
public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets, boolean unicodeSortOrder) throws IOException {
this.numTerms = numTerms;
this.storePositions = storePositions;
this.storeOffsets = storeOffsets;
currentTerm = -1;
tvf.seek(tvfFPStart);
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
this.unicodeSortOrder = unicodeSortOrder;
readVectors();
if (unicodeSortOrder) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,16 @@ public int size() {
private class TVTerms extends Terms {
private final int numTerms;
private final long tvfFPStart;
private final boolean storePositions;
private final boolean storeOffsets;


public TVTerms(long tvfFP) throws IOException {
tvf.seek(tvfFP);
numTerms = tvf.readVInt();
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFPStart = tvf.getFilePointer();
}

Expand All @@ -314,7 +320,7 @@ public TermsEnum iterator(TermsEnum reuse) throws IOException {
} else {
termsEnum = new TVTermsEnum();
}
termsEnum.reset(numTerms, tvfFPStart);
termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets);
return termsEnum;
}

Expand Down Expand Up @@ -345,6 +351,16 @@ public Comparator<BytesRef> getComparator() {
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator();
}

@Override
public boolean hasOffsets() {
return storeOffsets;
}

@Override
public boolean hasPositions() {
return storePositions;
}
}

private class TVTermsEnum extends TermsEnum {
Expand Down Expand Up @@ -373,13 +389,12 @@ public boolean canReuse(IndexInput tvf) {
return tvf == origTVF;
}

public void reset(int numTerms, long tvfFPStart) throws IOException {
public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets) throws IOException {
this.numTerms = numTerms;
this.storePositions = storePositions;
this.storeOffsets = storeOffsets;
nextTerm = 0;
tvf.seek(tvfFPStart);
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFP = 1+tvfFPStart;
positions = null;
startOffsets = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,16 @@ public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}

@Override
public boolean hasOffsets() {
return hasOffsets;
}

@Override
public boolean hasPositions() {
return hasPos;
}

private final class DirectTermsEnum extends TermsEnum {

private final BytesRef scratch = new BytesRef();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,16 @@ public TermsEnum iterator(TermsEnum reuse) {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}

@Override
public boolean hasOffsets() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}

@Override
public boolean hasPositions() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,16 @@ public long getSumDocFreq() throws IOException {
public int getDocCount() throws IOException {
return docCount;
}

@Override
public boolean hasOffsets() {
return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}

@Override
public boolean hasPositions() {
return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public Fields get(int doc) throws IOException {
assert StringHelper.startsWith(scratch, FIELDTERMCOUNT);
int termCount = parseIntAt(FIELDTERMCOUNT.length);

SimpleTVTerms terms = new SimpleTVTerms();
SimpleTVTerms terms = new SimpleTVTerms(offsets, positions);
fields.put(fieldName, terms);

for (int j = 0; j < termCount; j++) {
Expand Down Expand Up @@ -257,8 +257,12 @@ public int size() throws IOException {

private static class SimpleTVTerms extends Terms {
final SortedMap<BytesRef,SimpleTVPostings> terms;
final boolean hasOffsets;
final boolean hasPositions;

SimpleTVTerms() {
SimpleTVTerms(boolean hasOffsets, boolean hasPositions) {
this.hasOffsets = hasOffsets;
this.hasPositions = hasPositions;
terms = new TreeMap<BytesRef,SimpleTVPostings>();
}

Expand Down Expand Up @@ -292,6 +296,16 @@ public long getSumDocFreq() throws IOException {
public int getDocCount() throws IOException {
return 1;
}

@Override
public boolean hasOffsets() {
return hasOffsets;
}

@Override
public boolean hasPositions() {
return hasPositions;
}
}

private static class SimpleTVPostings {
Expand Down
Loading

0 comments on commit 3011631

Please sign in to comment.