Skip to content

Commit

Permalink
Merge branch 'paul/ftsiterator'
Browse files Browse the repository at this point in the history
Conflicts:
	src/mongo/db/fts/fts_spec.cpp
	src/mongo/db/fts/fts_spec.h
	src/mongo/db/fts/fts_spec_test.cpp
  • Loading branch information
jrassi committed Jan 29, 2014
2 parents a195fdd + 8afbec3 commit fc2cbaf
Show file tree
Hide file tree
Showing 10 changed files with 761 additions and 317 deletions.
8 changes: 6 additions & 2 deletions src/mongo/db/fts/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Import("env")

stop_word_lanages = [
stop_word_languages = [
'danish',
'dutch',
'english',
Expand All @@ -21,7 +21,7 @@ stop_word_lanages = [
]

env.Command( [ "stop_words_list.h", "stop_words_list.cpp"],
[ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_lanages ],
[ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_languages ],
"$PYTHON $SOURCES $TARGETS" )

env.Library('base', [
Expand All @@ -32,6 +32,7 @@ env.Library('base', [
'fts_spec_legacy.cpp',
'fts_language.cpp',
'fts_util.cpp',
'fts_iterator.cpp',
'stemmer.cpp',
'stop_words.cpp',
'stop_words_list.cpp',
Expand Down Expand Up @@ -81,5 +82,8 @@ env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
LIBDEPS=["base"] )

env.CppUnitTest( "fts_iterator_test", "fts_iterator_test.cpp",
LIBDEPS=["base"] )

env.CppUnitTest( "fts_util_test", "fts_util_test.cpp",
LIBDEPS=["base","$BUILD_DIR/mongo/mongohasher"] )
16 changes: 8 additions & 8 deletions src/mongo/db/fts/fts_index_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ namespace mongo {


TermFrequencyMap term_freqs;
spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &term_freqs );
spec.scoreDocument( obj, &term_freqs );

// create index keys from raw scores
// only 1 per string
Expand All @@ -92,9 +92,7 @@ namespace mongo {
long long keyBSONSize = 0;
const int MaxKeyBSONSizeMB = 4;

for ( TermFrequencyMap::const_iterator i = term_freqs.begin();
i != term_freqs.end();
++i ) {
for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); i != term_freqs.end(); ++i ) {

const string& term = i->first;
double weight = i->second;
Expand All @@ -108,17 +106,18 @@ namespace mongo {
extraSize;

BSONObjBuilder b(guess); // builds a BSON object with guess length.
for ( unsigned k = 0; k < extrasBefore.size(); k++ )
for ( unsigned k = 0; k < extrasBefore.size(); k++ ) {
b.appendAs( extrasBefore[k], "" );
}
_appendIndexKey( b, weight, term );
for ( unsigned k = 0; k < extrasAfter.size(); k++ )
for ( unsigned k = 0; k < extrasAfter.size(); k++ ) {
b.appendAs( extrasAfter[k], "" );
}
BSONObj res = b.obj();

verify( guess >= res.objsize() );

keys->insert( res );

keyBSONSize += res.objsize();

uassert( 16733,
Expand All @@ -136,8 +135,9 @@ namespace mongo {
BSONObjBuilder b;

BSONObjIterator i( indexPrefix );
while ( i.more() )
while ( i.more() ) {
b.appendAs( i.next(), "" );
}

_appendIndexKey( b, weight, term );
return b.obj();
Expand Down
186 changes: 186 additions & 0 deletions src/mongo/db/fts/fts_iterator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// fts_iterator.cpp
/**
* Copyright (C) 2014 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/

#include "mongo/db/fts/fts_iterator.h"
#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"

#include <stack>

namespace mongo {

namespace fts {

extern const double DEFAULT_WEIGHT;
extern const double MAX_WEIGHT;

std::ostream& operator<<( std::ostream& os, FTSElementIterator::FTSIteratorFrame& frame ) {
BSONObjIterator it = frame._it;
return os << "FTSIteratorFrame["
" element=" << (*it).toString() <<
", _language=" << frame._language->str() <<
", _parentPath=" << frame._parentPath <<
", _isArray=" << frame._isArray << "]";
}

FTSElementIterator::FTSElementIterator( const FTSSpec& spec, const BSONObj& obj )
: _frame( obj, spec, &spec.defaultLanguage(), "", false ),
_spec( spec ),
_currentValue( advance() )
{ }

namespace {
/** Check for exact match or path prefix match. */
inline bool _matchPrefix( const string& dottedName, const string& weight ) {
if ( weight == dottedName ) {
return true;
}
return mongoutils::str::startsWith( weight, dottedName + '.' );
}
}

bool FTSElementIterator::more() {
//_currentValue = advance();
return _currentValue.valid();
}

FTSIteratorValue FTSElementIterator::next() {
FTSIteratorValue result = _currentValue;
_currentValue = advance();
return result;
}

/**
* Helper method:
* if (current object iterator not exhausted) return true;
* while (frame stack not empty) {
* resume object iterator popped from stack;
* if (resumed iterator not exhausted) return true;
* }
* return false;
*/
bool FTSElementIterator::moreFrames() {
if (_frame._it.more()) return true;
while (!_frameStack.empty()) {
_frame = _frameStack.top();
_frameStack.pop();
if (_frame._it.more()) {
return true;
}
}
return false;
}

FTSIteratorValue FTSElementIterator::advance() {
while ( moreFrames() ) {

BSONElement elem = _frame._it.next();
string fieldName = elem.fieldName();

// Skip "language" specifier fields if wildcard.
if ( _spec.wildcard() && _spec.languageOverrideField() == fieldName ) {
continue;
}

// Compose the dotted name of the current field:
// 1. parent path empty (top level): use the current field name
// 2. parent path non-empty and obj is an array: use the parent path
// 3. parent path non-empty and obj is a sub-doc: append field name to parent path
string dottedName = ( _frame._parentPath.empty() ? fieldName
: _frame._isArray ? _frame._parentPath
: _frame._parentPath + '.' + fieldName );

// Find lower bound of dottedName in _weights. lower_bound leaves us at the first
// weight that could possibly match or be a prefix of dottedName. And if this
// element fails to match, then no subsequent weight can match, since the weights
// are lexicographically ordered.
Weights::const_iterator i = _spec.weights().lower_bound( elem.type() == Object
? dottedName + '.'
: dottedName );

// possibleWeightMatch is set if the weight map contains either a match or some item
// lexicographically larger than fieldName. This boolean acts as a guard on
// dereferences of iterator 'i'.
bool possibleWeightMatch = ( i != _spec.weights().end() );

// Optimize away two cases, when not wildcard:
// 1. lower_bound seeks to end(): no prefix match possible
// 2. lower_bound seeks to a name which is not a prefix
if ( !_spec.wildcard() ) {
if ( !possibleWeightMatch ) {
continue;
}
else if ( !_matchPrefix( dottedName, i->first ) ) {
continue;
}
}

// Is the current field an exact match on a weight?
bool exactMatch = ( possibleWeightMatch && i->first == dottedName );
double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT );

switch ( elem.type() ) {
case String:
// Only index strings on exact match or wildcard.
if ( exactMatch || _spec.wildcard() ) {
return FTSIteratorValue( elem.valuestr(), _frame._language, weight );
}
break;

case Object:
// Only descend into a sub-document on proper prefix or wildcard. Note that
// !exactMatch is a sufficient test for proper prefix match, because of
// if ( !matchPrefix( dottedName, i->first ) ) continue;
// block above.
if ( !exactMatch || _spec.wildcard() ) {
_frameStack.push( _frame );
_frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, false );
}
break;

case Array:
// Only descend into arrays from non-array parents or on wildcard.
if ( !_frame._isArray || _spec.wildcard() ) {
_frameStack.push( _frame );
_frame = FTSIteratorFrame( elem.Obj(), _spec, _frame._language, dottedName, true );
}
break;

default:
// Skip over all other BSON types.
break;
}
}
return FTSIteratorValue(); // valid()==false
}

} // namespace fts
} // namespace mongo
Loading

0 comments on commit fc2cbaf

Please sign in to comment.