From 692e89f79055f6e68ed90652521bfe83de8e6fa3 Mon Sep 17 00:00:00 2001 From: diml Date: Fri, 2 May 2008 11:58:37 +0000 Subject: [PATCH] Commiting all changes reported in MDL-14646 --- search/Zend/Search/Exception.php | 6 +- search/Zend/Search/Lucene.php | 501 +++++++++++++++--- .../Zend/Search/Lucene/Analysis/Analyzer.php | 33 +- .../Lucene/Analysis/Analyzer/Common/Text.php | 12 +- .../Lucene/Analysis/Analyzer/Common/Utf8.php | 126 ++--- .../Analysis/TokenFilter/LowerCaseUtf8.php | 70 +++ search/Zend/Search/Lucene/Document.php | 20 +- search/Zend/Search/Lucene/Exception.php | 6 +- .../Zend/Search/Lucene/Search/Query/Term.php | 10 +- .../Zend/Search/Lucene/Search/QueryParser.php | 259 ++++++--- 10 files changed, 766 insertions(+), 277 deletions(-) create mode 100644 search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php diff --git a/search/Zend/Search/Exception.php b/search/Zend/Search/Exception.php index 291cc43ed5032..63bdf565b2533 100644 --- a/search/Zend/Search/Exception.php +++ b/search/Zend/Search/Exception.php @@ -14,7 +14,7 @@ * * @category Zend * @package Zend_Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -22,13 +22,13 @@ /** * Framework base exception */ -require_once $CFG->dirroot.'/search/Zend/Exception.php'; +require_once "Zend/Exception.php"; /** * @category Zend * @package Zend_Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Exception extends Zend_Exception diff --git a/search/Zend/Search/Lucene.php b/search/Zend/Search/Lucene.php index 1f15c9a0ba122..0cd4d8a12c141 100644 --- a/search/Zend/Search/Lucene.php +++ b/search/Zend/Search/Lucene.php @@ -14,65 +14,68 @@ * * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ - /** Zend_Search_Lucene_Exception */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; +require_once "Zend/Search/Lucene/Exception.php"; /** Zend_Search_Lucene_Document */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php'; +require_once "Zend/Search/Lucene/Document.php"; /** Zend_Search_Lucene_Document_Html */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document/Html.php'; +require_once "Zend/Search/Lucene/Document/Html.php"; -/** Zend_Search_Lucene_Storage_Directory */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/Directory/Filesystem.php'; +/** Zend_Search_Lucene_Storage_Directory_Filesystem */ +require_once "Zend/Search/Lucene/Storage/Directory/Filesystem.php"; /** Zend_Search_Lucene_Storage_File_Memory */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Storage/File/Memory.php'; +require_once "Zend/Search/Lucene/Storage/File/Memory.php"; /** Zend_Search_Lucene_Index_Term */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; +require_once "Zend/Search/Lucene/Index/Term.php"; /** Zend_Search_Lucene_Index_TermInfo */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/TermInfo.php'; +require_once "Zend/Search/Lucene/Index/TermInfo.php"; /** Zend_Search_Lucene_Index_SegmentInfo */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; +require_once "Zend/Search/Lucene/Index/SegmentInfo.php"; /** Zend_Search_Lucene_Index_FieldInfo */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/FieldInfo.php'; +require_once "Zend/Search/Lucene/Index/FieldInfo.php"; /** Zend_Search_Lucene_Index_Writer */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Writer.php'; +require_once "Zend/Search/Lucene/Index/Writer.php"; /** Zend_Search_Lucene_Search_QueryParser */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParser.php'; +require_once "Zend/Search/Lucene/Search/QueryParser.php"; /** Zend_Search_Lucene_Search_QueryHit */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryHit.php'; +require_once "Zend/Search/Lucene/Search/QueryHit.php"; /** Zend_Search_Lucene_Search_Similarity */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity.php'; +require_once "Zend/Search/Lucene/Search/Similarity.php"; /** Zend_Search_Lucene_Index_SegmentInfoPriorityQueue */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php'; +require_once "Zend/Search/Lucene/Index/SegmentInfoPriorityQueue.php"; + +/** Zend_Search_Lucene_LockManager */ +require_once "Zend/Search/Lucene/LockManager.php"; + /** Zend_Search_Lucene_Interface */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Interface.php'; +require_once "Zend/Search/Lucene/Interface.php"; /** Zend_Search_Lucene_Proxy */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Proxy.php'; +require_once "Zend/Search/Lucene/Proxy.php"; /** * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene implements Zend_Search_Lucene_Interface @@ -86,6 +89,15 @@ class Zend_Search_Lucene implements Zend_Search_Lucene_Interface */ private static $_defaultSearchField = null; + /** + * Result set limit + * + * 0 means no limit + * + * @var integer + */ + private static $_resultSetLimit = 0; + /** * File system adapter. * @@ -129,13 +141,6 @@ class Zend_Search_Lucene implements Zend_Search_Lucene_Interface private $_hasChanges = false; - /** - * Index lock object - * - * @var Zend_Search_Lucene_Storage_File - */ - private $_lock; - /** * Signal, that index is already closed, changes are fixed and resources are cleaned up * @@ -150,7 +155,14 @@ class Zend_Search_Lucene implements Zend_Search_Lucene_Interface */ private $_refCount = 0; + /** + * Current segment generation + * + * @var integer + */ + private $_generation; + /** * Create index * @@ -173,56 +185,101 @@ public static function open($directory) return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false)); } + /** Generation retrieving counter */ + const GENERATION_RETRIEVE_COUNT = 10; + + /** Pause between generation retrieving attempts in milliseconds */ + const GENERATION_RETRIEVE_PAUSE = 50; + /** - * Opens the index. + * Get current generation number * - * IndexReader constructor needs Directory as a parameter. It should be - * a string with a path to the index folder or a Directory object. + * Returns generation number + * 0 means pre-2.1 index format + * -1 means there are no segments files. * - * @param mixed $directory + * @param Zend_Search_Lucene_Storage_Directory $directory + * @return integer * @throws Zend_Search_Lucene_Exception */ - public function __construct($directory = null, $create = false) + public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory) { - if ($directory === null) { - throw new Zend_Search_Exception('No index directory specified'); - } - - if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { - $this->_directory = $directory; - $this->_closeDirOnExit = false; - } else { - $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); - $this->_closeDirOnExit = true; - } + /** + * Zend_Search_Lucene uses segments.gen file to retrieve current generation number + * + * Apache Lucene index format documentation mentions this method only as a fallback method + * + * Nevertheless we use it according to the performance considerations + * + * @todo check if we can use some modification of Apache Lucene generation determination algorithm + * without performance problems + */ + + try { + for ($count = 0; $count < self::GENERATION_RETRIEVE_COUNT; $count++) { + // Try to get generation file + $genFile = $directory->getFileObject('segments.gen', false); + + $format = $genFile->readInt(); + if ($format != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong segments.gen file format'); + } + $gen1 = $genFile->readLong(); + $gen2 = $genFile->readLong(); - // Get a shared lock to the index - $this->_lock = $this->_directory->createFile('index.lock'); + if ($gen1 == $gen2) { + return $gen1; + } - $this->_segmentInfos = array(); + usleep(self::GENERATION_RETRIEVE_PAUSE * 1000); + } - if ($create) { - // Throw an exception if index is under processing now - if (!$this->_lock->lock(LOCK_EX, true)) { - throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now'); + // All passes are failed + throw new Zend_Search_Lucene_Exception('Index is under processing now'); + } catch (Zend_Search_Lucene_Exception $e) { + if (strpos($e->getMessage(), 'is not readable') !== false) { + try { + // Try to open old style segments file + $segmentsFile = $directory->getFileObject('segments', false); + + // It's pre-2.1 index + return 0; + } catch (Zend_Search_Lucene_Exception $e) { + if (strpos($e->getMessage(), 'is not readable') !== false) { + return -1; + } else { + throw $e; + } + } + } else { + throw $e; } + } - // Writer will create segments file for empty segments list - $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, true); + return -1; + } - if (!$this->_lock->lock(LOCK_SH)) { - throw new Zend_Search_Lucene_Exception('Can\'t reduce lock level from Exclusive to Shared'); - } - } else { - // Wait if index is under switching from one set of segments to another (Index_Writer::_updateSegments()) - if (!$this->_lock->lock(LOCK_SH)) { - throw new Zend_Search_Lucene_Exception('Can\'t obtain shared index lock'); - } - $this->_writer = null; + /** + * Get segments file name + * + * @param integer $generation + * @return string + */ + public static function getSegmentFileName($generation) + { + if ($generation == 0) { + return 'segments'; } + return 'segments_' . base_convert($generation, 10, 36); + } + /** + * Read segments file for pre-2.1 Lucene index format + */ + private function _readPre21SegmentsFile() + { $segmentsFile = $this->_directory->getFileObject('segments'); $format = $segmentsFile->readInt(); @@ -248,13 +305,156 @@ public function __construct($directory = null, $create = false) $segSize = $segmentsFile->readInt(); $this->_docCount += $segSize; - $this->_segmentInfos[] = - new Zend_Search_Lucene_Index_SegmentInfo($segName, + $this->_segmentInfos[$segName] = + new Zend_Search_Lucene_Index_SegmentInfo($this->_directory, + $segName, + $segSize); + } + } + + /** + * Read segments file + * + * @throws Zend_Search_Lucene_Exception + */ + private function _readSegmentsFile() + { + $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation)); + + $format = $segmentsFile->readInt(); + + if ($format != (int)0xFFFFFFFD) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + // $segmentsFile->readLong(); + $segmentsFile->readInt(); $segmentsFile->readInt(); + + // read segment name counter + $segmentsFile->readInt(); + + $segments = $segmentsFile->readInt(); + + $this->_docCount = 0; + + // read segmentInfos + for ($count = 0; $count < $segments; $count++) { + $segName = $segmentsFile->readString(); + $segSize = $segmentsFile->readInt(); + + // 2.1+ specific properties + //$delGen = $segmentsFile->readLong(); + $delGenHigh = $segmentsFile->readInt(); + $delGenLow = $segmentsFile->readInt(); + if ($delGenHigh == (int)0xFFFFFFFF && $delGenLow == (int)0xFFFFFFFF) { + $delGen = -1; // There are no deletes + } else { + $delGen = ($delGenHigh << 32) | $delGenLow; + } + + $hasSingleNormFile = $segmentsFile->readByte(); + $numField = $segmentsFile->readInt(); + + $normGens = array(); + if ($numField != (int)0xFFFFFFFF) { + for ($count1 = 0; $count1 < $numField; $count1++) { + $normGens[] = $segmentsFile->readLong(); + } + + throw new Zend_Search_Lucene_Exception('Separate norm files are not supported. Optimize index to use it with Zend_Search_Lucene.'); + } + + $isCompound = $segmentsFile->readByte(); + + + $this->_docCount += $segSize; + + $this->_segmentInfos[$segName] = + new Zend_Search_Lucene_Index_SegmentInfo($this->_directory, + $segName, $segSize, - $this->_directory); + $delGen, + $hasSingleNormFile, + $isCompound); } } + /** + * Opens the index. + * + * IndexReader constructor needs Directory as a parameter. It should be + * a string with a path to the index folder or a Directory object. + * + * @param mixed $directory + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($directory = null, $create = false) + { + if ($directory === null) { + throw new Zend_Search_Exception('No index directory specified'); + } + + if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { + $this->_directory = $directory; + $this->_closeDirOnExit = false; + } else { + $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); + $this->_closeDirOnExit = true; + } + + $this->_segmentInfos = array(); + + // Mark index as "under processing" to prevent other processes from premature index cleaning + Zend_Search_Lucene_LockManager::obtainReadLock($this->_directory); + + // Escalate read lock to prevent current generation index files to be deleted while opening process is not done +// Zend_Search_Lucene_LockManager::escalateReadLock($this->_directory); + + + $this->_generation = self::getActualGeneration($this->_directory); + + if ($create) { + try { + Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory); + } catch (Zend_Search_Lucene_Exception $e) { + if (strpos($e->getMessage(), 'Can\'t obtain exclusive index lock') === false) { + throw $e; + } else { + throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now'); + } + } + + if ($this->_generation == -1) { + // Directory doesn't contain existing index, start from 1 + $this->_generation = 1; + $nameCounter = 0; + } else { + // Directory contains existing index + $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation)); + $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) + + $nameCounter = $segmentsFile->readInt(); + $this->_generation++; + } + + Zend_Search_Lucene_Index_Writer::createIndex($this->_directory, $this->_generation, $nameCounter); + + Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory); + } + + if ($this->_generation == -1) { + throw new Zend_Search_Lucene_Exception('Index doesn\'t exists in the specified directory.'); + } else if ($this->_generation == 0) { + $this->_readPre21SegmentsFile(); + } else { + $this->_readSegmentsFile(); + } + + // De-escalate read lock to prevent current generation index files to be deleted while opening process is not done +// Zend_Search_Lucene_LockManager::escalateReadLock($this->_directory); + } + /** * Close current index and free resources */ @@ -267,9 +467,9 @@ private function _close() $this->commit(); - // Free shared lock - $this->_lock->unlock(); - + // Release "under processing" flag + Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory); + if ($this->_closeDirOnExit) { $this->_directory->close(); } @@ -431,6 +631,30 @@ public static function getDefaultSearchField() return self::$_defaultSearchField; } + /** + * Set result set limit. + * + * 0 (default) means no limit + * + * @param integer $limit + */ + public static function setResultSetLimit($limit) + { + self::$_resultSetLimit = $limit; + } + + /** + * Set result set limit. + * + * 0 means no limit + * + * @return integer + */ + public static function getResultSetLimit() + { + return self::$_resultSetLimit; + } + /** * Retrieve index maxBufferedDocs option * @@ -585,11 +809,15 @@ public function find($query) $topScore = $docScore; } } + + if (self::$_resultSetLimit != 0 && count($hits) >= self::$_resultSetLimit) { + break; + } } if (count($hits) == 0) { // skip sorting, which may cause a error on empty index - return array(); + return array(); } if ($topScore > 1) { @@ -977,6 +1205,8 @@ public function addDocument(Zend_Search_Lucene_Document $document) { $this->getIndexWriter()->addDocument($document); $this->_docCount++; + + $this->_hasChanges = true; } @@ -1002,14 +1232,12 @@ public function commit() foreach ($this->_segmentInfos as $segInfo) { $segInfo->writeChanges(); } - - $this->_hasChanges = false; - } - - if ($this->_writer !== null) { - $this->_writer->commit(); - + + $this->getIndexWriter()->commit(); + $this->_updateDocCount(); + + $this->_hasChanges = false; } } @@ -1059,9 +1287,7 @@ public function terms() $result[] = $segmentInfo->currentTerm(); } - $segmentInfo->nextTerm(); - // check, if segment dictionary is finished - if ($segmentInfo->currentTerm() !== null) { + if ($segmentInfo->nextTerm() !== null) { // Put segment back into the priority queue $segmentInfoQueue->put($segmentInfo); } @@ -1071,6 +1297,125 @@ public function terms() } + /** + * Terms stream queue + * + * @var Zend_Search_Lucene_Index_SegmentInfoPriorityQueue + */ + private $_termsStreamQueue = null; + + /** + * Last Term in a terms stream + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_lastTerm = null; + + /** + * Reset terms stream. + */ + public function resetTermsStream() + { + $this->_termsStreamQueue = new Zend_Search_Lucene_Index_SegmentInfoPriorityQueue(); + + foreach ($this->_segmentInfos as $segmentInfo) { + $segmentInfo->reset(); + + // Skip "empty" segments + if ($segmentInfo->currentTerm() !== null) { + $this->_termsStreamQueue->put($segmentInfo); + } + } + + $this->nextTerm(); + } + + /** + * Skip terms stream up to specified term preffix. + * + * Prefix contains fully specified field info and portion of searched term + * + * @param Zend_Search_Lucene_Index_Term $prefix + */ + public function skipTo(Zend_Search_Lucene_Index_Term $prefix) + { + $segments = array(); + + while (($segmentInfo = $this->_termsStreamQueue->pop()) !== null) { + $segments[] = $segmentInfo; + } + + foreach ($segments as $segmentInfo) { + $segmentInfo->skipTo($prefix); + + if ($segmentInfo->currentTerm() !== null) { + $this->_termsStreamQueue->put($segmentInfo); + } + } + + $this->nextTerm(); + } + + /** + * Scans terms dictionary and returns next term + * + * @return Zend_Search_Lucene_Index_Term|null + */ + public function nextTerm() + { + while (($segmentInfo = $this->_termsStreamQueue->pop()) !== null) { + if ($this->_termsStreamQueue->top() === null || + $this->_termsStreamQueue->top()->currentTerm()->key() != + $segmentInfo->currentTerm()->key()) { + // We got new term + $this->_lastTerm = $segmentInfo->currentTerm(); + + if ($segmentInfo->nextTerm() !== null) { + // Put segment back into the priority queue + $this->_termsStreamQueue->put($segmentInfo); + } + + return $this->_lastTerm; + } + + if ($segmentInfo->nextTerm() !== null) { + // Put segment back into the priority queue + $this->_termsStreamQueue->put($segmentInfo); + } + } + + // End of stream + $this->_lastTerm = null; + + return null; + } + + /** + * Returns term in current position + * + * @return Zend_Search_Lucene_Index_Term|null + */ + public function currentTerm() + { + return $this->_lastTerm; + } + + /** + * Close terms stream + * + * Should be used for resources clean up if stream is not read up to the end + */ + public function closeTermsStream() + { + while (($segmentInfo = $this->_termsStreamQueue->pop()) !== null) { + $segmentInfo->closeTermsStream(); + } + + $this->_termsStreamQueue = null; + $this->_lastTerm = null; + } + + /************************************************************************* @todo UNIMPLEMENTED *************************************************************************/ diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer.php b/search/Zend/Search/Lucene/Analysis/Analyzer.php index 7a29c763d90e5..def78ac5906e7 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer.php @@ -15,37 +15,43 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Token */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Token.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Token.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php"; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php"; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php"; /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php"; /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php'; +require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php'; /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php"; /** @@ -61,7 +67,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -97,9 +103,10 @@ abstract class Zend_Search_Lucene_Analysis_Analyzer * @param string $data * @return array */ - public function tokenize($data, $encoding = 'UTF-8') + public function tokenize($data, $encoding = '') { $this->setInput($data, $encoding); + $tokenList = array(); while (($nextToken = $this->nextToken()) !== null) { $tokenList[] = $nextToken; @@ -160,7 +167,7 @@ public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer public static function getDefault() { if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { - self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8(); + self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); } return self::$_defaultImpl; diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php index d084ebc4c5f91..46bf196e12b8e 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -15,20 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Analyzer_Common */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -53,7 +53,9 @@ public function reset() } // convert input into ascii - $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); + //$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); + $this->_input = mb_convert_encoding($this->_input, 'ASCII', 'auto'); + $this->_encoding = 'ASCII'; } @@ -75,7 +77,7 @@ public function nextToken() if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { // It covers both cases a) there are no matches (preg_match(...) === 0) // b) error occured (preg_match(...) === FALSE) - return null; + return null; } $str = $match[0][0]; diff --git a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php index 674a3d9e64387..768e7847c6f18 100644 --- a/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php +++ b/search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php @@ -15,20 +15,20 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Analysis_Analyzer_Common */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php'; +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -47,13 +47,20 @@ class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucen * @var integer */ private $_bytePosition; - + /** - * Stream length + * Object constructor * - * @var integer + * @throws Zend_Search_Lucene_Exception */ - private $_streamLength; + public function __construct() + { + if (@preg_match('/\pL/u', 'a') != 1) { + // PCRE unicode support is turned off + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.'); + } + } /** * Reset token stream @@ -66,56 +73,9 @@ public function reset() // convert input into UTF-8 if (strcasecmp($this->_encoding, 'utf8' ) != 0 && strcasecmp($this->_encoding, 'utf-8') != 0 ) { - $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); + $this->_input = @iconv($this->_encoding, 'UTF-8', $this->_input); $this->_encoding = 'UTF-8'; } - - // Get UTF-8 string length. - // It also checks if it's a correct utf-8 string - $this->_streamLength = iconv_strlen($this->_input, 'UTF-8'); - } - - /** - * Check, that character is a letter - * - * @param string $char - * @return boolean - */ - private static function _isAlpha($char) - { - if (strlen($char) > 1) { - // It's an UTF-8 character - return true; - } - - return ctype_alpha($char); - } - - /** - * Get next UTF-8 char - * - * @param string $char - * @return boolean - */ - private function _nextChar() - { - $char = $this->_input[$this->_bytePosition++]; - - if (( ord($char) & 0xC0 ) == 0xC0) { - $addBytes = 1; - if (ord($char) & 0x20 ) { - $addBytes++; - if (ord($char) & 0x10 ) { - $addBytes++; - } - } - $char .= substr($this->_input, $this->_bytePosition, $addBytes); - $this->_bytePosition += $addBytes; - } - - $this->_position++; - - return $char; } /** @@ -131,39 +91,35 @@ public function nextToken() return null; } - while ($this->_position < $this->_streamLength) { - // skip white space - while ($this->_position < $this->_streamLength && - !self::_isAlpha($char = $this->_nextChar())) { - $char = ''; - } - - $termStartPosition = $this->_position - 1; - $termText = $char; - - // read token - while ($this->_position < $this->_streamLength && - self::_isAlpha($char = $this->_nextChar())) { - $termText .= $char; - } - - // Empty token, end of stream. - if ($termText == '') { + do { + if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { + // It covers both cases a) there are no matches (preg_match(...) === 0) + // b) error occured (preg_match(...) === FALSE) return null; } - $token = new Zend_Search_Lucene_Analysis_Token( - $termText, - $termStartPosition, - $this->_position - 1); - $token = $this->normalize($token); - if ($token !== null) { - return $token; - } - // Continue if token is skipped - } - - return null; + // matched string + $matchedWord = $match[0][0]; + + // binary position of the matched word in the input stream + $binStartPos = $match[0][1]; + + // character position of the matched word in the input stream + $startPos = $this->_position + + iconv_strlen(substr($this->_input, + $this->_bytePosition, + $binStartPos - $this->_bytePosition), + 'UTF-8'); + // character postion of the end of matched word in the input stream + $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); + + $this->_bytePosition = $binStartPos + strlen($matchedWord); + $this->_position = $endPos; + + $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos)); + } while ($token === null); // try again if token is skipped + + return $token; } } diff --git a/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php b/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php new file mode 100644 index 0000000000000..78cb5e680f8d6 --- /dev/null +++ b/search/Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php @@ -0,0 +1,70 @@ +dirroot}/search/Zend/Search/Lucene/Analysis/TokenFilter.php"; + + +/** + * Lower case Token filter. + * + * @category Zend + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://framework.zend.com/license/new-bsd New BSD License + */ + +class Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 extends Zend_Search_Lucene_Analysis_TokenFilter +{ + /** + * Object constructor + */ + public function __construct() + { + global $CFG; + if (!function_exists('mb_strtolower')) { + // mbstring extension is disabled + require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Exception.php"; + throw new Zend_Search_Lucene_Exception('Utf8 compatible lower case filter needs mbstring extension to be enabled.'); + } + } + + /** + * Normalize Token or remove it (if null is returned) + * + * @param Zend_Search_Lucene_Analysis_Token $srcToken + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) + { + $newToken = new Zend_Search_Lucene_Analysis_Token( + mb_strtolower($srcToken->getTermText(), 'UTF-8'), + $srcToken->getStartOffset(), + $srcToken->getEndOffset()); + + $newToken->setPositionIncrement($srcToken->getPositionIncrement()); + + return $newToken; + } +} + diff --git a/search/Zend/Search/Lucene/Document.php b/search/Zend/Search/Lucene/Document.php index 6309719568d44..de4281efa7c8b 100644 --- a/search/Zend/Search/Lucene/Document.php +++ b/search/Zend/Search/Lucene/Document.php @@ -15,13 +15,13 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Field */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Field.php'; +require_once "Zend/Search/Lucene/Field.php"; /** @@ -30,7 +30,7 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Document - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Document @@ -54,10 +54,10 @@ class Zend_Search_Lucene_Document * @param $offset * @return string */ - public function __get($offset) - { - return $this->getFieldValue($offset); - } + public function __get($offset) + { + return $this->getFieldValue($offset); + } /** @@ -78,7 +78,7 @@ public function addField(Zend_Search_Lucene_Field $field) */ public function getFieldNames() { - return array_keys($this->_fields); + return array_keys($this->_fields); } @@ -105,7 +105,7 @@ public function getField($fieldName) */ public function getFieldValue($fieldName) { - return $this->getField($fieldName)->value; + return $this->getField($fieldName)->value; } /** @@ -116,6 +116,6 @@ public function getFieldValue($fieldName) */ public function getFieldUtf8Value($fieldName) { - return $this->getField($fieldName)->getUtf8Value(); + return $this->getField($fieldName)->getUtf8Value(); } } diff --git a/search/Zend/Search/Lucene/Exception.php b/search/Zend/Search/Lucene/Exception.php index 9d06e89522a61..d08b30dbb1de9 100644 --- a/search/Zend/Search/Lucene/Exception.php +++ b/search/Zend/Search/Lucene/Exception.php @@ -14,7 +14,7 @@ * * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ @@ -22,13 +22,13 @@ /** * Framework base exception */ -require_once $CFG->dirroot.'/search/Zend/Search/Exception.php'; +require_once "Zend/Search/Exception.php"; /** * @category Zend * @package Zend_Search_Lucene - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Exception extends Zend_Search_Exception diff --git a/search/Zend/Search/Lucene/Search/Query/Term.php b/search/Zend/Search/Lucene/Search/Query/Term.php index 0240104e3258d..f9aa071b8ecfd 100644 --- a/search/Zend/Search/Lucene/Search/Query/Term.php +++ b/search/Zend/Search/Lucene/Search/Query/Term.php @@ -15,23 +15,23 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Search_Query */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query.php"; /** Zend_Search_Lucene_Search_Weight_Term */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Term.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Weight/Term.php"; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query @@ -65,7 +65,7 @@ class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Que * @param Zend_Search_Lucene_Index_Term $term * @param boolean $sign */ - public function __construct($term) + public function __construct(Zend_Search_Lucene_Index_Term $term) { $this->_term = $term; } diff --git a/search/Zend/Search/Lucene/Search/QueryParser.php b/search/Zend/Search/Lucene/Search/QueryParser.php index 1a3d5712de820..b1092a5af6560 100644 --- a/search/Zend/Search/Lucene/Search/QueryParser.php +++ b/search/Zend/Search/Lucene/Search/QueryParser.php @@ -15,52 +15,64 @@ * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ /** Zend_Search_Lucene_Index_Term */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/Term.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Index/Term.php"; /** Zend_Search_Lucene_Search_Query_Term */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Term.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Term.php"; /** Zend_Search_Lucene_Search_Query_MultiTerm */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/MultiTerm.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/MultiTerm.php"; /** Zend_Search_Lucene_Search_Query_Boolean */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Boolean.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Boolean.php"; /** Zend_Search_Lucene_Search_Query_Phrase */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Phrase.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Phrase.php"; + +/** Zend_Search_Lucene_Search_Query_Wildcard */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Wildcard.php"; + +/** Zend_Search_Lucene_Search_Query_Range */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Range.php"; + +/** Zend_Search_Lucene_Search_Query_Fuzzy */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Fuzzy.php"; /** Zend_Search_Lucene_Search_Query_Empty */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query/Empty.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Empty.php"; + +/** Zend_Search_Lucene_Search_Query_Insignificant */ +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/Query/Insignificant.php"; /** Zend_Search_Lucene_Search_QueryLexer */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryLexer.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/QueryLexer.php"; /** Zend_Search_Lucene_Search_QueryParserContext */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserContext.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/QueryParserContext.php"; /** Zend_Search_Lucene_FSM */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/FSM.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/FSM.php"; /** Zend_Search_Lucene_Exception */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Exception.php"; /** Zend_Search_Lucene_Search_QueryParserException */ -require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/QueryParserException.php'; +require_once "{$CFG->dirroot}/search/Zend/Search/Lucene/Search/QueryParserException.php"; /** * @category Zend * @package Zend_Search_Lucene * @subpackage Search - * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) + * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Search_QueryParser extends Zend_Search_Lucene_FSM @@ -139,6 +151,26 @@ class Zend_Search_Lucene_Search_QueryParser extends Zend_Search_Lucene_FSM */ private $_defaultEncoding = ''; + /** + * Defines query parsing mode. + * + * If this option is turned on, then query parser suppress query parser exceptions + * and constructs multi-term query using all words from a query. + * + * That helps to avoid exceptions caused by queries, which don't conform to query language, + * but limits possibilities to check, that query entered by user has some inconsistencies. + * + * + * Default is true. + * + * Use {@link Zend_Search_Lucene::suppressQueryParsingExceptions()}, + * {@link Zend_Search_Lucene::dontSuppressQueryParsingExceptions()} and + * {@link Zend_Search_Lucene::checkQueryParsingExceptionsSuppressMode()} to operate + * with this setting. + * + * @var boolean + */ + private $_suppressQueryParsingExceptions = true; /** * Boolean operators constants @@ -256,6 +288,18 @@ public function __construct() $this->_lexer = new Zend_Search_Lucene_Search_QueryLexer(); } + /** + * Get query parser instance + * + * @return Zend_Search_Lucene_Search_QueryParser + */ + private static function _getInstance() + { + if (self::$_instance === null) { + self::$_instance = new self(); + } + return self::$_instance; + } /** * Set query string default encoding @@ -264,11 +308,7 @@ public function __construct() */ public static function setDefaultEncoding($encoding) { - if (self::$_instance === null) { - self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); - } - - self::$_instance->_defaultEncoding = $encoding; + self::_getInstance()->_defaultEncoding = $encoding; } /** @@ -278,11 +318,7 @@ public static function setDefaultEncoding($encoding) */ public static function getDefaultEncoding() { - if (self::$_instance === null) { - self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); - } - - return self::$_instance->_defaultEncoding; + return self::_getInstance()->_defaultEncoding; } /** @@ -292,11 +328,7 @@ public static function getDefaultEncoding() */ public static function setDefaultOperator($operator) { - if (self::$_instance === null) { - self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); - } - - self::$_instance->_defaultOperator = $operator; + self::_getInstance()->_defaultOperator = $operator; } /** @@ -306,13 +338,34 @@ public static function setDefaultOperator($operator) */ public static function getDefaultOperator() { - if (self::$_instance === null) { - self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); - } + return self::_getInstance()->_defaultOperator; + } - return self::$_instance->_defaultOperator; + /** + * Turn on 'suppress query parser exceptions' mode. + */ + public static function suppressQueryParsingExceptions() + { + self::_getInstance()->_suppressQueryParsingExceptions = true; + } + /** + * Turn off 'suppress query parser exceptions' mode. + */ + public static function dontSuppressQueryParsingExceptions() + { + self::_getInstance()->_suppressQueryParsingExceptions = false; } + /** + * Check 'suppress query parser exceptions' mode. + * @return boolean + */ + public static function queryParsingExceptionsSuppressed() + { + return self::_getInstance()->_suppressQueryParsingExceptions; + } + + /** * Parses a query string * @@ -323,42 +376,62 @@ public static function getDefaultOperator() */ public static function parse($strQuery, $encoding = null) { - if (self::$_instance === null) { - self::$_instance = new Zend_Search_Lucene_Search_QueryParser(); - } - - self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding; - self::$_instance->_lastToken = null; - self::$_instance->_context = new Zend_Search_Lucene_Search_QueryParserContext(self::$_instance->_encoding); - self::$_instance->_contextStack = array(); - self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding); - - // Empty query - if (count(self::$_instance->_tokens) == 0) { - return new Zend_Search_Lucene_Search_Query_Empty(); - } - - - foreach (self::$_instance->_tokens as $token) { - try { - self::$_instance->_currentToken = $token; - self::$_instance->process($token->type); - - self::$_instance->_lastToken = $token; - } catch (Exception $e) { - if (strpos($e->getMessage(), 'There is no any rule for') !== false) { - throw new Zend_Search_Lucene_Search_QueryParserException( 'Syntax error at char position ' . $token->position . '.' ); + self::_getInstance(); + + // Reset FSM if previous parse operation didn't return it into a correct state + self::$_instance->reset(); + + try { + self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding; + self::$_instance->_lastToken = null; + self::$_instance->_context = new Zend_Search_Lucene_Search_QueryParserContext(self::$_instance->_encoding); + self::$_instance->_contextStack = array(); + self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding); + + // Empty query + if (count(self::$_instance->_tokens) == 0) { + return new Zend_Search_Lucene_Search_Query_Insignificant(); + } + + + foreach (self::$_instance->_tokens as $token) { + try { + self::$_instance->_currentToken = $token; + self::$_instance->process($token->type); + + self::$_instance->_lastToken = $token; + } catch (Exception $e) { + if (strpos($e->getMessage(), 'There is no any rule for') !== false) { + throw new Zend_Search_Lucene_Search_QueryParserException( 'Syntax error at char position ' . $token->position . '.' ); + } + + throw $e; } - + } + + if (count(self::$_instance->_contextStack) != 0) { + throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' ); + } + + return self::$_instance->_context->getQuery(); + } catch (Zend_Search_Lucene_Search_QueryParserException $e) { + if (self::$_instance->_suppressQueryParsingExceptions) { + $queryTokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($strQuery, self::$_instance->_encoding); + + $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); + $termsSign = (self::$_instance->_defaultOperator == self::B_AND) ? true /* required term */ : + null /* optional term */; + + foreach ($queryTokens as $token) { + $query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText()), $termsSign); + } + + + return $query; + } else { throw $e; } } - - if (count(self::$_instance->_contextStack) != 0) { - throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' ); - } - - return self::$_instance->_context->getQuery(); } @@ -433,7 +506,7 @@ public function processModifierParameter() default: // It's not a user input exception - throw new Zend_Search_Lucene_Exception('Lexeme modifier parameter must follow lexeme modifier. Char position .' ); + throw new Zend_Search_Lucene_Exception('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' ); } } @@ -485,13 +558,31 @@ public function openedRQFirstTerm() */ public function openedRQLastTerm() { - throw new Zend_Search_Lucene_Search_QueryParserException('Range queries are not supported yet.'); + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); + if (count($tokens) > 1) { + throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); + } else if (count($tokens) == 1) { + $from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); + } else { + $from = null; + } - // $firstTerm = new Zend_Search_Lucene_Index_Term($this->_rqFirstTerm, $this->_context->getField()); - // $lastTerm = new Zend_Search_Lucene_Index_Term($this->_currentToken->text, $this->_context->getField()); + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); + if (count($tokens) > 1) { + throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); + } else if (count($tokens) == 1) { + $to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); + } else { + $to = null; + } - // $query = new Zend_Search_Lucene_Search_Query_Range($firstTerm, $lastTerm, false); - // $this->_context->addentry($query); + if ($from === null && $to === null) { + throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term'); + } + + $rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, false); + $entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery); + $this->_context->addEntry($entry); } /** @@ -509,13 +600,31 @@ public function closedRQFirstTerm() */ public function closedRQLastTerm() { - throw new Zend_Search_Lucene_Search_QueryParserException('Range queries are not supported yet.'); + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); + if (count($tokens) > 1) { + throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); + } else if (count($tokens) == 1) { + $from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); + } else { + $from = null; + } - // $firstTerm = new Zend_Search_Lucene_Index_Term($this->_rqFirstTerm, $this->_context->getField()); - // $lastTerm = new Zend_Search_Lucene_Index_Term($this->_currentToken->text, $this->_context->getField()); + $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); + if (count($tokens) > 1) { + throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms'); + } else if (count($tokens) == 1) { + $to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField()); + } else { + $to = null; + } - // $query = new Zend_Search_Lucene_Search_Query_Range($firstTerm, $lastTerm, true); - // $this->_context->addentry($query); + if ($from === null && $to === null) { + throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term'); + } + + $rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, true); + $entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery); + $this->_context->addEntry($entry); } }