pax_global_header00006660000000000000000000000064124577512560014530gustar00rootroot0000000000000052 comment=c8de2508d3413bf9d701b40c1092c1ffbdd8fc63 ZendSearch-release-2.0.0rc6/000077500000000000000000000000001245775125600156065ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/LICENSE.txt000066400000000000000000000030141245775125600174270ustar00rootroot00000000000000Copyright (c) 2005-2012, Zend Technologies USA, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Zend Technologies USA, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ZendSearch-release-2.0.0rc6/README.md000066400000000000000000000004071245775125600170660ustar00rootroot00000000000000ZendSearch component You can install using: ``` curl -s https://getcomposer.org/installer | php php composer.phar install ``` At that point, follow the instructions in the documentation folder for actual usage of the component. (Documentation is forthcoming.) ZendSearch-release-2.0.0rc6/composer.json000066400000000000000000000011461245775125600203320ustar00rootroot00000000000000{ "name": "zendframework/zendsearch", "description": "a general purpose text search engine written entirely in PHP 5", "type": "library", "license": "BSD-3-Clause", "keywords": [ "zf2", "lucene" ], "homepage": "http://packages.zendframework.com/", "autoload": { "psr-0": { "ZendSearch": "library/" } }, "repositories": [ { "type": "composer", "url": "http://packages.zendframework.com/" } ], "require": { "php": ">=5.3.3", "zendframework/zend-stdlib": "2.*" } } ZendSearch-release-2.0.0rc6/library/000077500000000000000000000000001245775125600172525ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/000077500000000000000000000000001245775125600213005ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Exception/000077500000000000000000000000001245775125600232365ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Exception/ExceptionInterface.php000066400000000000000000000007111245775125600275250ustar00rootroot00000000000000 targetState * * @var array */ private $_rules = array(); /** * List of entry actions * Each action executes when entering the state * * [state] => action * * @var array */ private $_entryActions = array(); /** * List of exit actions * Each action executes when exiting the state * * [state] => action * * @var array */ private $_exitActions = array(); /** * List of input actions * Each action executes when entering the state * * [state][input] => action * * @var array */ private $_inputActions = array(); /** * List of input actions * Each action executes when entering the state * * [state1][state2] => action * * @var array */ private $_transitionActions = array(); /** * Finite State machine constructor * * $states is an array of integers or strings with a list of possible machine states * constructor treats fist list element as a sturt state (assignes it to $_current state). * It may be reassigned by setState() call. * States list may be empty and can be extended later by addState() or addStates() calls. * * $inputAphabet is the same as $states, but represents input alphabet * it also may be extended later by addInputSymbols() or addInputSymbol() calls. * * $rules parameter describes FSM transitions and has a structure: * array( array(sourseState, input, targetState[, inputAction]), * array(sourseState, input, targetState[, inputAction]), * array(sourseState, input, targetState[, inputAction]), * ... * ) * Rules also can be added later by addRules() and addRule() calls. * * FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(), * addInputAction() and addTransitionAction() calls. * * @param array $states * @param array $inputAphabet * @param array $rules */ public function __construct($states = array(), $inputAphabet = array(), $rules = array()) { $this->addStates($states); $this->addInputSymbols($inputAphabet); $this->addRules($rules); } /** * Add states to the state machine * * @param array $states */ public function addStates($states) { foreach ($states as $state) { $this->addState($state); } } /** * Add state to the state machine * * @param integer|string $state */ public function addState($state) { $this->_states[$state] = $state; if ($this->_currentState === null) { $this->_currentState = $state; } } /** * Set FSM state. * No any action is invoked * * @param integer|string $state * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function setState($state) { if (!isset($this->_states[$state])) { throw new Exception\InvalidArgumentException('State \'' . $state . '\' is not on of the possible FSM states.'); } $this->_currentState = $state; } /** * Get FSM state. * * @return integer|string $state|null */ public function getState() { return $this->_currentState; } /** * Add symbols to the input alphabet * * @param array $inputAphabet */ public function addInputSymbols($inputAphabet) { foreach ($inputAphabet as $inputSymbol) { $this->addInputSymbol($inputSymbol); } } /** * Add symbol to the input alphabet * * @param integer|string $inputSymbol */ public function addInputSymbol($inputSymbol) { $this->_inputAphabet[$inputSymbol] = $inputSymbol; } /** * Add transition rules * * array structure: * array( array(sourseState, input, targetState[, inputAction]), * array(sourseState, input, targetState[, inputAction]), * array(sourseState, input, targetState[, inputAction]), * ... * ) * * @param array $rules */ public function addRules($rules) { foreach ($rules as $rule) { $this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null); } } /** * Add symbol to the input alphabet * * @param integer|string $sourceState * @param integer|string $input * @param integer|string $targetState * @param \ZendSearch\Lucene\FSMAction|null $inputAction * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function addRule($sourceState, $input, $targetState, $inputAction = null) { if (!isset($this->_states[$sourceState])) { throw new Exception\InvalidArgumentException('Undefined source state (' . $sourceState . ').'); } if (!isset($this->_states[$targetState])) { throw new Exception\InvalidArgumentException('Undefined target state (' . $targetState . ').'); } if (!isset($this->_inputAphabet[$input])) { throw new Exception\InvalidArgumentException('Undefined input symbol (' . $input . ').'); } if (!isset($this->_rules[$sourceState])) { $this->_rules[$sourceState] = array(); } if (isset($this->_rules[$sourceState][$input])) { throw new Exception\RuntimeException('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.'); } $this->_rules[$sourceState][$input] = $targetState; if ($inputAction !== null) { $this->addInputAction($sourceState, $input, $inputAction); } } /** * Add state entry action. * Several entry actions are allowed. * Action execution order is defined by addEntryAction() calls * * @param integer|string $state * @param \ZendSearch\Lucene\FSMAction $action * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function addEntryAction($state, FSMAction $action) { if (!isset($this->_states[$state])) { throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').'); } if (!isset($this->_entryActions[$state])) { $this->_entryActions[$state] = array(); } $this->_entryActions[$state][] = $action; } /** * Add state exit action. * Several exit actions are allowed. * Action execution order is defined by addEntryAction() calls * * @param integer|string $state * @param \ZendSearch\Lucene\FSMAction $action * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function addExitAction($state, FSMAction $action) { if (!isset($this->_states[$state])) { throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').'); } if (!isset($this->_exitActions[$state])) { $this->_exitActions[$state] = array(); } $this->_exitActions[$state][] = $action; } /** * Add input action (defined by {state, input} pair). * Several input actions are allowed. * Action execution order is defined by addInputAction() calls * * @param integer|string $state * @param integer|string $input * @param \ZendSearch\Lucene\FSMAction $action * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function addInputAction($state, $inputSymbol, FSMAction $action) { if (!isset($this->_states[$state])) { throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').'); } if (!isset($this->_inputAphabet[$inputSymbol])) { throw new Exception\InvalidArgumentException('Undefined input symbol (' . $inputSymbol. ').'); } if (!isset($this->_inputActions[$state])) { $this->_inputActions[$state] = array(); } if (!isset($this->_inputActions[$state][$inputSymbol])) { $this->_inputActions[$state][$inputSymbol] = array(); } $this->_inputActions[$state][$inputSymbol][] = $action; } /** * Add transition action (defined by {state, input} pair). * Several transition actions are allowed. * Action execution order is defined by addTransitionAction() calls * * @param integer|string $sourceState * @param integer|string $targetState * @param \ZendSearch\Lucene\FSMAction $action * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function addTransitionAction($sourceState, $targetState, FSMAction $action) { if (!isset($this->_states[$sourceState])) { throw new Exception\InvalidArgumentException('Undefined source state (' . $sourceState. ').'); } if (!isset($this->_states[$targetState])) { throw new Exception\InvalidArgumentException('Undefined source state (' . $targetState. ').'); } if (!isset($this->_transitionActions[$sourceState])) { $this->_transitionActions[$sourceState] = array(); } if (!isset($this->_transitionActions[$sourceState][$targetState])) { $this->_transitionActions[$sourceState][$targetState] = array(); } $this->_transitionActions[$sourceState][$targetState][] = $action; } /** * Process an input * * @param mixed $input * @throws \ZendSearch\Lucene\Exception\RuntimeException * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function process($input) { if (!isset($this->_rules[$this->_currentState])) { throw new Exception\RuntimeException('There is no any rule for current state (' . $this->_currentState . ').'); } if (!isset($this->_rules[$this->_currentState][$input])) { throw new Exception\InvalidArgumentException('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').'); } $sourceState = $this->_currentState; $targetState = $this->_rules[$this->_currentState][$input]; if ($sourceState != $targetState && isset($this->_exitActions[$sourceState])) { foreach ($this->_exitActions[$sourceState] as $action) { $action->doAction(); } } if (isset($this->_inputActions[$sourceState]) && isset($this->_inputActions[$sourceState][$input])) { foreach ($this->_inputActions[$sourceState][$input] as $action) { $action->doAction(); } } $this->_currentState = $targetState; if (isset($this->_transitionActions[$sourceState]) && isset($this->_transitionActions[$sourceState][$targetState])) { foreach ($this->_transitionActions[$sourceState][$targetState] as $action) { $action->doAction(); } } if ($sourceState != $targetState && isset($this->_entryActions[$targetState])) { foreach ($this->_entryActions[$targetState] as $action) { $action->doAction(); } } } /** * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function reset() { if (count($this->_states) == 0) { throw new Exception\RuntimeException('There is no any state defined for FSM.'); } $this->_currentState = $this->_states[0]; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/AbstractPriorityQueue.php000066400000000000000000000073501245775125600275430ustar00rootroot00000000000000_heap); $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) { // Move parent node down $this->_heap[$nodeId] = $this->_heap[$parentId]; // Move pointer to the next level of tree $nodeId = $parentId; $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) } // Put new node into the tree $this->_heap[$nodeId] = $element; } /** * Return least element of the queue * * Constant time * * @return mixed */ public function top() { if (count($this->_heap) == 0) { return null; } return $this->_heap[0]; } /** * Removes and return least element of the queue * * O(log(N)) time * * @return mixed */ public function pop() { if (count($this->_heap) == 0) { return null; } $top = $this->_heap[0]; $lastId = count($this->_heap) - 1; /** * Find appropriate position for last node */ $nodeId = 0; // Start from a top $childId = 1; // First child // Choose smaller child if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) { $childId = 2; } while ($childId < $lastId && $this->_less($this->_heap[$childId], $this->_heap[$lastId]) ) { // Move child node up $this->_heap[$nodeId] = $this->_heap[$childId]; $nodeId = $childId; // Go down $childId = ($nodeId << 1) + 1; // First child // Choose smaller child if (($childId+1) < $lastId && $this->_less($this->_heap[$childId+1], $this->_heap[$childId]) ) { $childId++; } } // Move last element to the new position $this->_heap[$nodeId] = $this->_heap[$lastId]; unset($this->_heap[$lastId]); return $top; } /** * Clear queue */ public function clear() { $this->_heap = array(); } /** * Compare elements * * Returns true, if $el1 is less than $el2; else otherwise * * @param mixed $el1 * @param mixed $el2 * @return boolean */ abstract protected function _less($el1, $el2); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/000077500000000000000000000000001245775125600242765ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/000077500000000000000000000000001245775125600260635ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/AbstractAnalyzer.php000066400000000000000000000031371245775125600320510ustar00rootroot00000000000000setInput($data, $encoding); $tokenList = array(); while (($nextToken = $this->nextToken()) !== null) { $tokenList[] = $nextToken; } return $tokenList; } /** * Tokenization stream API * Set input * * @param string $data */ public function setInput($data, $encoding = '') { $this->_input = $data; $this->_encoding = $encoding; $this->reset(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Analyzer.php000066400000000000000000000027071245775125600303670ustar00rootroot00000000000000_filters[] = $filter; } /** * Apply filters to the token. Can return null when the token was removed. * * @param \ZendSearch\Lucene\Analysis\Token $token * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Analysis\Token $token) { foreach ($this->_filters as $filter) { $token = $filter->normalize($token); // resulting token can be null if the filter removes it if ($token === null) { return null; } } return $token; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Text.php000066400000000000000000000036571245775125600307630ustar00rootroot00000000000000_position = 0; if ($this->_input === null) { return; } // convert input into ascii if (PHP_OS != 'AIX') { $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); } $this->_encoding = 'ASCII'; } /** * Tokenization stream API * Get next token * Returns null at the end of stream * * @return \ZendSearch\Lucene\Analysis\Token|null */ public function nextToken() { if ($this->_input === null) { return null; } do { if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { // It covers both cases a) there are no matches (preg_match(...) === 0) // b) error occured (preg_match(...) === FALSE) return null; } $str = $match[0][0]; $pos = $match[0][1]; $endpos = $pos + strlen($str); $this->_position = $endpos; $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); } while ($token === null); // try again if token is skipped return $token; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Text/000077500000000000000000000000001245775125600302375ustar00rootroot00000000000000CaseInsensitive.php000066400000000000000000000013231245775125600337640ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextaddFilter(new TokenFilter\LowerCase()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum.php000066400000000000000000000036641245775125600314410ustar00rootroot00000000000000_position = 0; if ($this->_input === null) { return; } // convert input into ascii if (PHP_OS != 'AIX') { $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); } $this->_encoding = 'ASCII'; } /** * Tokenization stream API * Get next token * Returns null at the end of stream * * @return \ZendSearch\Lucene\Analysis\Token|null */ public function nextToken() { if ($this->_input === null) { return null; } do { if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { // It covers both cases a) there are no matches (preg_match(...) === 0) // b) error occured (preg_match(...) === FALSE) return null; } $str = $match[0][0]; $pos = $match[0][1]; $endpos = $pos + strlen($str); $this->_position = $endpos; $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); } while ($token === null); // try again if token is skipped return $token; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum/000077500000000000000000000000001245775125600307175ustar00rootroot00000000000000CaseInsensitive.php000066400000000000000000000013311245775125600344430ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNumaddFilter(new TokenFilter\LowerCase()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8.php000066400000000000000000000065231245775125600306600ustar00rootroot00000000000000_position = 0; $this->_bytePosition = 0; // convert input into UTF-8 if (strcasecmp($this->_encoding, 'utf8' ) != 0 && strcasecmp($this->_encoding, 'utf-8') != 0 ) { $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); $this->_encoding = 'UTF-8'; } } /** * Tokenization stream API * Get next token * Returns null at the end of stream * * @return \ZendSearch\Lucene\Analysis\Token|null */ public function nextToken() { if ($this->_input === null) { return null; } do { if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { // It covers both cases a) there are no matches (preg_match(...) === 0) // b) error occured (preg_match(...) === FALSE) return null; } // matched string $matchedWord = $match[0][0]; // binary position of the matched word in the input stream $binStartPos = $match[0][1]; // character position of the matched word in the input stream $startPos = $this->_position + iconv_strlen(substr($this->_input, $this->_bytePosition, $binStartPos - $this->_bytePosition), 'UTF-8'); // character postion of the end of matched word in the input stream $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); $this->_bytePosition = $binStartPos + strlen($matchedWord); $this->_position = $endPos; $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); } while ($token === null); // try again if token is skipped return $token; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8/000077500000000000000000000000001245775125600301415ustar00rootroot00000000000000CaseInsensitive.php000066400000000000000000000013671245775125600336760ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8addFilter(new TokenFilter\LowerCaseUtf8()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8Num.php000066400000000000000000000065361245775125600313440ustar00rootroot00000000000000_position = 0; $this->_bytePosition = 0; // convert input into UTF-8 if (strcasecmp($this->_encoding, 'utf8' ) != 0 && strcasecmp($this->_encoding, 'utf-8') != 0 ) { $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); $this->_encoding = 'UTF-8'; } } /** * Tokenization stream API * Get next token * Returns null at the end of stream * * @return \ZendSearch\Lucene\Analysis\Token|null */ public function nextToken() { if ($this->_input === null) { return null; } do { if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { // It covers both cases a) there are no matches (preg_match(...) === 0) // b) error occured (preg_match(...) === FALSE) return null; } // matched string $matchedWord = $match[0][0]; // binary position of the matched word in the input stream $binStartPos = $match[0][1]; // character position of the matched word in the input stream $startPos = $this->_position + iconv_strlen(substr($this->_input, $this->_bytePosition, $binStartPos - $this->_bytePosition), 'UTF-8'); // character postion of the end of matched word in the input stream $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); $this->_bytePosition = $binStartPos + strlen($matchedWord); $this->_position = $endPos; $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); } while ($token === null); // try again if token is skipped return $token; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8Num/000077500000000000000000000000001245775125600306215ustar00rootroot00000000000000CaseInsensitive.php000066400000000000000000000013751245775125600343550ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8NumaddFilter(new TokenFilter\LowerCaseUtf8()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/Token.php000066400000000000000000000071001245775125600260650ustar00rootroot00000000000000_termText = $text; $this->_startOffset = $start; $this->_endOffset = $end; $this->_positionIncrement = 1; } /** * positionIncrement setter * * @param integer $positionIncrement */ public function setPositionIncrement($positionIncrement) { $this->_positionIncrement = $positionIncrement; } /** * Returns the position increment of this Token. * * @return integer */ public function getPositionIncrement() { return $this->_positionIncrement; } /** * Returns the Token's term text. * * @return string */ public function getTermText() { return $this->_termText; } /** * Returns this Token's starting offset, the position of the first character * corresponding to this token in the source text. * * Note: * The difference between getEndOffset() and getStartOffset() may not be equal * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered * by a stemmer or some other filter. * * @return integer */ public function getStartOffset() { return $this->_startOffset; } /** * Returns this Token's ending offset, one greater than the position of the * last character corresponding to this token in the source text. * * @return integer */ public function getEndOffset() { return $this->_endOffset; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/000077500000000000000000000000001245775125600265245ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/LowerCase.php000066400000000000000000000022001245775125600311130ustar00rootroot00000000000000getTermText() ), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php000066400000000000000000000031541245775125600316730ustar00rootroot00000000000000getTermText(), 'UTF-8'), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/ShortWords.php000066400000000000000000000026031245775125600313540ustar00rootroot00000000000000length = $length; } /** * Normalize Token or remove it (if null is returned) * * @param \ZendSearch\Lucene\Analysis\Token $srcToken * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $srcToken) { if (strlen($srcToken->getTermText()) < $this->length) { return null; } else { return $srcToken; } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/StopWords.php000066400000000000000000000054311245775125600312040ustar00rootroot00000000000000 1, 'an' => '1'); * * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter. * * @category Zend * @package Zend_Search_Lucene * @subpackage Analysis */ class StopWords implements TokenFilterInterface { /** * Stop Words * @var array */ private $_stopSet; /** * Constructs new instance of this filter. * * @param array $stopwords array (set) of words that will be filtered out */ public function __construct($stopwords = array()) { $this->_stopSet = array_flip($stopwords); } /** * Normalize Token or remove it (if null is returned) * * @param \ZendSearch\Lucene\Analysis\Token $srcToken * @return \ZendSearch\Lucene\Analysis\Token */ public function normalize(Token $srcToken) { if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { return null; } else { return $srcToken; } } /** * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first * column are ignored (as comments). * * You can call this method one or more times. New stopwords are always added to current set. * * @param string $filepath full path for text file with stopwords * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function loadFromFile($filepath = null) { if (! $filepath || ! file_exists($filepath)) { throw new InvalidArgumentException('You have to provide valid file path'); } $fd = fopen($filepath, "r"); if (! $fd) { throw new RuntimeException('Cannot open file ' . $filepath); } while (!feof ($fd)) { $buffer = trim(fgets($fd)); if (strlen($buffer) > 0 && $buffer[0] != '#') { $this->_stopSet[$buffer] = 1; } } if (!fclose($fd)) { throw new RuntimeException('Cannot close file ' . $filepath); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Analysis/TokenFilter/TokenFilterInterface.php000066400000000000000000000015421245775125600333060ustar00rootroot00000000000000getFieldNames()); } /** * Proxy method for getFieldValue(), provides more convenient access to * the string value of a field. * * @param $offset * @return string */ public function __get($offset) { return $this->getFieldValue($offset); } /** * Add a field object to this document. * * @param \ZendSearch\Lucene\Document\Field $field * @return \ZendSearch\Lucene\Document */ public function addField(Document\Field $field) { $this->_fields[$field->name] = $field; return $this; } /** * Return an array with the names of the fields in this document. * * @return array */ public function getFieldNames() { return array_keys($this->_fields); } /** * Returns {@link \ZendSearch\Lucene\Document\Field} object for a named field in this document. * * @param string $fieldName * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @return \ZendSearch\Lucene\Document\Field */ public function getField($fieldName) { if (!array_key_exists($fieldName, $this->_fields)) { throw new InvalidArgumentException("Field name \"$fieldName\" not found in document."); } return $this->_fields[$fieldName]; } /** * Returns the string value of a named field in this document. * * @see __get() * @return string */ public function getFieldValue($fieldName) { return $this->getField($fieldName)->value; } /** * Returns the string value of a named field in UTF-8 encoding. * * @see __get() * @return string */ public function getFieldUtf8Value($fieldName) { return $this->getField($fieldName)->getUtf8Value(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/000077500000000000000000000000001245775125600242715ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/AbstractOpenXML.php000066400000000000000000000071301245775125600277510ustar00rootroot00000000000000getFromName("_rels/.rels")); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == self::SCHEMA_COREPROPERTIES) { // Found core properties! Read in contents... $contents = simplexml_load_string( $package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"])) ); foreach ($contents->children(self::SCHEMA_DUBLINCORE) as $child) { $coreProperties[$child->getName()] = (string)$child; } foreach ($contents->children(self::SCHEMA_COREPROPERTIES) as $child) { $coreProperties[$child->getName()] = (string)$child; } foreach ($contents->children(self::SCHEMA_DUBLINCORETERMS) as $child) { $coreProperties[$child->getName()] = (string)$child; } } } return $coreProperties; } /** * Determine absolute zip path * * @param string $path * @return string */ protected function absoluteZipPath($path) { $path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path); $parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen'); $absolutes = array(); foreach ($parts as $part) { if ('.' == $part) continue; if ('..' == $part) { array_pop($absolutes); } else { $absolutes[] = $part; } } return implode('/', $absolutes); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/Docx.php000066400000000000000000000120761245775125600257050ustar00rootroot00000000000000open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .docx file.'); } // Prevent php from loading remote resources $loadEntities = libxml_disable_entity_loader(true); $relations = simplexml_load_string($relationsXml); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach($relations->Relationship as $rel) { if ($rel ["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read in contents... $contents = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target'])) )); $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML); $paragraphs = $contents->xpath('//w:body/w:p'); foreach ($paragraphs as $paragraph) { $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); if ($runs === false) { // Paragraph doesn't contain any text or breaks continue; } foreach ($runs as $run) { if ($run->getName() == 'br') { // Break element $documentBody[] = ' '; } else { $documentBody[] = (string)$run; } } // Add space after each paragraph. So they are not bound together. $documentBody[] = ' '; } break; } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (! isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } } /** * Load Docx document from a file * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Document\Exception\InvalidArgumentException * @return \ZendSearch\Lucene\Document\Docx */ public static function loadDocxFile($fileName, $storeContent = false) { if (!is_readable($fileName)) { throw new InvalidArgumentException('Provided file \'' . $fileName . '\' is not readable.'); } return new self($fileName, $storeContent); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/Exception/000077500000000000000000000000001245775125600262275ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/Exception/ExceptionInterface.php000066400000000000000000000007311245775125600325200ustar00rootroot00000000000000name = $name; $this->value = $value; if (!$isBinary) { $this->encoding = $encoding; $this->isTokenized = $isTokenized; } else { $this->encoding = ''; $this->isTokenized = false; } $this->isStored = $isStored; $this->isIndexed = $isIndexed; $this->isBinary = $isBinary; $this->storeTermVector = false; $this->boost = 1.0; } /** * Constructs a String-valued Field that is not tokenized, but is indexed * and stored. Useful for non-text fields, e.g. date or url. * * @param string $name * @param string $value * @param string $encoding * @return \ZendSearch\Lucene\Document\Field */ public static function keyword($name, $value, $encoding = 'UTF-8') { return new self($name, $value, $encoding, true, true, false); } /** * Constructs a String-valued Field that is not tokenized nor indexed, * but is stored in the index, for return with hits. * * @param string $name * @param string $value * @param string $encoding * @return \ZendSearch\Lucene\Document\Field */ public static function unIndexed($name, $value, $encoding = 'UTF-8') { return new self($name, $value, $encoding, true, false, false); } /** * Constructs a Binary String valued Field that is not tokenized nor indexed, * but is stored in the index, for return with hits. * * @param string $name * @param string $value * @param string $encoding * @return \ZendSearch\Lucene\Document\Field */ public static function binary($name, $value) { return new self($name, $value, '', true, false, false, true); } /** * Constructs a String-valued Field that is tokenized and indexed, * and is stored in the index, for return with hits. Useful for short text * fields, like "title" or "subject". Term vector will not be stored for this field. * * @param string $name * @param string $value * @param string $encoding * @return \ZendSearch\Lucene\Document\Field */ public static function text($name, $value, $encoding = 'UTF-8') { return new self($name, $value, $encoding, true, true, true); } /** * Constructs a String-valued Field that is tokenized and indexed, * but that is not stored in the index. * * @param string $name * @param string $value * @param string $encoding * @return \ZendSearch\Lucene\Document\Field */ public static function unStored($name, $value, $encoding = 'UTF-8') { return new self($name, $value, $encoding, false, true, true); } /** * Get field value in UTF-8 encoding * * @return string */ public function getUtf8Value() { if (strcasecmp($this->encoding, 'utf8' ) == 0 || strcasecmp($this->encoding, 'utf-8') == 0 ) { return $this->value; } else { return (PHP_OS != 'AIX') ? iconv($this->encoding, 'UTF-8', $this->value) : iconv('ISO8859-1', 'UTF-8', $this->value); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/HTML.php000066400000000000000000000377431245775125600255640ustar00rootroot00000000000000_doc = new \DOMDocument(); $this->_doc->substituteEntities = true; if ($isFile) { $htmlData = file_get_contents($data); } else { $htmlData = $data; } ErrorHandler::start(E_WARNING); $this->_doc->loadHTML($htmlData); ErrorHandler::stop(); if ($this->_doc->encoding === null) { // Document encoding is not recognized /** @todo improve HTML vs HTML fragment recognition */ if (preg_match('//i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { // It's an HTML document // Add additional HEAD section and recognize document $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); ErrorHandler::start(E_WARNING); $this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); ErrorHandler::stop(); // Remove additional HEAD section $xpath = new \DOMXPath($this->_doc); $head = $xpath->query('/html/head')->item(0); $head->parentNode->removeChild($head); } else { // It's an HTML fragment ErrorHandler::start(E_WARNING); $this->_doc->loadHTML('' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . ''); ErrorHandler::stop(); } } /** @todo Add correction of wrong HTML encoding recognition processing * The case is: * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, * even $this->_doc->encoding demonstrates another recognized encoding */ $xpath = new \DOMXPath($this->_doc); $docTitle = ''; $titleNodes = $xpath->query('/html/head/title'); foreach ($titleNodes as $titleNode) { // title should always have only one entry, but we process all nodeset entries $docTitle .= $titleNode->nodeValue . ' '; } $this->addField(Field::Text('title', $docTitle, 'UTF-8')); $metaNodes = $xpath->query('/html/head/meta[@name]'); foreach ($metaNodes as $metaNode) { $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8')); } $docBody = ''; $bodyNodes = $xpath->query('/html/body'); foreach ($bodyNodes as $bodyNode) { // body should always have only one entry, but we process all nodeset entries $this->_retrieveNodeText($bodyNode, $docBody); } if ($storeContent) { $this->addField(Field::Text('body', $docBody, 'UTF-8')); } else { $this->addField(Field::UnStored('body', $docBody, 'UTF-8')); } $linkNodes = $this->_doc->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) ) { $this->_links[] = $href; } } $linkNodes = $this->_doc->getElementsByTagName('area'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) ) { $this->_links[] = $href; } } $this->_links = array_unique($this->_links); $linkNodes = $xpath->query('/html/head/link'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '') { $this->_headerLinks[] = $href; } } $this->_headerLinks = array_unique($this->_headerLinks); } /** * Set exclude nofollow links flag * * @param boolean $newValue */ public static function setExcludeNoFollowLinks($newValue) { self::$_excludeNoFollowLinks = $newValue; } /** * Get exclude nofollow links flag * * @return boolean */ public static function getExcludeNoFollowLinks() { return self::$_excludeNoFollowLinks; } /** * Get node text * * We should exclude scripts, which may be not included into comment tags, CDATA sections, * * @param \DOMNode $node * @param string &$text */ private function _retrieveNodeText(\DOMNode $node, &$text) { if ($node->nodeType == XML_TEXT_NODE) { $text .= $node->nodeValue; if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { $text .= ' '; } } elseif ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { foreach ($node->childNodes as $childNode) { $this->_retrieveNodeText($childNode, $text); } } } /** * Get document HREF links * * @return array */ public function getLinks() { return $this->_links; } /** * Get document header links * * @return array */ public function getHeaderLinks() { return $this->_headerLinks; } /** * Load HTML document from a string * * @param string $data * @param boolean $storeContent * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. * @return \ZendSearch\Lucene\Document\HTML */ public static function loadHTML($data, $storeContent = false, $defaultEncoding = '') { return new self($data, false, $storeContent, $defaultEncoding); } /** * Load HTML document from a file * * @param string $file * @param boolean $storeContent * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. * @return \ZendSearch\Lucene\Document\HTML */ public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '') { return new self($file, true, $storeContent, $defaultEncoding); } /** * Highlight text in text node * * @param \DOMText $node * @param array $wordsToHighlight * @param callback $callback Callback method, used to transform (highlighting) text. * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) * @throws \ZendSearch\Lucene\Exception\RuntimeException */ protected function _highlightTextNode(\DOMText $node, $wordsToHighlight, $callback, $params) { $analyzer = Analyzer\Analyzer::getDefault(); $analyzer->setInput($node->nodeValue, 'UTF-8'); $matchedTokens = array(); while (($token = $analyzer->nextToken()) !== null) { if (isset($wordsToHighlight[$token->getTermText()])) { $matchedTokens[] = $token; } } if (count($matchedTokens) == 0) { return; } $matchedTokens = array_reverse($matchedTokens); foreach ($matchedTokens as $token) { // Cut text after matched token $node->splitText($token->getEndOffset()); // Cut matched node $matchedWordNode = $node->splitText($token->getStartOffset()); // Retrieve HTML string representation for highlihted word $fullCallbackparamsList = $params; array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue); $highlightedWordNodeSetHTML = call_user_func_array($callback, $fullCallbackparamsList); // Transform HTML string to a DOM representation and automatically transform retrieved string // into valid XHTML (It's automatically done by loadHTML() method) $highlightedWordNodeSetDomDocument = new \DOMDocument('1.0', 'UTF-8'); ErrorHandler::start(E_WARNING); $success = $highlightedWordNodeSetDomDocument-> loadHTML('' . $highlightedWordNodeSetHTML . ''); ErrorHandler::stop(); if (!$success) { throw new RuntimeException("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHTML'."); } $highlightedWordNodeSetXpath = new \DOMXPath($highlightedWordNodeSetDomDocument); $highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes; for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) { $nodeToImport = $highlightedWordNodeSet->item($count); $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */), $matchedWordNode); } $node->parentNode->removeChild($matchedWordNode); } } /** * highlight words in content of the specified node * * @param \DOMNode $contextNode * @param array $wordsToHighlight * @param callback $callback Callback method, used to transform (highlighting) text. * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) */ protected function _highlightNodeRecursive(\DOMNode $contextNode, $wordsToHighlight, $callback, $params) { $textNodes = array(); if (!$contextNode->hasChildNodes()) { return; } foreach ($contextNode->childNodes as $childNode) { if ($childNode->nodeType == XML_TEXT_NODE) { // process node later to leave childNodes structure untouched $textNodes[] = $childNode; } else { // Process node if it's not a script node if ($childNode->nodeName != 'script') { $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params); } } } foreach ($textNodes as $textNode) { $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params); } } /** * Standard callback method used to highlight words. * * @param string $stringToHighlight * @return string * @internal */ public function applyColour($stringToHighlight, $colour) { return '' . $stringToHighlight . ''; } /** * Highlight text with specified color * * @param string|array $words * @param string $colour * @return string */ public function highlight($words, $colour = '#66ffff') { return $this->highlightExtended($words, array($this, 'applyColour'), array($colour)); } /** * Highlight text using specified View helper or callback function. * * @param string|array $words Words to highlight. Words could be organized using the array or string. * @param callback $callback Callback method, used to transform (highlighting) text. * @param array $params Array of additionall callback parameters passed through into it * (first non-optional parameter is an HTML fragment for highlighting) * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @return string */ public function highlightExtended($words, $callback, $params = array()) { if (!is_array($words)) { $words = array($words); } $wordsToHighlightList = array(); $analyzer = Analyzer\Analyzer::getDefault(); foreach ($words as $wordString) { $wordsToHighlightList[] = $analyzer->tokenize($wordString); } $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); if (count($wordsToHighlight) == 0) { return $this->_doc->saveHTML(); } $wordsToHighlightFlipped = array(); foreach ($wordsToHighlight as $id => $token) { $wordsToHighlightFlipped[$token->getTermText()] = $id; } if (!is_callable($callback)) { throw new InvalidArgumentException('$viewHelper parameter mast be a View Helper name, View Helper object or callback.'); } $xpath = new \DOMXPath($this->_doc); $matchedNodes = $xpath->query("/html/body"); foreach ($matchedNodes as $matchedNode) { $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); } } /** * Get HTML * * @return string */ public function getHTML() { return $this->_doc->saveHTML(); } /** * Get HTML body * * @return string */ public function getHTMLBody() { $xpath = new \DOMXPath($this->_doc); $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes; $outputFragments = array(); for ($count = 0; $count < $bodyNodes->length; $count++) { $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count)); } return implode($outputFragments); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/Pptx.php000066400000000000000000000155721245775125600257470ustar00rootroot00000000000000open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .pptx file.'); } // Prevent php from loading remote resources $loadEntities = libxml_disable_entity_loader(true); $relations = simplexml_load_string($relationsXml); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Search for slides... $slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) ); foreach ($slideRelations->Relationship as $slideRel) { if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) { // Found slide! $slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) ) ); // Search for slide notes $slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) ); foreach ($slideNotesRelations->Relationship as $slideNoteRel) { if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) { // Found slide notes! $slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) ) ); break; } } } } break; } } // Sort slides ksort($slides); ksort($slideNotes); // Extract contents from slides foreach ($slides as $slideKey => $slide) { // Register namespaces $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slide->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string)$textElement; } // Extract contents from slide notes if (isset($slideNotes[$slideKey])) { // Fetch slide note $slideNote = $slideNotes[$slideKey]; // Register namespaces $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slideNote->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string)$textElement; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } } /** * Load Pptx document from a file * * @param string $fileName * @param boolean $storeContent * @return \ZendSearch\Lucene\Document\Pptx */ public static function loadPptxFile($fileName, $storeContent = false) { return new self($fileName, $storeContent); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Document/Xlsx.php000066400000000000000000000214331245775125600257430ustar00rootroot00000000000000open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .xlsx file.'); } // Prevent php from loading remote resources $loadEntities = libxml_disable_entity_loader(true); $relations = simplexml_load_string($relationsXml); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read relations for workbook... $workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) ); $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP); // Read shared strings $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']"); $sharedStringsPath = (string)$sharedStringsPath[0]['Target']; $xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) ); if (isset($xmlStrings) && isset($xmlStrings->si)) { foreach ($xmlStrings->si as $val) { if (isset($val->t)) { $sharedStrings[] = (string)$val->t; } elseif (isset($val->r)) { $sharedStrings[] = $this->_parseRichText($val); } } } // Loop relations for workbook and extract worksheets... foreach ($workbookRelations->Relationship as $workbookRelation) { if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) { $worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string( $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) ) ); } } break; } } // Sort worksheets ksort($worksheets); // Extract contents from worksheets foreach ($worksheets as $sheetKey => $worksheet) { foreach ($worksheet->sheetData->row as $row) { foreach ($row->c as $c) { // Determine data type $dataType = (string)$c["t"]; switch ($dataType) { case "s": // Value is a shared string if ((string)$c->v != '') { $value = $sharedStrings[intval($c->v)]; } else { $value = ''; } break; case "b": // Value is boolean $value = (string)$c->v; if ($value == '0') { $value = false; } elseif ($value == '1') { $value = true; } else { $value = (bool)$c->v; } break; case "inlineStr": // Value is rich text inline $value = $this->_parseRichText($c->is); break; case "e": // Value is an error message if ((string)$c->v != '') { $value = (string)$c->v; } else { $value = ''; } break; default: // Value is a string $value = (string)$c->v; // Check for numeric values if (is_numeric($value) && $dataType != 's') { if ($value == (int)$value) $value = (int)$value; elseif ($value == (float)$value) $value = (float)$value; elseif ($value == (double)$value) $value = (double)$value; } } $documentBody[] = $value; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } } /** * Parse rich text XML * * @param \SimpleXMLElement $is * @return string */ private function _parseRichText($is = null) { $value = array(); if (isset($is->t)) { $value[] = (string)$is->t; } else { foreach ($is->r as $run) { $value[] = (string)$run->t; } } return implode('', $value); } /** * Load Xlsx document from a file * * @param string $fileName * @param boolean $storeContent * @return \ZendSearch\Lucene\Document\Xlsx */ public static function loadXlsxFile($fileName, $storeContent = false) { return new self($fileName, $storeContent); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Exception/000077500000000000000000000000001245775125600244515ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Exception/ExceptionInterface.php000066400000000000000000000010101245775125600307310ustar00rootroot00000000000000_object = $object; $this->_method = $method; } public function doAction() { $methodName = $this->_method; $this->_object->$methodName(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index.php000066400000000000000000001145561245775125600243070ustar00rootroot00000000000000getFileObject('segments.gen', false); $format = $genFile->readInt(); if ($format != (int)0xFFFFFFFE) { throw new RuntimeException('Wrong segments.gen file format'); } $gen1 = $genFile->readLong(); $gen2 = $genFile->readLong(); if ($gen1 == $gen2) { return $gen1; } usleep(self::GENERATION_RETRIEVE_PAUSE * 1000); } // All passes are failed throw new RuntimeException('Index is under processing now'); } catch (\Exception $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { try { // Try to open old style segments file $segmentsFile = $directory->getFileObject('segments', false); // It's pre-2.1 index return 0; } catch (\Exception $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { return -1; } else { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } } else { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } return -1; } /** * Get generation number associated with this index instance * * The same generation number in pair with document number or query string * guarantees to give the same result while index retrieving. * So it may be used for search result caching. * * @return integer */ public function getGeneration() { return $this->_generation; } /** * Get segments file name * * @param integer $generation * @return string */ public static function getSegmentFileName($generation) { if ($generation == 0) { return 'segments'; } return 'segments_' . base_convert($generation, 10, 36); } /** * Get index format version * * @return integer */ public function getFormatVersion() { return $this->_formatVersion; } /** * Set index format version. * Index is converted to this format at the nearest upfdate time * * @param int $formatVersion * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function setFormatVersion($formatVersion) { if ($formatVersion != self::FORMAT_PRE_2_1 && $formatVersion != self::FORMAT_2_1 && $formatVersion != self::FORMAT_2_3) { throw new InvalidArgumentException('Unsupported index format'); } $this->_formatVersion = $formatVersion; } /** * Read segments file for pre-2.1 Lucene index format * * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException */ private function _readPre21SegmentsFile() { $segmentsFile = $this->_directory->getFileObject('segments'); $format = $segmentsFile->readInt(); if ($format != (int)0xFFFFFFFF) { throw new InvalidFileFormatException('Wrong segments file format'); } // read version $segmentsFile->readLong(); // read segment name counter $segmentsFile->readInt(); $segments = $segmentsFile->readInt(); $this->_docCount = 0; // read segmentInfos for ($count = 0; $count < $segments; $count++) { $segName = $segmentsFile->readString(); $segSize = $segmentsFile->readInt(); $this->_docCount += $segSize; $this->_segmentInfos[$segName] = new Index\SegmentInfo($this->_directory, $segName, $segSize); } // Use 2.1 as a target version. Index will be reorganized at update time. $this->_formatVersion = self::FORMAT_2_1; } /** * Read segments file * * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function _readSegmentsFile() { $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation)); $format = $segmentsFile->readInt(); if ($format == (int)0xFFFFFFFC) { $this->_formatVersion = self::FORMAT_2_3; } elseif ($format == (int)0xFFFFFFFD) { $this->_formatVersion = self::FORMAT_2_1; } else { throw new InvalidFileFormatException('Unsupported segments file format'); } // read version $segmentsFile->readLong(); // read segment name counter $segmentsFile->readInt(); $segments = $segmentsFile->readInt(); $this->_docCount = 0; // read segmentInfos for ($count = 0; $count < $segments; $count++) { $segName = $segmentsFile->readString(); $segSize = $segmentsFile->readInt(); // 2.1+ specific properties $delGen = $segmentsFile->readLong(); if ($this->_formatVersion == self::FORMAT_2_3) { $docStoreOffset = $segmentsFile->readInt(); if ($docStoreOffset != (int)0xFFFFFFFF) { $docStoreSegment = $segmentsFile->readString(); $docStoreIsCompoundFile = $segmentsFile->readByte(); $docStoreOptions = array('offset' => $docStoreOffset, 'segment' => $docStoreSegment, 'isCompound' => ($docStoreIsCompoundFile == 1)); } else { $docStoreOptions = null; } } else { $docStoreOptions = null; } $hasSingleNormFile = $segmentsFile->readByte(); $numField = $segmentsFile->readInt(); $normGens = array(); if ($numField != (int)0xFFFFFFFF) { for ($count1 = 0; $count1 < $numField; $count1++) { $normGens[] = $segmentsFile->readLong(); } throw new RuntimeException( 'Separate norm files are not supported. Optimize index to use it with ZendSearch\Lucene.' ); } $isCompoundByte = $segmentsFile->readByte(); if ($isCompoundByte == 0xFF) { // The segment is not a compound file $isCompound = false; } elseif ($isCompoundByte == 0x00) { // The status is unknown $isCompound = null; } elseif ($isCompoundByte == 0x01) { // The segment is a compound file $isCompound = true; } $this->_docCount += $segSize; $this->_segmentInfos[$segName] = new Index\SegmentInfo($this->_directory, $segName, $segSize, $delGen, $docStoreOptions, $hasSingleNormFile, $isCompound); } } /** * Opens the index. * * IndexReader constructor needs Directory as a parameter. It should be * a string with a path to the index folder or a Directory object. * * @param \ZendSearch\Lucene\Storage\Directory\Filesystem|string $directory * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function __construct($directory = null, $create = false) { if ($directory === null) { throw new InvalidArgumentException('No index directory specified'); } if (is_string($directory)) { $this->_directory = new Directory\Filesystem($directory); $this->_closeDirOnExit = true; } else { $this->_directory = $directory; $this->_closeDirOnExit = false; } $this->_segmentInfos = array(); // Mark index as "under processing" to prevent other processes from premature index cleaning LockManager::obtainReadLock($this->_directory); $this->_generation = self::getActualGeneration($this->_directory); if ($create) { try { LockManager::obtainWriteLock($this->_directory); } catch (\Exception $e) { LockManager::releaseReadLock($this->_directory); if (strpos($e->getMessage(), 'Can\'t obtain exclusive index lock') === false) { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } else { throw new RuntimeException('Can\'t create index. It\'s under processing now', 0, $e); } } if ($this->_generation == -1) { // Directory doesn't contain existing index, start from 1 $this->_generation = 1; $nameCounter = 0; } else { // Directory contains existing index $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation)); $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) $nameCounter = $segmentsFile->readInt(); $this->_generation++; } Index\Writer::createIndex($this->_directory, $this->_generation, $nameCounter); LockManager::releaseWriteLock($this->_directory); } if ($this->_generation == -1) { throw new RuntimeException('Index doesn\'t exists in the specified directory.'); } elseif ($this->_generation == 0) { $this->_readPre21SegmentsFile(); } else { $this->_readSegmentsFile(); } } /** * Object destructor */ public function __destruct() { $this->commit(); // Release "under processing" flag LockManager::releaseReadLock($this->_directory); if ($this->_closeDirOnExit) { $this->_directory->close(); } $this->_directory = null; $this->_writer = null; $this->_segmentInfos = null; } /** * Returns an instance of Zend_Search_Lucene_Index_Writer for the index * * @return \ZendSearch\Lucene\Index\Writer */ private function _getIndexWriter() { if ($this->_writer === null) { $this->_writer = new Index\Writer($this->_directory, $this->_segmentInfos, $this->_formatVersion); } return $this->_writer; } /** * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. * * @return \ZendSearch\Lucene\Storage\Directory\DirectoryInterface */ public function getDirectory() { return $this->_directory; } /** * Returns the total number of documents in this index (including deleted documents). * * @return integer */ public function count() { return $this->_docCount; } /** * Returns one greater than the largest possible document number. * This may be used to, e.g., determine how big to allocate a structure which will have * an element for every document number in an index. * * @return integer */ public function maxDoc() { return $this->count(); } /** * Returns the total number of non-deleted documents in this index. * * @return integer */ public function numDocs() { $numDocs = 0; foreach ($this->_segmentInfos as $segmentInfo) { $numDocs += $segmentInfo->numDocs(); } return $numDocs; } /** * Checks, that document is deleted * * @param integer $id * @return boolean * @throws \ZendSearch\Lucene\Exception\OutOfRangeException is thrown if $id is out of the range */ public function isDeleted($id) { if ($id >= $this->_docCount) { throw new OutOfRangeException('Document id is out of the range.'); } $segmentStartId = 0; foreach ($this->_segmentInfos as $segmentInfo) { if ($segmentStartId + $segmentInfo->count() > $id) { break; } $segmentStartId += $segmentInfo->count(); } if (isset($segmentInfo)) { return $segmentInfo->isDeleted($id - $segmentStartId); } return false; } /** * Retrieve index maxBufferedDocs option * * maxBufferedDocs is a minimal number of documents required before * the buffered in-memory documents are written into a new Segment * * Default value is 10 * * @return integer */ public function getMaxBufferedDocs() { return $this->_getIndexWriter()->maxBufferedDocs; } /** * Set index maxBufferedDocs option * * maxBufferedDocs is a minimal number of documents required before * the buffered in-memory documents are written into a new Segment * * Default value is 10 * * @param integer $maxBufferedDocs */ public function setMaxBufferedDocs($maxBufferedDocs) { $this->_getIndexWriter()->maxBufferedDocs = $maxBufferedDocs; } /** * Retrieve index maxMergeDocs option * * maxMergeDocs is a largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * Default value is PHP_INT_MAX * * @return integer */ public function getMaxMergeDocs() { return $this->_getIndexWriter()->maxMergeDocs; } /** * Set index maxMergeDocs option * * maxMergeDocs is a largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * Default value is PHP_INT_MAX * * @param integer $maxMergeDocs */ public function setMaxMergeDocs($maxMergeDocs) { $this->_getIndexWriter()->maxMergeDocs = $maxMergeDocs; } /** * Retrieve index mergeFactor option * * mergeFactor determines how often segment indices are merged by addDocument(). * With smaller values, less RAM is used while indexing, * and searches on unoptimized indices are faster, * but indexing speed is slower. * With larger values, more RAM is used during indexing, * and while searches on unoptimized indices are slower, * indexing is faster. * Thus larger values (> 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * Default value is 10 * * @return integer */ public function getMergeFactor() { return $this->_getIndexWriter()->mergeFactor; } /** * Set index mergeFactor option * * mergeFactor determines how often segment indices are merged by addDocument(). * With smaller values, less RAM is used while indexing, * and searches on unoptimized indices are faster, * but indexing speed is slower. * With larger values, more RAM is used during indexing, * and while searches on unoptimized indices are slower, * indexing is faster. * Thus larger values (> 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * Default value is 10 * * @param integer $maxMergeDocs */ public function setMergeFactor($mergeFactor) { $this->_getIndexWriter()->mergeFactor = $mergeFactor; } /** * Performs a query against the index and returns an array * of Zend_Search_Lucene_Search_QueryHit objects. * Input is a string or Zend_Search_Lucene_Search_Query. * * @param \ZendSearch\Lucene\Search\QueryParser|string $query * @return array|\ZendSearch\Lucene\Search\QueryHit * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function find($query) { if (is_string($query)) { $query = Search\QueryParser::parse($query); } elseif (!$query instanceof Search\Query\AbstractQuery) { throw new InvalidArgumentException('Query must be a string or ZendSearch\Lucene\Search\Query object'); } $this->commit(); $hits = array(); $scores = array(); $ids = array(); $query = $query->rewrite($this)->optimize($this); $query->execute($this); $topScore = 0; $resultSetLimit = Lucene::getResultSetLimit(); foreach ($query->matchedDocs() as $id => $num) { $docScore = $query->score($id, $this); if( $docScore != 0 ) { $hit = new Search\QueryHit($this); $hit->document_id = $hit->id = $id; $hit->score = $docScore; $hits[] = $hit; $ids[] = $id; $scores[] = $docScore; if ($docScore > $topScore) { $topScore = $docScore; } } if ($resultSetLimit != 0 && count($hits) >= $resultSetLimit) { break; } } if (count($hits) == 0) { // skip sorting, which may cause a error on empty index return array(); } if ($topScore > 1) { foreach ($hits as $hit) { $hit->score /= $topScore; } } if (func_num_args() == 1) { // sort by scores array_multisort($scores, SORT_DESC, SORT_NUMERIC, $ids, SORT_ASC, SORT_NUMERIC, $hits); } else { // sort by given field names $argList = func_get_args(); $fieldNames = $this->getFieldNames(); $sortArgs = array(); // PHP 5.3 now expects all arguments to array_multisort be passed by // reference (if it's invoked through call_user_func_array()); // since constants can't be passed by reference, create some placeholder variables. $sortReg = SORT_REGULAR; $sortAsc = SORT_ASC; $sortNum = SORT_NUMERIC; $sortFieldValues = array(); for ($count = 1; $count < count($argList); $count++) { $fieldName = $argList[$count]; if (!is_string($fieldName)) { throw new RuntimeException('Field name must be a string.'); } if (strtolower($fieldName) == 'score') { $sortArgs[] = &$scores; } else { if (!in_array($fieldName, $fieldNames)) { throw new RuntimeException('Wrong field name.'); } if (!isset($sortFieldValues[$fieldName])) { $valuesArray = array(); foreach ($hits as $hit) { try { $value = $hit->getDocument()->getFieldValue($fieldName); } catch (\Exception $e) { if (strpos($e->getMessage(), 'not found') === false) { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } else { $value = null; } } $valuesArray[] = $value; } // Collect loaded values in $sortFieldValues // Required for PHP 5.3 which translates references into values when source // variable is destroyed $sortFieldValues[$fieldName] = $valuesArray; } $sortArgs[] = &$sortFieldValues[$fieldName]; } if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { $count++; $sortArgs[] = &$argList[$count]; if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { $count++; $sortArgs[] = &$argList[$count]; } else { if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) { $sortArgs[] = &$sortReg; } else { $sortArgs[] = &$sortAsc; } } } else { $sortArgs[] = &$sortAsc; $sortArgs[] = &$sortReg; } } // Sort by id's if values are equal $sortArgs[] = &$ids; $sortArgs[] = &$sortAsc; $sortArgs[] = &$sortNum; // Array to be sorted $sortArgs[] = &$hits; // Do sort call_user_func_array('array_multisort', $sortArgs); } return $hits; } /** * Returns a list of all unique field names that exist in this index. * * @param boolean $indexed * @return array */ public function getFieldNames($indexed = false) { $result = array(); foreach( $this->_segmentInfos as $segmentInfo ) { $result = array_merge($result, $segmentInfo->getFields($indexed)); } return $result; } /** * Returns a Zend_Search_Lucene_Document object for the document * number $id in this index. * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @return \ZendSearch\Lucene\Document * @throws \ZendSearch\Lucene\OutOfRangeException is thrown if $id is out of the range */ public function getDocument($id) { if ($id instanceof Search\QueryHit) { /* @var $id \ZendSearch\Lucene\Search\QueryHit */ $id = $id->id; } if ($id >= $this->_docCount) { throw new OutOfRangeException('Document id is out of the range.'); } $segmentStartId = 0; foreach ($this->_segmentInfos as $segmentInfo) { if ($segmentStartId + $segmentInfo->count() > $id) { break; } $segmentStartId += $segmentInfo->count(); } $fdxFile = $segmentInfo->openCompoundFile('.fdx'); $fdxFile->seek(($id-$segmentStartId)*8, SEEK_CUR); $fieldValuesPosition = $fdxFile->readLong(); $fdtFile = $segmentInfo->openCompoundFile('.fdt'); $fdtFile->seek($fieldValuesPosition, SEEK_CUR); $fieldCount = $fdtFile->readVInt(); $doc = new Document(); for ($count = 0; $count < $fieldCount; $count++) { $fieldNum = $fdtFile->readVInt(); $bits = $fdtFile->readByte(); $fieldInfo = $segmentInfo->getField($fieldNum); if (!($bits & 2)) { // Text data $field = new Document\Field($fieldInfo->name, $fdtFile->readString(), 'UTF-8', true, $fieldInfo->isIndexed, $bits & 1 ); } else { // Binary data $field = new Document\Field($fieldInfo->name, $fdtFile->readBinary(), '', true, $fieldInfo->isIndexed, $bits & 1, true ); } $doc->addField($field); } return $doc; } /** * Returns true if index contain documents with specified term. * * Is used for query optimization. * * @param \ZendSearch\Lucene\Index\Term $term * @return boolean */ public function hasTerm(Index\Term $term) { foreach ($this->_segmentInfos as $segInfo) { if ($segInfo->getTermInfo($term) !== null) { return true; } } return false; } /** * Returns IDs of all documents containing term. * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return array */ public function termDocs(Index\Term $term, $docsFilter = null) { $subResults = array(); $segmentStartDocId = 0; foreach ($this->_segmentInfos as $segmentInfo) { $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter); $segmentStartDocId += $segmentInfo->count(); } if (count($subResults) == 0) { return array(); } elseif (count($subResults) == 1) { // Index is optimized (only one segment) // Do not perform array reindexing return reset($subResults); } else { $result = call_user_func_array('array_merge', $subResults); } return $result; } /** * Returns documents filter for all documents containing term. * * It performs the same operation as termDocs, but return result as * Zend_Search_Lucene_Index_DocsFilter object * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return \ZendSearch\Lucene\Index\DocsFilter */ public function termDocsFilter(Index\Term $term, $docsFilter = null) { $segmentStartDocId = 0; $result = new Index\DocsFilter(); foreach ($this->_segmentInfos as $segmentInfo) { $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter); $segmentStartDocId += $segmentInfo->count(); } if (count($subResults) == 0) { return array(); } elseif (count($subResults) == 1) { // Index is optimized (only one segment) // Do not perform array reindexing return reset($subResults); } else { $result = call_user_func_array('array_merge', $subResults); } return $result; } /** * Returns an array of all term freqs. * Result array structure: array(docId => freq, ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return integer */ public function termFreqs(Index\Term $term, $docsFilter = null) { $result = array(); $segmentStartDocId = 0; foreach ($this->_segmentInfos as $segmentInfo) { $result += $segmentInfo->termFreqs($term, $segmentStartDocId, $docsFilter); $segmentStartDocId += $segmentInfo->count(); } return $result; } /** * Returns an array of all term positions in the documents. * Result array structure: array(docId => array(pos1, pos2, ...), ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return array */ public function termPositions(Index\Term $term, $docsFilter = null) { $result = array(); $segmentStartDocId = 0; foreach ($this->_segmentInfos as $segmentInfo) { $result += $segmentInfo->termPositions($term, $segmentStartDocId, $docsFilter); $segmentStartDocId += $segmentInfo->count(); } return $result; } /** * Returns the number of documents in this index containing the $term. * * @param \ZendSearch\Lucene\Index\Term $term * @return integer */ public function docFreq(Index\Term $term) { $result = 0; foreach ($this->_segmentInfos as $segInfo) { $termInfo = $segInfo->getTermInfo($term); if ($termInfo !== null) { $result += $termInfo->docFreq; } } return $result; } /** * Retrive similarity used by index reader * * @return \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity */ public function getSimilarity() { return AbstractSimilarity::getDefault(); } /** * Returns a normalization factor for "field, document" pair. * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName) { if ($id >= $this->_docCount) { return null; } $segmentStartId = 0; foreach ($this->_segmentInfos as $segInfo) { if ($segmentStartId + $segInfo->count() > $id) { break; } $segmentStartId += $segInfo->count(); } if ($segInfo->isDeleted($id - $segmentStartId)) { return 0; } return $segInfo->norm($id - $segmentStartId, $fieldName); } /** * Returns true if any documents have been deleted from this index. * * @return boolean */ public function hasDeletions() { foreach ($this->_segmentInfos as $segmentInfo) { if ($segmentInfo->hasDeletions()) { return true; } } return false; } /** * Deletes a document from the index. * $id is an internal document id * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @throws \ZendSearch\Lucene\Exception\OutOfRangeException */ public function delete($id) { if ($id instanceof Search\QueryHit) { /* @var $id \ZendSearch\Lucene\Search\QueryHit */ $id = $id->id; } if ($id >= $this->_docCount) { throw new OutOfRangeException('Document id is out of the range.'); } $segmentStartId = 0; foreach ($this->_segmentInfos as $segmentInfo) { if ($segmentStartId + $segmentInfo->count() > $id) { break; } $segmentStartId += $segmentInfo->count(); } $segmentInfo->delete($id - $segmentStartId); $this->_hasChanges = true; } /** * Adds a document to this index. * * @param \ZendSearch\Lucene\Document $document */ public function addDocument(Document $document) { $this->_getIndexWriter()->addDocument($document); $this->_docCount++; $this->_hasChanges = true; } /** * Update document counter */ private function _updateDocCount() { $this->_docCount = 0; foreach ($this->_segmentInfos as $segInfo) { $this->_docCount += $segInfo->count(); } } /** * Commit changes resulting from delete() or undeleteAll() operations. * * @todo undeleteAll processing. */ public function commit() { if ($this->_hasChanges) { $this->_getIndexWriter()->commit(); $this->_updateDocCount(); $this->_hasChanges = false; } } /** * Optimize index. * * Merges all segments into one */ public function optimize() { // Commit changes if any changes have been made $this->commit(); if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) { $this->_getIndexWriter()->optimize(); $this->_updateDocCount(); } } /** * Returns an array of all terms in this index. * * @return array */ public function terms() { $result = array(); $segmentInfoQueue = new Index\TermsPriorityQueue(); foreach ($this->_segmentInfos as $segmentInfo) { $segmentInfo->resetTermsStream(); // Skip "empty" segments if ($segmentInfo->currentTerm() !== null) { $segmentInfoQueue->put($segmentInfo); } } while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { if ($segmentInfoQueue->top() === null || $segmentInfoQueue->top()->currentTerm()->key() != $segmentInfo->currentTerm()->key()) { // We got new term $result[] = $segmentInfo->currentTerm(); } if ($segmentInfo->nextTerm() !== null) { // Put segment back into the priority queue $segmentInfoQueue->put($segmentInfo); } } return $result; } /** * Terms stream priority queue object * * @var \ZendSearch\Lucene\TermStreamsPriorityQueue */ private $_termsStream = null; /** * Reset terms stream. */ public function resetTermsStream() { if ($this->_termsStream === null) { $this->_termsStream = new TermStreamsPriorityQueue($this->_segmentInfos); } else { $this->_termsStream->resetTermsStream(); } } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param \ZendSearch\Lucene\Index\Term $prefix */ public function skipTo(Index\Term $prefix) { $this->_termsStream->skipTo($prefix); } /** * Scans terms dictionary and returns next term * * @return \ZendSearch\Lucene\Index\Term|null */ public function nextTerm() { return $this->_termsStream->nextTerm(); } /** * Returns term in current position * * @return \ZendSearch\Lucene\Index\Term|null */ public function currentTerm() { return $this->_termsStream->currentTerm(); } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { $this->_termsStream->closeTermsStream(); $this->_termsStream = null; } /************************************************************************* @todo UNIMPLEMENTED *************************************************************************/ /** * Undeletes all documents currently marked as deleted in this index. * * @todo Implementation */ public function undeleteAll() {} } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/000077500000000000000000000000001245775125600235625ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/DictionaryLoader.php000066400000000000000000000222531245775125600275330ustar00rootroot00000000000000.tii index file data and * returns two arrays - term and tremInfo lists. * * See Zend_Search_Lucene_Index_SegmintInfo class for details * * @param string $data * @return array * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException */ public static function load($data) { $termDictionary = array(); $termInfos = array(); $pos = 0; // $tiVersion = $tiiFile->readInt(); $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]); $pos += 4; if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format'); } // $indexTermCount = $tiiFile->readLong(); if (PHP_INT_SIZE > 4) { $indexTermCount = ord($data[$pos]) << 56 | ord($data[$pos+1]) << 48 | ord($data[$pos+2]) << 40 | ord($data[$pos+3]) << 32 | ord($data[$pos+4]) << 24 | ord($data[$pos+5]) << 16 | ord($data[$pos+6]) << 8 | ord($data[$pos+7]); } else { if ((ord($data[$pos]) != 0) || (ord($data[$pos+1]) != 0) || (ord($data[$pos+2]) != 0) || (ord($data[$pos+3]) != 0) || ((ord($data[$pos+4]) & 0x80) != 0)) { throw new InvalidFileFormatException('Largest supported segment size (for 32-bit mode) is 2Gb'); } $indexTermCount = ord($data[$pos+4]) << 24 | ord($data[$pos+5]) << 16 | ord($data[$pos+6]) << 8 | ord($data[$pos+7]); } $pos += 8; // $tiiFile->readInt(); // IndexInterval $pos += 4; // $skipInterval = $tiiFile->readInt(); $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]); $pos += 4; if ($indexTermCount < 1) { throw new InvalidFileFormatException('Wrong number of terms in a term dictionary index'); } if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { /* Skip MaxSkipLevels value */ $pos += 4; } $prevTerm = ''; $freqPointer = 0; $proxPointer = 0; $indexPointer = 0; for ($count = 0; $count < $indexTermCount; $count++) { //$termPrefixLength = $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $termPrefixLength = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $termPrefixLength |= ($nbyte & 0x7F) << $shift; } // $termSuffix = $tiiFile->readString(); $nbyte = ord($data[$pos++]); $len = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $len |= ($nbyte & 0x7F) << $shift; } if ($len == 0) { $termSuffix = ''; } else { $termSuffix = substr($data, $pos, $len); $pos += $len; for ($count1 = 0; $count1 < $len; $count1++ ) { if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) { $addBytes = 1; if (ord($termSuffix[$count1]) & 0x20 ) { $addBytes++; // Never used for Java Lucene created index. // Java2 doesn't encode strings in four bytes if (ord($termSuffix[$count1]) & 0x10 ) { $addBytes++; } } $termSuffix .= substr($data, $pos, $addBytes); $pos += $addBytes; $len += $addBytes; // Check for null character. Java2 encodes null character // in two bytes. if (ord($termSuffix[$count1]) == 0xC0 && ord($termSuffix[$count1+1]) == 0x80 ) { $termSuffix[$count1] = 0; $termSuffix = substr($termSuffix,0,$count1+1) . substr($termSuffix,$count1+2); } $count1 += $addBytes; } } } $pb = 0; $pc = 0; while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) { $charBytes = 1; if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) { $charBytes++; if (ord($prevTerm[$pb]) & 0x20 ) { $charBytes++; if (ord($prevTerm[$pb]) & 0x10 ) { $charBytes++; } } } if ($pb + $charBytes > strlen($data)) { // wrong character break; } $pc++; $pb += $charBytes; } $termValue = substr($prevTerm, 0, $pb) . $termSuffix; // $termFieldNum = $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $termFieldNum = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $termFieldNum |= ($nbyte & 0x7F) << $shift; } // $docFreq = $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $docFreq = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $docFreq |= ($nbyte & 0x7F) << $shift; } // $freqPointer += $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $vint = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $vint |= ($nbyte & 0x7F) << $shift; } $freqPointer += $vint; // $proxPointer += $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $vint = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $vint |= ($nbyte & 0x7F) << $shift; } $proxPointer += $vint; if( $docFreq >= $skipInterval ) { // $skipDelta = $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $vint = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $vint |= ($nbyte & 0x7F) << $shift; } $skipDelta = $vint; } else { $skipDelta = 0; } // $indexPointer += $tiiFile->readVInt(); $nbyte = ord($data[$pos++]); $vint = $nbyte & 0x7F; for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { $nbyte = ord($data[$pos++]); $vint |= ($nbyte & 0x7F) << $shift; } $indexPointer += $vint; $termDictionary[] = array($termFieldNum, $termValue); $termInfos[] = array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); $prevTerm = $termValue; } // Check special index entry mark if ($termDictionary[0][0] != (int)0xFFFFFFFF) { throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format'); } if (PHP_INT_SIZE > 4) { // Treat 64-bit 0xFFFFFFFF as -1 $termDictionary[0][0] = -1; } return array($termDictionary, $termInfos); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/DocsFilter.php000066400000000000000000000032041245775125600263300ustar00rootroot00000000000000 => array( => , * => , * => , * ... ), * => array( => , * => , * => , * ... ), * => array( => , * => , * => , * ... ), * ... * ) * * @var array */ public $segmentFilters = array(); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/FieldInfo.php000066400000000000000000000020121245775125600261250ustar00rootroot00000000000000name = $name; $this->isIndexed = $isIndexed; $this->number = $number; $this->storeTermVector = $storeTermVector; $this->normsOmitted = $normsOmitted; $this->payloadsStored = $payloadsStored; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentInfo.php000066400000000000000000002140221245775125600265120ustar00rootroot00000000000000 $termValue * [1] -> $termFieldNum * * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos * * @var array */ private $_termDictionary; /** * Term Dictionary Index TermInfos * * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because * of performance considerations) * [0] -> $docFreq * [1] -> $freqPointer * [2] -> $proxPointer * [3] -> $skipOffset * [4] -> $indexPointer * * @var array */ private $_termDictionaryInfos; /** * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment * * @var array */ private $_fields; /** * Field positions in a dictionary. * (Term dictionary contains filelds ordered by names) * * @var array */ private $_fieldsDicPositions; /** * Associative array where the key is the file name and the value is data offset * in a compound segment file (.csf). * * @var array */ private $_segFiles; /** * Associative array where the key is the file name and the value is file size (.csf). * * @var array */ private $_segFileSizes; /** * Delete file generation number * * -2 means autodetect latest delete generation * -1 means 'there is no delete file' * 0 means pre-2.1 format delete file * X specifies used delete file * * @var integer */ private $_delGen; /** * Segment has single norms file * * If true then one .nrm file is used for all fields * Otherwise .fN files are used * * @var boolean */ private $_hasSingleNormFile; /** * Use compound segment file (*.cfs) to collect all other segment files * (excluding .del files) * * @var boolean */ private $_isCompound; /** * File system adapter. * * @var \ZendSearch\Lucene\Storage\Directory\DirectoryInterface */ private $_directory; /** * Normalization factors. * An array fieldName => normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity::encodeNorm()) * * @var array */ private $_norms = array(); /** * List of deleted documents. * bitset if bitset extension is loaded or array otherwise. * * @var mixed */ private $_deleted = null; /** * $this->_deleted update flag * * @var boolean */ private $_deletedDirty = false; /** * True if segment uses shared doc store * * @var boolean */ private $_usesSharedDocStore; /* * Shared doc store options. * It's an assotiative array with the following items: * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files. * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file). */ private $_sharedDocStoreOptions; /** * Zend_Search_Lucene_Index_SegmentInfo constructor * * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory * @param string $name * @param integer $docCount * @param integer $delGen * @param array|null $docStoreOptions * @param boolean $hasSingleNormFile * @param boolean $isCompound * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function __construct(Directory\DirectoryInterface $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null) { $this->_directory = $directory; $this->_name = $name; $this->_docCount = $docCount; if ($docStoreOptions !== null) { $this->_usesSharedDocStore = true; $this->_sharedDocStoreOptions = $docStoreOptions; if ($docStoreOptions['isCompound']) { $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx'); $cfxFilesCount = $cfxFile->readVInt(); $cfxFiles = array(); $cfxFileSizes = array(); for ($count = 0; $count < $cfxFilesCount; $count++) { $dataOffset = $cfxFile->readLong(); if ($count != 0) { $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles); } $fileName = $cfxFile->readString(); $cfxFiles[$fileName] = $dataOffset; } if ($count != 0) { $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset; } $this->_sharedDocStoreOptions['files'] = $cfxFiles; $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes; } } $this->_hasSingleNormFile = $hasSingleNormFile; $this->_delGen = $delGen; $this->_termDictionary = null; if ($isCompound !== null) { $this->_isCompound = $isCompound; } else { // It's a pre-2.1 segment or isCompound is set to 'unknown' // Detect if segment uses compound file try { // Try to open compound file $this->_directory->getFileObject($name . '.cfs'); // Compound file is found $this->_isCompound = true; } catch (ExceptionInterface $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { // Compound file is not found or is not readable $this->_isCompound = false; } else { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } } $this->_segFiles = array(); if ($this->_isCompound) { $cfsFile = $this->_directory->getFileObject($name . '.cfs'); $segFilesCount = $cfsFile->readVInt(); for ($count = 0; $count < $segFilesCount; $count++) { $dataOffset = $cfsFile->readLong(); if ($count != 0) { $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); } $fileName = $cfsFile->readString(); $this->_segFiles[$fileName] = $dataOffset; } if ($count != 0) { $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; } } $fnmFile = $this->openCompoundFile('.fnm'); $fieldsCount = $fnmFile->readVInt(); $fieldNames = array(); $fieldNums = array(); $this->_fields = array(); for ($count=0; $count < $fieldsCount; $count++) { $fieldName = $fnmFile->readString(); $fieldBits = $fnmFile->readByte(); $this->_fields[$count] = new FieldInfo($fieldName, $fieldBits & 0x01 /* field is indexed */, $count, $fieldBits & 0x02 /* termvectors are stored */, $fieldBits & 0x10 /* norms are omitted */, $fieldBits & 0x20 /* payloads are stored */); if ($fieldBits & 0x10) { // norms are omitted for the indexed field $this->_norms[$count] = str_repeat(chr(AbstractSimilarity::encodeNorm(1.0)), $docCount); } $fieldNums[$count] = $count; $fieldNames[$count] = $fieldName; } array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); $this->_fieldsDicPositions = array_flip($fieldNums); if ($this->_delGen == -2) { // SegmentInfo constructor is invoked from index writer // Autodetect current delete file generation number $this->_delGen = $this->_detectLatestDelGen(); } // Load deletions $this->_deleted = $this->_loadDelFile(); } /** * Load detetions file * * Returns bitset or an array depending on bitset extension availability * * @return mixed */ private function _loadDelFile() { if ($this->_delGen == -1) { // There is no delete file for this segment return null; } elseif ($this->_delGen == 0) { // It's a segment with pre-2.1 format delete file // Try to load deletions file return $this->_loadPre21DelFile(); } else { // It's 2.1+ format deleteions file return $this->_load21DelFile(); } } /** * Load pre-2.1 detetions file * * Returns bitset or an array depending on bitset extension availability * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return mixed */ private function _loadPre21DelFile() { try { // '.del' files always stored in a separate file // Segment compound is not used $delFile = $this->_directory->getFileObject($this->_name . '.del'); $byteCount = $delFile->readInt(); $byteCount = ceil($byteCount/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { return $delBytes; } else { $deletions = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes[$count]); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $deletions[$count*8 + $bit] = 1; } } } return $deletions; } } catch(ExceptionInterface $e) { if (strpos($e->getMessage(), 'is not readable') === false) { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } // There is no deletion file $this->_delGen = -1; return null; } } /** * Load 2.1+ format detetions file * * Returns bitset or an array depending on bitset extension availability * * @return mixed */ private function _load21DelFile() { $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $format = $delFile->readInt(); if ($format == (int)0xFFFFFFFF) { if (extension_loaded('bitset')) { $deletions = bitset_empty(); } else { $deletions = array(); } $byteCount = $delFile->readInt(); $bitCount = $delFile->readInt(); $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $byteNum = 0; do { $dgap = $delFile->readVInt(); $nonZeroByte = $delFile->readByte(); $byteNum += $dgap; if (extension_loaded('bitset')) { for ($bit = 0; $bit < 8; $bit++) { if ($nonZeroByte & (1<<$bit)) { bitset_incl($deletions, $byteNum*8 + $bit); } } return $deletions; } else { for ($bit = 0; $bit < 8; $bit++) { if ($nonZeroByte & (1<<$bit)) { $deletions[$byteNum*8 + $bit] = 1; } } return (count($deletions) > 0) ? $deletions : null; } } while ($delFile->tell() < $delFileSize); } else { // $format is actually byte count $byteCount = ceil($format/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { return $delBytes; } else { $deletions = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes[$count]); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $deletions[$count*8 + $bit] = 1; } } } return (count($deletions) > 0) ? $deletions : null; } } } /** * Opens index file stoted within compound index file * * @param string $extension * @param boolean $shareHandler * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @return \ZendSearch\Lucene\Storage\File\FileInterface */ public function openCompoundFile($extension, $shareHandler = true) { if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx'; $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt'; if (!$this->_sharedDocStoreOptions['isCompound']) { $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler); $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); if ($extension == '.fdx') { // '.fdx' file is requested return $fdxFile; } else { // '.fdt' file is requested $fdtStartOffset = $fdxFile->readLong(); $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler); $fdtFile->seek($fdtStartOffset, SEEK_CUR); return $fdtFile; } } if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) { throw new InvalidFileFormatException('Shared doc storage segment compound file doesn\'t contain ' . $fdxFName . ' file.' ); } if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) { throw new InvalidFileFormatException('Shared doc storage segment compound file doesn\'t contain ' . $fdtFName . ' file.' ); } // Open shared docstore segment file $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler); // Seek to the start of '.fdx' file within compound file $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]); // Seek to the start of current segment documents section $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); if ($extension == '.fdx') { // '.fdx' file is requested return $cfxFile; } else { // '.fdt' file is requested $fdtStartOffset = $cfxFile->readLong(); // Seek to the start of '.fdt' file within compound file $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]); // Seek to the start of current segment documents section $cfxFile->seek($fdtStartOffset, SEEK_CUR); return $fdtFile; } } $filename = $this->_name . $extension; if (!$this->_isCompound) { return $this->_directory->getFileObject($filename, $shareHandler); } if( !isset($this->_segFiles[$filename]) ) { throw new InvalidFileFormatException('Segment compound file doesn\'t contain ' . $filename . ' file.' ); } $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); $file->seek($this->_segFiles[$filename]); return $file; } /** * Get compound file length * * @param string $extension * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @return integer */ public function compoundFileLength($extension) { if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { $filename = $this->_sharedDocStoreOptions['segment'] . $extension; if (!$this->_sharedDocStoreOptions['isCompound']) { return $this->_directory->fileLength($filename); } if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) { throw new InvalidFileFormatException('Shared doc store compound file doesn\'t contain ' . $filename . ' file.' ); } return $this->_sharedDocStoreOptions['fileSizes'][$filename]; } $filename = $this->_name . $extension; // Try to get common file first if ($this->_directory->fileExists($filename)) { return $this->_directory->fileLength($filename); } if( !isset($this->_segFileSizes[$filename]) ) { throw new InvalidFileFormatException('Index compound file doesn\'t contain ' . $filename . ' file.' ); } return $this->_segFileSizes[$filename]; } /** * Returns field index or -1 if field is not found * * @param string $fieldName * @return integer */ public function getFieldNum($fieldName) { foreach( $this->_fields as $field ) { if( $field->name == $fieldName ) { return $field->number; } } return -1; } /** * Returns field info for specified field * * @param integer $fieldNum * @return \ZendSearch\Lucene\Index\FieldInfo */ public function getField($fieldNum) { return $this->_fields[$fieldNum]; } /** * Returns array of fields. * if $indexed parameter is true, then returns only indexed fields. * * @param boolean $indexed * @return array */ public function getFields($indexed = false) { $result = array(); foreach( $this->_fields as $field ) { if( (!$indexed) || $field->isIndexed ) { $result[ $field->name ] = $field->name; } } return $result; } /** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; } /** * Returns actual deletions file generation number. * * @return integer */ public function getDelGen() { return $this->_delGen; } /** * Returns the total number of documents in this segment (including deleted documents). * * @return integer */ public function count() { return $this->_docCount; } /** * Returns number of deleted documents. * * @return integer */ private function _deletedCount() { if ($this->_deleted === null) { return 0; } if (extension_loaded('bitset')) { return count(bitset_to_array($this->_deleted)); } else { return count($this->_deleted); } } /** * Returns the total number of non-deleted documents in this segment. * * @return integer */ public function numDocs() { if ($this->hasDeletions()) { return $this->_docCount - $this->_deletedCount(); } else { return $this->_docCount; } } /** * Get field position in a fields dictionary * * @param integer $fieldNum * @return integer */ private function _getFieldPosition($fieldNum) { // Treat values which are not in a translation table as a 'direct value' return isset($this->_fieldsDicPositions[$fieldNum]) ? $this->_fieldsDicPositions[$fieldNum] : $fieldNum; } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * TermInfo cache * * Size is 1024. * Numbers are used instead of class constants because of performance considerations * * @var array */ private $_termInfoCache = array(); private function _cleanUpTermInfoCache() { // Clean 256 term infos foreach ($this->_termInfoCache as $key => $termInfo) { unset($this->_termInfoCache[$key]); // leave 768 last used term infos if (count($this->_termInfoCache) == 768) { break; } } } /** * Load terms dictionary index * * @throws \ZendSearch\Lucene\Exception\ExceptionInterface */ private function _loadDictionaryIndex() { // Check, if index is already serialized if ($this->_directory->fileExists($this->_name . '.sti')) { // Load serialized dictionary index data $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); // Load dictionary index data if (($unserializedData = @unserialize($stiFileData)) !== false) { list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData; return; } } // Load data from .tii file and generate .sti file // Prefetch dictionary index data $tiiFile = $this->openCompoundFile('.tii'); $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); // Load dictionary index data list($this->_termDictionary, $this->_termDictionaryInfos) = DictionaryLoader::load($tiiFileData); $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); $stiFile = $this->_directory->createFile($this->_name . '.sti'); $stiFile->writeBytes($stiFileData); } /** * Scans terms dictionary and returns term info * * @param \ZendSearch\Lucene\Index\Term $term * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @return \ZendSearch\Lucene\Index\TermInfo */ public function getTermInfo(Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { throw new InvalidFileFormatException('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $tisFile->readInt(); } $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); $termValue = $prevTerm[1] /* text */; $termFieldNum = $prevTerm[0] /* field */; $freqPointer = $prevTermInfo[1] /* freqPointer */; $proxPointer = $prevTermInfo[2] /* proxPointer */; for ($count = $prevPosition*$indexInterval + 1; $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || ($this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0) ); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; } /** * Returns IDs of all the documents containing term. * * @param \ZendSearch\Lucene\Index\Term $term * @param integer $shift * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @return array */ public function termDocs(Term $term, $shift = 0, DocsFilter $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof TermInfo) { if ($docsFilter !== null && $docsFilter instanceof DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $docId = 0; $result = array(); if ($docsFilter !== null) { if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- $updatedFilterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } if (isset($filter[$docId])) { $result[] = $shift + $docId; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan $updatedFilterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } if (isset($filter[$docId])) { $result[] = $shift + $docId; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter is present, but doesn't has data for the current segment yet $filterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } $result[] = $shift + $docId; $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } $result[] = $shift + $docId; } } return $result; } /** * Returns term freqs array. * Result array structure: array(docId => freq, ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param integer $shift * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return \ZendSearch\Lucene\Index\TermInfo */ public function termFreqs(Term $term, $shift = 0, DocsFilter $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof TermInfo) { if ($docsFilter !== null && $docsFilter instanceof DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $result = array(); $docId = 0; $result = array(); if ($docsFilter !== null) { if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- $updatedFilterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; if (isset($filter[$docId])) { $result[$shift + $docId] = 1; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } else { $docId += $docDelta/2; if (isset($filter[$docId])) { $result[$shift + $docId] = $frqFile->readVInt(); $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan $updatedFilterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; if (isset($filter[$docId])) { $result[$shift + $docId] = 1; $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here } } else { $docId += $docDelta/2; if (isset($filter[$docId])) { $result[$shift + $docId] = $frqFile->readVInt(); $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here } } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter doesn't has data for current segment $filterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $result[$shift + $docId] = 1; $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } else { $docId += $docDelta/2; $result[$shift + $docId] = $frqFile->readVInt(); $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $result[$shift + $docId] = 1; } else { $docId += $docDelta/2; $result[$shift + $docId] = $frqFile->readVInt(); } } } return $result; } /** * Returns term positions array. * Result array structure: array(docId => array(pos1, pos2, ...), ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param integer $shift * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return \ZendSearch\Lucene\Index\TermInfo */ public function termPositions(Term $term, $shift = 0, DocsFilter $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof TermInfo) { if ($docsFilter !== null && $docsFilter instanceof DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $docId = 0; $freqs = array(); if ($docsFilter !== null) { if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $updatedFilterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); // we have to read .prx file to get right position for next doc // even filter doesn't match current document for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } // Include into updated filter and into result only if doc is matched by filter if (isset($filter[$docId])) { $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $result[$shift + $docId] = $positions; } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $updatedFilterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); // we have to read .prx file to get right position for next doc // even filter doesn't match current document for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } // Include into updated filter and into result only if doc is matched by filter if (isset($filter[$docId])) { $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $result[$shift + $docId] = $positions; } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter doesn't has data for current segment for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $filterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[$shift + $docId] = $positions; } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[$shift + $docId] = $positions; } } return $result; } /** * Load normalizatin factors from an index file * * @param integer $fieldNum * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException */ private function _loadNorm($fieldNum) { if ($this->_hasSingleNormFile) { $normfFile = $this->openCompoundFile('.nrm'); $header = $normfFile->readBytes(3); $headerFormatVersion = $normfFile->readByte(); if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) { throw new InvalidFileFormatException('Wrong norms file format.'); } foreach ($this->_fields as $fNum => $fieldInfo) { if ($fieldInfo->isIndexed) { $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount); } } } else { $fFile = $this->openCompoundFile('.f' . $fieldNum); $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); } } /** * Returns normalization factor for specified documents * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ( !($this->_fields[$fieldNum]->isIndexed) ) { return null; } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return AbstractSimilarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) ); } /** * Returns norm vector, encoded in a byte string * * @param string $fieldName * @return string */ public function normVector($fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { $similarity = AbstractSimilarity::getDefault(); return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), $this->_docCount); } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return $this->_norms[$fieldNum]; } /** * Returns true if any documents have been deleted from this index segment. * * @return boolean */ public function hasDeletions() { return $this->_deleted !== null; } /** * Returns true if segment has single norms file. * * @return boolean */ public function hasSingleNormFile() { return $this->_hasSingleNormFile ? true : false; } /** * Returns true if segment is stored using compound segment file. * * @return boolean */ public function isCompound() { return $this->_isCompound; } /** * Deletes a document from the index segment. * $id is an internal document id * * @param integer */ public function delete($id) { $this->_deletedDirty = true; if (extension_loaded('bitset')) { if ($this->_deleted === null) { $this->_deleted = bitset_empty($id); } bitset_incl($this->_deleted, $id); } else { if ($this->_deleted === null) { $this->_deleted = array(); } $this->_deleted[$id] = 1; } } /** * Checks, that document is deleted * * @param integer * @return boolean */ public function isDeleted($id) { if ($this->_deleted === null) { return false; } if (extension_loaded('bitset')) { return bitset_in($this->_deleted, $id); } else { return isset($this->_deleted[$id]); } } /** * Detect latest delete generation * * Is actualy used from writeChanges() method or from the constructor if it's invoked from * Index writer. In both cases index write lock is already obtained, so we shouldn't care * about it * * @return integer */ private function _detectLatestDelGen() { $delFileList = array(); foreach ($this->_directory->fileList() as $file) { if ($file == $this->_name . '.del') { // Matches .del file name $delFileList[] = 0; } elseif (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) { // Matches _NNN.del file names $delFileList[] = (int)base_convert($matches[1], 36, 10); } } if (count($delFileList) == 0) { // There is no deletions file for current segment in the directory // Set deletions file generation number to 1 return -1; } else { // There are some deletions files for current segment in the directory // Set deletions file generation number to the highest nuber return max($delFileList); } } /** * Write changes if it's necessary. * * This method must be invoked only from the Writer _updateSegments() method, * so index Write lock has to be already obtained. * * @internal * @throws ZendSearch\Lucene\Exception\RuntimeException */ public function writeChanges() { // Get new generation number $latestDelGen = $this->_detectLatestDelGen(); if (!$this->_deletedDirty) { // There was no deletions by current process if ($latestDelGen == $this->_delGen) { // Delete file hasn't been updated by any concurrent process return; } elseif ($latestDelGen > $this->_delGen) { // Delete file has been updated by some concurrent process // Reload deletions file $this->_delGen = $latestDelGen; $this->_deleted = $this->_loadDelFile(); return; } else { throw new RuntimeException( 'Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.' ); } } if ($latestDelGen > $this->_delGen) { // Merge current deletions with latest deletions file $this->_delGen = $latestDelGen; $latestDelete = $this->_loadDelFile(); if (extension_loaded('bitset')) { $this->_deleted = bitset_union($this->_deleted, $latestDelete); } else { $this->_deleted += $latestDelete; } } if (extension_loaded('bitset')) { $delBytes = $this->_deleted; $bitCount = count(bitset_to_array($delBytes)); } else { $byteCount = floor($this->_docCount/8)+1; $delBytes = str_repeat(chr(0), $byteCount); for ($count = 0; $count < $byteCount; $count++) { $byte = 0; for ($bit = 0; $bit < 8; $bit++) { if (isset($this->_deleted[$count*8 + $bit])) { $byte |= (1<<$bit); } } $delBytes[$count] = chr($byte); } $bitCount = count($this->_deleted); } if ($this->_delGen == -1) { // Set delete file generation number to 1 $this->_delGen = 1; } else { // Increase delete file generation number by 1 $this->_delGen++; } $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $delFile->writeInt($this->_docCount); $delFile->writeInt($bitCount); $delFile->writeBytes($delBytes); $this->_deletedDirty = false; } /** * Term Dictionary File object for stream like terms reading * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_tisFile = null; /** * Actual offset of the .tis file data * * @var integer */ private $_tisFileOffset; /** * Frequencies File object for stream like terms reading * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_frqFile = null; /** * Actual offset of the .frq file data * * @var integer */ private $_frqFileOffset; /** * Positions File object for stream like terms reading * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_prxFile = null; /** * Actual offset of the .prx file in the compound file * * @var integer */ private $_prxFileOffset; /** * Actual number of terms in term stream * * @var integer */ private $_termCount = 0; /** * Overall number of terms in term stream * * @var integer */ private $_termNum = 0; /** * Segment index interval * * @var integer */ private $_indexInterval; /** * Segment skip interval * * @var integer */ private $_skipInterval; /** * Last TermInfo in a terms stream * * @var \ZendSearch\Lucene\Index\TermInfo */ private $_lastTermInfo = null; /** * Last Term in a terms stream * * @var \ZendSearch\Lucene\Index\Term */ private $_lastTerm = null; /** * Map of the document IDs * Used to get new docID after removing deleted documents. * It's not very effective from memory usage point of view, * but much more faster, then other methods * * @var array|null */ private $_docMap = null; /** * An array of all term positions in the documents. * Array structure: array( docId => array( pos1, pos2, ...), ...) * * Is set to null if term positions loading has to be skipped * * @var array|null */ private $_lastTermPositions; /** * Terms scan mode * * Values: * * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved * document numbers are compacted (shifted if segment has deleted documents) * * @var integer */ private $_termsScanMode; /** Scan modes */ const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved // document numbers are compacted (shifted if segment contains deleted documents) /** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param integer $mode * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException * @return integer */ public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */) { /** * SegmentInfo->resetTermsStream() method actually takes two optional parameters: * $startId (default value is 0) * $mode (default value is self::SM_TERMS_ONLY) */ $argList = func_get_args(); if (count($argList) > 2) { throw new InvalidArgumentException('Wrong number of arguments'); } elseif (count($argList) == 2) { $startId = $argList[0]; $mode = $argList[1]; } elseif (count($argList) == 1) { $startId = $argList[0]; $mode = self::SM_TERMS_ONLY; } else { $startId = 0; $mode = self::SM_TERMS_ONLY; } if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $this->_tisFileOffset = $this->_tisFile->tell(); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { throw new InvalidFileFormatException('Wrong TermInfoFile file format'); } $this->_termCount = $this->_termNum = $this->_tisFile->readLong(); // Read terms count $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $this->_tisFile->readInt(); } if ($this->_frqFile !== null) { $this->_frqFile = null; } if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_docMap = array(); $this->_lastTerm = new Term('', -1); $this->_lastTermInfo = new TermInfo(0, 0, 0, 0); $this->_lastTermPositions = null; $this->_termsScanMode = $mode; switch ($mode) { case self::SM_TERMS_ONLY: // Do nothing break; case self::SM_FULL_INFO: // break intentionally omitted case self::SM_MERGE_INFO: $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); } } break; default: throw new InvalidArgumentException('Wrong terms scaning mode specified.'); break; } // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call) $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); $this->nextTerm(); return $nextSegmentStartId; } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param \ZendSearch\Lucene\Index\Term $prefix */ public function skipTo(Term $prefix) { if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($prefix->field); if ($searchField == -1) { /** * Field is not presented in this segment * Go to the end of dictionary */ $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($prefix->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // We have reached term we are looking for break; } } if ($highIndex == -1) { // Term is out of the dictionary range $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; if ($this->_tisFile === null) { // The end of terms stream is reached and terms dictionary file is closed // Perform mini-reset operation $this->_tisFile = $this->openCompoundFile('.tis', false); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_prxFile = $this->openCompoundFile('.prx', false); } } $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET); $this->_lastTerm = new Term($prevTerm[1] /* text */, ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name); $this->_lastTermInfo = new TermInfo($prevTermInfo[0] /* docFreq */, $prevTermInfo[1] /* freqPointer */, $prevTermInfo[2] /* proxPointer */, $prevTermInfo[3] /* skipOffset */); $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval; if ($highIndex == 0) { // skip start entry $this->nextTerm(); } elseif ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) { // We got exact match in the dictionary index if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } return; } // Search term matching specified prefix while ($this->_lastTerm !== null) { if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 || ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) { // Current term matches or greate than the pattern return; } $this->nextTerm(); } } /** * Scans terms dictionary and returns next term * * @return \ZendSearch\Lucene\Index\Term|null */ public function nextTerm() { if ($this->_tisFile === null || $this->_termCount == 0) { $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; // may be necessary for "empty" segment $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; return null; } $termPrefixLength = $this->_tisFile->readVInt(); $termSuffix = $this->_tisFile->readString(); $termFieldNum = $this->_tisFile->readVInt(); $termValue = Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; $this->_lastTerm = new Term($termValue, $this->_fields[$termFieldNum]->name); $docFreq = $this->_tisFile->readVInt(); $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); if ($docFreq >= $this->_skipInterval) { $skipOffset = $this->_tisFile->readVInt(); } else { $skipOffset = 0; } $this->_lastTermInfo = new TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } $this->_termCount--; if ($this->_termCount == 0) { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; } return $this->_lastTerm; } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; } /** * Returns term in current position * * @return \ZendSearch\Lucene\Index\Term|null */ public function currentTerm() { return $this->_lastTerm; } /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @return array */ public function currentTermPositions() { return $this->_lastTermPositions; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentMerger.php000066400000000000000000000175271245775125600270530ustar00rootroot00000000000000][] => * * @var array */ private $_fieldsMap = array(); /** * Object constructor. * * Creates new segment merger with $directory as target to merge segments into * and $name as a name of new segment * * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory * @param string $name */ public function __construct(Directory\DirectoryInterface $directory, $name) { /** \ZendSearch\Lucene\Index\SegmentWriter\StreamWriter */ $this->_writer = new SegmentWriter\StreamWriter($directory, $name); } /** * Add segmnet to a collection of segments to be merged * * @param \ZendSearch\Lucene\Index\SegmentInfo $segment */ public function addSource(SegmentInfo $segmentInfo) { $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo; } /** * Do merge. * * Returns number of documents in newly created segment * * @return \ZendSearch\Lucene\Index\SegmentInfo * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function merge() { if ($this->_mergeDone) { throw new RuntimeException('Merge is already done.'); } if (count($this->_segmentInfos) < 1) { throw new RuntimeException('Wrong number of segments to be merged (' . count($this->_segmentInfos) . ').'); } $this->_mergeFields(); $this->_mergeNorms(); $this->_mergeStoredFields(); $this->_mergeTerms(); $this->_mergeDone = true; return $this->_writer->close(); } /** * Merge fields information */ private function _mergeFields() { foreach ($this->_segmentInfos as $segName => $segmentInfo) { foreach ($segmentInfo->getFieldInfos() as $fieldInfo) { $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo); } } } /** * Merge field's normalization factors */ private function _mergeNorms() { foreach ($this->_writer->getFieldInfos() as $fieldInfo) { if ($fieldInfo->isIndexed) { foreach ($this->_segmentInfos as $segName => $segmentInfo) { if ($segmentInfo->hasDeletions()) { $srcNorm = $segmentInfo->normVector($fieldInfo->name); $norm = ''; $docs = $segmentInfo->count(); for ($count = 0; $count < $docs; $count++) { if (!$segmentInfo->isDeleted($count)) { $norm .= $srcNorm[$count]; } } $this->_writer->addNorm($fieldInfo->name, $norm); } else { $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name)); } } } } } /** * Merge fields information */ private function _mergeStoredFields() { $this->_docCount = 0; foreach ($this->_segmentInfos as $segName => $segmentInfo) { $fdtFile = $segmentInfo->openCompoundFile('.fdt'); for ($count = 0; $count < $segmentInfo->count(); $count++) { $fieldCount = $fdtFile->readVInt(); $storedFields = array(); for ($count2 = 0; $count2 < $fieldCount; $count2++) { $fieldNum = $fdtFile->readVInt(); $bits = $fdtFile->readByte(); $fieldInfo = $segmentInfo->getField($fieldNum); if (!($bits & 2)) { // Text data $storedFields[] = new Document\Field($fieldInfo->name, $fdtFile->readString(), 'UTF-8', true, $fieldInfo->isIndexed, $bits & 1 ); } else { // Binary data $storedFields[] = new Document\Field($fieldInfo->name, $fdtFile->readBinary(), '', true, $fieldInfo->isIndexed, $bits & 1, true); } } if (!$segmentInfo->isDeleted($count)) { $this->_docCount++; $this->_writer->addStoredFields($storedFields); } } } } /** * Merge fields information */ private function _mergeTerms() { $segmentInfoQueue = new TermsPriorityQueue(); $segmentStartId = 0; foreach ($this->_segmentInfos as $segName => $segmentInfo) { $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, SegmentInfo::SM_MERGE_INFO); // Skip "empty" segments if ($segmentInfo->currentTerm() !== null) { $segmentInfoQueue->put($segmentInfo); } } $this->_writer->initializeDictionaryFiles(); $termDocs = array(); while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { // Merge positions array $termDocs += $segmentInfo->currentTermPositions(); if ($segmentInfoQueue->top() === null || $segmentInfoQueue->top()->currentTerm()->key() != $segmentInfo->currentTerm()->key()) { // We got new term ksort($termDocs, SORT_NUMERIC); // Add term if it's contained in any document if (count($termDocs) > 0) { $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs); } $termDocs = array(); } $segmentInfo->nextTerm(); // check, if segment dictionary is finished if ($segmentInfo->currentTerm() !== null) { // Put segment back into the priority queue $segmentInfoQueue->put($segmentInfo); } } $this->_writer->closeDictionaryFiles(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentWriter/000077500000000000000000000000001245775125600263615ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentWriter/AbstractSegmentWriter.php000066400000000000000000000461031245775125600333610ustar00rootroot00000000000000 normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity::encodeNorm()) * * @var array */ protected $_norms = array(); /** * '.fdx' file - Stored Fields, the field index. * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ protected $_fdxFile = null; /** * '.fdt' file - Stored Fields, the field data. * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ protected $_fdtFile = null; /** * Object constructor. * * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory * @param string $name */ public function __construct(Directory\DirectoryInterface $directory, $name) { $this->_directory = $directory; $this->_name = $name; } /** * Add field to the segment * * Returns actual field number * * @param \ZendSearch\Lucene\Document\Field $field * @return integer */ public function addField(Document\Field $field) { if (!isset($this->_fields[$field->name])) { $fieldNumber = count($this->_fields); $this->_fields[$field->name] = new Index\FieldInfo($field->name, $field->isIndexed, $fieldNumber, $field->storeTermVector); return $fieldNumber; } else { $this->_fields[$field->name]->isIndexed |= $field->isIndexed; $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; return $this->_fields[$field->name]->number; } } /** * Add fieldInfo to the segment * * Returns actual field number * * @param \ZendSearch\Lucene\Index\FieldInfo $fieldInfo * @return integer */ public function addFieldInfo(Index\FieldInfo $fieldInfo) { if (!isset($this->_fields[$fieldInfo->name])) { $fieldNumber = count($this->_fields); $this->_fields[$fieldInfo->name] = new Index\FieldInfo($fieldInfo->name, $fieldInfo->isIndexed, $fieldNumber, $fieldInfo->storeTermVector); return $fieldNumber; } else { $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector; return $this->_fields[$fieldInfo->name]->number; } } /** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; } /** * Add stored fields information * * @param array $storedFields array of \ZendSearch\Lucene\Document\Field objects */ public function addStoredFields($storedFields) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | ($field->isBinary ? 0x02 : 0x00) | 0x00; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->value)); $this->_fdtFile->writeBytes($field->value); } else { $this->_fdtFile->writeString($field->getUtf8Value()); } } $this->_docCount++; } /** * Returns the total number of documents in this segment. * * @return integer */ public function count() { return $this->_docCount; } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * Dump Field Info (.fnm) segment file */ protected function _dumpFNM() { $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); $fnmFile->writeVInt(count($this->_fields)); $nrmFile = $this->_directory->createFile($this->_name . '.nrm'); // Write header $nrmFile->writeBytes('NRM'); // Write format specifier $nrmFile->writeByte((int)0xFF); foreach ($this->_fields as $field) { $fnmFile->writeString($field->name); $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | ($field->storeTermVector ? 0x02 : 0x00) // not supported yet 0x04 /* term positions are stored with the term vectors */ | // not supported yet 0x08 /* term offsets are stored with the term vectors */ | ); if ($field->isIndexed) { // pre-2.1 index mode (not used now) // $normFileName = $this->_name . '.f' . $field->number; // $fFile = $this->_directory->createFile($normFileName); // $fFile->writeBytes($this->_norms[$field->name]); // $this->_files[] = $normFileName; $nrmFile->writeBytes($this->_norms[$field->name]); } } $this->_files[] = $this->_name . '.fnm'; $this->_files[] = $this->_name . '.nrm'; } /** * Term Dictionary file * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_tisFile = null; /** * Term Dictionary index file * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_tiiFile = null; /** * Frequencies file * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_frqFile = null; /** * Positions file * * @var \ZendSearch\Lucene\Storage\File\FileInterface */ private $_prxFile = null; /** * Number of written terms * * @var integer */ private $_termCount; /** * Last saved term * * @var \ZendSearch\Lucene\Index\Term */ private $_prevTerm; /** * Last saved term info * * @var \ZendSearch\Lucene\Index\TermInfo */ private $_prevTermInfo; /** * Last saved index term * * @var \ZendSearch\Lucene\Index\Term */ private $_prevIndexTerm; /** * Last saved index term info * * @var \ZendSearch\Lucene\Index\TermInfo */ private $_prevIndexTermInfo; /** * Last term dictionary file position * * @var integer */ private $_lastIndexPosition; /** * Create dicrionary, frequency and positions files and write necessary headers */ public function initializeDictionaryFiles() { $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); $this->_tisFile->writeInt((int)0xFFFFFFFD); $this->_tisFile->writeLong(0 /* dummy data for terms count */); $this->_tisFile->writeInt(self::$indexInterval); $this->_tisFile->writeInt(self::$skipInterval); $this->_tisFile->writeInt(self::$maxSkipLevels); $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); $this->_tiiFile->writeInt((int)0xFFFFFFFD); $this->_tiiFile->writeLong(0 /* dummy data for terms count */); $this->_tiiFile->writeInt(self::$indexInterval); $this->_tiiFile->writeInt(self::$skipInterval); $this->_tiiFile->writeInt(self::$maxSkipLevels); /** Dump dictionary header */ $this->_tiiFile->writeVInt(0); // preffix length $this->_tiiFile->writeString(''); // suffix $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number $this->_tiiFile->writeByte((int)0x0F); $this->_tiiFile->writeVInt(0); // DocFreq $this->_tiiFile->writeVInt(0); // FreqDelta $this->_tiiFile->writeVInt(0); // ProxDelta $this->_tiiFile->writeVInt(24); // IndexDelta $this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); $this->_prxFile = $this->_directory->createFile($this->_name . '.prx'); $this->_files[] = $this->_name . '.tis'; $this->_files[] = $this->_name . '.tii'; $this->_files[] = $this->_name . '.frq'; $this->_files[] = $this->_name . '.prx'; $this->_prevTerm = null; $this->_prevTermInfo = null; $this->_prevIndexTerm = null; $this->_prevIndexTermInfo = null; $this->_lastIndexPosition = 24; $this->_termCount = 0; } /** * Add term * * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) * * @param \ZendSearch\Lucene\Index\Term $termEntry * @param array $termDocs */ public function addTerm($termEntry, $termDocs) { $freqPointer = $this->_frqFile->tell(); $proxPointer = $this->_prxFile->tell(); $prevDoc = 0; foreach ($termDocs as $docId => $termPositions) { $docDelta = ($docId - $prevDoc)*2; $prevDoc = $docId; if (count($termPositions) > 1) { $this->_frqFile->writeVInt($docDelta); $this->_frqFile->writeVInt(count($termPositions)); } else { $this->_frqFile->writeVInt($docDelta + 1); } $prevPosition = 0; foreach ($termPositions as $position) { $this->_prxFile->writeVInt($position - $prevPosition); $prevPosition = $position; } } if (count($termDocs) >= self::$skipInterval) { /** * @todo Write Skip Data to a freq file. * It's not used now, but make index more optimal */ $skipOffset = $this->_frqFile->tell() - $freqPointer; } else { $skipOffset = 0; } $term = new Index\Term($termEntry->text, $this->_fields[$termEntry->field]->number); $termInfo = new Index\TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset); $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); if (($this->_termCount + 1) % self::$indexInterval == 0) { $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); $indexPosition = $this->_tisFile->tell(); $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); $this->_lastIndexPosition = $indexPosition; } $this->_termCount++; } /** * Close dictionary */ public function closeDictionaryFiles() { $this->_tisFile->seek(4); $this->_tisFile->writeLong($this->_termCount); $this->_tiiFile->seek(4); // + 1 is used to count an additional special index entry (empty term at the start of the list) $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1); } /** * Dump Term Dictionary segment file entry. * Used to write entry to .tis or .tii files * * @param \ZendSearch\Lucene\Storage\File\FileInterface $dicFile * @param \ZendSearch\Lucene\Index\Term $prevTerm * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\TermInfo $prevTermInfo * @param \ZendSearch\Lucene\Index\TermInfo $termInfo */ protected function _dumpTermDictEntry(File\FileInterface $dicFile, &$prevTerm, Index\Term $term, &$prevTermInfo, Index\TermInfo $termInfo) { if (isset($prevTerm) && $prevTerm->field == $term->field) { $matchedBytes = 0; $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); while ($matchedBytes < $maxBytes && $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { $matchedBytes++; } // Calculate actual matched UTF-8 pattern $prefixBytes = 0; $prefixChars = 0; while ($prefixBytes < $matchedBytes) { $charBytes = 1; if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x20 ) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x10 ) { $charBytes++; } } } if ($prefixBytes + $charBytes > $matchedBytes) { // char crosses matched bytes boundary // skip char break; } $prefixChars++; $prefixBytes += $charBytes; } // Write preffix length $dicFile->writeVInt($prefixChars); // Write suffix $dicFile->writeString(substr($term->text, $prefixBytes)); } else { // Write preffix length $dicFile->writeVInt(0); // Write suffix $dicFile->writeString($term->text); } // Write field number $dicFile->writeVInt($term->field); // DocFreq (the count of documents which contain the term) $dicFile->writeVInt($termInfo->docFreq); $prevTerm = $term; if (!isset($prevTermInfo)) { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer); } else { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); } // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval if ($termInfo->skipOffset != 0) { $dicFile->writeVInt($termInfo->skipOffset); } $prevTermInfo = $termInfo; } /** * Generate compound index file */ protected function _generateCFS() { $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); $cfsFile->writeVInt(count($this->_files)); $dataOffsetPointers = array(); foreach ($this->_files as $fileName) { $dataOffsetPointers[$fileName] = $cfsFile->tell(); $cfsFile->writeLong(0); // write dummy data $cfsFile->writeString($fileName); } foreach ($this->_files as $fileName) { // Get actual data offset $dataOffset = $cfsFile->tell(); // Seek to the data offset pointer $cfsFile->seek($dataOffsetPointers[$fileName]); // Write actual data offset value $cfsFile->writeLong($dataOffset); // Seek back to the end of file $cfsFile->seek($dataOffset); $dataFile = $this->_directory->getFileObject($fileName); $byteCount = $this->_directory->fileLength($fileName); while ($byteCount > 0) { $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); $byteCount -= strlen($data); $cfsFile->writeBytes($data); } $this->_directory->deleteFile($fileName); } } /** * Close segment, write it to disk and return segment info * * @return \ZendSearch\Lucene\Index\SegmentInfo */ abstract public function close(); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentWriter/DocumentWriter.php000066400000000000000000000171021245775125600320460ustar00rootroot00000000000000_termDocs = array(); $this->_termDictionary = array(); } /** * Adds a document to this segment. * * @param \ZendSearch\Lucene\Document $document * @throws LuceneException\UnsupportedMethodCallException */ public function addDocument(Document $document) { $storedFields = array(); $docNorms = array(); $similarity = AbstractSimilarity::getDefault(); foreach ($document->getFieldNames() as $fieldName) { $field = $document->getField($fieldName); if ($field->storeTermVector) { /** * @todo term vector storing support */ throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.'); } if ($field->isIndexed) { if ($field->isTokenized) { $analyzer = Analyzer\Analyzer::getDefault(); $analyzer->setInput($field->value, $field->encoding); $position = 0; $tokenCounter = 0; while (($token = $analyzer->nextToken()) !== null) { $tokenCounter++; $term = new Index\Term($token->getTermText(), $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $position += $token->getPositionIncrement(); $this->_termDocs[$termKey][$this->_docCount][] = $position; } if ($tokenCounter == 0) { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone($field); $field->isIndexed = $field->isTokenized = false; } else { $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, $tokenCounter)* $document->boost* $field->boost )); } } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') { // Field contains empty value. Treat it as non-indexed and non-tokenized $field = clone($field); $field->isIndexed = $field->isTokenized = false; } else { $term = new Index\Term($fieldUtf8Value, $field->name); $termKey = $term->key(); if (!isset($this->_termDictionary[$termKey])) { // New term $this->_termDictionary[$termKey] = $term; $this->_termDocs[$termKey] = array(); $this->_termDocs[$termKey][$this->_docCount] = array(); } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { // Existing term, but new term entry $this->_termDocs[$termKey][$this->_docCount] = array(); } $this->_termDocs[$termKey][$this->_docCount][] = 0; // position $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* $document->boost* $field->boost )); } } if ($field->isStored) { $storedFields[] = $field; } $this->addField($field); } foreach ($this->_fields as $fieldName => $field) { if (!$field->isIndexed) { continue; } if (!isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), $this->_docCount); } if (isset($docNorms[$fieldName])){ $this->_norms[$fieldName] .= $docNorms[$fieldName]; } else { $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); } } $this->addStoredFields($storedFields); } /** * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files */ protected function _dumpDictionary() { ksort($this->_termDictionary, SORT_STRING); $this->initializeDictionaryFiles(); foreach ($this->_termDictionary as $termId => $term) { $this->addTerm($term, $this->_termDocs[$termId]); } $this->closeDictionaryFiles(); } /** * Close segment, write it to disk and return segment info * * @return \ZendSearch\Lucene\Index\SegmentInfo */ public function close() { if ($this->_docCount == 0) { return null; } $this->_dumpFNM(); $this->_dumpDictionary(); $this->_generateCFS(); return new Index\SegmentInfo($this->_directory, $this->_name, $this->_docCount, -1, null, true, true); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/SegmentWriter/StreamWriter.php000066400000000000000000000042371245775125600315300ustar00rootroot00000000000000_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } public function addNorm($fieldName, $normVector) { if (isset($this->_norms[$fieldName])) { $this->_norms[$fieldName] .= $normVector; } else { $this->_norms[$fieldName] = $normVector; } } /** * Close segment, write it to disk and return segment info * * @return \ZendSearch\Lucene\Index\SegmentInfo */ public function close() { if ($this->_docCount == 0) { return null; } $this->_dumpFNM(); $this->_generateCFS(); return new LuceneIndex\SegmentInfo($this->_directory, $this->_name, $this->_docCount, -1, null, true, true); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/Term.php000066400000000000000000000063011245775125600252020ustar00rootroot00000000000000field = ($field === null)? Lucene\Lucene::getDefaultSearchField() : $field; $this->text = $text; } /** * Returns term key * * @return string */ public function key() { return $this->field . chr(0) . $this->text; } /** * Get term prefix * * @param string $str * @param integer $length * @return string */ public static function getPrefix($str, $length) { /** * @todo !!!!!!! use mb_string or iconv functions if they are available */ $prefixBytes = 0; $prefixChars = 0; while (isset($str[$prefixBytes]) && $prefixChars < $length) { $charBytes = 1; if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) { $charBytes++; if (ord($str[$prefixBytes]) & 0x20 ) { $charBytes++; if (ord($str[$prefixBytes]) & 0x10 ) { $charBytes++; } } } if (! isset($str[$prefixBytes + $charBytes - 1])) { // wrong character break; } $prefixChars++; $prefixBytes += $charBytes; } return substr($str, 0, $prefixBytes); } /** * Get UTF-8 string length * * @param string $str * @return string */ public static function getLength($str) { $bytes = 0; $chars = 0; while ($bytes < strlen($str)) { $charBytes = 1; if ((ord($str[$bytes]) & 0xC0) == 0xC0) { $charBytes++; if (ord($str[$bytes]) & 0x20 ) { $charBytes++; if (ord($str[$bytes]) & 0x10 ) { $charBytes++; } } } if ($bytes + $charBytes > strlen($str)) { // wrong character break; } $chars++; $bytes += $charBytes; } return $chars; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/TermInfo.php000066400000000000000000000027641245775125600260270ustar00rootroot00000000000000docFreq = $docFreq; $this->freqPointer = $freqPointer; $this->proxPointer = $proxPointer; $this->skipOffset = $skipOffset; $this->indexPointer = $indexPointer; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/TermsPriorityQueue.php000066400000000000000000000017411245775125600301370ustar00rootroot00000000000000currentTerm()->key(), $termsStream2->currentTerm()->key()) < 0; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Index/TermsStreamInterface.php000066400000000000000000000024361245775125600303670ustar00rootroot00000000000000 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * Default value is 10 * * @var integer */ public $mergeFactor = 10; /** * File system adapter. * * @var \ZendSearch\Lucene\Storage\Directory\DirectoryInterface */ private $_directory = null; /** * Changes counter. * * @var integer */ private $_versionUpdate = 0; /** * List of the segments, created by index writer * Array of Zend_Search_Lucene_Index_SegmentInfo objects * * @var array */ private $_newSegments = array(); /** * List of segments to be deleted on commit * * @var array */ private $_segmentsToDelete = array(); /** * Current segment to add documents * * @var \ZendSearch\Lucene\Index\SegmentWriter\DocumentWriter */ private $_currentSegment = null; /** * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. * * It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array * * @var array|\ZendSearch\Lucene\Index\SegmentInfo */ private $_segmentInfos; /** * Index target format version * * @var integer */ private $_targetFormatVersion; /** * List of indexfiles extensions * * @var array */ private static $_indexExtensions = array('.cfs' => '.cfs', '.cfx' => '.cfx', '.fnm' => '.fnm', '.fdx' => '.fdx', '.fdt' => '.fdt', '.tis' => '.tis', '.tii' => '.tii', '.frq' => '.frq', '.prx' => '.prx', '.tvx' => '.tvx', '.tvd' => '.tvd', '.tvf' => '.tvf', '.del' => '.del', '.sti' => '.sti' ); /** * Create empty index * * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory * @param integer $generation * @param integer $nameCount */ public static function createIndex(Directory\DirectoryInterface $directory, $generation, $nameCount) { if ($generation == 0) { // Create index in pre-2.1 mode foreach ($directory->fileList() as $file) { if ($file == 'deletable' || $file == 'segments' || isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) || preg_match('/\.f\d+$/i', $file) /* matches .f file names */) { $directory->deleteFile($file); } } $segmentsFile = $directory->createFile('segments'); $segmentsFile->writeInt((int)0xFFFFFFFF); // write version (initialized by current time) $segmentsFile->writeLong(round(microtime(true))); // write name counter $segmentsFile->writeInt($nameCount); // write segment counter $segmentsFile->writeInt(0); $deletableFile = $directory->createFile('deletable'); // write counter $deletableFile->writeInt(0); } else { $genFile = $directory->createFile('segments.gen'); $genFile->writeInt((int)0xFFFFFFFE); // Write generation two times $genFile->writeLong($generation); $genFile->writeLong($generation); $segmentsFile = $directory->createFile(Lucene\Index::getSegmentFileName($generation)); $segmentsFile->writeInt((int)0xFFFFFFFD); // write version (initialized by current time) $segmentsFile->writeLong(round(microtime(true))); // write name counter $segmentsFile->writeInt($nameCount); // write segment counter $segmentsFile->writeInt(0); } } /** * Open the index for writing * * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory * @param array $segmentInfos * @param integer $targetFormatVersion * @param \ZendSearch\Lucene\Storage\File\FileInterface $cleanUpLock */ public function __construct(Directory\DirectoryInterface $directory, &$segmentInfos, $targetFormatVersion) { $this->_directory = $directory; $this->_segmentInfos = &$segmentInfos; $this->_targetFormatVersion = $targetFormatVersion; } /** * Adds a document to this index. * * @param \ZendSearch\Lucene\Document $document */ public function addDocument(Document $document) { if ($this->_currentSegment === null) { $this->_currentSegment = new SegmentWriter\DocumentWriter($this->_directory, $this->_newSegmentName()); } $this->_currentSegment->addDocument($document); if ($this->_currentSegment->count() >= $this->maxBufferedDocs) { $this->commit(); } $this->_maybeMergeSegments(); $this->_versionUpdate++; } /** * Check if we have anything to merge * * @return boolean */ private function _hasAnythingToMerge() { $segmentSizes = array(); foreach ($this->_segmentInfos as $segName => $segmentInfo) { $segmentSizes[$segName] = $segmentInfo->count(); } $mergePool = array(); $poolSize = 0; $sizeToMerge = $this->maxBufferedDocs; asort($segmentSizes, SORT_NUMERIC); foreach ($segmentSizes as $segName => $size) { // Check, if segment comes into a new merging block while ($size >= $sizeToMerge) { // Merge previous block if it's large enough if ($poolSize >= $sizeToMerge) { return true; } $mergePool = array(); $poolSize = 0; $sizeToMerge *= $this->mergeFactor; if ($sizeToMerge > $this->maxMergeDocs) { return false; } } $mergePool[] = $this->_segmentInfos[$segName]; $poolSize += $size; } if ($poolSize >= $sizeToMerge) { return true; } return false; } /** * Merge segments if necessary */ private function _maybeMergeSegments() { if (Lucene\LockManager::obtainOptimizationLock($this->_directory) === false) { return; } if (!$this->_hasAnythingToMerge()) { Lucene\LockManager::releaseOptimizationLock($this->_directory); return; } // Update segments list to be sure all segments are not merged yet by another process // // Segment merging functionality is concentrated in this class and surrounded // by optimization lock obtaining/releasing. // _updateSegments() refreshes segments list from the latest index generation. // So only new segments can be added to the index while we are merging some already existing // segments. // Newly added segments will be also included into the index by the _updateSegments() call // either by another process or by the current process with the commit() call at the end of _mergeSegments() method. // That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks. $this->_updateSegments(); // Perform standard auto-optimization procedure $segmentSizes = array(); foreach ($this->_segmentInfos as $segName => $segmentInfo) { $segmentSizes[$segName] = $segmentInfo->count(); } $mergePool = array(); $poolSize = 0; $sizeToMerge = $this->maxBufferedDocs; asort($segmentSizes, SORT_NUMERIC); foreach ($segmentSizes as $segName => $size) { // Check, if segment comes into a new merging block while ($size >= $sizeToMerge) { // Merge previous block if it's large enough if ($poolSize >= $sizeToMerge) { $this->_mergeSegments($mergePool); } $mergePool = array(); $poolSize = 0; $sizeToMerge *= $this->mergeFactor; if ($sizeToMerge > $this->maxMergeDocs) { Lucene\LockManager::releaseOptimizationLock($this->_directory); return; } } $mergePool[] = $this->_segmentInfos[$segName]; $poolSize += $size; } if ($poolSize >= $sizeToMerge) { $this->_mergeSegments($mergePool); } Lucene\LockManager::releaseOptimizationLock($this->_directory); } /** * Merge specified segments * * $segments is an array of SegmentInfo objects * * @param array $segments */ private function _mergeSegments($segments) { $newName = $this->_newSegmentName(); $merger = new SegmentMerger($this->_directory, $newName); foreach ($segments as $segmentInfo) { $merger->addSource($segmentInfo); $this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName(); } $newSegment = $merger->merge(); if ($newSegment !== null) { $this->_newSegments[$newSegment->getName()] = $newSegment; } $this->commit(); } /** * Update segments file by adding current segment to a list * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException */ private function _updateSegments() { // Get an exclusive index lock Lucene\LockManager::obtainWriteLock($this->_directory); // Write down changes for the segments foreach ($this->_segmentInfos as $segInfo) { $segInfo->writeChanges(); } $generation = Lucene\Index::getActualGeneration($this->_directory); $segmentsFile = $this->_directory->getFileObject(Lucene\Index::getSegmentFileName($generation), false); $newSegmentFile = $this->_directory->createFile(Lucene\Index::getSegmentFileName(++$generation), false); try { $genFile = $this->_directory->getFileObject('segments.gen', false); } catch (ExceptionInterface $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { $genFile = $this->_directory->createFile('segments.gen'); } else { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } $genFile->writeInt((int)0xFFFFFFFE); // Write generation (first copy) $genFile->writeLong($generation); try { // Write format marker if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_1) { $newSegmentFile->writeInt((int)0xFFFFFFFD); } elseif ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) { $newSegmentFile->writeInt((int)0xFFFFFFFC); } // Read src file format identifier $format = $segmentsFile->readInt(); if ($format == (int)0xFFFFFFFF) { $srcFormat = Lucene\Index::FORMAT_PRE_2_1; } elseif ($format == (int)0xFFFFFFFD) { $srcFormat = Lucene\Index::FORMAT_2_1; } elseif ($format == (int)0xFFFFFFFC) { $srcFormat = Lucene\Index::FORMAT_2_3; } else { throw new InvalidFileFormatException('Unsupported segments file format'); } $version = $segmentsFile->readLong() + $this->_versionUpdate; $this->_versionUpdate = 0; $newSegmentFile->writeLong($version); // Write segment name counter $newSegmentFile->writeInt($segmentsFile->readInt()); // Get number of segments offset $numOfSegmentsOffset = $newSegmentFile->tell(); // Write dummy data (segment counter) $newSegmentFile->writeInt(0); // Read number of segemnts $segmentsCount = $segmentsFile->readInt(); $segments = array(); for ($count = 0; $count < $segmentsCount; $count++) { $segName = $segmentsFile->readString(); $segSize = $segmentsFile->readInt(); if ($srcFormat == Lucene\Index::FORMAT_PRE_2_1) { // pre-2.1 index format $delGen = 0; $hasSingleNormFile = false; $numField = (int)0xFFFFFFFF; $isCompoundByte = 0; $docStoreOptions = null; } else { $delGen = $segmentsFile->readLong(); if ($srcFormat == Lucene\Index::FORMAT_2_3) { $docStoreOffset = $segmentsFile->readInt(); if ($docStoreOffset != (int)0xFFFFFFFF) { $docStoreSegment = $segmentsFile->readString(); $docStoreIsCompoundFile = $segmentsFile->readByte(); $docStoreOptions = array('offset' => $docStoreOffset, 'segment' => $docStoreSegment, 'isCompound' => ($docStoreIsCompoundFile == 1)); } else { $docStoreOptions = null; } } else { $docStoreOptions = null; } $hasSingleNormFile = $segmentsFile->readByte(); $numField = $segmentsFile->readInt(); $normGens = array(); if ($numField != (int)0xFFFFFFFF) { for ($count1 = 0; $count1 < $numField; $count1++) { $normGens[] = $segmentsFile->readLong(); } } $isCompoundByte = $segmentsFile->readByte(); } if (!in_array($segName, $this->_segmentsToDelete)) { // Load segment if necessary if (!isset($this->_segmentInfos[$segName])) { if ($isCompoundByte == 0xFF) { // The segment is not a compound file $isCompound = false; } elseif ($isCompoundByte == 0x00) { // The status is unknown $isCompound = null; } elseif ($isCompoundByte == 0x01) { // The segment is a compound file $isCompound = true; } $this->_segmentInfos[$segName] = new SegmentInfo($this->_directory, $segName, $segSize, $delGen, $docStoreOptions, $hasSingleNormFile, $isCompound); } else { // Retrieve actual deletions file generation number $delGen = $this->_segmentInfos[$segName]->getDelGen(); } $newSegmentFile->writeString($segName); $newSegmentFile->writeInt($segSize); $newSegmentFile->writeLong($delGen); if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) { if ($docStoreOptions !== null) { $newSegmentFile->writeInt($docStoreOffset); $newSegmentFile->writeString($docStoreSegment); $newSegmentFile->writeByte($docStoreIsCompoundFile); } else { // Set DocStoreOffset to -1 $newSegmentFile->writeInt((int)0xFFFFFFFF); } } elseif ($docStoreOptions !== null) { // Release index write lock Lucene\LockManager::releaseWriteLock($this->_directory); throw new RuntimeException('Index conversion to lower format version is not supported.'); } $newSegmentFile->writeByte($hasSingleNormFile); $newSegmentFile->writeInt($numField); if ($numField != (int)0xFFFFFFFF) { foreach ($normGens as $normGen) { $newSegmentFile->writeLong($normGen); } } $newSegmentFile->writeByte($isCompoundByte); $segments[$segName] = $segSize; } } $segmentsFile->close(); $segmentsCount = count($segments) + count($this->_newSegments); foreach ($this->_newSegments as $segName => $segmentInfo) { $newSegmentFile->writeString($segName); $newSegmentFile->writeInt($segmentInfo->count()); // delete file generation: -1 (there is no delete file yet) $newSegmentFile->writeInt((int)0xFFFFFFFF);$newSegmentFile->writeInt((int)0xFFFFFFFF); if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) { // docStoreOffset: -1 (segment doesn't use shared doc store) $newSegmentFile->writeInt((int)0xFFFFFFFF); } // HasSingleNormFile $newSegmentFile->writeByte($segmentInfo->hasSingleNormFile()); // NumField $newSegmentFile->writeInt((int)0xFFFFFFFF); // IsCompoundFile $newSegmentFile->writeByte($segmentInfo->isCompound() ? 1 : -1); $segments[$segmentInfo->getName()] = $segmentInfo->count(); $this->_segmentInfos[$segName] = $segmentInfo; } $this->_newSegments = array(); $newSegmentFile->seek($numOfSegmentsOffset); $newSegmentFile->writeInt($segmentsCount); // Update segments count $newSegmentFile->close(); } catch (\Exception $e) { /** Restore previous index generation */ $generation--; $genFile->seek(4, SEEK_SET); // Write generation number twice $genFile->writeLong($generation); $genFile->writeLong($generation); // Release index write lock Lucene\LockManager::releaseWriteLock($this->_directory); // Throw the exception throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } // Write generation (second copy) $genFile->writeLong($generation); // Check if another update or read process is not running now // If yes, skip clean-up procedure if (Lucene\LockManager::escalateReadLock($this->_directory)) { /** * Clean-up directory */ $filesToDelete = array(); $filesTypes = array(); $filesNumbers = array(); // list of .del files of currently used segments // each segment can have several generations of .del files // only last should not be deleted $delFiles = array(); foreach ($this->_directory->fileList() as $file) { if ($file == 'deletable') { // 'deletable' file $filesToDelete[] = $file; $filesTypes[] = 0; // delete this file first, since it's not used starting from Lucene v2.1 $filesNumbers[] = 0; } elseif ($file == 'segments') { // 'segments' file $filesToDelete[] = $file; $filesTypes[] = 1; // second file to be deleted "zero" version of segments file (Lucene pre-2.1) $filesNumbers[] = 0; } elseif (preg_match('/^segments_[a-zA-Z0-9]+$/i', $file)) { // 'segments_xxx' file // Check if it's not a just created generation file if ($file != Lucene\Index::getSegmentFileName($generation)) { $filesToDelete[] = $file; $filesTypes[] = 2; // first group of files for deletions $filesNumbers[] = (int)base_convert(substr($file, 9), 36, 10); // ordered by segment generation numbers } } elseif (preg_match('/(^_([a-zA-Z0-9]+))\.f\d+$/i', $file, $matches)) { // one of per segment files ('.f') // Check if it's not one of the segments in the current segments set if (!isset($segments[$matches[1]])) { $filesToDelete[] = $file; $filesTypes[] = 3; // second group of files for deletions $filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number } } elseif (preg_match('/(^_([a-zA-Z0-9]+))(_([a-zA-Z0-9]+))\.del$/i', $file, $matches)) { // one of per segment files ('_.del' where is '_') // Check if it's not one of the segments in the current segments set if (!isset($segments[$matches[1]])) { $filesToDelete[] = $file; $filesTypes[] = 3; // second group of files for deletions $filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number } else { $segmentNumber = (int)base_convert($matches[2], 36, 10); $delGeneration = (int)base_convert($matches[4], 36, 10); if (!isset($delFiles[$segmentNumber])) { $delFiles[$segmentNumber] = array(); } $delFiles[$segmentNumber][$delGeneration] = $file; } } elseif (isset(self::$_indexExtensions[substr($file, strlen($file)-4)])) { // one of per segment files ('.') $segmentName = substr($file, 0, strlen($file) - 4); // Check if it's not one of the segments in the current segments set if (!isset($segments[$segmentName]) && ($this->_currentSegment === null || $this->_currentSegment->getName() != $segmentName)) { $filesToDelete[] = $file; $filesTypes[] = 3; // second group of files for deletions $filesNumbers[] = (int)base_convert(substr($file, 1 /* skip '_' */, strlen($file)-5), 36, 10); // order by segment number } } } $maxGenNumber = 0; // process .del files of currently used segments foreach ($delFiles as $segmentNumber => $segmentDelFiles) { ksort($delFiles[$segmentNumber], SORT_NUMERIC); array_pop($delFiles[$segmentNumber]); // remove last delete file generation from candidates for deleting end($delFiles[$segmentNumber]); $lastGenNumber = key($delFiles[$segmentNumber]); if ($lastGenNumber > $maxGenNumber) { $maxGenNumber = $lastGenNumber; } } foreach ($delFiles as $segmentNumber => $segmentDelFiles) { foreach ($segmentDelFiles as $delGeneration => $file) { $filesToDelete[] = $file; $filesTypes[] = 4; // third group of files for deletions $filesNumbers[] = $segmentNumber*$maxGenNumber + $delGeneration; // order by , pair } } // Reorder files for deleting array_multisort($filesTypes, SORT_ASC, SORT_NUMERIC, $filesNumbers, SORT_ASC, SORT_NUMERIC, $filesToDelete, SORT_ASC, SORT_STRING); foreach ($filesToDelete as $file) { try { /** Skip shared docstore segments deleting */ /** @todo Process '.cfx' files to check if them are already unused */ if (substr($file, strlen($file)-4) != '.cfx') { $this->_directory->deleteFile($file); } } catch (ExceptionInterface $e) { if (strpos($e->getMessage(), 'Can\'t delete file') === false) { // That's not "file is under processing or already deleted" exception // Pass it through throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } } // Return read lock into the previous state Lucene\LockManager::deEscalateReadLock($this->_directory); } else { // Only release resources if another index reader is running now foreach ($this->_segmentsToDelete as $segName) { foreach (self::$_indexExtensions as $ext) { $this->_directory->purgeFile($segName . $ext); } } } // Clean-up _segmentsToDelete container $this->_segmentsToDelete = array(); // Release index write lock Lucene\LockManager::releaseWriteLock($this->_directory); // Remove unused segments from segments list foreach ($this->_segmentInfos as $segName => $segmentInfo) { if (!isset($segments[$segName])) { unset($this->_segmentInfos[$segName]); } } } /** * Commit current changes */ public function commit() { if ($this->_currentSegment !== null) { $newSegment = $this->_currentSegment->close(); if ($newSegment !== null) { $this->_newSegments[$newSegment->getName()] = $newSegment; } $this->_currentSegment = null; } $this->_updateSegments(); } /** * Merges the provided indexes into this index. * * @param array $readers * @return void */ public function addIndexes($readers) { /** * @todo implementation */ } /** * Merges all segments together into new one * * Returns true on success and false if another optimization or auto-optimization process * is running now * * @return boolean */ public function optimize() { if (Lucene\LockManager::obtainOptimizationLock($this->_directory) === false) { return false; } // Update segments list to be sure all segments are not merged yet by another process // // Segment merging functionality is concentrated in this class and surrounded // by optimization lock obtaining/releasing. // _updateSegments() refreshes segments list from the latest index generation. // So only new segments can be added to the index while we are merging some already existing // segments. // Newly added segments will be also included into the index by the _updateSegments() call // either by another process or by the current process with the commit() call at the end of _mergeSegments() method. // That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks. $this->_updateSegments(); $this->_mergeSegments($this->_segmentInfos); Lucene\LockManager::releaseOptimizationLock($this->_directory); return true; } /** * Get name for new segment * * @return string */ private function _newSegmentName() { Lucene\LockManager::obtainWriteLock($this->_directory); $generation = Lucene\Index::getActualGeneration($this->_directory); $segmentsFile = $this->_directory->getFileObject(Lucene\Index::getSegmentFileName($generation), false); $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) $segmentNameCounter = $segmentsFile->readInt(); $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version) $segmentsFile->writeInt($segmentNameCounter + 1); // Flash output to guarantee that wrong value will not be loaded between unlock and // return (which calls $segmentsFile destructor) $segmentsFile->flush(); Lucene\LockManager::releaseWriteLock($this->_directory); return '_' . base_convert($segmentNameCounter, 10, 36); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/LockManager.php000066400000000000000000000170541245775125600254160ustar00rootroot00000000000000createFile(self::WRITE_LOCK_FILE); if (!$lock->lock(LOCK_EX)) { throw new RuntimeException('Can\'t obtain exclusive index lock'); } return $lock; } /** * Release exclusive write lock * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory */ public static function releaseWriteLock(Directory $lockDirectory) { $lock = $lockDirectory->getFileObject(self::WRITE_LOCK_FILE); $lock->unlock(); } /** * Obtain the exclusive "read escalation/de-escalation" lock * * Required to protect the escalate/de-escalate read lock process * on GFS (and potentially other) mounted filesystems. * * Why we need this: * While GFS supports cluster-wide locking via flock(), it's * implementation isn't quite what it should be. The locking * semantics that work consistently on a local filesystem tend to * fail on GFS mounted filesystems. This appears to be a design defect * in the implementation of GFS. How this manifests itself is that * conditional promotion of a shared lock to exclusive will always * fail, lock release requests are honored but not immediately * processed (causing erratic failures of subsequent conditional * requests) and the releasing of the exclusive lock before the * shared lock is set when a lock is demoted (which can open a window * of opportunity for another process to gain an exclusive lock when * it shoudln't be allowed to). * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory * @return \ZendSearch\Lucene\Storage\File\FileInterface * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private static function _startReadLockProcessing(Directory $lockDirectory) { $lock = $lockDirectory->createFile(self::READ_LOCK_PROCESSING_LOCK_FILE); if (!$lock->lock(LOCK_EX)) { throw new RuntimeException('Can\'t obtain exclusive lock for the read lock processing file'); } return $lock; } /** * Release the exclusive "read escalation/de-escalation" lock * * Required to protect the escalate/de-escalate read lock process * on GFS (and potentially other) mounted filesystems. * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory */ private static function _stopReadLockProcessing(Directory $lockDirectory) { $lock = $lockDirectory->getFileObject(self::READ_LOCK_PROCESSING_LOCK_FILE); $lock->unlock(); } /** * Obtain shared read lock on the index * * It doesn't block other read or update processes, but prevent index from the premature cleaning-up * * @param \ZendSearch\Lucene\Storage\Directory $defaultLockDirectory * @return \ZendSearch\Lucene\Storage\File\FileInterface * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public static function obtainReadLock(Directory $lockDirectory) { $lock = $lockDirectory->createFile(self::READ_LOCK_FILE); if (!$lock->lock(LOCK_SH)) { throw new RuntimeException('Can\'t obtain shared reading index lock'); } return $lock; } /** * Release shared read lock * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory */ public static function releaseReadLock(Directory $lockDirectory) { $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); $lock->unlock(); } /** * Escalate Read lock to exclusive level * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory * @return boolean */ public static function escalateReadLock(Directory $lockDirectory) { self::_startReadLockProcessing($lockDirectory); $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); // First, release the shared lock for the benefit of GFS since // it will fail the conditional request to promote the lock to // "exclusive" while the shared lock is held (even when we are // the only holder). $lock->unlock(); // GFS is really poor. While the above "unlock" returns, GFS // doesn't clean up it's tables right away (which will potentially // cause the conditional locking for the "exclusive" lock to fail. // We will retry the conditional lock request several times on a // failure to get past this. The performance hit is negligible // in the grand scheme of things and only will occur with GFS // filesystems or if another local process has the shared lock // on local filesystems. for ($retries = 0; $retries < 10; $retries++) { if ($lock->lock(LOCK_EX, true)) { // Exclusive lock is obtained! self::_stopReadLockProcessing($lockDirectory); return true; } // wait 1 microsecond usleep(1); } // Restore lock state $lock->lock(LOCK_SH); self::_stopReadLockProcessing($lockDirectory); return false; } /** * De-escalate Read lock to shared level * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory */ public static function deEscalateReadLock(Directory $lockDirectory) { $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); $lock->lock(LOCK_SH); } /** * Obtain exclusive optimization lock on the index * * Returns lock object on success and false otherwise (doesn't block execution) * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory * @return mixed */ public static function obtainOptimizationLock(Directory $lockDirectory) { $lock = $lockDirectory->createFile(self::OPTIMIZATION_LOCK_FILE); if (!$lock->lock(LOCK_EX, true)) { return false; } return $lock; } /** * Release exclusive optimization lock * * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory */ public static function releaseOptimizationLock(Directory $lockDirectory) { $lock = $lockDirectory->getFileObject(self::OPTIMIZATION_LOCK_FILE); $lock->unlock(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Lucene.php000066400000000000000000000062361245775125600244460ustar00rootroot00000000000000_indices = $indices; foreach ($this->_indices as $index) { if (!$index instanceof SearchIndexInterface) { throw new InvalidArgumentException('sub-index objects have to implement ZendSearch\Lucene\Interface.'); } } } /** * Add index for searching. * * @param \ZendSearch\Lucene\SearchIndexInterface $index */ public function addIndex(SearchIndexInterface $index) { $this->_indices[] = $index; } /** * Get current generation number * * Returns generation number * 0 means pre-2.1 index format * -1 means there are no segments files. * * @param Storage\Directory\DirectoryInterface $directory * @return integer * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public static function getActualGeneration(Storage\Directory\DirectoryInterface $directory) { throw new UnsupportedMethodCallException("Generation number can't be retrieved for multi-searcher"); } /** * Get segments file name * * @param integer $generation * @return string */ public static function getSegmentFileName($generation) { return Index::getSegmentFileName($generation); } /** * Get index format version * * @return integer * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function getFormatVersion() { throw new UnsupportedMethodCallException("Format version can't be retrieved for multi-searcher"); } /** * Set index format version. * Index is converted to this format at the nearest upfdate time * * @param int $formatVersion */ public function setFormatVersion($formatVersion) { foreach ($this->_indices as $index) { $index->setFormatVersion($formatVersion); } } /** * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. * * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return \ZendSearch\Lucene\Storage\Directory\DirectoryInterface */ public function getDirectory() { throw new UnsupportedMethodCallException("Index directory can't be retrieved for multi-searcher"); } /** * Returns the total number of documents in this index (including deleted documents). * * @return integer */ public function count() { $count = 0; foreach ($this->_indices as $index) { $count += $this->_indices->count(); } return $count; } /** * Returns one greater than the largest possible document number. * This may be used to, e.g., determine how big to allocate a structure which will have * an element for every document number in an index. * * @return integer */ public function maxDoc() { return $this->count(); } /** * Returns the total number of non-deleted documents in this index. * * @return integer */ public function numDocs() { $docs = 0; foreach ($this->_indices as $index) { $docs += $index->numDocs(); } return $docs; } /** * Checks, that document is deleted * * @param integer $id * @return boolean * @throws \ZendSearch\Lucene\Exception\OutOfRangeException is thrown if $id is out of the range */ public function isDeleted($id) { foreach ($this->_indices as $index) { $indexCount = $index->count(); if ($indexCount > $id) { return $index->isDeleted($id); } $id -= $indexCount; } throw new OutOfRangeException('Document id is out of the range.'); } /** * Retrieve index maxBufferedDocs option * * maxBufferedDocs is a minimal number of documents required before * the buffered in-memory documents are written into a new Segment * * Default value is 10 * * @return integer * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function getMaxBufferedDocs() { if (count($this->_indices) == 0) { throw new RuntimeException('Indices list is empty'); } $maxBufferedDocs = reset($this->_indices)->getMaxBufferedDocs(); foreach ($this->_indices as $index) { if ($index->getMaxBufferedDocs() !== $maxBufferedDocs) { throw new RuntimeException('Indices have different default search field.'); } } return $maxBufferedDocs; } /** * Set index maxBufferedDocs option * * maxBufferedDocs is a minimal number of documents required before * the buffered in-memory documents are written into a new Segment * * Default value is 10 * * @param integer $maxBufferedDocs */ public function setMaxBufferedDocs($maxBufferedDocs) { foreach ($this->_indices as $index) { $index->setMaxBufferedDocs($maxBufferedDocs); } } /** * Retrieve index maxMergeDocs option * * maxMergeDocs is a largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * Default value is PHP_INT_MAX * * @return integer * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function getMaxMergeDocs() { if (count($this->_indices) == 0) { throw new RuntimeException('Indices list is empty'); } $maxMergeDocs = reset($this->_indices)->getMaxMergeDocs(); foreach ($this->_indices as $index) { if ($index->getMaxMergeDocs() !== $maxMergeDocs) { throw new RuntimeException('Indices have different default search field.'); } } return $maxMergeDocs; } /** * Set index maxMergeDocs option * * maxMergeDocs is a largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * Default value is PHP_INT_MAX * * @param integer $maxMergeDocs */ public function setMaxMergeDocs($maxMergeDocs) { foreach ($this->_indices as $index) { $index->setMaxMergeDocs($maxMergeDocs); } } /** * Retrieve index mergeFactor option * * mergeFactor determines how often segment indices are merged by addDocument(). * With smaller values, less RAM is used while indexing, * and searches on unoptimized indices are faster, * but indexing speed is slower. * With larger values, more RAM is used during indexing, * and while searches on unoptimized indices are slower, * indexing is faster. * Thus larger values (> 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * Default value is 10 * * @return integer * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function getMergeFactor() { if (count($this->_indices) == 0) { throw new RuntimeException('Indices list is empty'); } $mergeFactor = reset($this->_indices)->getMergeFactor(); foreach ($this->_indices as $index) { if ($index->getMergeFactor() !== $mergeFactor) { throw new RuntimeException('Indices have different default search field.'); } } return $mergeFactor; } /** * Set index mergeFactor option * * mergeFactor determines how often segment indices are merged by addDocument(). * With smaller values, less RAM is used while indexing, * and searches on unoptimized indices are faster, * but indexing speed is slower. * With larger values, more RAM is used during indexing, * and while searches on unoptimized indices are slower, * indexing is faster. * Thus larger values (> 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * Default value is 10 * * @param integer $maxMergeDocs */ public function setMergeFactor($mergeFactor) { foreach ($this->_indices as $index) { $index->setMaxMergeDocs($mergeFactor); } } /** * Performs a query against the index and returns an array * of Zend_Search_Lucene_Search_QueryHit objects. * Input is a string or Zend_Search_Lucene_Search_Query. * * @param mixed $query * @return array|\ZendSearch\Lucene\Search\QueryHit */ public function find($query) { if (count($this->_indices) == 0) { return array(); } $hitsList = array(); $indexShift = 0; foreach ($this->_indices as $index) { $hits = $index->find($query); if ($indexShift != 0) { foreach ($hits as $hit) { $hit->id += $indexShift; } } $indexShift += $index->count(); $hitsList[] = $hits; } /** @todo Implement advanced sorting */ return call_user_func_array('array_merge', $hitsList); } /** * Returns a list of all unique field names that exist in this index. * * @param boolean $indexed * @return array */ public function getFieldNames($indexed = false) { $fieldNamesList = array(); foreach ($this->_indices as $index) { $fieldNamesList[] = $index->getFieldNames($indexed); } return array_unique(call_user_func_array('array_merge', $fieldNamesList)); } /** * Returns a Zend_Search_Lucene_Document object for the document * number $id in this index. * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @return \ZendSearch\Lucene\Document * @throws \ZendSearch\Lucene\Exception\OutOfRangeException is thrown if $id is out of the range */ public function getDocument($id) { if ($id instanceof Search\QueryHit) { /* @var $id \ZendSearch\Lucene\Search\QueryHit */ $id = $id->id; } foreach ($this->_indices as $index) { $indexCount = $index->count(); if ($indexCount > $id) { return $index->getDocument($id); } $id -= $indexCount; } throw new OutOfRangeException('Document id is out of the range.'); } /** * Returns true if index contain documents with specified term. * * Is used for query optimization. * * @param \ZendSearch\Lucene\Index\Term $term * @return boolean */ public function hasTerm(Index\Term $term) { foreach ($this->_indices as $index) { if ($index->hasTerm($term)) { return true; } } return false; } /** * Returns IDs of all the documents containing term. * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return array * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function termDocs(Index\Term $term, $docsFilter = null) { if ($docsFilter != null) { throw new InvalidArgumentException('Document filters could not used with multi-searcher'); } $docsList = array(); $indexShift = 0; foreach ($this->_indices as $index) { $docs = $index->termDocs($term); if ($indexShift != 0) { foreach ($docs as $id => $docId) { $docs[$id] += $indexShift; } } $indexShift += $index->count(); $docsList[] = $docs; } return call_user_func_array('array_merge', $docsList); } /** * Returns documents filter for all documents containing term. * * It performs the same operation as termDocs, but return result as * Zend_Search_Lucene_Index_DocsFilter object * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return \ZendSearch\Lucene\Index\DocsFilter * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function termDocsFilter(Index\Term $term, $docsFilter = null) { throw new UnsupportedMethodCallException('Document filters could not used with multi-searcher'); } /** * Returns an array of all term freqs. * Return array structure: array( docId => freq, ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return integer * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function termFreqs(Index\Term $term, $docsFilter = null) { if ($docsFilter != null) { throw new InvalidArgumentException('Document filters could not used with multi-searcher'); } $freqsList = array(); $indexShift = 0; foreach ($this->_indices as $index) { $freqs = $index->termFreqs($term); if ($indexShift != 0) { $freqsShifted = array(); foreach ($freqs as $docId => $freq) { $freqsShifted[$docId + $indexShift] = $freq; } $freqs = $freqsShifted; } $indexShift += $index->count(); $freqsList[] = $freqs; } return call_user_func_array('array_merge', $freqsList); } /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException * @return array */ public function termPositions(Index\Term $term, $docsFilter = null) { if ($docsFilter != null) { throw new InvalidArgumentException('Document filters could not used with multi-searcher'); } $termPositionsList = array(); $indexShift = 0; foreach ($this->_indices as $index) { $termPositions = $index->termPositions($term); if ($indexShift != 0) { $termPositionsShifted = array(); foreach ($termPositions as $docId => $positions) { $termPositions[$docId + $indexShift] = $positions; } $termPositions = $termPositionsShifted; } $indexShift += $index->count(); $termPositionsList[] = $termPositions; } return call_user_func_array('array_merge', $termPositions); } /** * Returns the number of documents in this index containing the $term. * * @param \ZendSearch\Lucene\Index\Term $term * @return integer */ public function docFreq(Index\Term $term) { $docFreq = 0; foreach ($this->_indices as $index) { $docFreq += $index->docFreq($term); } return $docFreq; } /** * Retrive similarity used by index reader * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity */ public function getSimilarity() { if (count($this->_indices) == 0) { throw new RuntimeException('Indices list is empty'); } $similarity = reset($this->_indices)->getSimilarity(); foreach ($this->_indices as $index) { if ($index->getSimilarity() !== $similarity) { throw new RuntimeException('Indices have different similarity.'); } } return $similarity; } /** * Returns a normalization factor for "field, document" pair. * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName) { foreach ($this->_indices as $index) { $indexCount = $index->count(); if ($indexCount > $id) { return $index->norm($id, $fieldName); } $id -= $indexCount; } return null; } /** * Returns true if any documents have been deleted from this index. * * @return boolean */ public function hasDeletions() { foreach ($this->_indices as $index) { if ($index->hasDeletions()) { return true; } } return false; } /** * Deletes a document from the index. * $id is an internal document id * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @throws \ZendSearch\Lucene\Exception\OutOfRangeException */ public function delete($id) { foreach ($this->_indices as $index) { $indexCount = $index->count(); if ($indexCount > $id) { $index->delete($id); return; } $id -= $indexCount; } throw new OutOfRangeException('Document id is out of the range.'); } /** * Callback used to choose target index for new documents * * Function/method signature: * Zend_Search_Lucene_Interface callbackFunction(Zend_Search_Lucene_Document $document, array $indices); * * null means "default documents distributing algorithm" * * @var callback */ protected $_documentDistributorCallBack = null; /** * Set callback for choosing target index. * * @param callback $callback * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function setDocumentDistributorCallback($callback) { if ($callback !== null && !is_callable($callback)) { throw new InvalidArgumentException('$callback parameter must be a valid callback.'); } $this->_documentDistributorCallBack = $callback; } /** * Get callback for choosing target index. * * @return callback */ public function getDocumentDistributorCallback() { return $this->_documentDistributorCallBack; } /** * Adds a document to this index. * * @param \ZendSearch\Lucene\Document $document */ public function addDocument(Document $document) { if ($this->_documentDistributorCallBack !== null) { $index = call_user_func($this->_documentDistributorCallBack, $document, $this->_indices); } else { $index = $this->_indices[array_rand($this->_indices)]; } $index->addDocument($document); } /** * Commit changes resulting from delete() or undeleteAll() operations. */ public function commit() { foreach ($this->_indices as $index) { $index->commit(); } } /** * Optimize index. * * Merges all segments into one */ public function optimize() { foreach ($this->_indices as $index) { $index->optimise(); } } /** * Returns an array of all terms in this index. * * @return array */ public function terms() { $termsList = array(); foreach ($this->_indices as $index) { $termsList[] = $index->terms(); } return array_unique(call_user_func_array('array_merge', $termsList)); } /** * Terms stream priority queue object * * @var \ZendSearch\Lucene\TermStreamsPriorityQueue */ private $_termsStream = null; /** * Reset terms stream. */ public function resetTermsStream() { if ($this->_termsStream === null) { $this->_termsStream = new TermStreamsPriorityQueue($this->_indices); } else { $this->_termsStream->resetTermsStream(); } } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param \ZendSearch\Lucene\Index\Term $prefix */ public function skipTo(Index\Term $prefix) { $this->_termsStream->skipTo($prefix); } /** * Scans terms dictionary and returns next term * * @return \ZendSearch\Lucene\Index\Term|null */ public function nextTerm() { return $this->_termsStream->nextTerm(); } /** * Returns term in current position * * @return \ZendSearch\Lucene\Index\Term|null */ public function currentTerm() { return $this->_termsStream->currentTerm(); } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { $this->_termsStream->closeTermsStream(); $this->_termsStream = null; } /** * Undeletes all documents currently marked as deleted in this index. */ public function undeleteAll() { foreach ($this->_indices as $index) { $index->undeleteAll(); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/000077500000000000000000000000001245775125600237205ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/BooleanExpressionRecognizer.php000066400000000000000000000200451245775125600321210ustar00rootroot00000000000000, ) * * So, it has a structure: * array( array( array(, ), // first literal of first conjuction * array(, ), // second literal of first conjuction * ... * array(, ) * ), // end of first conjuction * array( array(, ), // first literal of second conjuction * array(, ), // second literal of second conjuction * ... * array(, ) * ), // end of second conjuction * ... * ) // end of structure * * @var array */ private $_conjunctions = array(); /** * Current conjuction * * @var array */ private $_currentConjunction = array(); /** * Object constructor */ public function __construct() { parent::__construct( array(self::ST_START, self::ST_LITERAL, self::ST_NOT_OPERATOR, self::ST_AND_OPERATOR, self::ST_OR_OPERATOR), array(self::IN_LITERAL, self::IN_NOT_OPERATOR, self::IN_AND_OPERATOR, self::IN_OR_OPERATOR)); $emptyOperatorAction = new Lucene\FSMAction($this, 'emptyOperatorAction'); $emptyNotOperatorAction = new Lucene\FSMAction($this, 'emptyNotOperatorAction'); $this->addRules(array( array(self::ST_START, self::IN_LITERAL, self::ST_LITERAL), array(self::ST_START, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), array(self::ST_LITERAL, self::IN_AND_OPERATOR, self::ST_AND_OPERATOR), array(self::ST_LITERAL, self::IN_OR_OPERATOR, self::ST_OR_OPERATOR), array(self::ST_LITERAL, self::IN_LITERAL, self::ST_LITERAL, $emptyOperatorAction), array(self::ST_LITERAL, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR, $emptyNotOperatorAction), array(self::ST_NOT_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), array(self::ST_AND_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), array(self::ST_AND_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), array(self::ST_OR_OPERATOR, self::IN_LITERAL, self::ST_LITERAL), array(self::ST_OR_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR), )); $notOperatorAction = new Lucene\FSMAction($this, 'notOperatorAction'); $orOperatorAction = new Lucene\FSMAction($this, 'orOperatorAction'); $literalAction = new Lucene\FSMAction($this, 'literalAction'); $this->addEntryAction(self::ST_NOT_OPERATOR, $notOperatorAction); $this->addEntryAction(self::ST_OR_OPERATOR, $orOperatorAction); $this->addEntryAction(self::ST_LITERAL, $literalAction); } /** * Process next operator. * * Operators are defined by class constants: IN_AND_OPERATOR, IN_OR_OPERATOR and IN_NOT_OPERATOR * * @param integer $operator */ public function processOperator($operator) { $this->process($operator); } /** * Process expression literal. * * @param integer $operator */ public function processLiteral($literal) { $this->_literal = $literal; $this->process(self::IN_LITERAL); } /** * Finish an expression and return result * * Result is a set of boolean query conjunctions * * Each conjunction is an array of conjunction elements * Each conjunction element is presented with two-elements array: * array(, ) * * So, it has a structure: * array( array( array(, ), // first literal of first conjuction * array(, ), // second literal of first conjuction * ... * array(, ) * ), // end of first conjuction * array( array(, ), // first literal of second conjuction * array(, ), // second literal of second conjuction * ... * array(, ) * ), // end of second conjuction * ... * ) // end of structure * * @throws \ZendSearch\Lucene\Exception\UnexpectedValueException * @return array */ public function finishExpression() { if ($this->getState() != self::ST_LITERAL) { throw new Lucene\Exception\UnexpectedValueException('Literal expected.'); } $this->_conjunctions[] = $this->_currentConjunction; return $this->_conjunctions; } /********************************************************************* * Actions implementation *********************************************************************/ /** * default (omitted) operator processing */ public function emptyOperatorAction() { if (QueryParser::getDefaultOperator() == QueryParser::B_AND) { // Do nothing } else { $this->orOperatorAction(); } // Process literal $this->literalAction(); } /** * default (omitted) + NOT operator processing */ public function emptyNotOperatorAction() { if (QueryParser::getDefaultOperator() == QueryParser::B_AND) { // Do nothing } else { $this->orOperatorAction(); } // Process NOT operator $this->notOperatorAction(); } /** * NOT operator processing */ public function notOperatorAction() { $this->_negativeLiteral = true; } /** * OR operator processing * Close current conjunction */ public function orOperatorAction() { $this->_conjunctions[] = $this->_currentConjunction; $this->_currentConjunction = array(); } /** * Literal processing */ public function literalAction() { // Add literal to the current conjunction $this->_currentConjunction[] = array($this->_literal, !$this->_negativeLiteral); // Switch off negative signal $this->_negativeLiteral = false; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Exception/000077500000000000000000000000001245775125600256565ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Exception/ExceptionInterface.php000066400000000000000000000007331245775125600321510ustar00rootroot00000000000000_doc = $document; } /** * Get document for highlighting. * * @return \ZendSearch\Lucene\Document\HTML $document */ public function getDocument() { return $this->_doc; } /** * Highlight specified words * * @param string|array $words Words to highlight. They could be organized using the array or string. */ public function highlight($words) { $color = $this->_highlightColors[$this->_currentColorIndex]; $this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors); $this->_doc->highlight($words, $color); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Highlighter/HighlighterInterface.php000066400000000000000000000021241245775125600327450ustar00rootroot00000000000000_boost; } /** * Sets the boost for this query clause to $boost. * * @param float $boost */ public function setBoost($boost) { $this->_boost = $boost; } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ abstract public function score($docId, Lucene\SearchIndexInterface $reader); /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ abstract public function matchedDocs(); /** * Execute query in context of index reader * It also initializes necessary internal structures * * AbstractQuery specific implementation * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ abstract public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null); /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return \ZendSearch\Lucene\Search\Weight\AbstractWeight */ abstract public function createWeight(Lucene\SearchIndexInterface $reader); /** * Constructs an initializes a Weight for a _top-level_query_. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader */ protected function _initWeight(Lucene\SearchIndexInterface $reader) { // Check, that it's a top-level query and query weight is not initialized yet. if ($this->_weight !== null) { return $this->_weight; } $this->createWeight($reader); $sum = $this->_weight->sumOfSquaredWeights(); $queryNorm = $reader->getSimilarity()->queryNorm($sum); $this->_weight->normalize($queryNorm); } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ abstract public function rewrite(Lucene\SearchIndexInterface $index); /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ abstract public function optimize(Lucene\SearchIndexInterface $index); /** * Reset query, so it can be reused within other queries or * with other indeces */ public function reset() { $this->_weight = null; } /** * Print a query * * @return string */ abstract public function __toString(); /** * Return query terms * * @return array */ abstract public function getQueryTerms(); /** * AbstractQuery specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ abstract protected function _highlightMatches(Highlighter $highlighter); /** * Highlight matches in $inputHTML * * @param string $inputHTML * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. * @param Highlighter|null $highlighter * @return string */ public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null) { if ($highlighter === null) { $highlighter = new DefaultHighlighter(); } $doc = Document\HTML::loadHTML($inputHTML, false, $defaultEncoding); $highlighter->setDocument($doc); $this->_highlightMatches($highlighter); return $doc->getHTML(); } /** * Highlight matches in $inputHTMLFragment and return it (without HTML header and body tag) * * @param string $inputHTMLFragment * @param string $encoding Input HTML string encoding * @param Highlighter|null $highlighter * @return string */ public function htmlFragmentHighlightMatches($inputHTMLFragment, $encoding = 'UTF-8', $highlighter = null) { if ($highlighter === null) { $highlighter = new DefaultHighlighter(); } $inputHTML = '' . iconv($encoding, 'UTF-8//IGNORE', $inputHTMLFragment) . ''; $doc = Document\HTML::loadHTML($inputHTML); $highlighter->setDocument($doc); $this->_highlightMatches($highlighter); return $doc->getHTMLBody(); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Boolean.php000066400000000000000000000621251245775125600271230ustar00rootroot00000000000000_subqueries = $subqueries; $this->_signs = null; // Check if all subqueries are required if (is_array($signs)) { foreach ($signs as $sign ) { if ($sign !== true) { $this->_signs = $signs; break; } } } } } /** * Add a $subquery (Zend_Search_Lucene_Search_Query) to this query. * * The sign is specified as: * TRUE - subquery is required * FALSE - subquery is prohibited * NULL - subquery is neither prohibited, nor required * * @param \ZendSearch\Lucene\Search\Query\AbstractQuery $subquery * @param boolean|null $sign * @return void */ public function addSubquery(AbstractQuery $subquery, $sign=null) { if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required if ($this->_signs === null) { // Check, If all previous subqueries are required $this->_signs = array(); foreach ($this->_subqueries as $prevSubquery) { $this->_signs[] = true; } } $this->_signs[] = $sign; } $this->_subqueries[] = $subquery; } /** * Re-write queries into primitive queries * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { $query = new self(); $query->setBoost($this->getBoost()); foreach ($this->_subqueries as $subqueryId => $subquery) { $query->addSubquery($subquery->rewrite($index), ($this->_signs === null)? true : $this->_signs[$subqueryId]); } return $query; } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { $subqueries = array(); $signs = array(); // Optimize all subqueries foreach ($this->_subqueries as $id => $subquery) { $subqueries[] = $subquery->optimize($index); $signs[] = ($this->_signs === null)? true : $this->_signs[$id]; } // Remove insignificant subqueries foreach ($subqueries as $id => $subquery) { if ($subquery instanceof Insignificant) { // Insignificant subquery has to be removed anyway unset($subqueries[$id]); unset($signs[$id]); } } if (count($subqueries) == 0) { // Boolean query doesn't has non-insignificant subqueries return new Insignificant(); } // Check if all non-insignificant subqueries are prohibited $allProhibited = true; foreach ($signs as $sign) { if ($sign !== false) { $allProhibited = false; break; } } if ($allProhibited) { return new Insignificant(); } // Check for empty subqueries foreach ($subqueries as $id => $subquery) { if ($subquery instanceof EmptyResult) { if ($signs[$id] === true) { // Matching is required, but is actually empty return new EmptyResult(); } else { // Matching is optional or prohibited, but is empty // Remove it from subqueries and signs list unset($subqueries[$id]); unset($signs[$id]); } } } // Check, if reduced subqueries list is empty if (count($subqueries) == 0) { return new EmptyResult(); } // Check if all non-empty subqueries are prohibited $allProhibited = true; foreach ($signs as $sign) { if ($sign !== false) { $allProhibited = false; break; } } if ($allProhibited) { return new EmptyResult(); } // Check, if reduced subqueries list has only one entry if (count($subqueries) == 1) { // It's a query with only one required or optional clause // (it's already checked, that it's not a prohibited clause) if ($this->getBoost() == 1) { return reset($subqueries); } $optimizedQuery = clone reset($subqueries); $optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost()); return $optimizedQuery; } // Prepare first candidate for optimized query $optimizedQuery = new self($subqueries, $signs); $optimizedQuery->setBoost($this->getBoost()); $terms = array(); $tsigns = array(); $boostFactors = array(); // Try to decompose term and multi-term subqueries foreach ($subqueries as $id => $subquery) { if ($subquery instanceof Term) { $terms[] = $subquery->getTerm(); $tsigns[] = $signs[$id]; $boostFactors[] = $subquery->getBoost(); // remove subquery from a subqueries list unset($subqueries[$id]); unset($signs[$id]); } elseif ($subquery instanceof MultiTerm) { $subTerms = $subquery->getTerms(); $subSigns = $subquery->getSigns(); if ($signs[$id] === true) { // It's a required multi-term subquery. // Something like '... +(+term1 -term2 term3 ...) ...' // Multi-term required subquery can be decomposed only if it contains // required terms and doesn't contain prohibited terms: // ... +(+term1 term2 ...) ... => ... +term1 term2 ... // // Check this $hasRequired = false; $hasProhibited = false; if ($subSigns === null) { // All subterms are required $hasRequired = true; } else { foreach ($subSigns as $sign) { if ($sign === true) { $hasRequired = true; } elseif ($sign === false) { $hasProhibited = true; break; } } } // Continue if subquery has prohibited terms or doesn't have required terms if ($hasProhibited || !$hasRequired) { continue; } foreach ($subTerms as $termId => $term) { $terms[] = $term; $tsigns[] = ($subSigns === null)? true : $subSigns[$termId]; $boostFactors[] = $subquery->getBoost(); } // remove subquery from a subqueries list unset($subqueries[$id]); unset($signs[$id]); } else { // $signs[$id] === null || $signs[$id] === false // It's an optional or prohibited multi-term subquery. // Something like '... (+term1 -term2 term3 ...) ...' // or // something like '... -(+term1 -term2 term3 ...) ...' // Multi-term optional and required subqueries can be decomposed // only if all terms are optional. // // Check if all terms are optional. $onlyOptional = true; if ($subSigns === null) { // All subterms are required $onlyOptional = false; } else { foreach ($subSigns as $sign) { if ($sign !== null) { $onlyOptional = false; break; } } } // Continue if non-optional terms are presented in this multi-term subquery if (!$onlyOptional) { continue; } foreach ($subTerms as $termId => $term) { $terms[] = $term; $tsigns[] = ($signs[$id] === null)? null /* optional */ : false /* prohibited */; $boostFactors[] = $subquery->getBoost(); } // remove subquery from a subqueries list unset($subqueries[$id]); unset($signs[$id]); } } } // Check, if there are no decomposed subqueries if (count($terms) == 0 ) { // return prepared candidate return $optimizedQuery; } // Check, if all subqueries have been decomposed and all terms has the same boost factor if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) { $optimizedQuery = new MultiTerm($terms, $tsigns); $optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost()); return $optimizedQuery; } // This boolean query can't be transformed to Term/MultiTerm query and still contains // several subqueries // Separate prohibited terms $prohibitedTerms = array(); foreach ($terms as $id => $term) { if ($tsigns[$id] === false) { $prohibitedTerms[] = $term; unset($terms[$id]); unset($tsigns[$id]); unset($boostFactors[$id]); } } if (count($terms) == 1) { $clause = new Term(reset($terms)); $clause->setBoost(reset($boostFactors)); $subqueries[] = $clause; $signs[] = reset($tsigns); // Clear terms list $terms = array(); } elseif (count($terms) > 1 && count(array_unique($boostFactors)) == 1) { $clause = new MultiTerm($terms, $tsigns); $clause->setBoost(reset($boostFactors)); $subqueries[] = $clause; // Clause sign is 'required' if clause contains required terms. 'Optional' otherwise. $signs[] = (in_array(true, $tsigns))? true : null; // Clear terms list $terms = array(); } if (count($prohibitedTerms) == 1) { // (boost factors are not significant for prohibited clauses) $subqueries[] = new Term(reset($prohibitedTerms)); $signs[] = false; // Clear prohibited terms list $prohibitedTerms = array(); } elseif (count($prohibitedTerms) > 1) { // prepare signs array $prohibitedSigns = array(); foreach ($prohibitedTerms as $id => $term) { // all prohibited term are grouped as optional into multi-term query $prohibitedSigns[$id] = null; } // (boost factors are not significant for prohibited clauses) $subqueries[] = new MultiTerm($prohibitedTerms, $prohibitedSigns); // Clause sign is 'prohibited' $signs[] = false; // Clear terms list $prohibitedTerms = array(); } /** @todo Group terms with the same boost factors together */ // Check, that all terms are processed // Replace candidate for optimized query if (count($terms) == 0 && count($prohibitedTerms) == 0) { $optimizedQuery = new self($subqueries, $signs); $optimizedQuery->setBoost($this->getBoost()); } return $optimizedQuery; } /** * Returns subqueries * * @return array */ public function getSubqueries() { return $this->_subqueries; } /** * Return subqueries signs * * @return array */ public function getSigns() { return $this->_signs; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return \ZendSearch\Lucene\Search\Weight\Boolean */ public function createWeight(Lucene\SearchIndexInterface $reader) { $this->_weight = new Weight\Boolean($this, $reader); return $this->_weight; } /** * Calculate result vector for Conjunction query * (like ' AND AND ') */ private function _calculateConjunctionResult() { $this->_resVector = null; if (count($this->_subqueries) == 0) { $this->_resVector = array(); } $resVectors = array(); $resVectorsSizes = array(); $resVectorsIds = array(); // is used to prevent arrays comparison foreach ($this->_subqueries as $subqueryId => $subquery) { $resVectors[] = $subquery->matchedDocs(); $resVectorsSizes[] = count(end($resVectors)); $resVectorsIds[] = $subqueryId; } // sort resvectors in order of subquery cardinality increasing array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC, $resVectorsIds, SORT_ASC, SORT_NUMERIC, $resVectors); foreach ($resVectors as $nextResVector) { if($this->_resVector === null) { $this->_resVector = $nextResVector; } else { //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector); /** * This code is used as workaround for array_intersect_key() slowness problem. */ $updatedVector = array(); foreach ($this->_resVector as $id => $value) { if (isset($nextResVector[$id])) { $updatedVector[$id] = $value; } } $this->_resVector = $updatedVector; } if (count($this->_resVector) == 0) { // Empty result set, we don't need to check other terms break; } } // ksort($this->_resVector, SORT_NUMERIC); // Used algorithm doesn't change elements order } /** * Calculate result vector for non Conjunction query * (like ' AND AND NOT OR ') */ private function _calculateNonConjunctionResult() { $requiredVectors = array(); $requiredVectorsSizes = array(); $requiredVectorsIds = array(); // is used to prevent arrays comparison $optional = array(); foreach ($this->_subqueries as $subqueryId => $subquery) { if ($this->_signs[$subqueryId] === true) { // required $requiredVectors[] = $subquery->matchedDocs(); $requiredVectorsSizes[] = count(end($requiredVectors)); $requiredVectorsIds[] = $subqueryId; } elseif ($this->_signs[$subqueryId] === false) { // prohibited // Do nothing. matchedDocs() may include non-matching id's // Calculating prohibited vector may take significant time, but do not affect the result // Skipped. } else { // neither required, nor prohibited // array union $optional += $subquery->matchedDocs(); } } // sort resvectors in order of subquery cardinality increasing array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC, $requiredVectorsIds, SORT_ASC, SORT_NUMERIC, $requiredVectors); $required = null; foreach ($requiredVectors as $nextResVector) { if($required === null) { $required = $nextResVector; } else { //$required = array_intersect_key($required, $nextResVector); /** * This code is used as workaround for array_intersect_key() slowness problem. */ $updatedVector = array(); foreach ($required as $id => $value) { if (isset($nextResVector[$id])) { $updatedVector[$id] = $value; } } $required = $updatedVector; } if (count($required) == 0) { // Empty result set, we don't need to check other terms break; } } if ($required !== null) { $this->_resVector = &$required; } else { $this->_resVector = &$optional; } ksort($this->_resVector, SORT_NUMERIC); } /** * Score calculator for conjunction queries (all subqueries are required) * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function _conjunctionScore($docId, Lucene\SearchIndexInterface $reader) { if ($this->_coord === null) { $this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries), count($this->_subqueries) ); } $score = 0; foreach ($this->_subqueries as $subquery) { $subscore = $subquery->score($docId, $reader); if ($subscore == 0) { return 0; } $score += $subquery->score($docId, $reader) * $this->_coord; } return $score * $this->_coord * $this->getBoost(); } /** * Score calculator for non conjunction queries (not all subqueries are required) * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function _nonConjunctionScore($docId, Lucene\SearchIndexInterface $reader) { if ($this->_coord === null) { $this->_coord = array(); $maxCoord = 0; foreach ($this->_signs as $sign) { if ($sign !== false /* not prohibited */) { $maxCoord++; } } for ($count = 0; $count <= $maxCoord; $count++) { $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); } } $score = 0; $matchedSubqueries = 0; foreach ($this->_subqueries as $subqueryId => $subquery) { $subscore = $subquery->score($docId, $reader); // Prohibited if ($this->_signs[$subqueryId] === false && $subscore != 0) { return 0; } // is required, but doen't match if ($this->_signs[$subqueryId] === true && $subscore == 0) { return 0; } if ($subscore != 0) { $matchedSubqueries++; $score += $subscore; } } return $score * $this->_coord[$matchedSubqueries] * $this->getBoost(); } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { // Initialize weight if it's not done yet $this->_initWeight($reader); if ($docsFilter === null) { // Create local documents filter if it's not provided by upper query $docsFilter = new Index\DocsFilter(); } foreach ($this->_subqueries as $subqueryId => $subquery) { if ($this->_signs == null || $this->_signs[$subqueryId] === true) { // Subquery is required $subquery->execute($reader, $docsFilter); } else { $subquery->execute($reader); } } if ($this->_signs === null) { $this->_calculateConjunctionResult(); } else { $this->_calculateNonConjunctionResult(); } } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ public function matchedDocs() { return $this->_resVector; } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { if (isset($this->_resVector[$docId])) { if ($this->_signs === null) { return $this->_conjunctionScore($docId, $reader); } else { return $this->_nonConjunctionScore($docId, $reader); } } else { return 0; } } /** * Return query terms * * @return array */ public function getQueryTerms() { $terms = array(); foreach ($this->_subqueries as $id => $subquery) { if ($this->_signs === null || $this->_signs[$id] !== false) { $terms = array_merge($terms, $subquery->getQueryTerms()); } } return $terms; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { foreach ($this->_subqueries as $id => $subquery) { if ($this->_signs === null || $this->_signs[$id] !== false) { $subquery->_highlightMatches($highlighter); } } } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping $query = ''; foreach ($this->_subqueries as $id => $subquery) { if ($id != 0) { $query .= ' '; } if ($this->_signs === null || $this->_signs[$id] === true) { $query .= '+'; } elseif ($this->_signs[$id] === false) { $query .= '-'; } $query .= '(' . $subquery->__toString() . ')'; } if ($this->getBoost() != 1) { $query = '(' . $query . ')^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/EmptyResult.php000066400000000000000000000061141245775125600300350ustar00rootroot00000000000000'; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Fuzzy.php000066400000000000000000000431721245775125600266740ustar00rootroot00000000000000= 1) { throw new InvalidArgumentException('minimumSimilarity cannot be greater than or equal to 1'); } if ($prefixLength < 0) { throw new InvalidArgumentException('prefixLength cannot be less than 0'); } $this->_term = $term; $this->_minimumSimilarity = $minimumSimilarity; $this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength; } /** * Get default non-fuzzy prefix length * * @return integer */ public static function getDefaultPrefixLength() { return self::$_defaultPrefixLength; } /** * Set default non-fuzzy prefix length * * @param integer $defaultPrefixLength */ public static function setDefaultPrefixLength($defaultPrefixLength) { self::$_defaultPrefixLength = $defaultPrefixLength; } /** * Calculate maximum distance for specified word length * * @param integer $prefixLength * @param integer $termLength * @param integer $length * @return integer */ private function _calculateMaxDistance($prefixLength, $termLength, $length) { $this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength)); return $this->_maxDistances[$length]; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { $this->_matches = array(); $this->_scores = array(); $this->_termKeys = array(); if ($this->_term->field === null) { // Search through all fields $fields = $index->getFieldNames(true /* indexed fields list */); } else { $fields = array($this->_term->field); } $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Index\Term::getLength($prefix); $termLength = Index\Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1/(1 - $this->_minimumSimilarity); $maxTerms = Lucene\Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); if ($prefix != '') { $index->skipTo(new Index\Term($prefix, $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($index->currentTerm()->text, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)])? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length); } elseif (strlen($target) == 0) { $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length); } elseif ($maxDistance < abs($termRestLength - strlen($target))){ //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target))); } if ($similarity > $this->_minimumSimilarity) { $this->_matches[] = $index->currentTerm(); $this->_termKeys[] = $index->currentTerm()->key(); $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor; if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } } $index->nextTerm(); } } else { $index->skipTo(new Index\Term('', $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { // Calculate similarity $target = $index->currentTerm()->text; $maxDistance = isset($this->_maxDistances[strlen($target)])? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance(0, $termRestLength, strlen($target)); if ($maxDistance < abs($termRestLength - strlen($target))){ //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target)); } if ($similarity > $this->_minimumSimilarity) { $this->_matches[] = $index->currentTerm(); $this->_termKeys[] = $index->currentTerm()->key(); $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor; if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { return new EmptyResult(); } elseif (count($this->_matches) == 1) { return new Term(reset($this->_matches)); } else { $rewrittenQuery = new Boolean(); array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC, $this->_termKeys, SORT_ASC, SORT_STRING, $this->_matches); $termCount = 0; foreach ($this->_matches as $id => $matchedTerm) { $subquery = new Term($matchedTerm); $subquery->setBoost($this->_scores[$id]); $rewrittenQuery->addSubquery($subquery); $termCount++; if ($termCount >= self::MAX_CLAUSE_COUNT) { break; } } return $rewrittenQuery; } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { throw new UnsupportedMethodCallException('Fuzzy query should not be directly used for search. Use $query->rewrite($index)'); } /** * Return query terms * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return array */ public function getQueryTerms() { if ($this->_matches === null) { throw new RuntimeException('Search or rewrite operations have to be performed before.'); } return $this->_matches; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return \ZendSearch\Lucene\Search\Weight\AbstractWeight */ public function createWeight(Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException( 'Fuzzy query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { throw new UnsupportedMethodCallException( 'Fuzzy query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return array */ public function matchedDocs() { throw new UnsupportedMethodCallException( 'Fuzzy query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException( 'Fuzzy query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength); $prefixByteLength = strlen($prefix); $prefixUtf8Length = Index\Term::getLength($prefix); $termLength = Index\Term::getLength($this->_term->text); $termRest = substr($this->_term->text, $prefixByteLength); // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible $termRestLength = strlen($termRest); $scaleFactor = 1/(1 - $this->_minimumSimilarity); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { $termText = $token->getTermText(); if (substr($termText, 0, $prefixByteLength) == $prefix) { // Calculate similarity $target = substr($termText, $prefixByteLength); $maxDistance = isset($this->_maxDistances[strlen($target)])? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); if ($termRestLength == 0) { // we don't have anything to compare. That means if we just add // the letters for current term we get the new word $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length); } elseif (strlen($target) == 0) { $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length); } elseif ($maxDistance < abs($termRestLength - strlen($target))){ //just adding the characters of term to target or vice-versa results in too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisesly abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. $similarity = 0; } else { $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target))); } if ($similarity > $this->_minimumSimilarity) { $words[] = $termText; } } } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping return (($this->_term->field === null)? '' : $this->_term->field . ':') . $this->_term->text . '~' . (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '') . (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : ''); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Insignificant.php000066400000000000000000000061571245775125600303340ustar00rootroot00000000000000'; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/MultiTerm.php000066400000000000000000000450171245775125600274670ustar00rootroot00000000000000 (docId => freq, ...) * term2Id => (docId => freq, ...) * * @var array */ private $_termsFreqs = array(); /** * A score factor based on the fraction of all query terms * that a document contains. * float for conjunction queries * array of float for non conjunction queries * * @var mixed */ private $_coord = null; /** * Terms weights * array of Zend_Search_Lucene_Search_Weight * * @var array */ private $_weights = array(); /** * Class constructor. Create a new multi-term query object. * * if $signs array is omitted then all terms are required * it differs from addTerm() behavior, but should never be used * * @param array $terms Array of \ZendSearch\Lucene\Index\Term objects * @param array $signs Array of signs. Sign is boolean|null. * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function __construct($terms = null, $signs = null) { if (is_array($terms)) { if (count($terms) > Lucene\Lucene::getTermsPerQueryLimit()) { throw new InvalidArgumentException('Terms per query limit is reached.'); } $this->_terms = $terms; $this->_signs = null; // Check if all terms are required if (is_array($signs)) { foreach ($signs as $sign ) { if ($sign !== true) { $this->_signs = $signs; break; } } } } } /** * Add a $term (Zend_Search_Lucene_Index_Term) to this query. * * The sign is specified as: * TRUE - term is required * FALSE - term is prohibited * NULL - term is neither prohibited, nor required * * @param \ZendSearch\Lucene\Index\Term $term * @param boolean|null $sign * @return void */ public function addTerm(Index\Term $term, $sign = null) { if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required if ($this->_signs === null) { // Check, If all previous terms are required $this->_signs = array(); foreach ($this->_terms as $prevTerm) { $this->_signs[] = true; } } $this->_signs[] = $sign; } $this->_terms[] = $term; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { if (count($this->_terms) == 0) { return new EmptyResult(); } // Check, that all fields are qualified $allQualified = true; foreach ($this->_terms as $term) { if ($term->field === null) { $allQualified = false; break; } } if ($allQualified) { return $this; } else { /** transform multiterm query to boolean and apply rewrite() method to subqueries. */ $query = new Boolean(); $query->setBoost($this->getBoost()); foreach ($this->_terms as $termId => $term) { $subquery = new Term($term); $query->addSubquery($subquery->rewrite($index), ($this->_signs === null)? true : $this->_signs[$termId]); } return $query; } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { $terms = $this->_terms; $signs = $this->_signs; foreach ($terms as $id => $term) { if (!$index->hasTerm($term)) { if ($signs === null || $signs[$id] === true) { // Term is required return new EmptyResult(); } else { // Term is optional or prohibited // Remove it from terms and signs list unset($terms[$id]); unset($signs[$id]); } } } // Check if all presented terms are prohibited $allProhibited = true; if ($signs === null) { $allProhibited = false; } else { foreach ($signs as $sign) { if ($sign !== false) { $allProhibited = false; break; } } } if ($allProhibited) { return new EmptyResult(); } /** * @todo make an optimization for repeated terms * (they may have different signs) */ if (count($terms) == 1) { // It's already checked, that it's not a prohibited term // It's one term query with one required or optional element $optimizedQuery = new Term(reset($terms)); $optimizedQuery->setBoost($this->getBoost()); return $optimizedQuery; } if (count($terms) == 0) { return new EmptyResult(); } $optimizedQuery = new MultiTerm($terms, $signs); $optimizedQuery->setBoost($this->getBoost()); return $optimizedQuery; } /** * Returns query term * * @return array */ public function getTerms() { return $this->_terms; } /** * Return terms signs * * @return array */ public function getSigns() { return $this->_signs; } /** * Set weight for specified term * * @param integer $num * @param \ZendSearch\Lucene\Search\Weight\Term $weight */ public function setWeight($num, $weight) { $this->_weights[$num] = $weight; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return \ZendSearch\Lucene\Search\Weight\MultiTerm */ public function createWeight(Lucene\SearchIndexInterface $reader) { $this->_weight = new Weight\MultiTerm($this, $reader); return $this->_weight; } /** * Calculate result vector for Conjunction query * (like '+something +another') * * @param \ZendSearch\Lucene\SearchIndexInterface $reader */ private function _calculateConjunctionResult(Lucene\SearchIndexInterface $reader) { $this->_resVector = null; if (count($this->_terms) == 0) { $this->_resVector = array(); } // Order terms by selectivity $docFreqs = array(); $ids = array(); foreach ($this->_terms as $id => $term) { $docFreqs[] = $reader->docFreq($term); $ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison } array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC, $ids, SORT_ASC, SORT_NUMERIC, $this->_terms); $docsFilter = new Lucene\Index\DocsFilter(); foreach ($this->_terms as $termId => $term) { $termDocs = $reader->termDocs($term, $docsFilter); } // Treat last retrieved docs vector as a result set // (filter collects data for other terms) $this->_resVector = array_flip($termDocs); foreach ($this->_terms as $termId => $term) { $this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter); } // ksort($this->_resVector, SORT_NUMERIC); // Docs are returned ordered. Used algorithms doesn't change elements order. } /** * Calculate result vector for non Conjunction query * (like '+something -another') * * @param \ZendSearch\Lucene\SearchIndexInterface $reader */ private function _calculateNonConjunctionResult(Lucene\SearchIndexInterface $reader) { $requiredVectors = array(); $requiredVectorsSizes = array(); $requiredVectorsIds = array(); // is used to prevent arrays comparison $optional = array(); $prohibited = array(); foreach ($this->_terms as $termId => $term) { $termDocs = array_flip($reader->termDocs($term)); if ($this->_signs[$termId] === true) { // required $requiredVectors[] = $termDocs; $requiredVectorsSizes[] = count($termDocs); $requiredVectorsIds[] = $termId; } elseif ($this->_signs[$termId] === false) { // prohibited // array union $prohibited += $termDocs; } else { // neither required, nor prohibited // array union $optional += $termDocs; } $this->_termsFreqs[$termId] = $reader->termFreqs($term); } // sort resvectors in order of subquery cardinality increasing array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC, $requiredVectorsIds, SORT_ASC, SORT_NUMERIC, $requiredVectors); $required = null; foreach ($requiredVectors as $nextResVector) { if($required === null) { $required = $nextResVector; } else { //$required = array_intersect_key($required, $nextResVector); /** * This code is used as workaround for array_intersect_key() slowness problem. */ $updatedVector = array(); foreach ($required as $id => $value) { if (isset($nextResVector[$id])) { $updatedVector[$id] = $value; } } $required = $updatedVector; } if (count($required) == 0) { // Empty result set, we don't need to check other terms break; } } if ($required !== null) { $this->_resVector = $required; } else { $this->_resVector = $optional; } if (count($prohibited) != 0) { // $this->_resVector = array_diff_key($this->_resVector, $prohibited); /** * This code is used as workaround for array_diff_key() slowness problem. */ if (count($this->_resVector) < count($prohibited)) { $updatedVector = $this->_resVector; foreach ($this->_resVector as $id => $value) { if (isset($prohibited[$id])) { unset($updatedVector[$id]); } } $this->_resVector = $updatedVector; } else { $updatedVector = $this->_resVector; foreach ($prohibited as $id => $value) { unset($updatedVector[$id]); } $this->_resVector = $updatedVector; } } ksort($this->_resVector, SORT_NUMERIC); } /** * Score calculator for conjunction queries (all terms are required) * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function _conjunctionScore($docId, Lucene\SearchIndexInterface $reader) { if ($this->_coord === null) { $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), count($this->_terms) ); } $score = 0.0; foreach ($this->_terms as $termId => $term) { /** * We don't need to check that term freq is not 0 * Score calculation is performed only for matched docs */ $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } return $score * $this->_coord * $this->getBoost(); } /** * Score calculator for non conjunction queries (not all terms are required) * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function _nonConjunctionScore($docId, $reader) { if ($this->_coord === null) { $this->_coord = array(); $maxCoord = 0; foreach ($this->_signs as $sign) { if ($sign !== false /* not prohibited */) { $maxCoord++; } } for ($count = 0; $count <= $maxCoord; $count++) { $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); } } $score = 0.0; $matchedTerms = 0; foreach ($this->_terms as $termId=>$term) { // Check if term is if ($this->_signs[$termId] !== false && // not prohibited isset($this->_termsFreqs[$termId][$docId]) // matched ) { $matchedTerms++; /** * We don't need to check that term freq is not 0 * Score calculation is performed only for matched docs */ $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) * $this->_weights[$termId]->getValue() * $reader->norm($docId, $term->field); } } return $score * $this->_coord[$matchedTerms] * $this->getBoost(); } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { if ($this->_signs === null) { $this->_calculateConjunctionResult($reader); } else { $this->_calculateNonConjunctionResult($reader); } // Initialize weight if it's not done yet $this->_initWeight($reader); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ public function matchedDocs() { return $this->_resVector; } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { if (isset($this->_resVector[$docId])) { if ($this->_signs === null) { return $this->_conjunctionScore($docId, $reader); } else { return $this->_nonConjunctionScore($docId, $reader); } } else { return 0; } } /** * Return query terms * * @return array */ public function getQueryTerms() { if ($this->_signs === null) { return $this->_terms; } $terms = array(); foreach ($this->_signs as $id => $sign) { if ($sign !== false) { $terms[] = $this->_terms[$id]; } } return $terms; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); if ($this->_signs === null) { foreach ($this->_terms as $term) { $words[] = $term->text; } } else { foreach ($this->_signs as $id => $sign) { if ($sign !== false) { $words[] = $this->_terms[$id]->text; } } } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping $query = ''; foreach ($this->_terms as $id => $term) { if ($id != 0) { $query .= ' '; } if ($this->_signs === null || $this->_signs[$id] === true) { $query .= '+'; } elseif ($this->_signs[$id] === false) { $query .= '-'; } if ($term->field !== null) { $query .= $term->field . ':'; } $query .= $term->text; } if ($this->getBoost() != 1) { $query = '(' . $query . ')^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Phrase.php000066400000000000000000000406541245775125600267710ustar00rootroot00000000000000 (docId => array( pos1, pos2, ... ), ...) * term2Id => (docId => array( pos1, pos2, ... ), ...) * * @var array */ private $_termsPositions = array(); /** * Class constructor. Create a new prase query. * * @param string $field Field to search. * @param array $terms Terms to search Array of strings. * @param array $offsets Relative term positions. Array of integers. * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function __construct($terms = null, $offsets = null, $field = null) { $this->_slop = 0; if (is_array($terms)) { $this->_terms = array(); foreach ($terms as $termId => $termText) { $this->_terms[$termId] = ($field !== null)? new Index\Term($termText, $field): new Index\Term($termText); } } elseif ($terms === null) { $this->_terms = array(); } else { throw new InvalidArgumentException('terms argument must be array of strings or null'); } if (is_array($offsets)) { if (count($this->_terms) != count($offsets)) { throw new InvalidArgumentException('terms and offsets arguments must have the same size.'); } $this->_offsets = $offsets; } elseif ($offsets === null) { $this->_offsets = array(); foreach ($this->_terms as $termId => $term) { $position = count($this->_offsets); $this->_offsets[$termId] = $position; } } else { throw new InvalidArgumentException('offsets argument must be array of strings or null'); } } /** * Set slop * * @param integer $slop */ public function setSlop($slop) { $this->_slop = $slop; } /** * Get slop * * @return integer */ public function getSlop() { return $this->_slop; } /** * Adds a term to the end of the query phrase. * The relative position of the term is specified explicitly or the one immediately * after the last term added. * * @param \ZendSearch\Lucene\Index\Term $term * @param integer $position * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function addTerm(Index\Term $term, $position = null) { if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) { throw new InvalidArgumentException('All phrase terms must be in the same field: ' . $term->field . ':' . $term->text); } $this->_terms[] = $term; if ($position !== null) { $this->_offsets[] = $position; } elseif (count($this->_offsets) != 0) { $this->_offsets[] = end($this->_offsets) + 1; } else { $this->_offsets[] = 0; } } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { if (count($this->_terms) == 0) { return new EmptyResult(); } elseif ($this->_terms[0]->field !== null) { return $this; } else { $query = new Boolean(); $query->setBoost($this->getBoost()); foreach ($index->getFieldNames(true) as $fieldName) { $subquery = new self(); $subquery->setSlop($this->getSlop()); foreach ($this->_terms as $termId => $term) { $qualifiedTerm = new Index\Term($term->text, $fieldName); $subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]); } $query->addSubquery($subquery); } return $query; } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { // Check, that index contains all phrase terms foreach ($this->_terms as $term) { if (!$index->hasTerm($term)) { return new EmptyResult(); } } if (count($this->_terms) == 1) { // It's one term query $optimizedQuery = new Term(reset($this->_terms)); $optimizedQuery->setBoost($this->getBoost()); return $optimizedQuery; } if (count($this->_terms) == 0) { return new EmptyResult(); } return $this; } /** * Returns query term * * @return array */ public function getTerms() { return $this->_terms; } /** * Set weight for specified term * * @param integer $num * @param \ZendSearch\Lucene\Search\Weight\Term $weight */ public function setWeight($num, $weight) { $this->_weights[$num] = $weight; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return \ZendSearch\Lucene\Search\Weight\Phrase */ public function createWeight(Lucene\SearchIndexInterface $reader) { $this->_weight = new Weight\Phrase($this, $reader); return $this->_weight; } /** * Score calculator for exact phrase queries (terms sequence is fixed) * * @param integer $docId * @return float */ public function _exactPhraseFreq($docId) { $freq = 0; // Term Id with lowest cardinality $lowCardTermId = null; // Calculate $lowCardTermId foreach ($this->_terms as $termId => $term) { if ($lowCardTermId === null || count($this->_termsPositions[$termId][$docId]) < count($this->_termsPositions[$lowCardTermId][$docId]) ) { $lowCardTermId = $termId; } } // Walk through positions of the term with lowest cardinality foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { // We expect phrase to be found $freq++; // Walk through other terms foreach ($this->_terms as $termId => $term) { if ($termId != $lowCardTermId) { $expectedPosition = $lowCardPos + ($this->_offsets[$termId] - $this->_offsets[$lowCardTermId]); if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { $freq--; // Phrase wasn't found. break; } } } } return $freq; } /** * Score calculator for sloppy phrase queries (terms sequence is fixed) * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function _sloppyPhraseFreq($docId, Lucene\SearchIndexInterface $reader) { $freq = 0; $phraseQueue = array(); $phraseQueue[0] = array(); // empty phrase $lastTerm = null; // Walk through the terms to create phrases. foreach ($this->_terms as $termId => $term) { $queueSize = count($phraseQueue); $firstPass = true; // Walk through the term positions. // Each term position produces a set of phrases. foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { if ($firstPass) { for ($count = 0; $count < $queueSize; $count++) { $phraseQueue[$count][$termId] = $termPosition; } } else { for ($count = 0; $count < $queueSize; $count++) { if ($lastTerm !== null && abs( $termPosition - $phraseQueue[$count][$lastTerm] - ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { continue; } $newPhraseId = count($phraseQueue); $phraseQueue[$newPhraseId] = $phraseQueue[$count]; $phraseQueue[$newPhraseId][$termId] = $termPosition; } } $firstPass = false; } $lastTerm = $termId; } foreach ($phraseQueue as $phrasePos) { $minDistance = null; for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { $distance = 0; $start = reset($phrasePos) - reset($this->_offsets) + $shift; foreach ($this->_terms as $termId => $term) { $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); if($distance > $this->_slop) { break; } } if ($minDistance === null || $distance < $minDistance) { $minDistance = $distance; } } if ($minDistance <= $this->_slop) { $freq += $reader->getSimilarity()->sloppyFreq($minDistance); } } return $freq; } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { $this->_resVector = null; if (count($this->_terms) == 0) { $this->_resVector = array(); } $resVectors = array(); $resVectorsSizes = array(); $resVectorsIds = array(); // is used to prevent arrays comparison foreach ($this->_terms as $termId => $term) { $resVectors[] = array_flip($reader->termDocs($term)); $resVectorsSizes[] = count(end($resVectors)); $resVectorsIds[] = $termId; $this->_termsPositions[$termId] = $reader->termPositions($term); } // sort resvectors in order of subquery cardinality increasing array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC, $resVectorsIds, SORT_ASC, SORT_NUMERIC, $resVectors); foreach ($resVectors as $nextResVector) { if($this->_resVector === null) { $this->_resVector = $nextResVector; } else { //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector); /** * This code is used as workaround for array_intersect_key() slowness problem. */ $updatedVector = array(); foreach ($this->_resVector as $id => $value) { if (isset($nextResVector[$id])) { $updatedVector[$id] = $value; } } $this->_resVector = $updatedVector; } if (count($this->_resVector) == 0) { // Empty result set, we don't need to check other terms break; } } // ksort($this->_resVector, SORT_NUMERIC); // Docs are returned ordered. Used algorithm doesn't change elements order. // Initialize weight if it's not done yet $this->_initWeight($reader); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ public function matchedDocs() { return $this->_resVector; } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { if (isset($this->_resVector[$docId])) { if ($this->_slop == 0) { $freq = $this->_exactPhraseFreq($docId); } else { $freq = $this->_sloppyPhraseFreq($docId, $reader); } if ($freq != 0) { $tf = $reader->getSimilarity()->tf($freq); $weight = $this->_weight->getValue(); $norm = $reader->norm($docId, reset($this->_terms)->field); return $tf * $weight * $norm * $this->getBoost(); } // Included in result, but culculated freq is zero return 0; } else { return 0; } } /** * Return query terms * * @return array */ public function getQueryTerms() { return $this->_terms; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); foreach ($this->_terms as $term) { $words[] = $term->text; } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) { $query = $this->_terms[0]->field . ':'; } else { $query = ''; } $query .= '"'; foreach ($this->_terms as $id => $term) { if ($id != 0) { $query .= ' '; } $query .= $term->text; } $query .= '"'; if ($this->_slop != 0) { $query .= '~' . $this->_slop; } if ($this->getBoost() != 1) { $query .= '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Preprocessing/000077500000000000000000000000001245775125600276505ustar00rootroot00000000000000AbstractPreprocessing.php000066400000000000000000000071021245775125600346110ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Preprocessing_word = $word; $this->_encoding = $encoding; $this->_field = $fieldName; $this->_minimumSimilarity = $minimumSimilarity; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { if ($this->_field === null) { $query = new Search\Query\Boolean(); $hasInsignificantSubqueries = false; if (Lucene\Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Lucene\Lucene::getDefaultSearchField()); } foreach ($searchFields as $fieldName) { $subquery = new self($this->_word, $this->_encoding, $fieldName, $this->_minimumSimilarity); $rewrittenSubquery = $subquery->rewrite($index); if ( !($rewrittenSubquery instanceof Query\Insignificant || $rewrittenSubquery instanceof Query\EmptyResult) ) { $query->addSubquery($rewrittenSubquery); } if ($rewrittenSubquery instanceof Query\Insignificant) { $hasInsignificantSubqueries = true; } } $subqueries = $query->getSubqueries(); if (count($subqueries) == 0) { $this->_matches = array(); if ($hasInsignificantSubqueries) { return new Query\Insignificant(); } else { return new Query\EmptyResult(); } } if (count($subqueries) == 1) { $query = reset($subqueries); } $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching $term = new Index\Term($this->_word, $this->_field); if ($index->hasTerm($term)) { $query = new Query\Fuzzy($term, $this->_minimumSimilarity); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // ------------------------------------- // Recognize wildcard queries /** * @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { throw new QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).'); } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { $this->_matches = array(); return new Query\Insignificant(); } if (count($tokens) == 1) { $term = new Index\Term($tokens[0]->getTermText(), $this->_field); $query = new Query\Fuzzy($term, $this->_minimumSimilarity); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // Word is tokenized into several tokens throw new QueryParserException('Fuzzy search is supported only for non-multiple word terms'); } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ // ------------------------------------- // Recognize wildcard queries /** * @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); } else { $subPatterns = preg_split('/[*?]/', $this->_word); } if (count($subPatterns) > 1) { // Do nothing return; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { $term = new Index\Term($tokens[0]->getTermText(), $this->_field); $query = new Query\Fuzzy($term, $this->_minimumSimilarity); $query->_highlightMatches($highlighter); return; } // Word is tokenized into several tokens // But fuzzy search is supported only for non-multiple word terms // Do nothing } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if ($this->_field !== null) { $query = $this->_field . ':'; } else { $query = ''; } $query .= $this->_word; if ($this->getBoost() != 1) { $query .= '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Preprocessing/Phrase.php000066400000000000000000000172321245775125600316100ustar00rootroot00000000000000_phrase = $phrase; $this->_phraseEncoding = $phraseEncoding; $this->_field = $fieldName; } /** * Set slop * * @param integer $slop */ public function setSlop($slop) { $this->_slop = $slop; } /** * Get slop * * @return integer */ public function getSlop() { return $this->_slop; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { // Allow to use wildcards within phrases // They are either removed by text analyzer or used as a part of keyword for keyword fields // // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); // } // Split query into subqueries if field name is not specified if ($this->_field === null) { $query = new Query\Boolean(); $query->setBoost($this->getBoost()); if (Lucene\Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Lucene\Lucene::getDefaultSearchField()); } foreach ($searchFields as $fieldName) { $subquery = new Phrase($this->_phrase, $this->_phraseEncoding, $fieldName); $subquery->setSlop($this->getSlop()); $query->addSubquery($subquery->rewrite($index)); } $this->_matches = $query->getQueryTerms(); return $query; } // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching $term = new Index\Term($this->_phrase, $this->_field); if ($index->hasTerm($term)) { $query = new Query\Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // tokenize phrase using current analyzer and process it as a phrase query $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); if (count($tokens) == 0) { $this->_matches = array(); return new Query\Insignificant(); } if (count($tokens) == 1) { $term = new Index\Term($tokens[0]->getTermText(), $this->_field); $query = new Query\Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } //It's non-trivial phrase query $position = -1; $query = new Query\Phrase(); foreach ($tokens as $token) { $position += $token->getPositionIncrement(); $term = new Index\Term($token->getTermText(), $this->_field); $query->addTerm($term, $position); $query->setSlop($this->getSlop()); } $this->_matches = $query->getQueryTerms(); return $query; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */ // tokenize phrase using current analyzer and process it as a phrase query $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { $highlighter->highlight($tokens[0]->getTermText()); return; } //It's non-trivial phrase query $words = array(); foreach ($tokens as $token) { $words[] = $token->getTermText(); } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if ($this->_field !== null) { $query = $this->_field . ':'; } else { $query = ''; } $query .= '"' . $this->_phrase . '"'; if ($this->_slop != 0) { $query .= '~' . $this->_slop; } if ($this->getBoost() != 1) { $query .= '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Preprocessing/Term.php000066400000000000000000000251421245775125600312740ustar00rootroot00000000000000_word = $word; $this->_encoding = $encoding; $this->_field = $fieldName; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { if ($this->_field === null) { $query = new Query\MultiTerm(); $query->setBoost($this->getBoost()); $hasInsignificantSubqueries = false; if (Lucene\Lucene::getDefaultSearchField() === null) { $searchFields = $index->getFieldNames(true); } else { $searchFields = array(Lucene\Lucene::getDefaultSearchField()); } foreach ($searchFields as $fieldName) { $subquery = new Term($this->_word, $this->_encoding, $fieldName); $rewrittenSubquery = $subquery->rewrite($index); foreach ($rewrittenSubquery->getQueryTerms() as $term) { $query->addTerm($term); } if ($rewrittenSubquery instanceof Query\Insignificant) { $hasInsignificantSubqueries = true; } } if (count($query->getTerms()) == 0) { $this->_matches = array(); if ($hasInsignificantSubqueries) { return new Query\Insignificant(); } else { return new Query\EmptyResult(); } } $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize exact term matching (it corresponds to Keyword fields stored in the index) // encoding is not used since we expect binary matching $term = new Index\Term($this->_word, $this->_field); if ($index->hasTerm($term)) { $query = new Query\Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } // ------------------------------------- // Recognize wildcard queries /** * @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { $word = iconv($this->_encoding, 'UTF-8', $this->_word); $wildcardsPattern = '/[*?]/u'; $subPatternsEncoding = 'UTF-8'; } else { $word = $this->_word; $wildcardsPattern = '/[*?]/'; $subPatternsEncoding = $this->_encoding; } $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE); if (count($subPatterns) > 1) { // Wildcard query is recognized $pattern = ''; foreach ($subPatterns as $id => $subPattern) { // Append corresponding wildcard character to the pattern before each sub-pattern (except first) if ($id != 0) { $pattern .= $word[ $subPattern[1] - 1 ]; } // Check if each subputtern is a single word in terms of current analyzer $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding); if (count($tokens) > 1) { throw new QueryParserException('Wildcard search is supported only for non-multiple word terms'); } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } $term = new Index\Term($pattern, $this->_field); $query = new Query\Wildcard($term); $query->setBoost($this->getBoost()); // Get rewritten query. Important! It also fills terms matching container. $rewrittenQuery = $query->rewrite($index); $this->_matches = $query->getQueryTerms(); return $rewrittenQuery; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { $this->_matches = array(); return new Query\Insignificant(); } if (count($tokens) == 1) { $term = new Index\Term($tokens[0]->getTermText(), $this->_field); $query = new Query\Term($term); $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } //It's not insignificant or one term query $query = new Query\MultiTerm(); /** * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other * analizer design features */ foreach ($tokens as $token) { $term = new Index\Term($token->getTermText(), $this->_field); $query->addTerm($term, true); // all subterms are required } $query->setBoost($this->getBoost()); $this->_matches = $query->getQueryTerms(); return $query; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ /** Skip exact term matching recognition, keyword fields highlighting is not supported */ // ------------------------------------- // Recognize wildcard queries /** * @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { $word = iconv($this->_encoding, 'UTF-8', $this->_word); $wildcardsPattern = '/[*?]/u'; $subPatternsEncoding = 'UTF-8'; } else { $word = $this->_word; $wildcardsPattern = '/[*?]/'; $subPatternsEncoding = $this->_encoding; } $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE); if (count($subPatterns) > 1) { // Wildcard query is recognized $pattern = ''; foreach ($subPatterns as $id => $subPattern) { // Append corresponding wildcard character to the pattern before each sub-pattern (except first) if ($id != 0) { $pattern .= $word[ $subPattern[1] - 1 ]; } // Check if each subputtern is a single word in terms of current analyzer $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding); if (count($tokens) > 1) { // Do nothing (nothing is highlighted) return; } foreach ($tokens as $token) { $pattern .= $token->getTermText(); } } $term = new Index\Term($pattern, $this->_field); $query = new Query\Wildcard($term); $query->_highlightMatches($highlighter); return; } // ------------------------------------- // Recognize one-term multi-term and "insignificant" queries $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); if (count($tokens) == 0) { // Do nothing return; } if (count($tokens) == 1) { $highlighter->highlight($tokens[0]->getTermText()); return; } //It's not insignificant or one term query $words = array(); foreach ($tokens as $token) { $words[] = $token->getTermText(); } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if ($this->_field !== null) { $query = $this->_field . ':'; } else { $query = ''; } $query .= $this->_word; if ($this->getBoost() != 1) { $query .= '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Range.php000066400000000000000000000256251245775125600266040ustar00rootroot00000000000000field != $upperTerm->field) { throw new InvalidArgumentException('Both terms must be for the same field'); } $this->_field = ($lowerTerm !== null)? $lowerTerm->field : $upperTerm->field; $this->_lowerTerm = $lowerTerm; $this->_upperTerm = $upperTerm; $this->_inclusive = $inclusive; } /** * Get query field name * * @return string|null */ public function getField() { return $this->_field; } /** * Get lower term * * @return \ZendSearch\Lucene\Index\Term|null */ public function getLowerTerm() { return $this->_lowerTerm; } /** * Get upper term * * @return \ZendSearch\Lucene\Index\Term|null */ public function getUpperTerm() { return $this->_upperTerm; } /** * Get upper term * * @return boolean */ public function isInclusive() { return $this->_inclusive; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { $this->_matches = array(); if ($this->_field === null) { // Search through all fields $fields = $index->getFieldNames(true /* indexed fields list */); } else { $fields = array($this->_field); } $maxTerms = Lucene\Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); if ($this->_lowerTerm !== null) { $lowerTerm = new Index\Term($this->_lowerTerm->text, $field); $index->skipTo($lowerTerm); if (!$this->_inclusive && $index->currentTerm() == $lowerTerm) { // Skip lower term $index->nextTerm(); } } else { $index->skipTo(new Index\Term('', $field)); } if ($this->_upperTerm !== null) { // Walk up to the upper term $upperTerm = new Index\Term($this->_upperTerm->text, $field); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && $index->currentTerm()->text < $upperTerm->text) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } $index->nextTerm(); } if ($this->_inclusive && $index->currentTerm() == $upperTerm) { // Include upper term into result $this->_matches[] = $upperTerm; } } else { // Walk up to the end of field data while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { return new EmptyResult(); } elseif (count($this->_matches) == 1) { return new Term(reset($this->_matches)); } else { $rewrittenQuery = new MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { throw new UnsupportedMethodCallException( 'Range query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Return query terms * * @return array * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function getQueryTerms() { if ($this->_matches === null) { throw new RuntimeException('Search or rewrite operations have to be performed before.'); } return $this->_matches; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function createWeight(Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException( 'Range query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { throw new UnsupportedMethodCallException( 'Range query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return array */ public function matchedDocs() { throw new UnsupportedMethodCallException( 'Range query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException( 'Range query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); $lowerTermText = ($this->_lowerTerm !== null)? $this->_lowerTerm->text : null; $upperTermText = ($this->_upperTerm !== null)? $this->_upperTerm->text : null; if ($this->_inclusive) { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) { $words[] = $termText; } } } else { foreach ($tokens as $token) { $termText = $token->getTermText(); if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) { $words[] = $termText; } } } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping return (($this->_field === null)? '' : $this->_field . ':') . (($this->_inclusive)? '[' : '{') . (($this->_lowerTerm !== null)? $this->_lowerTerm->text : 'null') . ' TO ' . (($this->_upperTerm !== null)? $this->_upperTerm->text : 'null') . (($this->_inclusive)? ']' : '}') . (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : ''); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Term.php000066400000000000000000000124261245775125600264520ustar00rootroot00000000000000 freq, ...) * * @var array */ private $_termFreqs; /** * Zend_Search_Lucene_Search_Query_Term constructor * * @param \ZendSearch\Lucene\Index\Term $term * @param boolean $sign */ public function __construct(Index\Term $term) { $this->_term = $term; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { if ($this->_term->field != null) { return $this; } else { $query = new MultiTerm(); $query->setBoost($this->getBoost()); foreach ($index->getFieldNames(true) as $fieldName) { $term = new Index\Term($this->_term->text, $fieldName); $query->addTerm($term); } return $query->rewrite($index); } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { // Check, that index contains specified term if (!$index->hasTerm($this->_term)) { return new EmptyResult(); } return $this; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return \ZendSearch\Lucene\Search\Weight\Term */ public function createWeight(Lucene\SearchIndexInterface $reader) { $this->_weight = new Weight\Term($this->_term, $this, $reader); return $this->_weight; } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { $this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter)); $this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter); // Initialize weight if it's not done yet $this->_initWeight($reader); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ public function matchedDocs() { return $this->_docVector; } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { if (isset($this->_docVector[$docId])) { return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) * $this->_weight->getValue() * $reader->norm($docId, $this->_term->field) * $this->getBoost(); } else { return 0; } } /** * Return query terms * * @return array */ public function getQueryTerms() { return array($this->_term); } /** * Return query term * * @return \ZendSearch\Lucene\Index\Term */ public function getTerm() { return $this->_term; } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $highlighter->highlight($this->_term->text); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if ($this->_term->field !== null) { $query = $this->_term->field . ':'; } else { $query = ''; } $query .= $this->_term->text; if ($this->getBoost() != 1) { $query = $query . '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Query/Wildcard.php000066400000000000000000000256641245775125600273040ustar00rootroot00000000000000_pattern = $pattern; } /** * Get minimum prefix length * * @return integer */ public static function getMinPrefixLength() { return self::$_minPrefixLength; } /** * Set minimum prefix length * * @param integer $minPrefixLength */ public static function setMinPrefixLength($minPrefixLength) { self::$_minPrefixLength = $minPrefixLength; } /** * Get terms prefix * * @param string $word * @return string */ private static function _getPrefix($word) { $questionMarkPosition = strpos($word, '?'); $astrericPosition = strpos($word, '*'); if ($questionMarkPosition !== false) { if ($astrericPosition !== false) { return substr($word, 0, min($questionMarkPosition, $astrericPosition)); } return substr($word, 0, $questionMarkPosition); } elseif ($astrericPosition !== false) { return substr($word, 0, $astrericPosition); } return $word; } /** * Re-write query into primitive queries in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\RuntimeException * @throws \ZendSearch\Lucene\Exception\OutOfBoundsException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function rewrite(Lucene\SearchIndexInterface $index) { $this->_matches = array(); if ($this->_pattern->field === null) { // Search through all fields $fields = $index->getFieldNames(true /* indexed fields list */); } else { $fields = array($this->_pattern->field); } $prefix = self::_getPrefix($this->_pattern->text); $prefixLength = strlen($prefix); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/'; if ($prefixLength < self::$_minPrefixLength) { throw new RuntimeException( 'At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.' ); } /** * @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $maxTerms = Lucene\Lucene::getTermsPerQueryLimit(); foreach ($fields as $field) { $index->resetTermsStream(); if ($prefix != '') { $index->skipTo(new Index\Term($prefix, $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field && substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } } $index->nextTerm(); } } else { $index->skipTo(new Index\Term('', $field)); while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { if (preg_match($matchExpression, $index->currentTerm()->text) === 1) { $this->_matches[] = $index->currentTerm(); if ($maxTerms != 0 && count($this->_matches) > $maxTerms) { throw new OutOfBoundsException('Terms per query limit is reached.'); } } $index->nextTerm(); } } $index->closeTermsStream(); } if (count($this->_matches) == 0) { return new EmptyResult(); } elseif (count($this->_matches) == 1) { return new Term(reset($this->_matches)); } else { $rewrittenQuery = new MultiTerm(); foreach ($this->_matches as $matchedTerm) { $rewrittenQuery->addTerm($matchedTerm); } return $rewrittenQuery; } } /** * Optimize query in the context of specified index * * @param \ZendSearch\Lucene\SearchIndexInterface $index * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function optimize(Lucene\SearchIndexInterface $index) { throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); } /** * Returns query pattern * * @return \ZendSearch\Lucene\Index\Term */ public function getPattern() { return $this->_pattern; } /** * Return query terms * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return array */ public function getQueryTerms() { if ($this->_matches === null) { throw new RuntimeException('Search has to be performed first to get matched terms'); } return $this->_matches; } /** * Constructs an appropriate Weight implementation for this query. * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function createWeight(Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException */ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) { throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return array */ public function matchedDocs() { throw new UnsupportedMethodCallException( 'Wildcard query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Score specified document * * @param integer $docId * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException * @return float */ public function score($docId, Lucene\SearchIndexInterface $reader) { throw new UnsupportedMethodCallException( 'Wildcard query should not be directly used for search. Use $query->rewrite($index)' ); } /** * Query specific matches highlighting * * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) */ protected function _highlightMatches(Highlighter $highlighter) { $words = array(); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/'; ErrorHandler::start(E_WARNING); $result = preg_match('/\pL/u', 'a'); ErrorHandler::stop(); if ($result == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); $tokens = Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); foreach ($tokens as $token) { if (preg_match($matchExpression, $token->getTermText()) === 1) { $words[] = $token->getTermText(); } } $highlighter->highlight($words); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping if ($this->_pattern->field !== null) { $query = $this->_pattern->field . ':'; } else { $query = ''; } $query .= $this->_pattern->text; if ($this->getBoost() != 1) { $query = $query . '^' . round($this->getBoost(), 4); } return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryEntry/000077500000000000000000000000001245775125600260475ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryEntry/AbstractQueryEntry.php000066400000000000000000000021761245775125600324010ustar00rootroot00000000000000_boost *= $boostFactor; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryEntry/Phrase.php000066400000000000000000000045121245775125600300040ustar00rootroot00000000000000_phrase = $phrase; $this->_field = $field; } /** * Process modifier ('~') * * @param mixed $parameter */ public function processFuzzyProximityModifier($parameter = null) { $this->_proximityQuery = true; if ($parameter !== null) { $this->_wordsDistance = $parameter; } } /** * Transform entry to a subquery * * @param string $encoding * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function getQuery($encoding) { $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Phrase($this->_phrase, $encoding, ($this->_field !== null)? iconv($encoding, 'UTF-8', $this->_field) : null); if ($this->_proximityQuery) { $query->setSlop($this->_wordsDistance); } $query->setBoost($this->_boost); return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryEntry/Subquery.php000066400000000000000000000030021245775125600303720ustar00rootroot00000000000000_query = $query; } /** * Process modifier ('~') * * @param mixed $parameter * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function processFuzzyProximityModifier($parameter = null) { throw new \ZendSearch\Lucene\Search\Exception\QueryParserException( '\'~\' sign must follow term or phrase' ); } /** * Transform entry to a subquery * * @param string $encoding * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function getQuery($encoding) { $this->_query->setBoost($this->_boost); return $this->_query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryEntry/Term.php000066400000000000000000000061511245775125600274720ustar00rootroot00000000000000_term = $term; $this->_field = $field; } /** * Process modifier ('~') * * @param mixed $parameter */ public function processFuzzyProximityModifier($parameter = null) { $this->_fuzzyQuery = true; if ($parameter !== null) { $this->_similarity = $parameter; } else { $this->_similarity = \ZendSearch\Lucene\Search\Query\Fuzzy::DEFAULT_MIN_SIMILARITY; } } /** * Transform entry to a subquery * * @param string $encoding * @return \ZendSearch\Lucene\Search\Query\AbstractQuery * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function getQuery($encoding) { if ($this->_fuzzyQuery) { $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Fuzzy($this->_term, $encoding, ($this->_field !== null)? iconv($encoding, 'UTF-8', $this->_field) : null, $this->_similarity ); $query->setBoost($this->_boost); return $query; } $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Term($this->_term, $encoding, ($this->_field !== null)? iconv($encoding, 'UTF-8', $this->_field) : null ); $query->setBoost($this->_boost); return $query; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryHit.php000066400000000000000000000047311245775125600262100ustar00rootroot00000000000000_index = $index; } /** * Magic method for checking the existence of a field * * @param string $offset * @return boolean TRUE if the field exists else FALSE */ public function __isset($offset) { return isset($this->getDocument()->$offset); } /** * Convenience function for getting fields from the document * associated with this hit. * * @param string $offset * @return string */ public function __get($offset) { return $this->getDocument()->getFieldValue($offset); } /** * Return the document object for this hit * * @return \ZendSearch\Lucene\Document */ public function getDocument() { if (!$this->_document instanceof Document) { $this->_document = $this->_index->getDocument($this->document_id); } return $this->_document; } /** * Return the index object for this hit * * @return \ZendSearch\Lucene\SearchIndexInterface */ public function getIndex() { return $this->_index; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryLexer.php000066400000000000000000000561461245775125600265520ustar00rootroot00000000000000addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME), array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME), array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME), array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME) )); $this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME), array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME), array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME), array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME) )); $this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME), array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR), // IN_QUOTE not allowed array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction), array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME), array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME), array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME) )); $this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR), array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE), array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME), array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME) )); $this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME), array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME) )); $this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME), array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME) )); $this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), // IN_ESCAPE_CHAR not allowed array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction), // IN_QUOTE not allowed array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction), array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA), array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER), // IN_CHAR not allowed array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction), )); $this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), // IN_ESCAPE_CHAR not allowed array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction), // IN_QUOTE not allowed array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction), array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA), array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER), // IN_CHAR not allowed array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction), )); $this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE), array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME), array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME), array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER), // IN_ESCAPE_CHAR not allowed array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction), // IN_QUOTE not allowed array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction), // IN_DECIMAL_POINT not allowed array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction), array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA), // IN_CHAR not allowed array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction), )); /** Actions */ $syntaxLexemeAction = new Lucene\FSMAction($this, 'addQuerySyntaxLexeme'); $lexemeModifierAction = new Lucene\FSMAction($this, 'addLexemeModifier'); $addLexemeAction = new Lucene\FSMAction($this, 'addLexeme'); $addQuotedLexemeAction = new Lucene\FSMAction($this, 'addQuotedLexeme'); $addNumberLexemeAction = new Lucene\FSMAction($this, 'addNumberLexeme'); $addLexemeCharAction = new Lucene\FSMAction($this, 'addLexemeChar'); /** Syntax lexeme */ $this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction); // Two lexemes in succession $this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction); /** Lexeme */ $this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction); // ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action $this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction); $this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction); /** Quoted lexeme */ // We don't need entry action (skeep quote) $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction); $this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction); // Closing quote changes state to the ST_WHITE_SPACE other states are not used $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction); /** Lexeme modifier */ $this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction); /** Number */ $this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction); $this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction); $this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction); // ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action $this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction); $this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction); $this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction); $this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction); $this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction); $this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction); $this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction); } /** * Translate input char to an input symbol of state machine * * @param string $char * @return integer */ private function _translateInput($char) { if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE; } elseif (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR; } elseif (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR; } elseif (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER; } elseif (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT; } elseif ($char === '"' ) { return self::IN_QUOTE; } elseif ($char === '.' ) { return self::IN_DECIMAL_POINT; } elseif ($char === '\\') { return self::IN_ESCAPE_CHAR; } else { return self::IN_CHAR; } } /** * This method is used to tokenize query string into lexemes * * @param string $inputString * @param string $encoding * @return array * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function tokenize($inputString, $encoding) { $this->reset(); $this->_lexemes = array(); $this->_queryString = array(); if (PHP_OS == 'AIX' && $encoding == '') { $encoding = 'ISO8859-1'; } $strLength = iconv_strlen($inputString, $encoding); // Workaround for iconv_substr bug $inputString .= ' '; for ($count = 0; $count < $strLength; $count++) { $this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding); } for ($this->_queryStringPosition = 0; $this->_queryStringPosition < count($this->_queryString); $this->_queryStringPosition++) { $this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition])); } $this->process(self::IN_WHITE_SPACE); if ($this->getState() != self::ST_WHITE_SPACE) { throw new QueryParserException('Unexpected end of query'); } $this->_queryString = null; return $this->_lexemes; } /********************************************************************* * Actions implementation * * Actions affect on recognized lexemes list *********************************************************************/ /** * Add query syntax lexeme * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function addQuerySyntaxLexeme() { $lexeme = $this->_queryString[$this->_queryStringPosition]; // Process two char lexemes if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) { // increase current position in a query string $this->_queryStringPosition++; // check, if ($this->_queryStringPosition == count($this->_queryString) || $this->_queryString[$this->_queryStringPosition] != $lexeme) { throw new QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg()); } // duplicate character $lexeme .= $lexeme; } $token = new QueryToken(QueryToken::TC_SYNTAX_ELEMENT, $lexeme, $this->_queryStringPosition); // Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word' if ($token->type == QueryToken::TT_FIELD_INDICATOR) { $token = array_pop($this->_lexemes); if ($token === null || $token->type != QueryToken::TT_WORD) { throw new QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg()); } $token->type = QueryToken::TT_FIELD; } $this->_lexemes[] = $token; } /** * Add lexeme modifier */ public function addLexemeModifier() { $this->_lexemes[] = new QueryToken(QueryToken::TC_SYNTAX_ELEMENT, $this->_queryString[$this->_queryStringPosition], $this->_queryStringPosition); } /** * Add lexeme */ public function addLexeme() { $this->_lexemes[] = new QueryToken(QueryToken::TC_WORD, $this->_currentLexeme, $this->_queryStringPosition - 1); $this->_currentLexeme = ''; } /** * Add quoted lexeme */ public function addQuotedLexeme() { $this->_lexemes[] = new QueryToken(QueryToken::TC_PHRASE, $this->_currentLexeme, $this->_queryStringPosition); $this->_currentLexeme = ''; } /** * Add number lexeme */ public function addNumberLexeme() { $this->_lexemes[] = new QueryToken(QueryToken::TC_NUMBER, $this->_currentLexeme, $this->_queryStringPosition - 1); $this->_currentLexeme = ''; } /** * Extend lexeme by one char */ public function addLexemeChar() { $this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition]; } /** * Position message * * @return string */ private function _positionMsg() { return 'Position is ' . $this->_queryStringPosition . '.'; } /********************************************************************* * Syntax errors actions *********************************************************************/ public function lexModifierErrException() { throw new QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg()); } public function quoteWithinLexemeErrException() { throw new QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg()); } public function wrongNumberErrException() { throw new QueryParserException('Wrong number syntax.' . $this->_positionMsg()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryParser.php000066400000000000000000000530331245775125600267170ustar00rootroot00000000000000addRules( array(array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_WORD, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PHRASE, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FIELD, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_REQUIRED, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PROHIBITED, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FUZZY_PROX_MARK, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_BOOSTING_MARK, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_RANGE_INCL_START, self::ST_CLOSEDINT_RQ_START), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_RANGE_EXCL_START, self::ST_OPENEDINT_RQ_START), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_START, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_END, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_AND_LEXEME, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_OR_LEXEME, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NOT_LEXEME, self::ST_COMMON_QUERY_ELEMENT), array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NUMBER, self::ST_COMMON_QUERY_ELEMENT) )); $this->addRules( array(array(self::ST_CLOSEDINT_RQ_START, QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_FIRST_TERM), array(self::ST_CLOSEDINT_RQ_FIRST_TERM, QueryToken::TT_TO_LEXEME, self::ST_CLOSEDINT_RQ_TO_TERM), array(self::ST_CLOSEDINT_RQ_TO_TERM, QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_LAST_TERM), array(self::ST_CLOSEDINT_RQ_LAST_TERM, QueryToken::TT_RANGE_INCL_END, self::ST_COMMON_QUERY_ELEMENT) )); $this->addRules( array(array(self::ST_OPENEDINT_RQ_START, QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_FIRST_TERM), array(self::ST_OPENEDINT_RQ_FIRST_TERM, QueryToken::TT_TO_LEXEME, self::ST_OPENEDINT_RQ_TO_TERM), array(self::ST_OPENEDINT_RQ_TO_TERM, QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_LAST_TERM), array(self::ST_OPENEDINT_RQ_LAST_TERM, QueryToken::TT_RANGE_EXCL_END, self::ST_COMMON_QUERY_ELEMENT) )); $addTermEntryAction = new Lucene\FSMAction($this, 'addTermEntry'); $addPhraseEntryAction = new Lucene\FSMAction($this, 'addPhraseEntry'); $setFieldAction = new Lucene\FSMAction($this, 'setField'); $setSignAction = new Lucene\FSMAction($this, 'setSign'); $setFuzzyProxAction = new Lucene\FSMAction($this, 'processFuzzyProximityModifier'); $processModifierParameterAction = new Lucene\FSMAction($this, 'processModifierParameter'); $subqueryStartAction = new Lucene\FSMAction($this, 'subqueryStart'); $subqueryEndAction = new Lucene\FSMAction($this, 'subqueryEnd'); $logicalOperatorAction = new Lucene\FSMAction($this, 'logicalOperator'); $openedRQFirstTermAction = new Lucene\FSMAction($this, 'openedRQFirstTerm'); $openedRQLastTermAction = new Lucene\FSMAction($this, 'openedRQLastTerm'); $closedRQFirstTermAction = new Lucene\FSMAction($this, 'closedRQFirstTerm'); $closedRQLastTermAction = new Lucene\FSMAction($this, 'closedRQLastTerm'); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_WORD, $addTermEntryAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PHRASE, $addPhraseEntryAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FIELD, $setFieldAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_REQUIRED, $setSignAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PROHIBITED, $setSignAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FUZZY_PROX_MARK, $setFuzzyProxAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NUMBER, $processModifierParameterAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_START, $subqueryStartAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_END, $subqueryEndAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_AND_LEXEME, $logicalOperatorAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_OR_LEXEME, $logicalOperatorAction); $this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NOT_LEXEME, $logicalOperatorAction); $this->addEntryAction(self::ST_OPENEDINT_RQ_FIRST_TERM, $openedRQFirstTermAction); $this->addEntryAction(self::ST_OPENEDINT_RQ_LAST_TERM, $openedRQLastTermAction); $this->addEntryAction(self::ST_CLOSEDINT_RQ_FIRST_TERM, $closedRQFirstTermAction); $this->addEntryAction(self::ST_CLOSEDINT_RQ_LAST_TERM, $closedRQLastTermAction); $this->_lexer = new QueryLexer(); } /** * Get query parser instance * * @return \ZendSearch\Lucene\Search\QueryParser */ private static function _getInstance() { if (self::$_instance === null) { self::$_instance = new self(); } return self::$_instance; } /** * Set query string default encoding * * @param string $encoding */ public static function setDefaultEncoding($encoding) { self::_getInstance()->_defaultEncoding = $encoding; } /** * Get query string default encoding * * @return string */ public static function getDefaultEncoding() { return self::_getInstance()->_defaultEncoding; } /** * Set default boolean operator * * @param integer $operator */ public static function setDefaultOperator($operator) { self::_getInstance()->_defaultOperator = $operator; } /** * Get default boolean operator * * @return integer */ public static function getDefaultOperator() { return self::_getInstance()->_defaultOperator; } /** * Turn on 'suppress query parser exceptions' mode. */ public static function suppressQueryParsingExceptions() { self::_getInstance()->_suppressQueryParsingExceptions = true; } /** * Turn off 'suppress query parser exceptions' mode. */ public static function dontSuppressQueryParsingExceptions() { self::_getInstance()->_suppressQueryParsingExceptions = false; } /** * Check 'suppress query parser exceptions' mode. * @return boolean */ public static function queryParsingExceptionsSuppressed() { return self::_getInstance()->_suppressQueryParsingExceptions; } /** * Escape keyword to force it to be parsed as one term * * @param string $keyword * @return string */ public static function escape($keyword) { return '\\' . implode('\\', str_split($keyword)); } /** * Parses a query string * * @param string $strQuery * @param string $encoding * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public static function parse($strQuery, $encoding = null) { self::_getInstance(); // Reset FSM if previous parse operation didn't return it into a correct state self::$_instance->reset(); try { self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding; self::$_instance->_lastToken = null; self::$_instance->_context = new QueryParserContext(self::$_instance->_encoding); self::$_instance->_contextStack = array(); self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding); // Empty query if (count(self::$_instance->_tokens) == 0) { return new Query\Insignificant(); } foreach (self::$_instance->_tokens as $token) { try { self::$_instance->_currentToken = $token; self::$_instance->process($token->type); self::$_instance->_lastToken = $token; } catch (\Exception $e) { if (strpos($e->getMessage(), 'There is no any rule for') !== false) { throw new QueryParserException( 'Syntax error at char position ' . $token->position . '.', 0, $e); } throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } if (count(self::$_instance->_contextStack) != 0) { throw new QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' ); } return self::$_instance->_context->getQuery(); } catch (QueryParserException $e) { if (self::$_instance->_suppressQueryParsingExceptions) { $queryTokens = Analyzer\Analyzer::getDefault()->tokenize($strQuery, self::$_instance->_encoding); $query = new Query\MultiTerm(); $termsSign = (self::$_instance->_defaultOperator == self::B_AND) ? true /* required term */ : null /* optional term */; foreach ($queryTokens as $token) { $query->addTerm(new Index\Term($token->getTermText()), $termsSign); } return $query; } else { throw new RuntimeException($e->getMessage(), $e->getCode(), $e); } } } /********************************************************************* * Actions implementation * * Actions affect on recognized lexemes list *********************************************************************/ /** * Add term to a query */ public function addTermEntry() { $entry = new QueryEntry\Term($this->_currentToken->text, $this->_context->getField()); $this->_context->addEntry($entry); } /** * Add phrase to a query */ public function addPhraseEntry() { $entry = new QueryEntry\Phrase($this->_currentToken->text, $this->_context->getField()); $this->_context->addEntry($entry); } /** * Set entry field */ public function setField() { $this->_context->setNextEntryField($this->_currentToken->text); } /** * Set entry sign */ public function setSign() { $this->_context->setNextEntrySign($this->_currentToken->type); } /** * Process fuzzy search/proximity modifier - '~' */ public function processFuzzyProximityModifier() { $this->_context->processFuzzyProximityModifier(); } /** * Process modifier parameter * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ public function processModifierParameter() { if ($this->_lastToken === null) { throw new QueryParserException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' ); } switch ($this->_lastToken->type) { case QueryToken::TT_FUZZY_PROX_MARK: $this->_context->processFuzzyProximityModifier($this->_currentToken->text); break; case QueryToken::TT_BOOSTING_MARK: $this->_context->boost($this->_currentToken->text); break; default: // It's not a user input exception throw new RuntimeException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' ); } } /** * Start subquery */ public function subqueryStart() { $this->_contextStack[] = $this->_context; $this->_context = new QueryParserContext($this->_encoding, $this->_context->getField()); } /** * End subquery */ public function subqueryEnd() { if (count($this->_contextStack) == 0) { throw new QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing. Char position ' . $this->_currentToken->position . '.' ); } $query = $this->_context->getQuery(); $this->_context = array_pop($this->_contextStack); $this->_context->addEntry(new QueryEntry\Subquery($query)); } /** * Process logical operator */ public function logicalOperator() { $this->_context->addLogicalOperator($this->_currentToken->type); } /** * Process first range query term (opened interval) */ public function openedRQFirstTerm() { $this->_rqFirstTerm = $this->_currentToken->text; } /** * Process last range query term (opened interval) * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function openedRQLastTerm() { $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $from = null; } $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $to = null; } if ($from === null && $to === null) { throw new QueryParserException('At least one range query boundary term must be non-empty term'); } $rangeQuery = new Query\Range($from, $to, false); $entry = new QueryEntry\Subquery($rangeQuery); $this->_context->addEntry($entry); } /** * Process first range query term (closed interval) */ public function closedRQFirstTerm() { $this->_rqFirstTerm = $this->_currentToken->text; } /** * Process last range query term (closed interval) * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function closedRQLastTerm() { $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $from = null; } $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding); if (count($tokens) > 1) { throw new QueryParserException('Range query boundary terms must be non-multiple word terms'); } elseif (count($tokens) == 1) { $to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField()); } else { $to = null; } if ($from === null && $to === null) { throw new QueryParserException('At least one range query boundary term must be non-empty term'); } $rangeQuery = new Query\Range($from, $to, true); $entry = new QueryEntry\Subquery($rangeQuery); $this->_context->addEntry($entry); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryParserContext.php000066400000000000000000000253401245775125600302640ustar00rootroot00000000000000_encoding = $encoding; $this->_defaultField = $defaultField; } /** * Get context default field * * @return string|null */ public function getField() { return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField; } /** * Set field for next entry * * @param string $field */ public function setNextEntryField($field) { $this->_nextEntryField = $field; } /** * Set sign for next entry * * @param integer $sign * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @throws \ZendSearch\Lucene\Exception\UnexpectedValueException */ public function setNextEntrySign($sign) { if ($this->_mode === self::GM_BOOLEAN) { throw new QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.'); } $this->_mode = self::GM_SIGNS; if ($sign == QueryToken::TT_REQUIRED) { $this->_nextEntrySign = true; } elseif ($sign == QueryToken::TT_PROHIBITED) { $this->_nextEntrySign = false; } else { throw new UnexpectedValueException('Unrecognized sign type.'); } } /** * Add entry to a query * * @param \ZendSearch\Lucene\Search\QueryEntry\AbstractQueryEntry $entry */ public function addEntry(QueryEntry\AbstractQueryEntry $entry) { if ($this->_mode !== self::GM_BOOLEAN) { $this->_signs[] = $this->_nextEntrySign; } $this->_entries[] = $entry; $this->_nextEntryField = null; $this->_nextEntrySign = null; } /** * Process fuzzy search or proximity search modifier * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function processFuzzyProximityModifier($parameter = null) { // Check, that modifier has came just after word or phrase if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) { throw new QueryParserException('\'~\' modifier must follow word or phrase.'); } $lastEntry = array_pop($this->_entries); if (!$lastEntry instanceof QueryEntry\AbstractQueryEntry) { // there are no entries or last entry is boolean operator throw new QueryParserException('\'~\' modifier must follow word or phrase.'); } $lastEntry->processFuzzyProximityModifier($parameter); $this->_entries[] = $lastEntry; } /** * Set boost factor to the entry * * @param float $boostFactor * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function boost($boostFactor) { // Check, that modifier has came just after word or phrase if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) { throw new QueryParserException('\'^\' modifier must follow word, phrase or subquery.'); } $lastEntry = array_pop($this->_entries); if (!$lastEntry instanceof QueryEntry\AbstractQueryEntry) { // there are no entries or last entry is boolean operator throw new QueryParserException('\'^\' modifier must follow word, phrase or subquery.'); } $lastEntry->boost($boostFactor); $this->_entries[] = $lastEntry; } /** * Process logical operator * * @param integer $operator * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException */ public function addLogicalOperator($operator) { if ($this->_mode === self::GM_SIGNS) { throw new QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.'); } $this->_mode = self::GM_BOOLEAN; $this->_entries[] = $operator; } /** * Generate 'signs style' query from the context * '+term1 term2 -term3 +() ...' * * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function _signStyleExpressionQuery() { $query = new Query\Boolean(); if (QueryParser::getDefaultOperator() == QueryParser::B_AND) { $defaultSign = true; // required } else { $defaultSign = null; // optional } foreach ($this->_entries as $entryId => $entry) { $sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign; $query->addSubquery($entry->getQuery($this->_encoding), $sign); } return $query; } /** * Generate 'boolean style' query from the context * 'term1 and term2 or term3 and () and not ()' * * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ private function _booleanExpressionQuery() { /** * We treat each level of an expression as a boolean expression in * a Disjunctive Normal Form * * AND operator has higher precedence than OR * * Thus logical query is a disjunction of one or more conjunctions of * one or more query entries */ $expressionRecognizer = new BooleanExpressionRecognizer(); try { foreach ($this->_entries as $entry) { if ($entry instanceof QueryEntry\AbstractQueryEntry) { $expressionRecognizer->processLiteral($entry); } else { switch ($entry) { case QueryToken::TT_AND_LEXEME: $expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_AND_OPERATOR); break; case QueryToken::TT_OR_LEXEME: $expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_OR_OPERATOR); break; case QueryToken::TT_NOT_LEXEME: $expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_NOT_OPERATOR); break; default: throw new UnexpectedValueException('Boolean expression error. Unknown operator type.'); } } } $conjuctions = $expressionRecognizer->finishExpression(); } catch (ExceptionInterface $e) { // It's query syntax error message and it should be user friendly. So FSM message is omitted throw new QueryParserException('Boolean expression error.', 0, $e); } // Remove 'only negative' conjunctions foreach ($conjuctions as $conjuctionId => $conjuction) { $nonNegativeEntryFound = false; foreach ($conjuction as $conjuctionEntry) { if ($conjuctionEntry[1]) { $nonNegativeEntryFound = true; break; } } if (!$nonNegativeEntryFound) { unset($conjuctions[$conjuctionId]); } } $subqueries = array(); foreach ($conjuctions as $conjuction) { // Check, if it's a one term conjuction if (count($conjuction) == 1) { $subqueries[] = $conjuction[0][0]->getQuery($this->_encoding); } else { $subquery = new Query\Boolean(); foreach ($conjuction as $conjuctionEntry) { $subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]); } $subqueries[] = $subquery; } } if (count($subqueries) == 0) { return new Query\Insignificant(); } if (count($subqueries) == 1) { return $subqueries[0]; } $query = new Query\Boolean(); foreach ($subqueries as $subquery) { // Non-requirered entry/subquery $query->addSubquery($subquery); } return $query; } /** * Generate query from current context * * @return \ZendSearch\Lucene\Search\Query\AbstractQuery */ public function getQuery() { if ($this->_mode === self::GM_BOOLEAN) { return $this->_booleanExpressionQuery(); } else { return $this->_signStyleExpressionQuery(); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/QueryToken.php000066400000000000000000000154411245775125600265440ustar00rootroot00000000000000 or field:() pairs const TT_FIELD_INDICATOR = 3; // ':' const TT_REQUIRED = 4; // '+' const TT_PROHIBITED = 5; // '-' const TT_FUZZY_PROX_MARK = 6; // '~' const TT_BOOSTING_MARK = 7; // '^' const TT_RANGE_INCL_START = 8; // '[' const TT_RANGE_INCL_END = 9; // ']' const TT_RANGE_EXCL_START = 10; // '{' const TT_RANGE_EXCL_END = 11; // '}' const TT_SUBQUERY_START = 12; // '(' const TT_SUBQUERY_END = 13; // ')' const TT_AND_LEXEME = 14; // 'AND' or 'and' const TT_OR_LEXEME = 15; // 'OR' or 'or' const TT_NOT_LEXEME = 16; // 'NOT' or 'not' const TT_TO_LEXEME = 17; // 'TO' or 'to' const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, .... /** * Returns all possible lexeme types. * It's used for syntax analyzer state machine initialization * * @return array */ public static function getTypes() { return array( self::TT_WORD, self::TT_PHRASE, self::TT_FIELD, self::TT_FIELD_INDICATOR, self::TT_REQUIRED, self::TT_PROHIBITED, self::TT_FUZZY_PROX_MARK, self::TT_BOOSTING_MARK, self::TT_RANGE_INCL_START, self::TT_RANGE_INCL_END, self::TT_RANGE_EXCL_START, self::TT_RANGE_EXCL_END, self::TT_SUBQUERY_START, self::TT_SUBQUERY_END, self::TT_AND_LEXEME, self::TT_OR_LEXEME, self::TT_NOT_LEXEME, self::TT_TO_LEXEME, self::TT_NUMBER ); } /** * TokenCategories */ const TC_WORD = 0; // Word const TC_PHRASE = 1; // Phrase (one or several quoted words) const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8 const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^ /** * Token type. * * @var integer */ public $type; /** * Token text. * * @var integer */ public $text; /** * Token position within query. * * @var integer */ public $position; /** * IndexReader constructor needs token type and token text as a parameters. * * @param integer $tokenCategory * @param string $tokText * @param integer $position * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function __construct($tokenCategory, $tokenText, $position) { $this->text = $tokenText; $this->position = $position + 1; // Start from 1 switch ($tokenCategory) { case self::TC_WORD: if ( strtolower($tokenText) == 'and') { $this->type = self::TT_AND_LEXEME; } elseif (strtolower($tokenText) == 'or') { $this->type = self::TT_OR_LEXEME; } elseif (strtolower($tokenText) == 'not') { $this->type = self::TT_NOT_LEXEME; } elseif (strtolower($tokenText) == 'to') { $this->type = self::TT_TO_LEXEME; } else { $this->type = self::TT_WORD; } break; case self::TC_PHRASE: $this->type = self::TT_PHRASE; break; case self::TC_NUMBER: $this->type = self::TT_NUMBER; break; case self::TC_SYNTAX_ELEMENT: switch ($tokenText) { case ':': $this->type = self::TT_FIELD_INDICATOR; break; case '+': $this->type = self::TT_REQUIRED; break; case '-': $this->type = self::TT_PROHIBITED; break; case '~': $this->type = self::TT_FUZZY_PROX_MARK; break; case '^': $this->type = self::TT_BOOSTING_MARK; break; case '[': $this->type = self::TT_RANGE_INCL_START; break; case ']': $this->type = self::TT_RANGE_INCL_END; break; case '{': $this->type = self::TT_RANGE_EXCL_START; break; case '}': $this->type = self::TT_RANGE_EXCL_END; break; case '(': $this->type = self::TT_SUBQUERY_START; break; case ')': $this->type = self::TT_SUBQUERY_END; break; case '!': $this->type = self::TT_NOT_LEXEME; break; case '&&': $this->type = self::TT_AND_LEXEME; break; case '||': $this->type = self::TT_OR_LEXEME; break; default: throw new Lucene\Exception\InvalidArgumentException( 'Unrecognized query syntax lexeme: \'' . $tokenText . '\'' ); } break; case self::TC_NUMBER: $this->type = self::TT_NUMBER; default: throw new Lucene\Exception\InvalidArgumentException( 'Unrecognized lexeme type: \'' . $tokenCategory . '\'' ); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Similarity/000077500000000000000000000000001245775125600260465ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Similarity/AbstractSimilarity.php000066400000000000000000000571071245775125600324030ustar00rootroot00000000000000 0.0, 1 => 5.820766E-10, 2 => 6.9849193E-10, 3 => 8.1490725E-10, 4 => 9.313226E-10, 5 => 1.1641532E-9, 6 => 1.3969839E-9, 7 => 1.6298145E-9, 8 => 1.8626451E-9, 9 => 2.3283064E-9, 10 => 2.7939677E-9, 11 => 3.259629E-9, 12 => 3.7252903E-9, 13 => 4.656613E-9, 14 => 5.5879354E-9, 15 => 6.519258E-9, 16 => 7.4505806E-9, 17 => 9.313226E-9, 18 => 1.1175871E-8, 19 => 1.3038516E-8, 20 => 1.4901161E-8, 21 => 1.8626451E-8, 22 => 2.2351742E-8, 23 => 2.6077032E-8, 24 => 2.9802322E-8, 25 => 3.7252903E-8, 26 => 4.4703484E-8, 27 => 5.2154064E-8, 28 => 5.9604645E-8, 29 => 7.4505806E-8, 30 => 8.940697E-8, 31 => 1.0430813E-7, 32 => 1.1920929E-7, 33 => 1.4901161E-7, 34 => 1.7881393E-7, 35 => 2.0861626E-7, 36 => 2.3841858E-7, 37 => 2.9802322E-7, 38 => 3.5762787E-7, 39 => 4.172325E-7, 40 => 4.7683716E-7, 41 => 5.9604645E-7, 42 => 7.1525574E-7, 43 => 8.34465E-7, 44 => 9.536743E-7, 45 => 1.1920929E-6, 46 => 1.4305115E-6, 47 => 1.66893E-6, 48 => 1.9073486E-6, 49 => 2.3841858E-6, 50 => 2.861023E-6, 51 => 3.33786E-6, 52 => 3.8146973E-6, 53 => 4.7683716E-6, 54 => 5.722046E-6, 55 => 6.67572E-6, 56 => 7.6293945E-6, 57 => 9.536743E-6, 58 => 1.1444092E-5, 59 => 1.335144E-5, 60 => 1.5258789E-5, 61 => 1.9073486E-5, 62 => 2.2888184E-5, 63 => 2.670288E-5, 64 => 3.0517578E-5, 65 => 3.8146973E-5, 66 => 4.5776367E-5, 67 => 5.340576E-5, 68 => 6.1035156E-5, 69 => 7.6293945E-5, 70 => 9.1552734E-5, 71 => 1.0681152E-4, 72 => 1.2207031E-4, 73 => 1.5258789E-4, 74 => 1.8310547E-4, 75 => 2.1362305E-4, 76 => 2.4414062E-4, 77 => 3.0517578E-4, 78 => 3.6621094E-4, 79 => 4.272461E-4, 80 => 4.8828125E-4, 81 => 6.1035156E-4, 82 => 7.324219E-4, 83 => 8.544922E-4, 84 => 9.765625E-4, 85 => 0.0012207031, 86 => 0.0014648438, 87 => 0.0017089844, 88 => 0.001953125, 89 => 0.0024414062, 90 => 0.0029296875, 91 => 0.0034179688, 92 => 0.00390625, 93 => 0.0048828125, 94 => 0.005859375, 95 => 0.0068359375, 96 => 0.0078125, 97 => 0.009765625, 98 => 0.01171875, 99 => 0.013671875, 100 => 0.015625, 101 => 0.01953125, 102 => 0.0234375, 103 => 0.02734375, 104 => 0.03125, 105 => 0.0390625, 106 => 0.046875, 107 => 0.0546875, 108 => 0.0625, 109 => 0.078125, 110 => 0.09375, 111 => 0.109375, 112 => 0.125, 113 => 0.15625, 114 => 0.1875, 115 => 0.21875, 116 => 0.25, 117 => 0.3125, 118 => 0.375, 119 => 0.4375, 120 => 0.5, 121 => 0.625, 122 => 0.75, 123 => 0.875, 124 => 1.0, 125 => 1.25, 126 => 1.5, 127 => 1.75, 128 => 2.0, 129 => 2.5, 130 => 3.0, 131 => 3.5, 132 => 4.0, 133 => 5.0, 134 => 6.0, 135 => 7.0, 136 => 8.0, 137 => 10.0, 138 => 12.0, 139 => 14.0, 140 => 16.0, 141 => 20.0, 142 => 24.0, 143 => 28.0, 144 => 32.0, 145 => 40.0, 146 => 48.0, 147 => 56.0, 148 => 64.0, 149 => 80.0, 150 => 96.0, 151 => 112.0, 152 => 128.0, 153 => 160.0, 154 => 192.0, 155 => 224.0, 156 => 256.0, 157 => 320.0, 158 => 384.0, 159 => 448.0, 160 => 512.0, 161 => 640.0, 162 => 768.0, 163 => 896.0, 164 => 1024.0, 165 => 1280.0, 166 => 1536.0, 167 => 1792.0, 168 => 2048.0, 169 => 2560.0, 170 => 3072.0, 171 => 3584.0, 172 => 4096.0, 173 => 5120.0, 174 => 6144.0, 175 => 7168.0, 176 => 8192.0, 177 => 10240.0, 178 => 12288.0, 179 => 14336.0, 180 => 16384.0, 181 => 20480.0, 182 => 24576.0, 183 => 28672.0, 184 => 32768.0, 185 => 40960.0, 186 => 49152.0, 187 => 57344.0, 188 => 65536.0, 189 => 81920.0, 190 => 98304.0, 191 => 114688.0, 192 => 131072.0, 193 => 163840.0, 194 => 196608.0, 195 => 229376.0, 196 => 262144.0, 197 => 327680.0, 198 => 393216.0, 199 => 458752.0, 200 => 524288.0, 201 => 655360.0, 202 => 786432.0, 203 => 917504.0, 204 => 1048576.0, 205 => 1310720.0, 206 => 1572864.0, 207 => 1835008.0, 208 => 2097152.0, 209 => 2621440.0, 210 => 3145728.0, 211 => 3670016.0, 212 => 4194304.0, 213 => 5242880.0, 214 => 6291456.0, 215 => 7340032.0, 216 => 8388608.0, 217 => 1.048576E7, 218 => 1.2582912E7, 219 => 1.4680064E7, 220 => 1.6777216E7, 221 => 2.097152E7, 222 => 2.5165824E7, 223 => 2.9360128E7, 224 => 3.3554432E7, 225 => 4.194304E7, 226 => 5.0331648E7, 227 => 5.8720256E7, 228 => 6.7108864E7, 229 => 8.388608E7, 230 => 1.00663296E8, 231 => 1.17440512E8, 232 => 1.34217728E8, 233 => 1.6777216E8, 234 => 2.01326592E8, 235 => 2.34881024E8, 236 => 2.68435456E8, 237 => 3.3554432E8, 238 => 4.02653184E8, 239 => 4.69762048E8, 240 => 5.3687091E8, 241 => 6.7108864E8, 242 => 8.0530637E8, 243 => 9.395241E8, 244 => 1.07374182E9, 245 => 1.34217728E9, 246 => 1.61061274E9, 247 => 1.87904819E9, 248 => 2.14748365E9, 249 => 2.68435456E9, 250 => 3.22122547E9, 251 => 3.75809638E9, 252 => 4.2949673E9, 253 => 5.3687091E9, 254 => 6.4424509E9, 255 => 7.5161928E9 ); /** * Set the default Similarity implementation used by indexing and search * code. * * @param AbstractSimilarity $similarity */ public static function setDefault(AbstractSimilarity $similarity) { self::$_defaultImpl = $similarity; } /** * Return the default Similarity implementation used by indexing and search * code. * * @return AbstractSimilarity */ public static function getDefault() { if (!self::$_defaultImpl instanceof AbstractSimilarity) { self::$_defaultImpl = new DefaultSimilarity(); } return self::$_defaultImpl; } /** * Computes the normalization value for a field given the total number of * terms contained in a field. These values, together with field boosts, are * stored in an index and multipled into scores for hits on each field by the * search code. * * Matches in longer fields are less precise, so implemenations of this * method usually return smaller values when 'numTokens' is large, * and larger values when 'numTokens' is small. * * That these values are computed under * IndexWriter::addDocument(Document) and stored then using * encodeNorm(float). Thus they have limited precision, and documents * must be re-indexed if this method is altered. * * fieldName - name of field * numTokens - the total number of tokens contained in fields named * 'fieldName' of 'doc'. * Returns a normalization factor for hits on this field of this document * * @param string $fieldName * @param integer $numTokens * @return float */ abstract public function lengthNorm($fieldName, $numTokens); /** * Computes the normalization value for a query given the sum of the squared * weights of each of the query terms. This value is then multipled into the * weight of each query term. * * This does not affect ranking, but rather just attempts to make scores * from different queries comparable. * * sumOfSquaredWeights - the sum of the squares of query term weights * Returns a normalization factor for query weights * * @param float $sumOfSquaredWeights * @return float */ abstract public function queryNorm($sumOfSquaredWeights); /** * Decodes a normalization factor stored in an index. * * @param integer $byte * @return float */ public static function decodeNorm($byte) { return self::$_normTable[$byte & 0xFF]; } /** * Encodes a normalization factor for storage in an index. * * The encoding uses a five-bit exponent and three-bit mantissa, thus * representing values from around 7x10^9 to 2x10^-9 with about one * significant decimal digit of accuracy. Zero is also represented. * Negative numbers are rounded up to zero. Values too large to represent * are rounded down to the largest representable value. Positive values too * small to represent are rounded up to the smallest positive representable * value. * * @param float $f * @return integer */ public static function encodeNorm($f) { return self::_floatToByte($f); } /** * Float to byte conversion * * @param integer $b * @return float */ private static function _floatToByte($f) { // round negatives up to zero if ($f <= 0.0) { return 0; } // search for appropriate value $lowIndex = 0; $highIndex = 255; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $delta = $f - self::$_normTable[$mid]; if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { return $mid; // We got it! } } // round to closest value if ($highIndex != 255 && $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { return $highIndex + 1; } else { return $highIndex; } } /** * Computes a score factor based on a term or phrase's frequency in a * document. This value is multiplied by the idf(Term, Searcher) * factor for each term in the query and these products are then summed to * form the initial score for a document. * * Terms and phrases repeated in a document indicate the topic of the * document, so implementations of this method usually return larger values * when 'freq' is large, and smaller values when 'freq' * is small. * * freq - the frequency of a term within a document * Returns a score factor based on a term's within-document frequency * * @param float $freq * @return float */ abstract public function tf($freq); /** * Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form * the frequency that is passed to tf(float). * * A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually * return larger values when the edit distance is small and smaller values * when it is large. * * distance - the edit distance of this sloppy phrase match * Returns the frequency increment for this match * * @param integer $distance * @return float */ abstract public function sloppyFreq($distance); /** * Computes a score factor for a simple term or a phrase. * * The default implementation is: * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); * * input - the term in question or array of terms * reader - reader the document collection being searched * Returns a score factor for the term * * @param mixed $input * @param \ZendSearch\Lucene\SearchIndexInterface $reader * @return float a score factor for the term */ public function idf($input, \ZendSearch\Lucene\SearchIndexInterface $reader) { if (!is_array($input)) { return $this->idfFreq($reader->docFreq($input), $reader->count()); } else { $idf = 0.0; foreach ($input as $term) { $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); } return $idf; } } /** * Computes a score factor based on a term's document frequency (the number * of documents which contain the term). This value is multiplied by the * tf(int) factor for each term in the query and these products are * then summed to form the initial score for a document. * * Terms that occur in fewer documents are better indicators of topic, so * implemenations of this method usually return larger values for rare terms, * and smaller values for common terms. * * docFreq - the number of documents which contain the term * numDocs - the total number of documents in the collection * Returns a score factor based on the term's document frequency * * @param integer $docFreq * @param integer $numDocs * @return float */ abstract public function idfFreq($docFreq, $numDocs); /** * Computes a score factor based on the fraction of all query terms that a * document contains. This value is multiplied into scores. * * The presence of a large portion of the query terms indicates a better * match with the query, so implemenations of this method usually return * larger values when the ratio between these parameters is large and smaller * values when the ratio between them is small. * * overlap - the number of query terms matched in the document * maxOverlap - the total number of terms in the query * Returns a score factor based on term overlap with the query * * @param integer $overlap * @param integer $maxOverlap * @return float */ abstract public function coord($overlap, $maxOverlap); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Similarity/DefaultSimilarity.php000066400000000000000000000040331245775125600322120ustar00rootroot00000000000000createWeight(). * The sumOfSquaredWeights() method is then called on the top-level * query to compute the query normalization factor Similarity->queryNorm(float). * This factor is then passed to normalize(float). At this point the weighting * is complete. * * @category Zend * @package Zend_Search_Lucene * @subpackage Search */ abstract class AbstractWeight { /** * Normalization factor. * This value is stored only for query expanation purpose and not used in any other place * * @var float */ protected $_queryNorm; /** * AbstractWeight value * * AbstractWeight value may be initialized in sumOfSquaredWeights() or normalize() * because they both are invoked either in Query::_initWeight (for top-level query) or * in corresponding methods of parent query's weights * * @var float */ protected $_value; /** * The weight for this query. * * @return float */ public function getValue() { return $this->_value; } /** * The sum of squared weights of contained query clauses. * * @return float */ abstract public function sumOfSquaredWeights(); /** * Assigns the query normalization factor to this. * * @param $norm */ abstract public function normalize($norm); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Weight/Boolean.php000066400000000000000000000055051245775125600272440ustar00rootroot00000000000000_query = $query; $this->_reader = $reader; $this->_weights = array(); $signs = $query->getSigns(); foreach ($query->getSubqueries() as $num => $subquery) { if ($signs === null || $signs[$num] === null || $signs[$num]) { $this->_weights[$num] = $subquery->createWeight($reader); } } } /** * The weight for this query * Standard Weight::$_value is not used for boolean queries * * @return float */ public function getValue() { return $this->_query->getBoost(); } /** * The sum of squared weights of contained query clauses. * * @return float */ public function sumOfSquaredWeights() { $sum = 0; foreach ($this->_weights as $weight) { // sum sub weights $sum += $weight->sumOfSquaredWeights(); } // boost each sub-weight $sum *= $this->_query->getBoost() * $this->_query->getBoost(); // check for empty query (like '-something -another') if ($sum == 0) { $sum = 1.0; } return $sum; } /** * Assigns the query normalization factor to this. * * @param float $queryNorm */ public function normalize($queryNorm) { // incorporate boost $queryNorm *= $this->_query->getBoost(); foreach ($this->_weights as $weight) { $weight->normalize($queryNorm); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Weight/EmptyResultWeight.php000066400000000000000000000015321245775125600313260ustar00rootroot00000000000000_query = $query; $this->_reader = $reader; $this->_weights = array(); $signs = $query->getSigns(); foreach ($query->getTerms() as $id => $term) { if ($signs === null || $signs[$id] === null || $signs[$id]) { $this->_weights[$id] = new Term($term, $query, $reader); $query->setWeight($id, $this->_weights[$id]); } } } /** * The weight for this query * Standard Weight::$_value is not used for boolean queries * * @return float */ public function getValue() { return $this->_query->getBoost(); } /** * The sum of squared weights of contained query clauses. * * @return float */ public function sumOfSquaredWeights() { $sum = 0; foreach ($this->_weights as $weight) { // sum sub weights $sum += $weight->sumOfSquaredWeights(); } // boost each sub-weight $sum *= $this->_query->getBoost() * $this->_query->getBoost(); // check for empty query (like '-something -another') if ($sum == 0) { $sum = 1.0; } return $sum; } /** * Assigns the query normalization factor to this. * * @param float $queryNorm */ public function normalize($queryNorm) { // incorporate boost $queryNorm *= $this->_query->getBoost(); foreach ($this->_weights as $weight) { $weight->normalize($queryNorm); } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Weight/Phrase.php000066400000000000000000000041051245775125600271020ustar00rootroot00000000000000_query = $query; $this->_reader = $reader; } /** * The sum of squared weights of contained query clauses. * * @return float */ public function sumOfSquaredWeights() { // compute idf $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); // compute query weight $this->_queryWeight = $this->_idf * $this->_query->getBoost(); // square it return $this->_queryWeight * $this->_queryWeight; } /** * Assigns the query normalization factor to this. * * @param float $queryNorm */ public function normalize($queryNorm) { $this->_queryNorm = $queryNorm; // normalize query weight $this->_queryWeight *= $queryNorm; // idf for documents $this->_value = $this->_queryWeight * $this->_idf; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Search/Weight/Term.php000066400000000000000000000050101245775125600265630ustar00rootroot00000000000000_term = $term; $this->_query = $query; $this->_reader = $reader; } /** * The sum of squared weights of contained query clauses. * * @return float */ public function sumOfSquaredWeights() { // compute idf $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); // compute query weight $this->_queryWeight = $this->_idf * $this->_query->getBoost(); // square it return $this->_queryWeight * $this->_queryWeight; } /** * Assigns the query normalization factor to this. * * @param float $queryNorm */ public function normalize($queryNorm) { $this->_queryNorm = $queryNorm; // normalize query weight $this->_queryWeight *= $queryNorm; // idf for documents $this->_value = $this->_queryWeight * $this->_idf; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/SearchIndexInterface.php000066400000000000000000000230041245775125600272410ustar00rootroot00000000000000 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * @return integer */ public function getMergeFactor(); /** * Set index mergeFactor option * * mergeFactor determines how often segment indices are merged by addDocument(). * With smaller values, less RAM is used while indexing, * and searches on unoptimized indices are faster, * but indexing speed is slower. * With larger values, more RAM is used during indexing, * and while searches on unoptimized indices are slower, * indexing is faster. * Thus larger values (> 10) are best for batch index creation, * and smaller values (< 10) for indices that are interactively maintained. * * @param integer $maxMergeDocs */ public function setMergeFactor($mergeFactor); /** * Performs a query against the index and returns an array * of Zend_Search_Lucene_Search_QueryHit objects. * Input is a string or Zend_Search_Lucene_Search_Query. * * @param mixed $query * @return array|\ZendSearch\Lucene\Search\QueryHit * @throws \ZendSearch\Lucene\Exception\ExceptionInterface */ public function find($query); /** * Returns a list of all unique field names that exist in this index. * * @param boolean $indexed * @return array */ public function getFieldNames($indexed = false); /** * Returns a Zend_Search_Lucene_Document object for the document * number $id in this index. * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @return \ZendSearch\Lucene\Document */ public function getDocument($id); /** * Returns true if index contain documents with specified term. * * Is used for query optimization. * * @param \ZendSearch\Lucene\Index\Term $term * @return boolean */ public function hasTerm(Index\Term $term); /** * Returns IDs of all the documents containing term. * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return array */ public function termDocs(Index\Term $term, $docsFilter = null); /** * Returns documents filter for all documents containing term. * * It performs the same operation as termDocs, but return result as * Zend_Search_Lucene_Index_DocsFilter object * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return \ZendSearch\Lucene\Index\DocsFilter */ public function termDocsFilter(Index\Term $term, $docsFilter = null); /** * Returns an array of all term freqs. * Return array structure: array( docId => freq, ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return integer */ public function termFreqs(Index\Term $term, $docsFilter = null); /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter * @return array */ public function termPositions(Index\Term $term, $docsFilter = null); /** * Returns the number of documents in this index containing the $term. * * @param \ZendSearch\Lucene\Index\Term $term * @return integer */ public function docFreq(Index\Term $term); /** * Retrive similarity used by index reader * * @return \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity */ public function getSimilarity(); /** * Returns a normalization factor for "field, document" pair. * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName); /** * Returns true if any documents have been deleted from this index. * * @return boolean */ public function hasDeletions(); /** * Deletes a document from the index. * $id is an internal document id * * @param integer|\ZendSearch\Lucene\Search\QueryHit $id * @throws \ZendSearch\Lucene\Exception\ExceptionInterface */ public function delete($id); /** * Adds a document to this index. * * @param \ZendSearch\Lucene\Document $document */ public function addDocument(Document $document); /** * Commit changes resulting from delete() or undeleteAll() operations. */ public function commit(); /** * Optimize index. * * Merges all segments into one */ public function optimize(); /** * Returns an array of all terms in this index. * * @return array */ public function terms(); /** * Undeletes all documents currently marked as deleted in this index. */ public function undeleteAll(); } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/000077500000000000000000000000001245775125600241175ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/Directory/000077500000000000000000000000001245775125600260635ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/Directory/DirectoryInterface.php000066400000000000000000000056151245775125600323700ustar00rootroot00000000000000 Zend_Search_Lucene_Storage_File object * * @var array * @throws \ZendSearch\Lucene\Exception\ExceptionInterface */ protected $_fileHandlers; /** * Default file permissions * * @var integer */ protected static $_defaultFilePermissions = 0666; /** * Get default file permissions * * @return integer */ public static function getDefaultFilePermissions() { return self::$_defaultFilePermissions; } /** * Set default file permissions * * @param integer $mode */ public static function setDefaultFilePermissions($mode) { self::$_defaultFilePermissions = $mode; } /** * Utility function to recursive directory creation * * @param string $dir * @param integer $mode * @param boolean $recursive * @return boolean */ public static function mkdirs($dir, $mode = 0777, $recursive = true) { if (($dir === null) || $dir === '') { return false; } if (is_dir($dir) || $dir === '/') { return true; } if (self::mkdirs(dirname($dir), $mode, $recursive)) { return mkdir($dir, $mode); } return false; } /** * Object constructor * Checks if $path is a directory or tries to create it. * * @param string $path * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function __construct($path) { if (!is_dir($path)) { if (file_exists($path)) { throw new Lucene\Exception\InvalidArgumentException( 'Path exists, but it\'s not a directory' ); } else { if (!self::mkdirs($path)) { throw new Lucene\Exception\InvalidArgumentException( "Can't create directory '$path'." ); } } } $this->_dirPath = $path; $this->_fileHandlers = array(); } /** * Closes the store. * * @return void */ public function close() { foreach ($this->_fileHandlers as $fileObject) { $fileObject->close(); } $this->_fileHandlers = array(); } /** * Returns an array of strings, one for each file in the directory. * * @return array */ public function fileList() { $result = array(); $dirContent = opendir( $this->_dirPath ); while (($file = readdir($dirContent)) !== false) { if (($file == '..')||($file == '.')) continue; if( !is_dir($this->_dirPath . '/' . $file) ) { $result[] = $file; } } closedir($dirContent); return $result; } /** * Creates a new, empty file in the directory with the given $filename. * * @param string $filename * @return \ZendSearch\Lucene\Storage\File\FileInterface */ public function createFile($filename) { if (isset($this->_fileHandlers[$filename])) { $this->_fileHandlers[$filename]->close(); } unset($this->_fileHandlers[$filename]); $this->_fileHandlers[$filename] = new File\Filesystem($this->_dirPath . '/' . $filename, 'w+b'); // Set file permissions, but don't care about any possible failures, since file may be already // created by anther user which has to care about right permissions ErrorHandler::start(E_WARNING); chmod($this->_dirPath . '/' . $filename, self::$_defaultFilePermissions); ErrorHandler::stop(); return $this->_fileHandlers[$filename]; } /** * Removes an existing $filename in the directory. * * @param string $filename * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return void */ public function deleteFile($filename) { if (isset($this->_fileHandlers[$filename])) { $this->_fileHandlers[$filename]->close(); } unset($this->_fileHandlers[$filename]); global $php_errormsg; $trackErrors = ini_get('track_errors'); ini_set('track_errors', '1'); if (!@unlink($this->_dirPath . '/' . $filename)) { ini_set('track_errors', $trackErrors); throw new Lucene\Exception\RuntimeException('Can\'t delete file: ' . $php_errormsg); } ini_set('track_errors', $trackErrors); } /** * Purge file if it's cached by directory object * * Method is used to prevent 'too many open files' error * * @param string $filename * @return void */ public function purgeFile($filename) { if (isset($this->_fileHandlers[$filename])) { $this->_fileHandlers[$filename]->close(); } unset($this->_fileHandlers[$filename]); } /** * Returns true if a file with the given $filename exists. * * @param string $filename * @return boolean */ public function fileExists($filename) { return isset($this->_fileHandlers[$filename]) || file_exists($this->_dirPath . '/' . $filename); } /** * Returns the length of a $filename in the directory. * * @param string $filename * @return integer */ public function fileLength($filename) { if (isset( $this->_fileHandlers[$filename] )) { return $this->_fileHandlers[$filename]->size(); } return filesize($this->_dirPath .'/'. $filename); } /** * Returns the UNIX timestamp $filename was last modified. * * @param string $filename * @return integer */ public function fileModified($filename) { return filemtime($this->_dirPath .'/'. $filename); } /** * Renames an existing file in the directory. * * @param string $from * @param string $to * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return void */ public function renameFile($from, $to) { global $php_errormsg; if (isset($this->_fileHandlers[$from])) { $this->_fileHandlers[$from]->close(); } unset($this->_fileHandlers[$from]); if (isset($this->_fileHandlers[$to])) { $this->_fileHandlers[$to]->close(); } unset($this->_fileHandlers[$to]); if (file_exists($this->_dirPath . '/' . $to)) { if (!unlink($this->_dirPath . '/' . $to)) { throw new Lucene\Exception\RuntimeException( 'Delete operation failed' ); } } $trackErrors = ini_get('track_errors'); ini_set('track_errors', '1'); ErrorHandler::start(E_WARNING); $success = rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); ErrorHandler::stop(); if (!$success) { ini_set('track_errors', $trackErrors); throw new Lucene\Exception\RuntimeException($php_errormsg); } ini_set('track_errors', $trackErrors); return $success; } /** * Sets the modified time of $filename to now. * * @param string $filename * @return void */ public function touchFile($filename) { return touch($this->_dirPath .'/'. $filename); } /** * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. * * If $shareHandler option is true, then file handler can be shared between File Object * requests. It speed-ups performance, but makes problems with file position. * Shared handler are good for short atomic requests. * Non-shared handlers are useful for stream file reading (especial for compound files). * * @param string $filename * @param boolean $shareHandler * @return \ZendSearch\Lucene\Storage\File\FileInterface */ public function getFileObject($filename, $shareHandler = true) { $fullFilename = $this->_dirPath . '/' . $filename; if (!$shareHandler) { return new File\Filesystem($fullFilename); } if (isset( $this->_fileHandlers[$filename] )) { $this->_fileHandlers[$filename]->seek(0); return $this->_fileHandlers[$filename]; } $this->_fileHandlers[$filename] = new File\Filesystem($fullFilename); return $this->_fileHandlers[$filename]; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/File/000077500000000000000000000000001245775125600247765ustar00rootroot00000000000000ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/File/AbstractFile.php000066400000000000000000000272121245775125600300560ustar00rootroot00000000000000_fread(1)); } /** * Writes a byte to the end of the file. * * @param integer $byte */ public function writeByte($byte) { return $this->_fwrite(chr($byte), 1); } /** * Read num bytes from the current position in the file * and advances the file pointer. * * @param integer $num * @return string */ public function readBytes($num) { return $this->_fread($num); } /** * Writes num bytes of data (all, if $num===null) to the end * of the string. * * @param string $data * @param integer $num */ public function writeBytes($data, $num=null) { $this->_fwrite($data, $num); } /** * Reads an integer from the current position in the file * and advances the file pointer. * * @return integer */ public function readInt() { $str = $this->_fread(4); return ord($str[0]) << 24 | ord($str[1]) << 16 | ord($str[2]) << 8 | ord($str[3]); } /** * Writes an integer to the end of file. * * @param integer $value */ public function writeInt($value) { settype($value, 'integer'); $this->_fwrite( chr($value>>24 & 0xFF) . chr($value>>16 & 0xFF) . chr($value>>8 & 0xFF) . chr($value & 0xFF), 4 ); } /** * Returns a long integer from the current position in the file * and advances the file pointer. * * @return integer|float */ public function readLong() { /** * Check, that we work in 64-bit mode. * fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb */ if (PHP_INT_SIZE > 4) { $str = $this->_fread(8); return ord($str[0]) << 56 | ord($str[1]) << 48 | ord($str[2]) << 40 | ord($str[3]) << 32 | ord($str[4]) << 24 | ord($str[5]) << 16 | ord($str[6]) << 8 | ord($str[7]); } else { return $this->_readLong32Bit(); } } /** * Writes long integer to the end of file * * @param integer $value */ public function writeLong($value) { /** * Check, that we work in 64-bit mode. * fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb */ if (PHP_INT_SIZE > 4) { settype($value, 'integer'); $this->_fwrite( chr($value>>56 & 0xFF) . chr($value>>48 & 0xFF) . chr($value>>40 & 0xFF) . chr($value>>32 & 0xFF) . chr($value>>24 & 0xFF) . chr($value>>16 & 0xFF) . chr($value>>8 & 0xFF) . chr($value & 0xFF), 8 ); } else { $this->_writeLong32Bit($value); } } /** * Returns a long integer from the current position in the file, * advances the file pointer and return it as float (for 32-bit platforms). * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return integer|float */ protected function _readLong32Bit() { $wordHigh = $this->readInt(); $wordLow = $this->readInt(); if ($wordHigh & (int)0x80000000) { // It's a negative value since the highest bit is set if ($wordHigh == (int)0xFFFFFFFF && ($wordLow & (int)0x80000000)) { return $wordLow; } else { throw new Lucene\Exception\RuntimeException( 'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.' ); } } if ($wordLow < 0) { // Value is large than 0x7FFF FFFF. Represent low word as float. $wordLow &= 0x7FFFFFFF; $wordLow += (float)0x80000000; } if ($wordHigh == 0) { // Return value as integer if possible return $wordLow; } return $wordHigh*(float)0x100000000/* 0x00000001 00000000 */ + $wordLow; } /** * Writes long integer to the end of file (32-bit platforms implementation) * * @param integer|float $value * @throws \ZendSearch\Lucene\Exception\RuntimeException */ protected function _writeLong32Bit($value) { if ($value < (int)0x80000000) { throw new Lucene\Exception\RuntimeException( 'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.' ); } if ($value < 0) { $wordHigh = (int)0xFFFFFFFF; $wordLow = (int)$value; } else { $wordHigh = (int)($value/(float)0x100000000/* 0x00000001 00000000 */); $wordLow = $value - $wordHigh*(float)0x100000000/* 0x00000001 00000000 */; if ($wordLow > 0x7FFFFFFF) { // Highest bit of low word is set. Translate it to the corresponding negative integer value $wordLow -= 0x80000000; $wordLow |= 0x80000000; } } $this->writeInt($wordHigh); $this->writeInt($wordLow); } /** * Returns a variable-length integer from the current * position in the file and advances the file pointer. * * @return integer */ public function readVInt() { $nextByte = ord($this->_fread(1)); $val = $nextByte & 0x7F; for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { $nextByte = ord($this->_fread(1)); $val |= ($nextByte & 0x7F) << $shift; } return $val; } /** * Writes a variable-length integer to the end of file. * * @param integer $value */ public function writeVInt($value) { settype($value, 'integer'); while ($value > 0x7F) { $this->_fwrite(chr( ($value & 0x7F)|0x80 )); $value >>= 7; } $this->_fwrite(chr($value)); } /** * Reads a string from the current position in the file * and advances the file pointer. * * @return string */ public function readString() { $strlen = $this->readVInt(); if ($strlen == 0) { return ''; } else { /** * This implementation supports only Basic Multilingual Plane * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support * "supplementary characters" (characters whose code points are * greater than 0xFFFF) * Java 2 represents these characters as a pair of char (16-bit) * values, the first from the high-surrogates range (0xD800-0xDBFF), * the second from the low-surrogates range (0xDC00-0xDFFF). Then * they are encoded as usual UTF-8 characters in six bytes. * Standard UTF-8 representation uses four bytes for supplementary * characters. */ $str_val = $this->_fread($strlen); for ($count = 0; $count < $strlen; $count++ ) { if (( ord($str_val[$count]) & 0xC0 ) == 0xC0) { $addBytes = 1; if (ord($str_val[$count]) & 0x20 ) { $addBytes++; // Never used. Java2 doesn't encode strings in four bytes if (ord($str_val[$count]) & 0x10 ) { $addBytes++; } } $str_val .= $this->_fread($addBytes); $strlen += $addBytes; // Check for null character. Java2 encodes null character // in two bytes. if (ord($str_val[$count]) == 0xC0 && ord($str_val[$count+1]) == 0x80 ) { $str_val[$count] = 0; $str_val = substr($str_val,0,$count+1) . substr($str_val,$count+2); } $count += $addBytes; } } return $str_val; } } /** * Writes a string to the end of file. * * @param string $str * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function writeString($str) { /** * This implementation supports only Basic Multilingual Plane * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support * "supplementary characters" (characters whose code points are * greater than 0xFFFF) * Java 2 represents these characters as a pair of char (16-bit) * values, the first from the high-surrogates range (0xD800-0xDBFF), * the second from the low-surrogates range (0xDC00-0xDFFF). Then * they are encoded as usual UTF-8 characters in six bytes. * Standard UTF-8 representation uses four bytes for supplementary * characters. */ // convert input to a string before iterating string characters settype($str, 'string'); $chars = $strlen = strlen($str); $containNullChars = false; for ($count = 0; $count < $strlen; $count++ ) { /** * String is already in Java 2 representation. * We should only calculate actual string length and replace * \x00 by \xC0\x80 */ if ((ord($str[$count]) & 0xC0) == 0xC0) { $addBytes = 1; if (ord($str[$count]) & 0x20 ) { $addBytes++; // Never used. Java2 doesn't encode strings in four bytes // and we dont't support non-BMP characters if (ord($str[$count]) & 0x10 ) { $addBytes++; } } $chars -= $addBytes; if (ord($str[$count]) == 0 ) { $containNullChars = true; } $count += $addBytes; } } if ($chars < 0) { throw new Lucene\Exception\InvalidArgumentException('Invalid UTF-8 string'); } $this->writeVInt($chars); if ($containNullChars) { $this->_fwrite(str_replace($str, "\x00", "\xC0\x80")); } else { $this->_fwrite($str); } } /** * Reads binary data from the current position in the file * and advances the file pointer. * * @return string */ public function readBinary() { return $this->_fread($this->readVInt()); } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/File/FileInterface.php000066400000000000000000000077311245775125600302170ustar00rootroot00000000000000_fileHandle = @fopen($filename, $mode); if ($this->_fileHandle === false) { ini_set('track_errors', $trackErrors); throw new Lucene\Exception\RuntimeException($php_errormsg); } ini_set('track_errors', $trackErrors); } /** * Sets the file position indicator and advances the file pointer. * The new position, measured in bytes from the beginning of the file, * is obtained by adding offset to the position specified by whence, * whose values are defined as follows: * SEEK_SET - Set position equal to offset bytes. * SEEK_CUR - Set position to current location plus offset. * SEEK_END - Set position to end-of-file plus offset. (To move to * a position before the end-of-file, you need to pass a negative value * in offset.) * SEEK_CUR is the only supported offset type for compound files * * Upon success, returns 0; otherwise, returns -1 * * @param integer $offset * @param integer $whence * @return integer */ public function seek($offset, $whence=SEEK_SET) { return fseek($this->_fileHandle, $offset, $whence); } /** * Get file position. * * @return integer */ public function tell() { return ftell($this->_fileHandle); } /** * Flush output. * * Returns true on success or false on failure. * * @return boolean */ public function flush() { return fflush($this->_fileHandle); } /** * Close File object */ public function close() { if ($this->_fileHandle !== null ) { ErrorHandler::start(E_WARNING); fclose($this->_fileHandle); ErrorHandler::stop(); $this->_fileHandle = null; } } /** * Get the size of the already opened file * * @return integer */ public function size() { $position = ftell($this->_fileHandle); fseek($this->_fileHandle, 0, SEEK_END); $size = ftell($this->_fileHandle); fseek($this->_fileHandle,$position); return $size; } /** * Read a $length bytes from the file and advance the file pointer. * * @param integer $length * @return string */ protected function _fread($length=1) { if ($length == 0) { return ''; } if ($length < 1024) { return fread($this->_fileHandle, $length); } $data = ''; while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { $data .= $nextBlock; $length -= strlen($nextBlock); } return $data; } /** * Writes $length number of bytes (all, if $length===null) to the end * of the file. * * @param string $data * @param integer $length */ protected function _fwrite($data, $length=null) { if ($length === null ) { fwrite($this->_fileHandle, $data); } else { fwrite($this->_fileHandle, $data, $length); } } /** * Lock file * * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) * * @param integer $lockType * @param boolean $nonBlockingLock * @return boolean */ public function lock($lockType, $nonBlockingLock = false) { if ($nonBlockingLock) { return flock($this->_fileHandle, $lockType | LOCK_NB); } else { return flock($this->_fileHandle, $lockType); } } /** * Unlock file * * Returns true on success * * @return boolean */ public function unlock() { if ($this->_fileHandle !== null ) { return flock($this->_fileHandle, LOCK_UN); } else { return true; } } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/Storage/File/Memory.php000066400000000000000000000407751245775125600267740ustar00rootroot00000000000000_data = $data; } /** * Reads $length number of bytes at the current position in the * file and advances the file pointer. * * @param integer $length * @return string */ protected function _fread($length = 1) { $returnValue = substr($this->_data, $this->_position, $length); $this->_position += $length; return $returnValue; } /** * Sets the file position indicator and advances the file pointer. * The new position, measured in bytes from the beginning of the file, * is obtained by adding offset to the position specified by whence, * whose values are defined as follows: * SEEK_SET - Set position equal to offset bytes. * SEEK_CUR - Set position to current location plus offset. * SEEK_END - Set position to end-of-file plus offset. (To move to * a position before the end-of-file, you need to pass a negative value * in offset.) * Upon success, returns 0; otherwise, returns -1 * * @param integer $offset * @param integer $whence * @return integer */ public function seek($offset, $whence=SEEK_SET) { switch ($whence) { case SEEK_SET: $this->_position = $offset; break; case SEEK_CUR: $this->_position += $offset; break; case SEEK_END: $this->_position = strlen($this->_data); $this->_position += $offset; break; default: break; } } /** * Get file position. * * @return integer */ public function tell() { return $this->_position; } /** * Flush output. * * Returns true on success or false on failure. * * @return boolean */ public function flush() { // Do nothing return true; } /** * Writes $length number of bytes (all, if $length===null) to the end * of the file. * * @param string $data * @param integer $length */ protected function _fwrite($data, $length=null) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now if ($length !== null) { $this->_data .= substr($data, 0, $length); } else { $this->_data .= $data; } $this->_position = strlen($this->_data); } /** * Lock file * * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) * * @param integer $lockType * @return boolean */ public function lock($lockType, $nonBlockinLock = false) { // Memory files can't be shared // do nothing return true; } /** * Unlock file */ public function unlock() { // Memory files can't be shared // do nothing } /** * Reads a byte from the current position in the file * and advances the file pointer. * * @return integer */ public function readByte() { return ord($this->_data[$this->_position++]); } /** * Writes a byte to the end of the file. * * @param integer $byte */ public function writeByte($byte) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now $this->_data .= chr($byte); $this->_position = strlen($this->_data); return 1; } /** * Read num bytes from the current position in the file * and advances the file pointer. * * @param integer $num * @return string */ public function readBytes($num) { $returnValue = substr($this->_data, $this->_position, $num); $this->_position += $num; return $returnValue; } /** * Writes num bytes of data (all, if $num===null) to the end * of the string. * * @param string $data * @param integer $num */ public function writeBytes($data, $num=null) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now if ($num !== null) { $this->_data .= substr($data, 0, $num); } else { $this->_data .= $data; } $this->_position = strlen($this->_data); } /** * Reads an integer from the current position in the file * and advances the file pointer. * * @return integer */ public function readInt() { $str = substr($this->_data, $this->_position, 4); $this->_position += 4; return ord($str[0]) << 24 | ord($str[1]) << 16 | ord($str[2]) << 8 | ord($str[3]); } /** * Writes an integer to the end of file. * * @param integer $value */ public function writeInt($value) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now settype($value, 'integer'); $this->_data .= chr($value>>24 & 0xFF) . chr($value>>16 & 0xFF) . chr($value>>8 & 0xFF) . chr($value & 0xFF); $this->_position = strlen($this->_data); } /** * Returns a long integer from the current position in the file * and advances the file pointer. * * @return integer */ public function readLong() { /** * Check, that we work in 64-bit mode. * fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb */ if (PHP_INT_SIZE > 4) { $str = substr($this->_data, $this->_position, 8); $this->_position += 8; return ord($str[0]) << 56 | ord($str[1]) << 48 | ord($str[2]) << 40 | ord($str[3]) << 32 | ord($str[4]) << 24 | ord($str[5]) << 16 | ord($str[6]) << 8 | ord($str[7]); } else { return $this->_readLong32Bit(); } } /** * Writes long integer to the end of file * * @param integer $value */ public function writeLong($value) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now /** * Check, that we work in 64-bit mode. * fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb */ if (PHP_INT_SIZE > 4) { settype($value, 'integer'); $this->_data .= chr($value>>56 & 0xFF) . chr($value>>48 & 0xFF) . chr($value>>40 & 0xFF) . chr($value>>32 & 0xFF) . chr($value>>24 & 0xFF) . chr($value>>16 & 0xFF) . chr($value>>8 & 0xFF) . chr($value & 0xFF); } else { $this->_writeLong32Bit($value); } $this->_position = strlen($this->_data); } /** * Returns a long integer from the current position in the file, * advances the file pointer and return it as float (for 32-bit platforms). * * @throws \ZendSearch\Lucene\Exception\RuntimeException * @return integer|float */ protected function _readLong32Bit() { $wordHigh = $this->readInt(); $wordLow = $this->readInt(); if ($wordHigh & (int)0x80000000) { // It's a negative value since the highest bit is set if ($wordHigh == (int)0xFFFFFFFF && ($wordLow & (int)0x80000000)) { return $wordLow; } else { throw new Lucene\Exception\RuntimeException( 'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.' ); } } if ($wordLow < 0) { // Value is large than 0x7FFF FFFF. Represent low word as float. $wordLow &= 0x7FFFFFFF; $wordLow += (float)0x80000000; } if ($wordHigh == 0) { // Return value as integer if possible return $wordLow; } return $wordHigh*(float)0x100000000/* 0x00000001 00000000 */ + $wordLow; } /** * Writes long integer to the end of file (32-bit platforms implementation) * * @param integer|float $value * @throws \ZendSearch\Lucene\Exception\RuntimeException */ protected function _writeLong32Bit($value) { if ($value < (int)0x80000000) { throw new Lucene\Exception\RuntimeException( 'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.' ); } if ($value < 0) { $wordHigh = (int)0xFFFFFFFF; $wordLow = (int)$value; } else { $wordHigh = (int)($value/(float)0x100000000/* 0x00000001 00000000 */); $wordLow = $value - $wordHigh*(float)0x100000000/* 0x00000001 00000000 */; if ($wordLow > 0x7FFFFFFF) { // Highest bit of low word is set. Translate it to the corresponding negative integer value $wordLow -= 0x80000000; $wordLow |= 0x80000000; } } $this->writeInt($wordHigh); $this->writeInt($wordLow); } /** * Returns a variable-length integer from the current * position in the file and advances the file pointer. * * @return integer */ public function readVInt() { $nextByte = ord($this->_data[$this->_position++]); $val = $nextByte & 0x7F; for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { $nextByte = ord($this->_data[$this->_position++]); $val |= ($nextByte & 0x7F) << $shift; } return $val; } /** * Writes a variable-length integer to the end of file. * * @param integer $value */ public function writeVInt($value) { // We do not need to check if file position points to the end of "file". // Only append operation is supported now settype($value, 'integer'); while ($value > 0x7F) { $this->_data .= chr( ($value & 0x7F)|0x80 ); $value >>= 7; } $this->_data .= chr($value); $this->_position = strlen($this->_data); } /** * Reads a string from the current position in the file * and advances the file pointer. * * @return string */ public function readString() { $strlen = $this->readVInt(); if ($strlen == 0) { return ''; } else { /** * This implementation supports only Basic Multilingual Plane * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support * "supplementary characters" (characters whose code points are * greater than 0xFFFF) * Java 2 represents these characters as a pair of char (16-bit) * values, the first from the high-surrogates range (0xD800-0xDBFF), * the second from the low-surrogates range (0xDC00-0xDFFF). Then * they are encoded as usual UTF-8 characters in six bytes. * Standard UTF-8 representation uses four bytes for supplementary * characters. */ $str_val = substr($this->_data, $this->_position, $strlen); $this->_position += $strlen; for ($count = 0; $count < $strlen; $count++ ) { if (( ord($str_val[$count]) & 0xC0 ) == 0xC0) { $addBytes = 1; if (ord($str_val[$count]) & 0x20 ) { $addBytes++; // Never used. Java2 doesn't encode strings in four bytes if (ord($str_val[$count]) & 0x10 ) { $addBytes++; } } $str_val .= substr($this->_data, $this->_position, $addBytes); $this->_position += $addBytes; $strlen += $addBytes; // Check for null character. Java2 encodes null character // in two bytes. if (ord($str_val[$count]) == 0xC0 && ord($str_val[$count+1]) == 0x80 ) { $str_val[$count] = 0; $str_val = substr($str_val,0,$count+1) . substr($str_val,$count+2); } $count += $addBytes; } } return $str_val; } } /** * Writes a string to the end of file. * * @param string $str * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException */ public function writeString($str) { /** * This implementation supports only Basic Multilingual Plane * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support * "supplementary characters" (characters whose code points are * greater than 0xFFFF) * Java 2 represents these characters as a pair of char (16-bit) * values, the first from the high-surrogates range (0xD800-0xDBFF), * the second from the low-surrogates range (0xDC00-0xDFFF). Then * they are encoded as usual UTF-8 characters in six bytes. * Standard UTF-8 representation uses four bytes for supplementary * characters. */ // We do not need to check if file position points to the end of "file". // Only append operation is supported now // convert input to a string before iterating string characters settype($str, 'string'); $chars = $strlen = strlen($str); $containNullChars = false; for ($count = 0; $count < $strlen; $count++ ) { /** * String is already in Java 2 representation. * We should only calculate actual string length and replace * \x00 by \xC0\x80 */ if ((ord($str[$count]) & 0xC0) == 0xC0) { $addBytes = 1; if (ord($str[$count]) & 0x20 ) { $addBytes++; // Never used. Java2 doesn't encode strings in four bytes // and we dont't support non-BMP characters if (ord($str[$count]) & 0x10 ) { $addBytes++; } } $chars -= $addBytes; if (ord($str[$count]) == 0 ) { $containNullChars = true; } $count += $addBytes; } } if ($chars < 0) { throw new Lucene\Exception\InvalidArgumentException('Invalid UTF-8 string'); } $this->writeVInt($chars); if ($containNullChars) { $this->_data .= str_replace($str, "\x00", "\xC0\x80"); } else { $this->_data .= $str; } $this->_position = strlen($this->_data); } /** * Reads binary data from the current position in the file * and advances the file pointer. * * @return string */ public function readBinary() { $length = $this->readVInt(); $returnValue = substr($this->_data, $this->_position, $length); $this->_position += $length; return $returnValue; } } ZendSearch-release-2.0.0rc6/library/ZendSearch/Lucene/TermStreamsPriorityQueue.php000066400000000000000000000077721245775125600302560ustar00rootroot00000000000000_termStreams = $termStreams; $this->resetTermsStream(); } /** * Reset terms stream. */ public function resetTermsStream() { $this->_termsStreamQueue = new Index\TermsPriorityQueue(); foreach ($this->_termStreams as $termStream) { $termStream->resetTermsStream(); // Skip "empty" containers if ($termStream->currentTerm() !== null) { $this->_termsStreamQueue->put($termStream); } } $this->nextTerm(); } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param \ZendSearch\Lucene\Index\Term $prefix */ public function skipTo(Index\Term $prefix) { $termStreams = array(); while (($termStream = $this->_termsStreamQueue->pop()) !== null) { $termStreams[] = $termStream; } foreach ($termStreams as $termStream) { $termStream->skipTo($prefix); if ($termStream->currentTerm() !== null) { $this->_termsStreamQueue->put($termStream); } } $this->nextTerm(); } /** * Scans term streams and returns next term * * @return \ZendSearch\Lucene\Index\Term|null */ public function nextTerm() { while (($termStream = $this->_termsStreamQueue->pop()) !== null) { if ($this->_termsStreamQueue->top() === null || $this->_termsStreamQueue->top()->currentTerm()->key() != $termStream->currentTerm()->key()) { // We got new term $this->_lastTerm = $termStream->currentTerm(); if ($termStream->nextTerm() !== null) { // Put segment back into the priority queue $this->_termsStreamQueue->put($termStream); } return $this->_lastTerm; } if ($termStream->nextTerm() !== null) { // Put segment back into the priority queue $this->_termsStreamQueue->put($termStream); } } // End of stream $this->_lastTerm = null; return null; } /** * Returns term in current position * * @return \ZendSearch\Lucene\Index\Term|null */ public function currentTerm() { return $this->_lastTerm; } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { while (($termStream = $this->_termsStreamQueue->pop()) !== null) { $termStream->closeTermsStream(); } $this->_termsStreamQueue = null; $this->_lastTerm = null; } }