This commit is contained in:
cutemeli
2025-12-22 10:35:30 +00:00
parent 0bfc6c8425
commit 5ce7ca2c5d
38927 changed files with 0 additions and 4594700 deletions

View File

@@ -1,27 +0,0 @@
Copyright (c) 2005-2012, Zend Technologies USA, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Zend Technologies USA, Inc. nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -1,15 +0,0 @@
# ZendSearch component
> ## UNMAINTAINED
>
> This package is no longer maintained.
You can install using:
```
curl -s https://getcomposer.org/installer | php
php composer.phar install
```
At that point, follow the instructions in the documentation folder for actual
usage of the component. (Documentation is forthcoming.)

View File

@@ -1,22 +0,0 @@
{
"name": "plesk/zendsearch",
"description": "a general purpose text search engine written entirely in PHP 5",
"type": "library",
"license": "BSD-3-Clause",
"keywords": [
"zf2",
"lucene"
],
"autoload": {
"psr-0": {
"ZendSearch": "library/"
}
},
"require": {
"php": ">=7.1",
"laminas/laminas-stdlib": "^3.2.1"
},
"require-dev": {
"phpunit/phpunit": "^8"
}
}

View File

@@ -1,19 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Exception;
/**
* @category Zend
* @package Zend_Search
*/
interface ExceptionInterface
{}

View File

@@ -1,423 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
/**
* Abstract Finite State Machine
*
* Take a look on Wikipedia state machine description: http://en.wikipedia.org/wiki/Finite_state_machine
*
* Any type of Transducers (Moore machine or Mealy machine) also may be implemented by using this abstract FSM.
* process() methods invokes a specified actions which may construct FSM output.
* Actions may be also used to signal, that we have reached Accept State
*
* @category Zend
* @package Zend_Search_Lucene
*/
abstract class AbstractFSM
{
/**
* Machine States alphabet
*
* @var array
*/
private $_states = array();
/**
* Current state
*
* @var integer|string
*/
private $_currentState = null;
/**
* Input alphabet
*
* @var array
*/
private $_inputAphabet = array();
/**
* State transition table
*
* [sourceState][input] => targetState
*
* @var array
*/
private $_rules = array();
/**
* List of entry actions
* Each action executes when entering the state
*
* [state] => action
*
* @var array
*/
private $_entryActions = array();
/**
* List of exit actions
* Each action executes when exiting the state
*
* [state] => action
*
* @var array
*/
private $_exitActions = array();
/**
* List of input actions
* Each action executes when entering the state
*
* [state][input] => action
*
* @var array
*/
private $_inputActions = array();
/**
* List of input actions
* Each action executes when entering the state
*
* [state1][state2] => action
*
* @var array
*/
private $_transitionActions = array();
/**
* Finite State machine constructor
*
* $states is an array of integers or strings with a list of possible machine states
* constructor treats fist list element as a sturt state (assignes it to $_current state).
* It may be reassigned by setState() call.
* States list may be empty and can be extended later by addState() or addStates() calls.
*
* $inputAphabet is the same as $states, but represents input alphabet
* it also may be extended later by addInputSymbols() or addInputSymbol() calls.
*
* $rules parameter describes FSM transitions and has a structure:
* array( array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* ...
* )
* Rules also can be added later by addRules() and addRule() calls.
*
* FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(),
* addInputAction() and addTransitionAction() calls.
*
* @param array $states
* @param array $inputAphabet
* @param array $rules
*/
public function __construct($states = array(), $inputAphabet = array(), $rules = array())
{
$this->addStates($states);
$this->addInputSymbols($inputAphabet);
$this->addRules($rules);
}
/**
* Add states to the state machine
*
* @param array $states
*/
public function addStates($states)
{
foreach ($states as $state) {
$this->addState($state);
}
}
/**
* Add state to the state machine
*
* @param integer|string $state
*/
public function addState($state)
{
$this->_states[$state] = $state;
if ($this->_currentState === null) {
$this->_currentState = $state;
}
}
/**
* Set FSM state.
* No any action is invoked
*
* @param integer|string $state
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function setState($state)
{
if (!isset($this->_states[$state])) {
throw new Exception\InvalidArgumentException('State \'' . $state . '\' is not on of the possible FSM states.');
}
$this->_currentState = $state;
}
/**
* Get FSM state.
*
* @return integer|string $state|null
*/
public function getState()
{
return $this->_currentState;
}
/**
* Add symbols to the input alphabet
*
* @param array $inputAphabet
*/
public function addInputSymbols($inputAphabet)
{
foreach ($inputAphabet as $inputSymbol) {
$this->addInputSymbol($inputSymbol);
}
}
/**
* Add symbol to the input alphabet
*
* @param integer|string $inputSymbol
*/
public function addInputSymbol($inputSymbol)
{
$this->_inputAphabet[$inputSymbol] = $inputSymbol;
}
/**
* Add transition rules
*
* array structure:
* array( array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* ...
* )
*
* @param array $rules
*/
public function addRules($rules)
{
foreach ($rules as $rule) {
$this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null);
}
}
/**
* Add symbol to the input alphabet
*
* @param integer|string $sourceState
* @param integer|string $input
* @param integer|string $targetState
* @param \ZendSearch\Lucene\FSMAction|null $inputAction
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function addRule($sourceState, $input, $targetState, $inputAction = null)
{
if (!isset($this->_states[$sourceState])) {
throw new Exception\InvalidArgumentException('Undefined source state (' . $sourceState . ').');
}
if (!isset($this->_states[$targetState])) {
throw new Exception\InvalidArgumentException('Undefined target state (' . $targetState . ').');
}
if (!isset($this->_inputAphabet[$input])) {
throw new Exception\InvalidArgumentException('Undefined input symbol (' . $input . ').');
}
if (!isset($this->_rules[$sourceState])) {
$this->_rules[$sourceState] = array();
}
if (isset($this->_rules[$sourceState][$input])) {
throw new Exception\RuntimeException('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.');
}
$this->_rules[$sourceState][$input] = $targetState;
if ($inputAction !== null) {
$this->addInputAction($sourceState, $input, $inputAction);
}
}
/**
* Add state entry action.
* Several entry actions are allowed.
* Action execution order is defined by addEntryAction() calls
*
* @param integer|string $state
* @param \ZendSearch\Lucene\FSMAction $action
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function addEntryAction($state, FSMAction $action)
{
if (!isset($this->_states[$state])) {
throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').');
}
if (!isset($this->_entryActions[$state])) {
$this->_entryActions[$state] = array();
}
$this->_entryActions[$state][] = $action;
}
/**
* Add state exit action.
* Several exit actions are allowed.
* Action execution order is defined by addEntryAction() calls
*
* @param integer|string $state
* @param \ZendSearch\Lucene\FSMAction $action
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function addExitAction($state, FSMAction $action)
{
if (!isset($this->_states[$state])) {
throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').');
}
if (!isset($this->_exitActions[$state])) {
$this->_exitActions[$state] = array();
}
$this->_exitActions[$state][] = $action;
}
/**
* Add input action (defined by {state, input} pair).
* Several input actions are allowed.
* Action execution order is defined by addInputAction() calls
*
* @param integer|string $state
* @param integer|string $input
* @param \ZendSearch\Lucene\FSMAction $action
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function addInputAction($state, $inputSymbol, FSMAction $action)
{
if (!isset($this->_states[$state])) {
throw new Exception\InvalidArgumentException('Undefined state (' . $state. ').');
}
if (!isset($this->_inputAphabet[$inputSymbol])) {
throw new Exception\InvalidArgumentException('Undefined input symbol (' . $inputSymbol. ').');
}
if (!isset($this->_inputActions[$state])) {
$this->_inputActions[$state] = array();
}
if (!isset($this->_inputActions[$state][$inputSymbol])) {
$this->_inputActions[$state][$inputSymbol] = array();
}
$this->_inputActions[$state][$inputSymbol][] = $action;
}
/**
* Add transition action (defined by {state, input} pair).
* Several transition actions are allowed.
* Action execution order is defined by addTransitionAction() calls
*
* @param integer|string $sourceState
* @param integer|string $targetState
* @param \ZendSearch\Lucene\FSMAction $action
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function addTransitionAction($sourceState, $targetState, FSMAction $action)
{
if (!isset($this->_states[$sourceState])) {
throw new Exception\InvalidArgumentException('Undefined source state (' . $sourceState. ').');
}
if (!isset($this->_states[$targetState])) {
throw new Exception\InvalidArgumentException('Undefined source state (' . $targetState. ').');
}
if (!isset($this->_transitionActions[$sourceState])) {
$this->_transitionActions[$sourceState] = array();
}
if (!isset($this->_transitionActions[$sourceState][$targetState])) {
$this->_transitionActions[$sourceState][$targetState] = array();
}
$this->_transitionActions[$sourceState][$targetState][] = $action;
}
/**
* Process an input
*
* @param mixed $input
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function process($input)
{
if (!isset($this->_rules[$this->_currentState])) {
throw new Exception\RuntimeException('There is no any rule for current state (' . $this->_currentState . ').');
}
if (!isset($this->_rules[$this->_currentState][$input])) {
throw new Exception\InvalidArgumentException('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').');
}
$sourceState = $this->_currentState;
$targetState = $this->_rules[$this->_currentState][$input];
if ($sourceState != $targetState && isset($this->_exitActions[$sourceState])) {
foreach ($this->_exitActions[$sourceState] as $action) {
$action->doAction();
}
}
if (isset($this->_inputActions[$sourceState]) &&
isset($this->_inputActions[$sourceState][$input])) {
foreach ($this->_inputActions[$sourceState][$input] as $action) {
$action->doAction();
}
}
$this->_currentState = $targetState;
if (isset($this->_transitionActions[$sourceState]) &&
isset($this->_transitionActions[$sourceState][$targetState])) {
foreach ($this->_transitionActions[$sourceState][$targetState] as $action) {
$action->doAction();
}
}
if ($sourceState != $targetState && isset($this->_entryActions[$targetState])) {
foreach ($this->_entryActions[$targetState] as $action) {
$action->doAction();
}
}
}
/**
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function reset()
{
if (count($this->_states) == 0) {
throw new Exception\RuntimeException('There is no any state defined for FSM.');
}
$this->_currentState = $this->_states[0];
}
}

View File

@@ -1,158 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
/**
* Abstract Priority Queue
*
* It implements a priority queue.
* Please go to "Data Structures and Algorithms",
* Aho, Hopcroft, and Ullman, Addison-Wesley, 1983 (corrected 1987 edition),
* for implementation details.
*
* It provides O(log(N)) time of put/pop operations, where N is a size of queue
*
* @category Zend
* @package Zend_Search_Lucene
*/
abstract class AbstractPriorityQueue
{
/**
* Queue heap
*
* Heap contains balanced partial ordered binary tree represented in array
* [0] - top of the tree
* [1] - first child of [0]
* [2] - second child of [0]
* ...
* [2*n + 1] - first child of [n]
* [2*n + 2] - second child of [n]
*
* @var array
*/
private $_heap = array();
/**
* Add element to the queue
*
* O(log(N)) time
*
* @param mixed $element
*/
public function put($element)
{
$nodeId = count($this->_heap);
$parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) {
// Move parent node down
$this->_heap[$nodeId] = $this->_heap[$parentId];
// Move pointer to the next level of tree
$nodeId = $parentId;
$parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
}
// Put new node into the tree
$this->_heap[$nodeId] = $element;
}
/**
* Return least element of the queue
*
* Constant time
*
* @return mixed
*/
public function top()
{
if (count($this->_heap) == 0) {
return null;
}
return $this->_heap[0];
}
/**
* Removes and return least element of the queue
*
* O(log(N)) time
*
* @return mixed
*/
public function pop()
{
if (count($this->_heap) == 0) {
return null;
}
$top = $this->_heap[0];
$lastId = count($this->_heap) - 1;
/**
* Find appropriate position for last node
*/
$nodeId = 0; // Start from a top
$childId = 1; // First child
// Choose smaller child
if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) {
$childId = 2;
}
while ($childId < $lastId &&
$this->_less($this->_heap[$childId], $this->_heap[$lastId])
) {
// Move child node up
$this->_heap[$nodeId] = $this->_heap[$childId];
$nodeId = $childId; // Go down
$childId = ($nodeId << 1) + 1; // First child
// Choose smaller child
if (($childId+1) < $lastId &&
$this->_less($this->_heap[$childId+1], $this->_heap[$childId])
) {
$childId++;
}
}
// Move last element to the new position
$this->_heap[$nodeId] = $this->_heap[$lastId];
unset($this->_heap[$lastId]);
return $top;
}
/**
* Clear queue
*/
public function clear()
{
$this->_heap = array();
}
/**
* Compare elements
*
* Returns true, if $el1 is less than $el2; else otherwise
*
* @param mixed $el1
* @param mixed $el2
* @return boolean
*/
abstract protected function _less($el1, $el2);
}

View File

@@ -1,71 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface as LuceneAnalyzer;
/**
* General analyzer implementation.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
abstract class AbstractAnalyzer implements LuceneAnalyzer
{
/**
* Input string
*
* @var string
*/
protected $_input = null;
/**
* Input string encoding
*
* @var string
*/
protected $_encoding = '';
/**
* Tokenize text to a terms
* Returns array of \ZendSearch\Lucene\Analysis\Token objects
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @param string $data
* @return array
*/
public function tokenize($data, $encoding = '')
{
$this->setInput($data, $encoding);
$tokenList = array();
while (($nextToken = $this->nextToken()) !== null) {
$tokenList[] = $nextToken;
}
return $tokenList;
}
/**
* Tokenization stream API
* Set input
*
* @param string $data
*/
public function setInput($data, $encoding = '')
{
$this->_input = $data;
$this->_encoding = $encoding;
$this->reset();
}
}

View File

@@ -1,54 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface as LuceneAnalyzer;
/**
* AnalyzerInterface manager.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class Analyzer
{
/**
* The AnalyzerInterface implementation used by default.
*
* @var \ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface
*/
private static $_defaultImpl = null;
/**
* Set the default AnalyzerInterface implementation used by indexing code.
*
* @param \ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface $analyzer
*/
public static function setDefault(LuceneAnalyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
/**
* Return the default AnalyzerInterface implementation used by indexing code.
*
* @return \ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface
*/
public static function getDefault()
{
if (self::$_defaultImpl === null) {
self::$_defaultImpl = new Common\Text\CaseInsensitive();
}
return self::$_defaultImpl;
}
}

View File

@@ -1,56 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer;
/**
* An AnalyzerInterface is used to analyze text.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
interface AnalyzerInterface
{
/**
* Tokenize text to terms
* Returns array of ZendSearch\Lucene\Analysis\Token objects
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @param string $data
* @return array
*/
public function tokenize($data, $encoding = '');
/**
* Tokenization stream API
* Set input
*
* @param string $data
*/
public function setInput($data, $encoding = '');
/**
* Reset token stream
*/
public function reset();
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @return \ZendSearch\Lucene\Analysis\Token|null
*/
public function nextToken();
}

View File

@@ -1,66 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface;
use ZendSearch\Lucene\Analysis\TokenFilter\TokenFilterInterface;
/**
* AbstractCommon implementation of the analyzerfunctionality.
*
* There are several standard standard subclasses provided
* by Analysis subpackage.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
abstract class AbstractCommon extends Analysis\Analyzer\AbstractAnalyzer
{
/**
* The set of Token filters applied to the Token stream.
* Array of \ZendSearch\Lucene\Analysis\TokenFilter\TokenFilterInterface objects.
*
* @var array
*/
private $_filters = array();
/**
* Add Token filter to the AnalyzerInterface
*
* @param \ZendSearch\Lucene\Analysis\TokenFilter\TokenFilterInterface $filter
*/
public function addFilter(TokenFilterInterface $filter)
{
$this->_filters[] = $filter;
}
/**
* Apply filters to the token. Can return null when the token was removed.
*
* @param \ZendSearch\Lucene\Analysis\Token $token
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Analysis\Token $token)
{
foreach ($this->_filters as $filter) {
$token = $filter->normalize($token);
// resulting token can be null if the filter removes it
if ($token === null) {
return null;
}
}
return $token;
}
}

View File

@@ -1,80 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class Text extends AbstractCommon
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return \ZendSearch\Lucene\Analysis\Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Analysis\Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@@ -1,28 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common\Text;
use ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis\TokenFilter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class CaseInsensitive extends Common\Text
{
public function __construct()
{
$this->addFilter(new TokenFilter\LowerCase());
}
}

View File

@@ -1,79 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class TextNum extends AbstractCommon
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return \ZendSearch\Lucene\Analysis\Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Analysis\Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@@ -1,28 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common\TextNum;
use ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis\TokenFilter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class CaseInsensitive extends Common\TextNum
{
public function __construct()
{
$this->addFilter(new TokenFilter\LowerCase());
}
}

View File

@@ -1,115 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis;
use ZendSearch\Lucene\Exception\RuntimeException;
use Laminas\Stdlib\ErrorHandler;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class Utf8 extends AbstractCommon
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function __construct()
{
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result != 1) {
// PCRE unicode support is turned off
throw new RuntimeException('Utf8 analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return \ZendSearch\Lucene\Analysis\Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@@ -1,30 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8;
use ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis\TokenFilter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class CaseInsensitive extends Common\Utf8
{
public function __construct()
{
parent::__construct();
$this->addFilter(new TokenFilter\LowerCaseUtf8());
}
}

View File

@@ -1,115 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis;
use ZendSearch\Lucene\Exception\RuntimeException;
use Laminas\Stdlib\ErrorHandler;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class Utf8Num extends AbstractCommon
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function __construct()
{
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result != 1) {
// PCRE unicode support is turned off
throw new RuntimeException('Utf8Num analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return \ZendSearch\Lucene\Analysis\Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@@ -1,30 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\Analyzer\Common\Utf8Num;
use ZendSearch\Lucene\Analysis\Analyzer\Common;
use ZendSearch\Lucene\Analysis\TokenFilter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class CaseInsensitive extends Common\Utf8Num
{
public function __construct()
{
parent::__construct();
$this->addFilter(new TokenFilter\LowerCaseUtf8());
}
}

View File

@@ -1,141 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class Token
{
/**
* The text of the term.
*
* @var string
*/
private $_termText;
/**
* Start in source text.
*
* @var integer
*/
private $_startOffset;
/**
* End in source text
*
* @var integer
*/
private $_endOffset;
/**
* The position of this token relative to the previous Token.
*
* The default value is one.
*
* Some common uses for this are:
* Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* @var integer
*/
private $_positionIncrement;
/**
* Object constructor
*
* @param string $text
* @param integer $start
* @param integer $end
* @param string $type
*/
public function __construct($text, $start, $end)
{
$this->_termText = $text;
$this->_startOffset = $start;
$this->_endOffset = $end;
$this->_positionIncrement = 1;
}
/**
* positionIncrement setter
*
* @param integer $positionIncrement
*/
public function setPositionIncrement($positionIncrement)
{
$this->_positionIncrement = $positionIncrement;
}
/**
* Returns the position increment of this Token.
*
* @return integer
*/
public function getPositionIncrement()
{
return $this->_positionIncrement;
}
/**
* Returns the Token's term text.
*
* @return string
*/
public function getTermText()
{
return $this->_termText;
}
/**
* Returns this Token's starting offset, the position of the first character
* corresponding to this token in the source text.
*
* Note:
* The difference between getEndOffset() and getStartOffset() may not be equal
* to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered
* by a stemmer or some other filter.
*
* @return integer
*/
public function getStartOffset()
{
return $this->_startOffset;
}
/**
* Returns this Token's ending offset, one greater than the position of the
* last character corresponding to this token in the source text.
*
* @return integer
*/
public function getEndOffset()
{
return $this->_endOffset;
}
}

View File

@@ -1,41 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\TokenFilter;
use ZendSearch\Lucene\Analysis\Token;
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class LowerCase implements TokenFilterInterface
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken)
{
$newToken = new Token(strtolower( $srcToken->getTermText() ),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}

View File

@@ -1,55 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\TokenFilter;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Token;
use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class LowerCaseUtf8 implements TokenFilterInterface
{
/**
* Object constructor
* @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
*/
public function __construct()
{
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
throw new ExtensionNotLoadedException('Utf8 compatible lower case filter needs mbstring extension to be enabled.');
}
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken)
{
$newToken = new Token(mb_strtolower($srcToken->getTermText(), 'UTF-8'),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}

View File

@@ -1,55 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\TokenFilter;
use ZendSearch\Lucene\Analysis\Token;
/**
* Token filter that removes short words. What is short word can be configured with constructor.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class ShortWords implements TokenFilterInterface
{
/**
* Minimum allowed term length
* @var integer
*/
private $length;
/**
* Constructs new instance of this filter.
*
* @param integer $short minimum allowed length of term which passes this filter (default 2)
*/
public function __construct($length = 2)
{
$this->length = $length;
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken)
{
if (strlen($srcToken->getTermText()) < $this->length) {
return null;
} else {
return $srcToken;
}
}
}

View File

@@ -1,90 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\TokenFilter;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Token;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\RuntimeException;
/**
* Token filter that removes stop words. These words must be provided as array (set), example:
* $stopwords = array('the' => 1, 'an' => '1');
*
* We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class StopWords implements TokenFilterInterface
{
/**
* Stop Words
* @var array
*/
private $_stopSet;
/**
* Constructs new instance of this filter.
*
* @param array $stopwords array (set) of words that will be filtered out
*/
public function __construct($stopwords = array())
{
$this->_stopSet = array_flip($stopwords);
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken)
{
if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
return null;
} else {
return $srcToken;
}
}
/**
* Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first
* column are ignored (as comments).
*
* You can call this method one or more times. New stopwords are always added to current set.
*
* @param string $filepath full path for text file with stopwords
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function loadFromFile($filepath = null)
{
if (! $filepath || ! file_exists($filepath)) {
throw new InvalidArgumentException('You have to provide valid file path');
}
$fd = fopen($filepath, "r");
if (! $fd) {
throw new RuntimeException('Cannot open file ' . $filepath);
}
while (!feof ($fd)) {
$buffer = trim(fgets($fd));
if (strlen($buffer) > 0 && $buffer[0] != '#') {
$this->_stopSet[$buffer] = 1;
}
}
if (!fclose($fd)) {
throw new RuntimeException('Cannot close file ' . $filepath);
}
}
}

View File

@@ -1,31 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Analysis\TokenFilter;
use ZendSearch\Lucene\Analysis\Token;
/**
* Token filter converts (normalizes) Token ore removes it from a token stream.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
interface TokenFilterInterface
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken);
}

View File

@@ -1,127 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
/**
* A Document is a set of fields. Each field has a name and a textual value.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class Document
{
/**
* Associative array \ZendSearch\Lucene\Document\Field objects where the keys to the
* array are the names of the fields.
*
* @var array
*/
protected $_fields = array();
/**
* Field boost factor
* It's not stored directly in the index, but affects on normalization factor
*
* @var float
*/
public $boost = 1.0;
/**
* Magic method for checking the existence of a field
*
* @param string $offset
* @return boolean TRUE if the field exists else FALSE
*/
public function __isset($offset)
{
return in_array($offset, $this->getFieldNames());
}
/**
* Proxy method for getFieldValue(), provides more convenient access to
* the string value of a field.
*
* @param $offset
* @return string
*/
public function __get($offset)
{
return $this->getFieldValue($offset);
}
/**
* Add a field object to this document.
*
* @param \ZendSearch\Lucene\Document\Field $field
* @return \ZendSearch\Lucene\Document
*/
public function addField(Document\Field $field)
{
$this->_fields[$field->name] = $field;
return $this;
}
/**
* Return an array with the names of the fields in this document.
*
* @return array
*/
public function getFieldNames()
{
return array_keys($this->_fields);
}
/**
* Returns {@link \ZendSearch\Lucene\Document\Field} object for a named field in this document.
*
* @param string $fieldName
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
* @return \ZendSearch\Lucene\Document\Field
*/
public function getField($fieldName)
{
if (!array_key_exists($fieldName, $this->_fields)) {
throw new InvalidArgumentException("Field name \"$fieldName\" not found in document.");
}
return $this->_fields[$fieldName];
}
/**
* Returns the string value of a named field in this document.
*
* @see __get()
* @return string
*/
public function getFieldValue($fieldName)
{
return $this->getField($fieldName)->value;
}
/**
* Returns the string value of a named field in UTF-8 encoding.
*
* @see __get()
* @return string
*/
public function getFieldUtf8Value($fieldName)
{
return $this->getField($fieldName)->getUtf8Value();
}
}

View File

@@ -1,126 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Document;
/**
* OpenXML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
abstract class AbstractOpenXML extends Document
{
/**
* Xml Schema - Relationships
*
* @var string
*/
const SCHEMA_RELATIONSHIP = 'http://schemas.openxmlformats.org/package/2006/relationships';
/**
* Xml Schema - Office document
*
* @var string
*/
const SCHEMA_OFFICEDOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument';
/**
* Xml Schema - Core properties
*
* @var string
*/
const SCHEMA_COREPROPERTIES = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties';
/**
* Xml Schema - Dublin Core
*
* @var string
*/
const SCHEMA_DUBLINCORE = 'http://purl.org/dc/elements/1.1/';
/**
* Xml Schema - Dublin Core Terms
*
* @var string
*/
const SCHEMA_DUBLINCORETERMS = 'http://purl.org/dc/terms/';
/**
* Extract metadata from document
*
* @param \ZipArchive $package ZipArchive AbstractOpenXML package
* @return array Key-value pairs containing document meta data
*/
protected function extractMetaData(\ZipArchive $package)
{
// Data holders
$coreProperties = array();
if (\LIBXML_VERSION < 20900) {
// Prevent php from loading remote resources
$loadEntities = libxml_disable_entity_loader(true);
}
// Read relations and search for core properties
$relations = simplexml_load_string($package->getFromName("_rels/.rels"));
if (\LIBXML_VERSION < 20900) {
// Restore entity loader state
libxml_disable_entity_loader($loadEntities);
}
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == self::SCHEMA_COREPROPERTIES) {
// Found core properties! Read in contents...
$contents = simplexml_load_string(
$package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"]))
);
foreach ($contents->children(self::SCHEMA_DUBLINCORE) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(self::SCHEMA_COREPROPERTIES) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(self::SCHEMA_DUBLINCORETERMS) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
}
}
return $coreProperties;
}
/**
* Determine absolute zip path
*
* @param string $path
* @return string
*/
protected function absoluteZipPath($path)
{
$path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path);
$parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen');
$absolutes = array();
foreach ($parts as $part) {
if ('.' == $part) continue;
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
return implode('/', $absolutes);
}
}

View File

@@ -1,156 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Document\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;
use ZendSearch\Lucene\Exception\RuntimeException;
/**
* Docx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class Docx extends AbstractOpenXML
{
/**
* Xml Schema - WordprocessingML
*
* @var string
*/
const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
private function __construct($fileName, $storeContent)
{
if (!class_exists('ZipArchive', false)) {
throw new ExtensionNotLoadedException(
'MS Office documents processing functionality requires Zip extension to be loaded'
);
}
// Document data holders
$documentBody = array();
$coreProperties = array();
// Open AbstractOpenXML package
$package = new \ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
throw new RuntimeException('Invalid archive or corrupted .docx file.');
}
if (\LIBXML_VERSION < 20900) {
// Prevent php from loading remote resources
$loadEntities = libxml_disable_entity_loader(true);
}
$relations = simplexml_load_string($relationsXml);
if (\LIBXML_VERSION < 20900) {
// Restore entity loader state
libxml_disable_entity_loader($loadEntities);
}
foreach($relations->Relationship as $rel) {
if ($rel ["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read in contents...
$contents = simplexml_load_string($package->getFromName(
$this->absoluteZipPath(dirname($rel['Target'])
. '/'
. basename($rel['Target']))
));
$contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML);
$paragraphs = $contents->xpath('//w:body/w:p');
foreach ($paragraphs as $paragraph) {
$runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
if ($runs === false) {
// Paragraph doesn't contain any text or breaks
continue;
}
foreach ($runs as $run) {
if ($run->getName() == 'br') {
// Break element
$documentBody[] = ' ';
} else {
$documentBody[] = (string)$run;
}
}
// Add space after each paragraph. So they are not bound together.
$documentBody[] = ' ';
}
break;
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8'));
} else {
$this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value) {
$this->addField(Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (! isset($coreProperties['title'])) {
$this->addField(Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Docx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @throws \ZendSearch\Lucene\Document\Exception\InvalidArgumentException
* @return \ZendSearch\Lucene\Document\Docx
*/
public static function loadDocxFile($fileName, $storeContent = false)
{
if (!is_readable($fileName)) {
throw new InvalidArgumentException('Provided file \'' . $fileName . '\' is not readable.');
}
return new self($fileName, $storeContent);
}
}

View File

@@ -1,14 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document\Exception;
interface ExceptionInterface extends \ZendSearch\Lucene\Exception\ExceptionInterface
{}

View File

@@ -1,18 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document\Exception;
use ZendSearch\Lucene\Exception;
class InvalidArgumentException
extends Exception\InvalidArgumentException
implements ExceptionInterface
{}

View File

@@ -1,212 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
/**
* A field is a section of a Document. Each field has two parts,
* a name and a value. Values may be free text or they may be atomic
* keywords, which are not further processed. Such keywords may
* be used to represent dates, urls, etc. Fields are optionally
* stored in the index, so that they may be returned with hits
* on the document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class Field
{
/**
* Field name
*
* @var string
*/
public $name;
/**
* Field value
*
* @var boolean
*/
public $value;
/**
* Field is to be stored in the index for return with search hits.
*
* @var boolean
*/
public $isStored = false;
/**
* Field is to be indexed, so that it may be searched on.
*
* @var boolean
*/
public $isIndexed = true;
/**
* Field should be tokenized as text prior to indexing.
*
* @var boolean
*/
public $isTokenized = true;
/**
* Field is stored as binary.
*
* @var boolean
*/
public $isBinary = false;
/**
* Field are stored as a term vector
*
* @var boolean
*/
public $storeTermVector = false;
/**
* Field boost factor
* It's not stored directly in the index, but affects on normalization factor
*
* @var float
*/
public $boost = 1.0;
/**
* Field value encoding.
*
* @var string
*/
public $encoding;
/**
* Object constructor
*
* @param string $name
* @param string $value
* @param string $encoding
* @param boolean $isStored
* @param boolean $isIndexed
* @param boolean $isTokenized
* @param boolean $isBinary
*/
public function __construct($name, $value, $encoding, $isStored, $isIndexed, $isTokenized, $isBinary = false)
{
$this->name = $name;
$this->value = $value;
if (!$isBinary) {
$this->encoding = $encoding;
$this->isTokenized = $isTokenized;
} else {
$this->encoding = '';
$this->isTokenized = false;
}
$this->isStored = $isStored;
$this->isIndexed = $isIndexed;
$this->isBinary = $isBinary;
$this->storeTermVector = false;
$this->boost = 1.0;
}
/**
* Constructs a String-valued Field that is not tokenized, but is indexed
* and stored. Useful for non-text fields, e.g. date or url.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return \ZendSearch\Lucene\Document\Field
*/
public static function keyword($name, $value, $encoding = 'UTF-8')
{
return new self($name, $value, $encoding, true, true, false);
}
/**
* Constructs a String-valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return \ZendSearch\Lucene\Document\Field
*/
public static function unIndexed($name, $value, $encoding = 'UTF-8')
{
return new self($name, $value, $encoding, true, false, false);
}
/**
* Constructs a Binary String valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return \ZendSearch\Lucene\Document\Field
*/
public static function binary($name, $value)
{
return new self($name, $value, '', true, false, false, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* and is stored in the index, for return with hits. Useful for short text
* fields, like "title" or "subject". Term vector will not be stored for this field.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return \ZendSearch\Lucene\Document\Field
*/
public static function text($name, $value, $encoding = 'UTF-8')
{
return new self($name, $value, $encoding, true, true, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* but that is not stored in the index.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return \ZendSearch\Lucene\Document\Field
*/
public static function unStored($name, $value, $encoding = 'UTF-8')
{
return new self($name, $value, $encoding, false, true, true);
}
/**
* Get field value in UTF-8 encoding
*
* @return string
*/
public function getUtf8Value()
{
if (strcasecmp($this->encoding, 'utf8' ) == 0 ||
strcasecmp($this->encoding, 'utf-8') == 0 ) {
return $this->value;
} else {
return (PHP_OS != 'AIX') ? iconv($this->encoding, 'UTF-8', $this->value) : iconv('ISO8859-1', 'UTF-8', $this->value);
}
}
}

View File

@@ -1,470 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\RuntimeException;
use Laminas\Stdlib\ErrorHandler;
/**
* HTML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class HTML extends Document
{
/**
* List of document links
*
* @var array
*/
private $_links = array();
/**
* List of document header links
*
* @var array
*/
private $_headerLinks = array();
/**
* Stored DOM representation
*
* @var \DOMDocument
*/
private $_doc;
/**
* Exclude nofollow links flag
*
* If true then links with rel='nofollow' attribute are not included into
* document links.
*
* @var boolean
*/
private static $_excludeNoFollowLinks = false;
/**
*
* List of inline tags
*
* @var array
*/
private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
'q', 'sub', 'sup');
/**
* Object constructor
*
* @param string $data HTML string (may be HTML fragment, )
* @param boolean $isFile
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
*/
private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
{
$this->_doc = new \DOMDocument();
$this->_doc->substituteEntities = true;
if ($isFile) {
$htmlData = file_get_contents($data);
} else {
$htmlData = $data;
}
ErrorHandler::start(E_WARNING);
$this->_doc->loadHTML($htmlData);
ErrorHandler::stop();
if ($this->_doc->encoding === null) {
// Document encoding is not recognized
/** @todo improve HTML vs HTML fragment recognition */
if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
// It's an HTML document
// Add additional HEAD section and recognize document
$htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
ErrorHandler::start(E_WARNING);
$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
. '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
. iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
ErrorHandler::stop();
// Remove additional HEAD section
$xpath = new \DOMXPath($this->_doc);
$head = $xpath->query('/html/head')->item(0);
$head->parentNode->removeChild($head);
} else {
// It's an HTML fragment
ErrorHandler::start(E_WARNING);
$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
. iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
. '</body></html>');
ErrorHandler::stop();
}
}
/** @todo Add correction of wrong HTML encoding recognition processing
* The case is:
* Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
* even $this->_doc->encoding demonstrates another recognized encoding
*/
$xpath = new \DOMXPath($this->_doc);
$docTitle = '';
$titleNodes = $xpath->query('/html/head/title');
foreach ($titleNodes as $titleNode) {
// title should always have only one entry, but we process all nodeset entries
$docTitle .= $titleNode->nodeValue . ' ';
}
$this->addField(Field::Text('title', $docTitle, 'UTF-8'));
$metaNodes = $xpath->query('/html/head/meta[@name]');
foreach ($metaNodes as $metaNode) {
$this->addField(Field::Text($metaNode->getAttribute('name'),
$metaNode->getAttribute('content'),
'UTF-8'));
}
$docBody = '';
$bodyNodes = $xpath->query('/html/body');
foreach ($bodyNodes as $bodyNode) {
// body should always have only one entry, but we process all nodeset entries
$this->_retrieveNodeText($bodyNode, $docBody);
}
if ($storeContent) {
$this->addField(Field::Text('body', $docBody, 'UTF-8'));
} else {
$this->addField(Field::UnStored('body', $docBody, 'UTF-8'));
}
$linkNodes = $this->_doc->getElementsByTagName('a');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' &&
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
) {
$this->_links[] = $href;
}
}
$linkNodes = $this->_doc->getElementsByTagName('area');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' &&
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
) {
$this->_links[] = $href;
}
}
$this->_links = array_unique($this->_links);
$linkNodes = $xpath->query('/html/head/link');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '') {
$this->_headerLinks[] = $href;
}
}
$this->_headerLinks = array_unique($this->_headerLinks);
}
/**
* Set exclude nofollow links flag
*
* @param boolean $newValue
*/
public static function setExcludeNoFollowLinks($newValue)
{
self::$_excludeNoFollowLinks = $newValue;
}
/**
* Get exclude nofollow links flag
*
* @return boolean
*/
public static function getExcludeNoFollowLinks()
{
return self::$_excludeNoFollowLinks;
}
/**
* Get node text
*
* We should exclude scripts, which may be not included into comment tags, CDATA sections,
*
* @param \DOMNode $node
* @param string &$text
*/
private function _retrieveNodeText(\DOMNode $node, &$text)
{
if ($node->nodeType == XML_TEXT_NODE) {
$text .= $node->nodeValue;
if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
$text .= ' ';
}
} elseif ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
foreach ($node->childNodes as $childNode) {
$this->_retrieveNodeText($childNode, $text);
}
}
}
/**
* Get document HREF links
*
* @return array
*/
public function getLinks()
{
return $this->_links;
}
/**
* Get document header links
*
* @return array
*/
public function getHeaderLinks()
{
return $this->_headerLinks;
}
/**
* Load HTML document from a string
*
* @param string $data
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @return \ZendSearch\Lucene\Document\HTML
*/
public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
{
return new self($data, false, $storeContent, $defaultEncoding);
}
/**
* Load HTML document from a file
*
* @param string $file
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @return \ZendSearch\Lucene\Document\HTML
*/
public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
{
return new self($file, true, $storeContent, $defaultEncoding);
}
/**
* Highlight text in text node
*
* @param \DOMText $node
* @param array $wordsToHighlight
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
protected function _highlightTextNode(\DOMText $node, $wordsToHighlight, $callback, $params)
{
$analyzer = Analyzer\Analyzer::getDefault();
$analyzer->setInput($node->nodeValue, 'UTF-8');
$matchedTokens = array();
while (($token = $analyzer->nextToken()) !== null) {
if (isset($wordsToHighlight[$token->getTermText()])) {
$matchedTokens[] = $token;
}
}
if (count($matchedTokens) == 0) {
return;
}
$matchedTokens = array_reverse($matchedTokens);
foreach ($matchedTokens as $token) {
// Cut text after matched token
$node->splitText($token->getEndOffset());
// Cut matched node
$matchedWordNode = $node->splitText($token->getStartOffset());
// Retrieve HTML string representation for highlihted word
$fullCallbackparamsList = $params;
array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
$highlightedWordNodeSetHTML = call_user_func_array($callback, $fullCallbackparamsList);
// Transform HTML string to a DOM representation and automatically transform retrieved string
// into valid XHTML (It's automatically done by loadHTML() method)
$highlightedWordNodeSetDomDocument = new \DOMDocument('1.0', 'UTF-8');
ErrorHandler::start(E_WARNING);
$success = $highlightedWordNodeSetDomDocument->
loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
. $highlightedWordNodeSetHTML
. '</body></html>');
ErrorHandler::stop();
if (!$success) {
throw new RuntimeException("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHTML'.");
}
$highlightedWordNodeSetXpath = new \DOMXPath($highlightedWordNodeSetDomDocument);
$highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
$nodeToImport = $highlightedWordNodeSet->item($count);
$node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
$matchedWordNode);
}
$node->parentNode->removeChild($matchedWordNode);
}
}
/**
* highlight words in content of the specified node
*
* @param \DOMNode $contextNode
* @param array $wordsToHighlight
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
*/
protected function _highlightNodeRecursive(\DOMNode $contextNode, $wordsToHighlight, $callback, $params)
{
$textNodes = array();
if (!$contextNode->hasChildNodes()) {
return;
}
foreach ($contextNode->childNodes as $childNode) {
if ($childNode->nodeType == XML_TEXT_NODE) {
// process node later to leave childNodes structure untouched
$textNodes[] = $childNode;
} else {
// Process node if it's not a script node
if ($childNode->nodeName != 'script') {
$this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
}
}
}
foreach ($textNodes as $textNode) {
$this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
}
}
/**
* Standard callback method used to highlight words.
*
* @param string $stringToHighlight
* @return string
* @internal
*/
public function applyColour($stringToHighlight, $colour)
{
return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
}
/**
* Highlight text with specified color
*
* @param string|array $words
* @param string $colour
* @return string
*/
public function highlight($words, $colour = '#66ffff')
{
return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
}
/**
* Highlight text using specified View helper or callback function.
*
* @param string|array $words Words to highlight. Words could be organized using the array or string.
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters passed through into it
* (first non-optional parameter is an HTML fragment for highlighting)
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
* @return string
*/
public function highlightExtended($words, $callback, $params = array())
{
if (!is_array($words)) {
$words = array($words);
}
$wordsToHighlightList = array();
$analyzer = Analyzer\Analyzer::getDefault();
foreach ($words as $wordString) {
$wordsToHighlightList[] = $analyzer->tokenize($wordString);
}
$wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
if (count($wordsToHighlight) == 0) {
return $this->_doc->saveHTML();
}
$wordsToHighlightFlipped = array();
foreach ($wordsToHighlight as $id => $token) {
$wordsToHighlightFlipped[$token->getTermText()] = $id;
}
if (!is_callable($callback)) {
throw new InvalidArgumentException('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
}
$xpath = new \DOMXPath($this->_doc);
$matchedNodes = $xpath->query("/html/body");
foreach ($matchedNodes as $matchedNode) {
$this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
}
}
/**
* Get HTML
*
* @return string
*/
public function getHTML()
{
return $this->_doc->saveHTML();
}
/**
* Get HTML body
*
* @return string
*/
public function getHTMLBody()
{
$xpath = new \DOMXPath($this->_doc);
$bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
$outputFragments = array();
for ($count = 0; $count < $bodyNodes->length; $count++) {
$outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
}
return implode($outputFragments);
}
}

View File

@@ -1,197 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;
use ZendSearch\Lucene\Exception\RuntimeException;
/**
* Pptx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class Pptx extends AbstractOpenXML
{
/**
* Xml Schema - PresentationML
*
* @var string
*/
const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Slide relation
*
* @var string
*/
const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
private function __construct($fileName, $storeContent)
{
if (!class_exists('ZipArchive', false)) {
throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
}
// Document data holders
$slides = array();
$slideNotes = array();
$documentBody = array();
$coreProperties = array();
// Open AbstractOpenXML package
$package = new \ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
throw new RuntimeException('Invalid archive or corrupted .pptx file.');
}
if (\LIBXML_VERSION < 20900) {
// Prevent php from loading remote resources
$loadEntities = libxml_disable_entity_loader(true);
}
$relations = simplexml_load_string($relationsXml);
if (\LIBXML_VERSION < 20900) {
// Restore entity loader state
libxml_disable_entity_loader($loadEntities);
}
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Search for slides...
$slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
foreach ($slideRelations->Relationship as $slideRel) {
if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) {
// Found slide!
$slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) )
);
// Search for slide notes
$slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) );
foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) {
// Found slide notes!
$slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) )
);
break;
}
}
}
}
break;
}
}
// Sort slides
ksort($slides);
ksort($slideNotes);
// Extract contents from slides
foreach ($slides as $slideKey => $slide) {
// Register namespaces
$slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
$slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slide->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
// Extract contents from slide notes
if (isset($slideNotes[$slideKey])) {
// Fetch slide note
$slideNote = $slideNotes[$slideKey];
// Register namespaces
$slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
$slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slideNote->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value) {
$this->addField(Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title'])) {
$this->addField(Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Pptx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return \ZendSearch\Lucene\Document\Pptx
*/
public static function loadPptxFile($fileName, $storeContent = false)
{
return new self($fileName, $storeContent);
}
}

View File

@@ -1,263 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Document;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;
use ZendSearch\Lucene\Exception\RuntimeException;
/**
* Xlsx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
*/
class Xlsx extends AbstractOpenXML
{
/**
* Xml Schema - SpreadsheetML
*
* @var string
*/
const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Shared Strings
*
* @var string
*/
const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
/**
* Xml Schema - Worksheet relation
*
* @var string
*/
const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
private function __construct($fileName, $storeContent)
{
if (!class_exists('ZipArchive', false)) {
throw new ExtensionNotLoadedException(
'MS Office documents processing functionality requires Zip extension to be loaded'
);
}
// Document data holders
$sharedStrings = array();
$worksheets = array();
$documentBody = array();
$coreProperties = array();
// Open AbstractOpenXML package
$package = new \ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
throw new RuntimeException('Invalid archive or corrupted .xlsx file.');
}
if (\LIBXML_VERSION < 20900) {
// Prevent php from loading remote resources
$loadEntities = libxml_disable_entity_loader(true);
}
$relations = simplexml_load_string($relationsXml);
if (\LIBXML_VERSION < 20900) {
// Restore entity loader state
libxml_disable_entity_loader($loadEntities);
}
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read relations for workbook...
$workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
$workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP);
// Read shared strings
$sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']");
$sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
$xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
if (isset($xmlStrings) && isset($xmlStrings->si)) {
foreach ($xmlStrings->si as $val) {
if (isset($val->t)) {
$sharedStrings[] = (string)$val->t;
} elseif (isset($val->r)) {
$sharedStrings[] = $this->_parseRichText($val);
}
}
}
// Loop relations for workbook and extract worksheets...
foreach ($workbookRelations->Relationship as $workbookRelation) {
if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) {
$worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
);
}
}
break;
}
}
// Sort worksheets
ksort($worksheets);
// Extract contents from worksheets
foreach ($worksheets as $sheetKey => $worksheet) {
foreach ($worksheet->sheetData->row as $row) {
foreach ($row->c as $c) {
// Determine data type
$dataType = (string)$c["t"];
switch ($dataType) {
case "s":
// Value is a shared string
if ((string)$c->v != '') {
$value = $sharedStrings[intval($c->v)];
} else {
$value = '';
}
break;
case "b":
// Value is boolean
$value = (string)$c->v;
if ($value == '0') {
$value = false;
} elseif ($value == '1') {
$value = true;
} else {
$value = (bool)$c->v;
}
break;
case "inlineStr":
// Value is rich text inline
$value = $this->_parseRichText($c->is);
break;
case "e":
// Value is an error message
if ((string)$c->v != '') {
$value = (string)$c->v;
} else {
$value = '';
}
break;
default:
// Value is a string
$value = (string)$c->v;
// Check for numeric values
if (is_numeric($value) && $dataType != 's') {
if ($value == (int)$value) $value = (int)$value;
elseif ($value == (float)$value) $value = (float)$value;
elseif ($value == (double)$value) $value = (double)$value;
}
}
$documentBody[] = $value;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value) {
$this->addField(Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title'])) {
$this->addField(Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Parse rich text XML
*
* @param \SimpleXMLElement $is
* @return string
*/
private function _parseRichText($is = null)
{
$value = array();
if (isset($is->t)) {
$value[] = (string)$is->t;
} else {
foreach ($is->r as $run) {
$value[] = (string)$run->t;
}
}
return implode('', $value);
}
/**
* Load Xlsx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return \ZendSearch\Lucene\Document\Xlsx
*/
public static function loadXlsxFile($fileName, $storeContent = false)
{
return new self($fileName, $storeContent);
}
}

View File

@@ -1,19 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
/**
* @category Zend
* @package Zend_Search_Lucene
*/
interface ExceptionInterface extends \ZendSearch\Exception\ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class ExtensionNotLoadedException
extends \RuntimeException
implements ExceptionInterface
{}

View File

@@ -1,18 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
use ZendSearch\Lucene\Exception\ExceptionInterface;
class InvalidArgumentException
extends \InvalidArgumentException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class InvalidFileFormatException
extends \RuntimeException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class OutOfBoundsException
extends \OutOfBoundsException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class OutOfRangeException
extends \OutOfRangeException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class RuntimeException
extends \RuntimeException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class UnexpectedValueException
extends \UnexpectedValueException
implements ExceptionInterface
{}

View File

@@ -1,16 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Exception;
class UnsupportedMethodCallException
extends \BadMethodCallException
implements ExceptionInterface
{}

View File

@@ -1,53 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
/**
* Abstract Finite State Machine
*
*
* @category Zend
* @package Zend_Search_Lucene
*/
class FSMAction
{
/**
* Object reference
*
* @var object
*/
private $_object;
/**
* Method name
*
* @var string
*/
private $_method;
/**
* Object constructor
*
* @param object $object
* @param string $method
*/
public function __construct($object, $method)
{
$this->_object = $object;
$this->_method = $method;
}
public function doAction()
{
$methodName = $this->_method;
$this->_object->$methodName();
}
}

View File

@@ -1,251 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidFileFormatException;
/**
* Dictionary loader
*
* It's a dummy class which is created to encapsulate non-good structured code.
* Manual "method inlining" is performed to increase dictionary index loading operation
* which is major bottelneck for search performance.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class DictionaryLoader
{
/**
* Dictionary index loader.
*
* It takes a string which is actually <segment_name>.tii index file data and
* returns two arrays - term and tremInfo lists.
*
* See Zend_Search_Lucene_Index_SegmintInfo class for details
*
* @param string $data
* @return array
* @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException
*/
public static function load($data)
{
$termDictionary = array();
$termInfos = array();
$pos = 0;
// $tiVersion = $tiiFile->readInt();
$tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
$pos += 4;
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
}
// $indexTermCount = $tiiFile->readLong();
if (PHP_INT_SIZE > 4) {
$indexTermCount = ord($data[$pos]) << 56 |
ord($data[$pos+1]) << 48 |
ord($data[$pos+2]) << 40 |
ord($data[$pos+3]) << 32 |
ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
} else {
if ((ord($data[$pos]) != 0) ||
(ord($data[$pos+1]) != 0) ||
(ord($data[$pos+2]) != 0) ||
(ord($data[$pos+3]) != 0) ||
((ord($data[$pos+4]) & 0x80) != 0)) {
throw new InvalidFileFormatException('Largest supported segment size (for 32-bit mode) is 2Gb');
}
$indexTermCount = ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
}
$pos += 8;
// $tiiFile->readInt(); // IndexInterval
$pos += 4;
// $skipInterval = $tiiFile->readInt();
$skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
$pos += 4;
if ($indexTermCount < 1) {
throw new InvalidFileFormatException('Wrong number of terms in a term dictionary index');
}
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
/* Skip MaxSkipLevels value */
$pos += 4;
}
$prevTerm = '';
$freqPointer = 0;
$proxPointer = 0;
$indexPointer = 0;
for ($count = 0; $count < $indexTermCount; $count++) {
//$termPrefixLength = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termPrefixLength = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termPrefixLength |= ($nbyte & 0x7F) << $shift;
}
// $termSuffix = $tiiFile->readString();
$nbyte = ord($data[$pos++]);
$len = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$len |= ($nbyte & 0x7F) << $shift;
}
if ($len == 0) {
$termSuffix = '';
} else {
$termSuffix = substr($data, $pos, $len);
$pos += $len;
for ($count1 = 0; $count1 < $len; $count1++ ) {
if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($termSuffix[$count1]) & 0x20 ) {
$addBytes++;
// Never used for Java Lucene created index.
// Java2 doesn't encode strings in four bytes
if (ord($termSuffix[$count1]) & 0x10 ) {
$addBytes++;
}
}
$termSuffix .= substr($data, $pos, $addBytes);
$pos += $addBytes;
$len += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($termSuffix[$count1]) == 0xC0 &&
ord($termSuffix[$count1+1]) == 0x80 ) {
$termSuffix[$count1] = 0;
$termSuffix = substr($termSuffix,0,$count1+1)
. substr($termSuffix,$count1+2);
}
$count1 += $addBytes;
}
}
}
$pb = 0; $pc = 0;
while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
$charBytes = 1;
if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x20 ) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x10 ) {
$charBytes++;
}
}
}
if ($pb + $charBytes > strlen($data)) {
// wrong character
break;
}
$pc++;
$pb += $charBytes;
}
$termValue = substr($prevTerm, 0, $pb) . $termSuffix;
// $termFieldNum = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termFieldNum = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termFieldNum |= ($nbyte & 0x7F) << $shift;
}
// $docFreq = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$docFreq = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$docFreq |= ($nbyte & 0x7F) << $shift;
}
// $freqPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$freqPointer += $vint;
// $proxPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$proxPointer += $vint;
if( $docFreq >= $skipInterval ) {
// $skipDelta = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$skipDelta = $vint;
} else {
$skipDelta = 0;
}
// $indexPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$indexPointer += $vint;
$termDictionary[] = array($termFieldNum, $termValue);
$termInfos[] =
array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
$prevTerm = $termValue;
}
// Check special index entry mark
if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
throw new InvalidFileFormatException('Wrong TermInfoIndexFile file format');
}
if (PHP_INT_SIZE > 4) {
// Treat 64-bit 0xFFFFFFFF as -1
$termDictionary[0][0] = -1;
}
return array($termDictionary, $termInfos);
}
}

View File

@@ -1,46 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
/**
* A Zend_Search_Lucene_Index_DocsFilter is used to filter documents while searching.
*
* It may or _may_not_ be used for actual filtering, so it's just a hint that upper query limits
* search result by specified list.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class DocsFilter
{
/**
* Set of segment filters:
* array( <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* ...
* )
*
* @var array
*/
public $segmentFilters = array();
}

View File

@@ -1,36 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class FieldInfo
{
public $name;
public $isIndexed;
public $number;
public $storeTermVector;
public $normsOmitted;
public $payloadsStored;
public function __construct($name, $isIndexed, $number, $storeTermVector, $normsOmitted = false, $payloadsStored = false)
{
$this->name = $name;
$this->isIndexed = $isIndexed;
$this->number = $number;
$this->storeTermVector = $storeTermVector;
$this->normsOmitted = $normsOmitted;
$this->payloadsStored = $payloadsStored;
}
}

View File

@@ -1,254 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Storage\Directory;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class SegmentMerger
{
/**
* Target segment writer
*
* @var \ZendSearch\Lucene\Index\SegmentWriter\StreamWriter
*/
private $_writer;
/**
* Number of docs in a new segment
*
* @var integer
*/
private $_docCount;
/**
* A set of segments to be merged
*
* @var array|\ZendSearch\Lucene\Index\SegmentInfo
*/
private $_segmentInfos = array();
/**
* Flag to signal, that merge is already done
*
* @var boolean
*/
private $_mergeDone = false;
/**
* Field map
* [<segment_name>][<field_number>] => <target_field_number>
*
* @var array
*/
private $_fieldsMap = array();
/**
* Object constructor.
*
* Creates new segment merger with $directory as target to merge segments into
* and $name as a name of new segment
*
* @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory
* @param string $name
*/
public function __construct(Directory\DirectoryInterface $directory, $name)
{
/** \ZendSearch\Lucene\Index\SegmentWriter\StreamWriter */
$this->_writer = new SegmentWriter\StreamWriter($directory, $name);
}
/**
* Add segmnet to a collection of segments to be merged
*
* @param \ZendSearch\Lucene\Index\SegmentInfo $segment
*/
public function addSource(SegmentInfo $segmentInfo)
{
$this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
}
/**
* Do merge.
*
* Returns number of documents in newly created segment
*
* @return \ZendSearch\Lucene\Index\SegmentInfo
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function merge()
{
if ($this->_mergeDone) {
throw new RuntimeException('Merge is already done.');
}
if (count($this->_segmentInfos) < 1) {
throw new RuntimeException('Wrong number of segments to be merged ('
. count($this->_segmentInfos)
. ').');
}
$this->_mergeFields();
$this->_mergeNorms();
$this->_mergeStoredFields();
$this->_mergeTerms();
$this->_mergeDone = true;
return $this->_writer->close();
}
/**
* Merge fields information
*/
private function _mergeFields()
{
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
$this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
}
}
}
/**
* Merge field's normalization factors
*/
private function _mergeNorms()
{
foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
if ($fieldInfo->isIndexed) {
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if ($segmentInfo->hasDeletions()) {
$srcNorm = $segmentInfo->normVector($fieldInfo->name);
$norm = '';
$docs = $segmentInfo->count();
for ($count = 0; $count < $docs; $count++) {
if (!$segmentInfo->isDeleted($count)) {
$norm .= $srcNorm[$count];
}
}
$this->_writer->addNorm($fieldInfo->name, $norm);
} else {
$this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
}
}
}
}
}
/**
* Merge fields information
*/
private function _mergeStoredFields()
{
$this->_docCount = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$fdtFile = $segmentInfo->openCompoundFile('.fdt');
for ($count = 0; $count < $segmentInfo->count(); $count++) {
$fieldCount = $fdtFile->readVInt();
$storedFields = array();
for ($count2 = 0; $count2 < $fieldCount; $count2++) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
$fieldInfo = $segmentInfo->getField($fieldNum);
if (!($bits & 2)) { // Text data
$storedFields[] =
new Document\Field($fieldInfo->name,
$fdtFile->readString(),
'UTF-8',
true,
$fieldInfo->isIndexed,
$bits & 1 );
} else { // Binary data
$storedFields[] =
new Document\Field($fieldInfo->name,
$fdtFile->readBinary(),
'',
true,
$fieldInfo->isIndexed,
$bits & 1,
true);
}
}
if (!$segmentInfo->isDeleted($count)) {
$this->_docCount++;
$this->_writer->addStoredFields($storedFields);
}
}
}
}
/**
* Merge fields information
*/
private function _mergeTerms()
{
$segmentInfoQueue = new TermsPriorityQueue();
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, SegmentInfo::SM_MERGE_INFO);
// Skip "empty" segments
if ($segmentInfo->currentTerm() !== null) {
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->initializeDictionaryFiles();
$termDocs = array();
while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
// Merge positions array
$termDocs += $segmentInfo->currentTermPositions();
if ($segmentInfoQueue->top() === null ||
$segmentInfoQueue->top()->currentTerm()->key() !=
$segmentInfo->currentTerm()->key()) {
// We got new term
ksort($termDocs, SORT_NUMERIC);
// Add term if it's contained in any document
if (count($termDocs) > 0) {
$this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
}
$termDocs = array();
}
$segmentInfo->nextTerm();
// check, if segment dictionary is finished
if ($segmentInfo->currentTerm() !== null) {
// Put segment back into the priority queue
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->closeDictionaryFiles();
}
}

View File

@@ -1,613 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index\SegmentWriter;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Storage\Directory;
use ZendSearch\Lucene\Storage\File;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
abstract class AbstractSegmentWriter
{
/**
* Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.
*
* @var integer
*/
public static $indexInterval = 128;
/**
* Expert: The fraction of TermDocs entries stored in skip tables.
* Larger values result in smaller indexes, greater acceleration, but fewer
* accelerable cases, while smaller values result in bigger indexes,
* less acceleration and more
* accelerable cases. More detailed experiments would be useful here.
*
* 0x7FFFFFFF indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $skipInterval = 0x7FFFFFFF;
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*
* 0 indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $maxSkipLevels = 0;
/**
* Number of docs in a segment
*
* @var integer
*/
protected $_docCount = 0;
/**
* Segment name
*
* @var string
*/
protected $_name;
/**
* File system adapter.
*
* @var \ZendSearch\Lucene\Storage\Directory\DirectoryInterface
*/
protected $_directory;
/**
* List of the index files.
* Used for automatic compound file generation
*
* @var array
*/
protected $_files = array();
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
protected $_fields = array();
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity::encodeNorm())
*
* @var array
*/
protected $_norms = array();
/**
* '.fdx' file - Stored Fields, the field index.
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
protected $_fdxFile = null;
/**
* '.fdt' file - Stored Fields, the field data.
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
protected $_fdtFile = null;
/**
* Object constructor.
*
* @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory
* @param string $name
*/
public function __construct(Directory\DirectoryInterface $directory, $name)
{
$this->_directory = $directory;
$this->_name = $name;
}
/**
* Add field to the segment
*
* Returns actual field number
*
* @param \ZendSearch\Lucene\Document\Field $field
* @return integer
*/
public function addField(Document\Field $field)
{
if (!isset($this->_fields[$field->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$field->name] = new Index\FieldInfo($field->name,
$field->isIndexed,
$fieldNumber,
$field->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$field->name]->isIndexed |= $field->isIndexed;
$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
return $this->_fields[$field->name]->number;
}
}
/**
* Add fieldInfo to the segment
*
* Returns actual field number
*
* @param \ZendSearch\Lucene\Index\FieldInfo $fieldInfo
* @return integer
*/
public function addFieldInfo(Index\FieldInfo $fieldInfo)
{
if (!isset($this->_fields[$fieldInfo->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$fieldInfo->name] = new Index\FieldInfo($fieldInfo->name,
$fieldInfo->isIndexed,
$fieldNumber,
$fieldInfo->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
$this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
return $this->_fields[$fieldInfo->name]->number;
}
}
/**
* Returns array of FieldInfo objects.
*
* @return array
*/
public function getFieldInfos()
{
return $this->_fields;
}
/**
* Add stored fields information
*
* @param array $storedFields array of \ZendSearch\Lucene\Document\Field objects
*/
public function addStoredFields($storedFields)
{
if (!isset($this->_fdxFile)) {
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
$this->_fdxFile->writeLong($this->_fdtFile->tell());
$this->_fdtFile->writeVInt(count($storedFields));
foreach ($storedFields as $field) {
$this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
$fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
($field->isBinary ? 0x02 : 0x00) |
0x00; /* 0x04 - third bit, compressed (ZLIB) */
$this->_fdtFile->writeByte($fieldBits);
if ($field->isBinary) {
$this->_fdtFile->writeVInt(strlen($field->value));
$this->_fdtFile->writeBytes($field->value);
} else {
$this->_fdtFile->writeString($field->getUtf8Value());
}
}
$this->_docCount++;
}
/**
* Returns the total number of documents in this segment.
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* Dump Field Info (.fnm) segment file
*/
protected function _dumpFNM()
{
$fnmFile = $this->_directory->createFile($this->_name . '.fnm');
$fnmFile->writeVInt(count($this->_fields));
$nrmFile = $this->_directory->createFile($this->_name . '.nrm');
// Write header
$nrmFile->writeBytes('NRM');
// Write format specifier
$nrmFile->writeByte((int)0xFF);
foreach ($this->_fields as $field) {
$fnmFile->writeString($field->name);
$fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
($field->storeTermVector ? 0x02 : 0x00)
// not supported yet 0x04 /* term positions are stored with the term vectors */ |
// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
);
if ($field->isIndexed) {
// pre-2.1 index mode (not used now)
// $normFileName = $this->_name . '.f' . $field->number;
// $fFile = $this->_directory->createFile($normFileName);
// $fFile->writeBytes($this->_norms[$field->name]);
// $this->_files[] = $normFileName;
$nrmFile->writeBytes($this->_norms[$field->name]);
}
}
$this->_files[] = $this->_name . '.fnm';
$this->_files[] = $this->_name . '.nrm';
}
/**
* Term Dictionary file
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
private $_tisFile = null;
/**
* Term Dictionary index file
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
private $_tiiFile = null;
/**
* Frequencies file
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
private $_frqFile = null;
/**
* Positions file
*
* @var \ZendSearch\Lucene\Storage\File\FileInterface
*/
private $_prxFile = null;
/**
* Number of written terms
*
* @var integer
*/
private $_termCount;
/**
* Last saved term
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_prevTerm;
/**
* Last saved term info
*
* @var \ZendSearch\Lucene\Index\TermInfo
*/
private $_prevTermInfo;
/**
* Last saved index term
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_prevIndexTerm;
/**
* Last saved index term info
*
* @var \ZendSearch\Lucene\Index\TermInfo
*/
private $_prevIndexTermInfo;
/**
* Last term dictionary file position
*
* @var integer
*/
private $_lastIndexPosition;
/**
* Create dicrionary, frequency and positions files and write necessary headers
*/
public function initializeDictionaryFiles()
{
$this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
$this->_tisFile->writeInt((int)0xFFFFFFFD);
$this->_tisFile->writeLong(0 /* dummy data for terms count */);
$this->_tisFile->writeInt(self::$indexInterval);
$this->_tisFile->writeInt(self::$skipInterval);
$this->_tisFile->writeInt(self::$maxSkipLevels);
$this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
$this->_tiiFile->writeInt((int)0xFFFFFFFD);
$this->_tiiFile->writeLong(0 /* dummy data for terms count */);
$this->_tiiFile->writeInt(self::$indexInterval);
$this->_tiiFile->writeInt(self::$skipInterval);
$this->_tiiFile->writeInt(self::$maxSkipLevels);
/** Dump dictionary header */
$this->_tiiFile->writeVInt(0); // preffix length
$this->_tiiFile->writeString(''); // suffix
$this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
$this->_tiiFile->writeByte((int)0x0F);
$this->_tiiFile->writeVInt(0); // DocFreq
$this->_tiiFile->writeVInt(0); // FreqDelta
$this->_tiiFile->writeVInt(0); // ProxDelta
$this->_tiiFile->writeVInt(24); // IndexDelta
$this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
$this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
$this->_files[] = $this->_name . '.tis';
$this->_files[] = $this->_name . '.tii';
$this->_files[] = $this->_name . '.frq';
$this->_files[] = $this->_name . '.prx';
$this->_prevTerm = null;
$this->_prevTermInfo = null;
$this->_prevIndexTerm = null;
$this->_prevIndexTermInfo = null;
$this->_lastIndexPosition = 24;
$this->_termCount = 0;
}
/**
* Add term
*
* Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
*
* @param \ZendSearch\Lucene\Index\Term $termEntry
* @param array $termDocs
*/
public function addTerm($termEntry, $termDocs)
{
$freqPointer = $this->_frqFile->tell();
$proxPointer = $this->_prxFile->tell();
$prevDoc = 0;
foreach ($termDocs as $docId => $termPositions) {
$docDelta = ($docId - $prevDoc)*2;
$prevDoc = $docId;
if (count($termPositions) > 1) {
$this->_frqFile->writeVInt($docDelta);
$this->_frqFile->writeVInt(count($termPositions));
} else {
$this->_frqFile->writeVInt($docDelta + 1);
}
$prevPosition = 0;
foreach ($termPositions as $position) {
$this->_prxFile->writeVInt($position - $prevPosition);
$prevPosition = $position;
}
}
if (count($termDocs) >= self::$skipInterval) {
/**
* @todo Write Skip Data to a freq file.
* It's not used now, but make index more optimal
*/
$skipOffset = $this->_frqFile->tell() - $freqPointer;
} else {
$skipOffset = 0;
}
$term = new Index\Term($termEntry->text, $this->_fields[$termEntry->field]->number);
$termInfo = new Index\TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
if (($this->_termCount + 1) % self::$indexInterval == 0) {
$this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
$indexPosition = $this->_tisFile->tell();
$this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
$this->_lastIndexPosition = $indexPosition;
}
$this->_termCount++;
}
/**
* Close dictionary
*/
public function closeDictionaryFiles()
{
$this->_tisFile->seek(4);
$this->_tisFile->writeLong($this->_termCount);
$this->_tiiFile->seek(4);
// + 1 is used to count an additional special index entry (empty term at the start of the list)
$this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
}
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
*
* @param \ZendSearch\Lucene\Storage\File\FileInterface $dicFile
* @param \ZendSearch\Lucene\Index\Term $prevTerm
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\TermInfo $prevTermInfo
* @param \ZendSearch\Lucene\Index\TermInfo $termInfo
*/
protected function _dumpTermDictEntry(File\FileInterface $dicFile,
&$prevTerm, Index\Term $term,
&$prevTermInfo, Index\TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
$matchedBytes = 0;
$maxBytes = min(strlen($prevTerm->text), strlen($term->text));
while ($matchedBytes < $maxBytes &&
$prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
$matchedBytes++;
}
// Calculate actual matched UTF-8 pattern
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < $matchedBytes) {
$charBytes = 1;
if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > $matchedBytes) {
// char crosses matched bytes boundary
// skip char
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
// Write preffix length
$dicFile->writeVInt($prefixChars);
// Write suffix
$dicFile->writeString(substr($term->text, $prefixBytes));
} else {
// Write preffix length
$dicFile->writeVInt(0);
// Write suffix
$dicFile->writeString($term->text);
}
// Write field number
$dicFile->writeVInt($term->field);
// DocFreq (the count of documents which contain the term)
$dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer);
} else {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
}
// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
if ($termInfo->skipOffset != 0) {
$dicFile->writeVInt($termInfo->skipOffset);
}
$prevTermInfo = $termInfo;
}
/**
* Generate compound index file
*/
protected function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array();
foreach ($this->_files as $fileName) {
$dataOffsetPointers[$fileName] = $cfsFile->tell();
$cfsFile->writeLong(0); // write dummy data
$cfsFile->writeString($fileName);
}
foreach ($this->_files as $fileName) {
// Get actual data offset
$dataOffset = $cfsFile->tell();
// Seek to the data offset pointer
$cfsFile->seek($dataOffsetPointers[$fileName]);
// Write actual data offset value
$cfsFile->writeLong($dataOffset);
// Seek back to the end of file
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$byteCount = $this->_directory->fileLength($fileName);
while ($byteCount > 0) {
$data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
$byteCount -= strlen($data);
$cfsFile->writeBytes($data);
}
$this->_directory->deleteFile($fileName);
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return \ZendSearch\Lucene\Index\SegmentInfo
*/
abstract public function close();
}

View File

@@ -1,213 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index\SegmentWriter;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Exception as LuceneException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Similarity\AbstractSimilarity;
use ZendSearch\Lucene\Storage\Directory;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class DocumentWriter extends AbstractSegmentWriter
{
/**
* Term Dictionary
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
protected $_termDictionary;
/**
* Documents, which contain the term
*
* @var array
*/
protected $_termDocs;
/**
* Object constructor.
*
* @param Directory\DirectoryInterface $directory
* @param string $name
*/
public function __construct(Directory\DirectoryInterface $directory, $name)
{
parent::__construct($directory, $name);
$this->_termDocs = array();
$this->_termDictionary = array();
}
/**
* Adds a document to this segment.
*
* @param \ZendSearch\Lucene\Document $document
* @throws LuceneException\UnsupportedMethodCallException
*/
public function addDocument(Document $document)
{
$storedFields = array();
$docNorms = array();
$similarity = AbstractSimilarity::getDefault();
foreach ($document->getFieldNames() as $fieldName) {
$field = $document->getField($fieldName);
if ($field->storeTermVector) {
/**
* @todo term vector storing support
*/
throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.');
}
if ($field->isIndexed) {
if ($field->isTokenized) {
$analyzer = Analyzer\Analyzer::getDefault();
$analyzer->setInput($field->value, $field->encoding);
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null) {
$tokenCounter++;
$term = new Index\Term($token->getTermText(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$position += $token->getPositionIncrement();
$this->_termDocs[$termKey][$this->_docCount][] = $position;
}
if ($tokenCounter == 0) {
// Field contains empty value. Treat it as non-indexed and non-tokenized
$field = clone($field);
$field->isIndexed = $field->isTokenized = false;
} else {
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
$tokenCounter)*
$document->boost*
$field->boost ));
}
} elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') {
// Field contains empty value. Treat it as non-indexed and non-tokenized
$field = clone($field);
$field->isIndexed = $field->isTokenized = false;
} else {
$term = new Index\Term($fieldUtf8Value, $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$this->_termDocs[$termKey][$this->_docCount][] = 0; // position
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
$document->boost*
$field->boost ));
}
}
if ($field->isStored) {
$storedFields[] = $field;
}
$this->addField($field);
}
foreach ($this->_fields as $fieldName => $field) {
if (!$field->isIndexed) {
continue;
}
if (!isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (isset($docNorms[$fieldName])){
$this->_norms[$fieldName] .= $docNorms[$fieldName];
} else {
$this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
}
}
$this->addStoredFields($storedFields);
}
/**
* Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
*/
protected function _dumpDictionary()
{
ksort($this->_termDictionary, SORT_STRING);
$this->initializeDictionaryFiles();
foreach ($this->_termDictionary as $termId => $term) {
$this->addTerm($term, $this->_termDocs[$termId]);
}
$this->closeDictionaryFiles();
}
/**
* Close segment, write it to disk and return segment info
*
* @return \ZendSearch\Lucene\Index\SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_dumpDictionary();
$this->_generateCFS();
return new Index\SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}

View File

@@ -1,79 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index\SegmentWriter;
use ZendSearch\Lucene\Index as LuceneIndex;
use ZendSearch\Lucene\Storage\Directory;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class StreamWriter extends AbstractSegmentWriter
{
/**
* Object constructor.
*
* @param Directory\DirectoryInterface $directory
* @param string $name
*/
public function __construct(Directory\DirectoryInterface $directory, $name)
{
parent::__construct($directory, $name);
}
/**
* Create stored fields files and open them for write
*/
public function createStoredFieldsFiles()
{
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
public function addNorm($fieldName, $normVector)
{
if (isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] .= $normVector;
} else {
$this->_norms[$fieldName] = $normVector;
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return \ZendSearch\Lucene\Index\SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_generateCFS();
return new LuceneIndex\SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}

View File

@@ -1,135 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
use ZendSearch\Lucene;
/**
* A Term represents a word from text. This is the unit of search. It is
* composed of two elements, the text of the word, as a string, and the name of
* the field that the text occured in, an interned string.
*
* Note that terms may represent more than words from text fields, but also
* things like dates, email addresses, urls, etc.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class Term
{
/**
* Field name or field number (depending from context)
*
* @var mixed
*/
public $field;
/**
* Term value
*
* @var string
*/
public $text;
/**
* Object constructor
*/
public function __construct($text, $field = null)
{
$this->field = ($field === null)? Lucene\Lucene::getDefaultSearchField() : $field;
$this->text = $text;
}
/**
* Returns term key
*
* @return string
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
/**
* Get term prefix
*
* @param string $str
* @param integer $length
* @return string
*/
public static function getPrefix($str, $length)
{
/**
* @todo !!!!!!! use mb_string or iconv functions if they are available
*/
$prefixBytes = 0;
$prefixChars = 0;
while (isset($str[$prefixBytes]) && $prefixChars < $length) {
$charBytes = 1;
if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if (! isset($str[$prefixBytes + $charBytes - 1])) {
// wrong character
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
return substr($str, 0, $prefixBytes);
}
/**
* Get UTF-8 string length
*
* @param string $str
* @return string
*/
public static function getLength($str)
{
$bytes = 0;
$chars = 0;
while ($bytes < strlen($str)) {
$charBytes = 1;
if ((ord($str[$bytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$bytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$bytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($bytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$chars++;
$bytes += $charBytes;
}
return $chars;
}
}

View File

@@ -1,67 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
/**
* A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class TermInfo
{
/**
* The number of documents which contain the term.
*
* @var integer
*/
public $docFreq;
/**
* Data offset in a Frequencies file.
*
* @var integer
*/
public $freqPointer;
/**
* Data offset in a Positions file.
*
* @var integer
*/
public $proxPointer;
/**
* ScipData offset in a Frequencies file.
*
* @var integer
*/
public $skipOffset;
/**
* Term offset of the _next_ term in a TermDictionary file.
* Used only for Term Index
*
* @var integer
*/
public $indexPointer;
public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
{
$this->docFreq = $docFreq;
$this->freqPointer = $freqPointer;
$this->proxPointer = $proxPointer;
$this->skipOffset = $skipOffset;
$this->indexPointer = $indexPointer;
}
}

View File

@@ -1,37 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
use ZendSearch\Lucene;
/** @todo !!!!!! convert to SPL class usage */
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class TermsPriorityQueue extends Lucene\AbstractPriorityQueue
{
/**
* Compare elements
*
* Returns true, if $termsStream1 is "less" than $termsStream2; else otherwise
*
* @param mixed $termsStream1
* @param mixed $termsStream2
* @return boolean
*/
protected function _less($termsStream1, $termsStream2)
{
return strcmp($termsStream1->currentTerm()->key(), $termsStream2->currentTerm()->key()) < 0;
}
}

View File

@@ -1,54 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
interface TermsStreamInterface
{
/**
* Reset terms stream.
*/
public function resetTermsStream();
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param \ZendSearch\Lucene\Index\Term $prefix
*/
public function skipTo(Term $prefix);
/**
* Scans terms dictionary and returns next term
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function nextTerm();
/**
* Returns term in current position
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function currentTerm();
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream();
}

View File

@@ -1,823 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Index;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Exception\ExceptionInterface;
use ZendSearch\Lucene\Exception\InvalidFileFormatException;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Storage\Directory;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
*/
class Writer
{
/**
* @todo Implement AnalyzerInterface substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
* @todo DirectoryInterface lock processing
*/
/**
* Number of documents required before the buffered in-memory
* documents are written into a new Segment
*
* Default value is 10
*
* @var integer
*/
public $maxBufferedDocs = 10;
/**
* Largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @var integer
*/
public $maxMergeDocs = PHP_INT_MAX;
/**
* Determines how often segment indices are merged by addDocument().
*
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
*
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
*
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @var integer
*/
public $mergeFactor = 10;
/**
* File system adapter.
*
* @var \ZendSearch\Lucene\Storage\Directory\DirectoryInterface
*/
private $_directory = null;
/**
* Changes counter.
*
* @var integer
*/
private $_versionUpdate = 0;
/**
* List of the segments, created by index writer
* Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
* @var array
*/
private $_newSegments = array();
/**
* List of segments to be deleted on commit
*
* @var array
*/
private $_segmentsToDelete = array();
/**
* Current segment to add documents
*
* @var \ZendSearch\Lucene\Index\SegmentWriter\DocumentWriter
*/
private $_currentSegment = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
*
* It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
*
* @var array|\ZendSearch\Lucene\Index\SegmentInfo
*/
private $_segmentInfos;
/**
* Index target format version
*
* @var integer
*/
private $_targetFormatVersion;
/**
* List of indexfiles extensions
*
* @var array
*/
private static $_indexExtensions = array('.cfs' => '.cfs',
'.cfx' => '.cfx',
'.fnm' => '.fnm',
'.fdx' => '.fdx',
'.fdt' => '.fdt',
'.tis' => '.tis',
'.tii' => '.tii',
'.frq' => '.frq',
'.prx' => '.prx',
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
'.del' => '.del',
'.sti' => '.sti' );
/**
* Create empty index
*
* @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory
* @param integer $generation
* @param integer $nameCount
*/
public static function createIndex(Directory\DirectoryInterface $directory, $generation, $nameCount)
{
if ($generation == 0) {
// Create index in pre-2.1 mode
foreach ($directory->fileList() as $file) {
if ($file == 'deletable' ||
$file == 'segments' ||
isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
$directory->deleteFile($file);
}
}
$segmentsFile = $directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
// write version (initialized by current time)
$segmentsFile->writeLong(round(microtime(true)));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
$deletableFile = $directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
} else {
$genFile = $directory->createFile('segments.gen');
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation two times
$genFile->writeLong($generation);
$genFile->writeLong($generation);
$segmentsFile = $directory->createFile(Lucene\Index::getSegmentFileName($generation));
$segmentsFile->writeInt((int)0xFFFFFFFD);
// write version (initialized by current time)
$segmentsFile->writeLong(round(microtime(true)));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
}
}
/**
* Open the index for writing
*
* @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory
* @param array $segmentInfos
* @param integer $targetFormatVersion
* @param \ZendSearch\Lucene\Storage\File\FileInterface $cleanUpLock
*/
public function __construct(Directory\DirectoryInterface $directory, &$segmentInfos, $targetFormatVersion)
{
$this->_directory = $directory;
$this->_segmentInfos = &$segmentInfos;
$this->_targetFormatVersion = $targetFormatVersion;
}
/**
* Adds a document to this index.
*
* @param \ZendSearch\Lucene\Document $document
*/
public function addDocument(Document $document)
{
if ($this->_currentSegment === null) {
$this->_currentSegment =
new SegmentWriter\DocumentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
$this->commit();
}
$this->_maybeMergeSegments();
$this->_versionUpdate++;
}
/**
* Check if we have anything to merge
*
* @return boolean
*/
private function _hasAnythingToMerge()
{
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
return true;
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
return false;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
return true;
}
return false;
}
/**
* Merge segments if necessary
*/
private function _maybeMergeSegments()
{
if (Lucene\LockManager::obtainOptimizationLock($this->_directory) === false) {
return;
}
if (!$this->_hasAnythingToMerge()) {
Lucene\LockManager::releaseOptimizationLock($this->_directory);
return;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
// Perform standard auto-optimization procedure
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
Lucene\LockManager::releaseOptimizationLock($this->_directory);
return;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
Lucene\LockManager::releaseOptimizationLock($this->_directory);
}
/**
* Merge specified segments
*
* $segments is an array of SegmentInfo objects
*
* @param array $segments
*/
private function _mergeSegments($segments)
{
$newName = $this->_newSegmentName();
$merger = new SegmentMerger($this->_directory,
$newName);
foreach ($segments as $segmentInfo) {
$merger->addSource($segmentInfo);
$this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
}
$newSegment = $merger->merge();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->commit();
}
/**
* Update segments file by adding current segment to a list
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @throws \ZendSearch\Lucene\Exception\InvalidFileFormatException
*/
private function _updateSegments()
{
// Get an exclusive index lock
Lucene\LockManager::obtainWriteLock($this->_directory);
// Write down changes for the segments
foreach ($this->_segmentInfos as $segInfo) {
$segInfo->writeChanges();
}
$generation = Lucene\Index::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Lucene\Index::getSegmentFileName($generation), false);
$newSegmentFile = $this->_directory->createFile(Lucene\Index::getSegmentFileName(++$generation), false);
try {
$genFile = $this->_directory->getFileObject('segments.gen', false);
} catch (ExceptionInterface $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
$genFile = $this->_directory->createFile('segments.gen');
} else {
throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
}
}
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation (first copy)
$genFile->writeLong($generation);
try {
// Write format marker
if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_1) {
$newSegmentFile->writeInt((int)0xFFFFFFFD);
} elseif ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) {
$newSegmentFile->writeInt((int)0xFFFFFFFC);
}
// Read src file format identifier
$format = $segmentsFile->readInt();
if ($format == (int)0xFFFFFFFF) {
$srcFormat = Lucene\Index::FORMAT_PRE_2_1;
} elseif ($format == (int)0xFFFFFFFD) {
$srcFormat = Lucene\Index::FORMAT_2_1;
} elseif ($format == (int)0xFFFFFFFC) {
$srcFormat = Lucene\Index::FORMAT_2_3;
} else {
throw new InvalidFileFormatException('Unsupported segments file format');
}
$version = $segmentsFile->readLong() + $this->_versionUpdate;
$this->_versionUpdate = 0;
$newSegmentFile->writeLong($version);
// Write segment name counter
$newSegmentFile->writeInt($segmentsFile->readInt());
// Get number of segments offset
$numOfSegmentsOffset = $newSegmentFile->tell();
// Write dummy data (segment counter)
$newSegmentFile->writeInt(0);
// Read number of segemnts
$segmentsCount = $segmentsFile->readInt();
$segments = array();
for ($count = 0; $count < $segmentsCount; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
if ($srcFormat == Lucene\Index::FORMAT_PRE_2_1) {
// pre-2.1 index format
$delGen = 0;
$hasSingleNormFile = false;
$numField = (int)0xFFFFFFFF;
$isCompoundByte = 0;
$docStoreOptions = null;
} else {
$delGen = $segmentsFile->readLong();
if ($srcFormat == Lucene\Index::FORMAT_2_3) {
$docStoreOffset = $segmentsFile->readInt();
if ($docStoreOffset != (int)0xFFFFFFFF) {
$docStoreSegment = $segmentsFile->readString();
$docStoreIsCompoundFile = $segmentsFile->readByte();
$docStoreOptions = array('offset' => $docStoreOffset,
'segment' => $docStoreSegment,
'isCompound' => ($docStoreIsCompoundFile == 1));
} else {
$docStoreOptions = null;
}
} else {
$docStoreOptions = null;
}
$hasSingleNormFile = $segmentsFile->readByte();
$numField = $segmentsFile->readInt();
$normGens = array();
if ($numField != (int)0xFFFFFFFF) {
for ($count1 = 0; $count1 < $numField; $count1++) {
$normGens[] = $segmentsFile->readLong();
}
}
$isCompoundByte = $segmentsFile->readByte();
}
if (!in_array($segName, $this->_segmentsToDelete)) {
// Load segment if necessary
if (!isset($this->_segmentInfos[$segName])) {
if ($isCompoundByte == 0xFF) {
// The segment is not a compound file
$isCompound = false;
} elseif ($isCompoundByte == 0x00) {
// The status is unknown
$isCompound = null;
} elseif ($isCompoundByte == 0x01) {
// The segment is a compound file
$isCompound = true;
}
$this->_segmentInfos[$segName] =
new SegmentInfo($this->_directory,
$segName,
$segSize,
$delGen,
$docStoreOptions,
$hasSingleNormFile,
$isCompound);
} else {
// Retrieve actual deletions file generation number
$delGen = $this->_segmentInfos[$segName]->getDelGen();
}
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segSize);
$newSegmentFile->writeLong($delGen);
if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) {
if ($docStoreOptions !== null) {
$newSegmentFile->writeInt($docStoreOffset);
$newSegmentFile->writeString($docStoreSegment);
$newSegmentFile->writeByte($docStoreIsCompoundFile);
} else {
// Set DocStoreOffset to -1
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
} elseif ($docStoreOptions !== null) {
// Release index write lock
Lucene\LockManager::releaseWriteLock($this->_directory);
throw new RuntimeException('Index conversion to lower format version is not supported.');
}
$newSegmentFile->writeByte($hasSingleNormFile);
$newSegmentFile->writeInt($numField);
if ($numField != (int)0xFFFFFFFF) {
foreach ($normGens as $normGen) {
$newSegmentFile->writeLong($normGen);
}
}
$newSegmentFile->writeByte($isCompoundByte);
$segments[$segName] = $segSize;
}
}
$segmentsFile->close();
$segmentsCount = count($segments) + count($this->_newSegments);
foreach ($this->_newSegments as $segName => $segmentInfo) {
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segmentInfo->count());
// delete file generation: -1 (there is no delete file yet)
$newSegmentFile->writeInt((int)0xFFFFFFFF);$newSegmentFile->writeInt((int)0xFFFFFFFF);
if ($this->_targetFormatVersion == Lucene\Index::FORMAT_2_3) {
// docStoreOffset: -1 (segment doesn't use shared doc store)
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
// HasSingleNormFile
$newSegmentFile->writeByte($segmentInfo->hasSingleNormFile());
// NumField
$newSegmentFile->writeInt((int)0xFFFFFFFF);
// IsCompoundFile
$newSegmentFile->writeByte($segmentInfo->isCompound() ? 1 : -1);
$segments[$segmentInfo->getName()] = $segmentInfo->count();
$this->_segmentInfos[$segName] = $segmentInfo;
}
$this->_newSegments = array();
$newSegmentFile->seek($numOfSegmentsOffset);
$newSegmentFile->writeInt($segmentsCount); // Update segments count
$newSegmentFile->close();
} catch (\Exception $e) {
/** Restore previous index generation */
$generation--;
$genFile->seek(4, SEEK_SET);
// Write generation number twice
$genFile->writeLong($generation); $genFile->writeLong($generation);
// Release index write lock
Lucene\LockManager::releaseWriteLock($this->_directory);
// Throw the exception
throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
}
// Write generation (second copy)
$genFile->writeLong($generation);
// Check if another update or read process is not running now
// If yes, skip clean-up procedure
if (Lucene\LockManager::escalateReadLock($this->_directory)) {
/**
* Clean-up directory
*/
$filesToDelete = array();
$filesTypes = array();
$filesNumbers = array();
// list of .del files of currently used segments
// each segment can have several generations of .del files
// only last should not be deleted
$delFiles = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == 'deletable') {
// 'deletable' file
$filesToDelete[] = $file;
$filesTypes[] = 0; // delete this file first, since it's not used starting from Lucene v2.1
$filesNumbers[] = 0;
} elseif ($file == 'segments') {
// 'segments' file
$filesToDelete[] = $file;
$filesTypes[] = 1; // second file to be deleted "zero" version of segments file (Lucene pre-2.1)
$filesNumbers[] = 0;
} elseif (preg_match('/^segments_[a-zA-Z0-9]+$/i', $file)) {
// 'segments_xxx' file
// Check if it's not a just created generation file
if ($file != Lucene\Index::getSegmentFileName($generation)) {
$filesToDelete[] = $file;
$filesTypes[] = 2; // first group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 9), 36, 10); // ordered by segment generation numbers
}
} elseif (preg_match('/(^_([a-zA-Z0-9]+))\.f\d+$/i', $file, $matches)) {
// one of per segment files ('<segment_name>.f<decimal_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
}
} elseif (preg_match('/(^_([a-zA-Z0-9]+))(_([a-zA-Z0-9]+))\.del$/i', $file, $matches)) {
// one of per segment files ('<segment_name>_<del_generation>.del' where <segment_name> is '_<segment_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
} else {
$segmentNumber = (int)base_convert($matches[2], 36, 10);
$delGeneration = (int)base_convert($matches[4], 36, 10);
if (!isset($delFiles[$segmentNumber])) {
$delFiles[$segmentNumber] = array();
}
$delFiles[$segmentNumber][$delGeneration] = $file;
}
} elseif (isset(self::$_indexExtensions[substr($file, strlen($file)-4)])) {
// one of per segment files ('<segment_name>.<ext>')
$segmentName = substr($file, 0, strlen($file) - 4);
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$segmentName]) &&
($this->_currentSegment === null || $this->_currentSegment->getName() != $segmentName)) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 1 /* skip '_' */, strlen($file)-5), 36, 10); // order by segment number
}
}
}
$maxGenNumber = 0;
// process .del files of currently used segments
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
ksort($delFiles[$segmentNumber], SORT_NUMERIC);
array_pop($delFiles[$segmentNumber]); // remove last delete file generation from candidates for deleting
end($delFiles[$segmentNumber]);
$lastGenNumber = key($delFiles[$segmentNumber]);
if ($lastGenNumber > $maxGenNumber) {
$maxGenNumber = $lastGenNumber;
}
}
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
foreach ($segmentDelFiles as $delGeneration => $file) {
$filesToDelete[] = $file;
$filesTypes[] = 4; // third group of files for deletions
$filesNumbers[] = $segmentNumber*$maxGenNumber + $delGeneration; // order by <segment_number>,<del_generation> pair
}
}
// Reorder files for deleting
array_multisort($filesTypes, SORT_ASC, SORT_NUMERIC,
$filesNumbers, SORT_ASC, SORT_NUMERIC,
$filesToDelete, SORT_ASC, SORT_STRING);
foreach ($filesToDelete as $file) {
try {
/** Skip shared docstore segments deleting */
/** @todo Process '.cfx' files to check if them are already unused */
if (substr($file, strlen($file)-4) != '.cfx') {
$this->_directory->deleteFile($file);
}
} catch (ExceptionInterface $e) {
if (strpos($e->getMessage(), 'Can\'t delete file') === false) {
// That's not "file is under processing or already deleted" exception
// Pass it through
throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
}
}
}
// Return read lock into the previous state
Lucene\LockManager::deEscalateReadLock($this->_directory);
} else {
// Only release resources if another index reader is running now
foreach ($this->_segmentsToDelete as $segName) {
foreach (self::$_indexExtensions as $ext) {
$this->_directory->purgeFile($segName . $ext);
}
}
}
// Clean-up _segmentsToDelete container
$this->_segmentsToDelete = array();
// Release index write lock
Lucene\LockManager::releaseWriteLock($this->_directory);
// Remove unused segments from segments list
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if (!isset($segments[$segName])) {
unset($this->_segmentInfos[$segName]);
}
}
}
/**
* Commit current changes
*/
public function commit()
{
if ($this->_currentSegment !== null) {
$newSegment = $this->_currentSegment->close();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->_currentSegment = null;
}
$this->_updateSegments();
}
/**
* Merges the provided indexes into this index.
*
* @param array $readers
* @return void
*/
public function addIndexes($readers)
{
/**
* @todo implementation
*/
}
/**
* Merges all segments together into new one
*
* Returns true on success and false if another optimization or auto-optimization process
* is running now
*
* @return boolean
*/
public function optimize()
{
if (Lucene\LockManager::obtainOptimizationLock($this->_directory) === false) {
return false;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
$this->_mergeSegments($this->_segmentInfos);
Lucene\LockManager::releaseOptimizationLock($this->_directory);
return true;
}
/**
* Get name for new segment
*
* @return string
*/
private function _newSegmentName()
{
Lucene\LockManager::obtainWriteLock($this->_directory);
$generation = Lucene\Index::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Lucene\Index::getSegmentFileName($generation), false);
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentNameCounter = $segmentsFile->readInt();
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentsFile->writeInt($segmentNameCounter + 1);
// Flash output to guarantee that wrong value will not be loaded between unlock and
// return (which calls $segmentsFile destructor)
$segmentsFile->flush();
Lucene\LockManager::releaseWriteLock($this->_directory);
return '_' . base_convert($segmentNameCounter, 10, 36);
}
}

View File

@@ -1,218 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Storage\Directory\DirectoryInterface as Directory;
/**
* This is an utility class which provides index locks processing functionality
*
* @category Zend
* @package Zend_Search_Lucene
*/
class LockManager
{
/**
* consts for name of file to show lock status
*/
const WRITE_LOCK_FILE = 'write.lock.file';
const READ_LOCK_FILE = 'read.lock.file';
const READ_LOCK_PROCESSING_LOCK_FILE = 'read-lock-processing.lock.file';
const OPTIMIZATION_LOCK_FILE = 'optimization.lock.file';
/**
* Obtain exclusive write lock on the index
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
* @return \ZendSearch\Lucene\Storage\File\FileInterface
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public static function obtainWriteLock(Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::WRITE_LOCK_FILE);
if (!$lock->lock(LOCK_EX)) {
throw new RuntimeException('Can\'t obtain exclusive index lock');
}
return $lock;
}
/**
* Release exclusive write lock
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
*/
public static function releaseWriteLock(Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::WRITE_LOCK_FILE);
$lock->unlock();
}
/**
* Obtain the exclusive "read escalation/de-escalation" lock
*
* Required to protect the escalate/de-escalate read lock process
* on GFS (and potentially other) mounted filesystems.
*
* Why we need this:
* While GFS supports cluster-wide locking via flock(), it's
* implementation isn't quite what it should be. The locking
* semantics that work consistently on a local filesystem tend to
* fail on GFS mounted filesystems. This appears to be a design defect
* in the implementation of GFS. How this manifests itself is that
* conditional promotion of a shared lock to exclusive will always
* fail, lock release requests are honored but not immediately
* processed (causing erratic failures of subsequent conditional
* requests) and the releasing of the exclusive lock before the
* shared lock is set when a lock is demoted (which can open a window
* of opportunity for another process to gain an exclusive lock when
* it shoudln't be allowed to).
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
* @return \ZendSearch\Lucene\Storage\File\FileInterface
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
private static function _startReadLockProcessing(Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::READ_LOCK_PROCESSING_LOCK_FILE);
if (!$lock->lock(LOCK_EX)) {
throw new RuntimeException('Can\'t obtain exclusive lock for the read lock processing file');
}
return $lock;
}
/**
* Release the exclusive "read escalation/de-escalation" lock
*
* Required to protect the escalate/de-escalate read lock process
* on GFS (and potentially other) mounted filesystems.
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
*/
private static function _stopReadLockProcessing(Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_PROCESSING_LOCK_FILE);
$lock->unlock();
}
/**
* Obtain shared read lock on the index
*
* It doesn't block other read or update processes, but prevent index from the premature cleaning-up
*
* @param \ZendSearch\Lucene\Storage\Directory $defaultLockDirectory
* @return \ZendSearch\Lucene\Storage\File\FileInterface
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public static function obtainReadLock(Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::READ_LOCK_FILE);
if (!$lock->lock(LOCK_SH)) {
throw new RuntimeException('Can\'t obtain shared reading index lock');
}
return $lock;
}
/**
* Release shared read lock
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
*/
public static function releaseReadLock(Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
$lock->unlock();
}
/**
* Escalate Read lock to exclusive level
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
* @return boolean
*/
public static function escalateReadLock(Directory $lockDirectory)
{
self::_startReadLockProcessing($lockDirectory);
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
// First, release the shared lock for the benefit of GFS since
// it will fail the conditional request to promote the lock to
// "exclusive" while the shared lock is held (even when we are
// the only holder).
$lock->unlock();
// GFS is really poor. While the above "unlock" returns, GFS
// doesn't clean up it's tables right away (which will potentially
// cause the conditional locking for the "exclusive" lock to fail.
// We will retry the conditional lock request several times on a
// failure to get past this. The performance hit is negligible
// in the grand scheme of things and only will occur with GFS
// filesystems or if another local process has the shared lock
// on local filesystems.
for ($retries = 0; $retries < 10; $retries++) {
if ($lock->lock(LOCK_EX, true)) {
// Exclusive lock is obtained!
self::_stopReadLockProcessing($lockDirectory);
return true;
}
// wait 1 microsecond
usleep(1);
}
// Restore lock state
$lock->lock(LOCK_SH);
self::_stopReadLockProcessing($lockDirectory);
return false;
}
/**
* De-escalate Read lock to shared level
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
*/
public static function deEscalateReadLock(Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
$lock->lock(LOCK_SH);
}
/**
* Obtain exclusive optimization lock on the index
*
* Returns lock object on success and false otherwise (doesn't block execution)
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
* @return mixed
*/
public static function obtainOptimizationLock(Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::OPTIMIZATION_LOCK_FILE);
if (!$lock->lock(LOCK_EX, true)) {
return false;
}
return $lock;
}
/**
* Release exclusive optimization lock
*
* @param \ZendSearch\Lucene\Storage\Directory $lockDirectory
*/
public static function releaseOptimizationLock(Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::OPTIMIZATION_LOCK_FILE);
$lock->unlock();
}
}

View File

@@ -1,151 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
/**
* @category Zend
* @package Zend_Search_Lucene
*/
class Lucene
{
/**
* Default field name for search
*
* Null means search through all fields
*
* @var string
*/
private static $_defaultSearchField = null;
/**
* Result set limit
*
* 0 means no limit
*
* @var integer
*/
private static $_resultSetLimit = 0;
/**
* Terms per query limit
*
* 0 means no limit
*
* @var integer
*/
private static $_termsPerQueryLimit = 1024;
/**
* Create index
*
* @param mixed $directory
* @return \ZendSearch\Lucene\SearchIndexInterface
*/
public static function create($directory)
{
return new Index($directory, true);
}
/**
* Open index
*
* @param mixed $directory
* @return \ZendSearch\Lucene\SearchIndexInterface
*/
public static function open($directory)
{
return new Index($directory, false);
}
/**
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function __construct()
{
throw new UnsupportedMethodCallException('\ZendSearch\Lucene class is the only container for static methods. Use Lucene::open() or Lucene::create() methods.');
}
/**
* Set default search field.
*
* Null means, that search is performed through all fields by default
*
* Default value is null
*
* @param string $fieldName
*/
public static function setDefaultSearchField($fieldName)
{
self::$_defaultSearchField = $fieldName;
}
/**
* Get default search field.
*
* Null means, that search is performed through all fields by default
*
* @return string
*/
public static function getDefaultSearchField()
{
return self::$_defaultSearchField;
}
/**
* Set result set limit.
*
* 0 (default) means no limit
*
* @param integer $limit
*/
public static function setResultSetLimit($limit)
{
self::$_resultSetLimit = $limit;
}
/**
* Get result set limit.
*
* 0 means no limit
*
* @return integer
*/
public static function getResultSetLimit()
{
return self::$_resultSetLimit;
}
/**
* Set terms per query limit.
*
* 0 means no limit
*
* @param integer $limit
*/
public static function setTermsPerQueryLimit($limit)
{
self::$_termsPerQueryLimit = $limit;
}
/**
* Get result set limit.
*
* 0 (default) means no limit
*
* @return integer
*/
public static function getTermsPerQueryLimit()
{
return self::$_termsPerQueryLimit;
}
}

View File

@@ -1,832 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\OutOfRangeException;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
use ZendSearch\Lucene\Storage\Directory;
/**
* Multisearcher allows to search through several independent indexes.
*
* @category Zend
* @package Zend_Search_Lucene
*/
class MultiSearcher implements SearchIndexInterface
{
/**
* List of indices for searching.
* Array of Zend_Search_Lucene_Interface objects
*
* @var array
*/
protected $_indices;
/**
* Object constructor.
*
* @param array $indices Arrays of indices for search
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($indices = array())
{
$this->_indices = $indices;
foreach ($this->_indices as $index) {
if (!$index instanceof SearchIndexInterface) {
throw new InvalidArgumentException('sub-index objects have to implement ZendSearch\Lucene\Interface.');
}
}
}
/**
* Add index for searching.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
*/
public function addIndex(SearchIndexInterface $index)
{
$this->_indices[] = $index;
}
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param Storage\Directory\DirectoryInterface $directory
* @return integer
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public static function getActualGeneration(Storage\Directory\DirectoryInterface $directory)
{
throw new UnsupportedMethodCallException("Generation number can't be retrieved for multi-searcher");
}
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation)
{
return Index::getSegmentFileName($generation);
}
/**
* Get index format version
*
* @return integer
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function getFormatVersion()
{
throw new UnsupportedMethodCallException("Format version can't be retrieved for multi-searcher");
}
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
*/
public function setFormatVersion($formatVersion)
{
foreach ($this->_indices as $index) {
$index->setFormatVersion($formatVersion);
}
}
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Storage\Directory\DirectoryInterface
*/
public function getDirectory()
{
throw new UnsupportedMethodCallException("Index directory can't be retrieved for multi-searcher");
}
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count()
{
$count = 0;
foreach ($this->_indices as $index) {
$count += $this->_indices->count();
}
return $count;
}
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc()
{
return $this->count();
}
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs()
{
$docs = 0;
foreach ($this->_indices as $index) {
$docs += $index->numDocs();
}
return $docs;
}
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws \ZendSearch\Lucene\Exception\OutOfRangeException is thrown if $id is out of the range
*/
public function isDeleted($id)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->isDeleted($id);
}
$id -= $indexCount;
}
throw new OutOfRangeException('Document id is out of the range.');
}
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @return integer
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function getMaxBufferedDocs()
{
if (count($this->_indices) == 0) {
throw new RuntimeException('Indices list is empty');
}
$maxBufferedDocs = reset($this->_indices)->getMaxBufferedDocs();
foreach ($this->_indices as $index) {
if ($index->getMaxBufferedDocs() !== $maxBufferedDocs) {
throw new RuntimeException('Indices have different default search field.');
}
}
return $maxBufferedDocs;
}
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs)
{
foreach ($this->_indices as $index) {
$index->setMaxBufferedDocs($maxBufferedDocs);
}
}
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @return integer
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function getMaxMergeDocs()
{
if (count($this->_indices) == 0) {
throw new RuntimeException('Indices list is empty');
}
$maxMergeDocs = reset($this->_indices)->getMaxMergeDocs();
foreach ($this->_indices as $index) {
if ($index->getMaxMergeDocs() !== $maxMergeDocs) {
throw new RuntimeException('Indices have different default search field.');
}
}
return $maxMergeDocs;
}
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs)
{
foreach ($this->_indices as $index) {
$index->setMaxMergeDocs($maxMergeDocs);
}
}
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @return integer
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function getMergeFactor()
{
if (count($this->_indices) == 0) {
throw new RuntimeException('Indices list is empty');
}
$mergeFactor = reset($this->_indices)->getMergeFactor();
foreach ($this->_indices as $index) {
if ($index->getMergeFactor() !== $mergeFactor) {
throw new RuntimeException('Indices have different default search field.');
}
}
return $mergeFactor;
}
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor)
{
foreach ($this->_indices as $index) {
$index->setMaxMergeDocs($mergeFactor);
}
}
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array|\ZendSearch\Lucene\Search\QueryHit
*/
public function find($query)
{
if (count($this->_indices) == 0) {
return array();
}
$hitsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$hits = $index->find($query);
if ($indexShift != 0) {
foreach ($hits as $hit) {
$hit->id += $indexShift;
}
}
$indexShift += $index->count();
$hitsList[] = $hits;
}
/** @todo Implement advanced sorting */
return call_user_func_array('array_merge', $hitsList);
}
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false)
{
$fieldNamesList = array();
foreach ($this->_indices as $index) {
$fieldNamesList[] = $index->getFieldNames($indexed);
}
return array_unique(call_user_func_array('array_merge', $fieldNamesList));
}
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|\ZendSearch\Lucene\Search\QueryHit $id
* @return \ZendSearch\Lucene\Document
* @throws \ZendSearch\Lucene\Exception\OutOfRangeException is thrown if $id is out of the range
*/
public function getDocument($id)
{
if ($id instanceof Search\QueryHit) {
/* @var $id \ZendSearch\Lucene\Search\QueryHit */
$id = $id->id;
}
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->getDocument($id);
}
$id -= $indexCount;
}
throw new OutOfRangeException('Document id is out of the range.');
}
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @return boolean
*/
public function hasTerm(Index\Term $term)
{
foreach ($this->_indices as $index) {
if ($index->hasTerm($term)) {
return true;
}
}
return false;
}
/**
* Returns IDs of all the documents containing term.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return array
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function termDocs(Index\Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
throw new InvalidArgumentException('Document filters could not used with multi-searcher');
}
$docsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$docs = $index->termDocs($term);
if ($indexShift != 0) {
foreach ($docs as $id => $docId) {
$docs[$id] += $indexShift;
}
}
$indexShift += $index->count();
$docsList[] = $docs;
}
return call_user_func_array('array_merge', $docsList);
}
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return \ZendSearch\Lucene\Index\DocsFilter
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function termDocsFilter(Index\Term $term, $docsFilter = null)
{
throw new UnsupportedMethodCallException('Document filters could not used with multi-searcher');
}
/**
* Returns an array of all term freqs.
* Return array structure: array( docId => freq, ...)
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return integer
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function termFreqs(Index\Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
throw new InvalidArgumentException('Document filters could not used with multi-searcher');
}
$freqsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$freqs = $index->termFreqs($term);
if ($indexShift != 0) {
$freqsShifted = array();
foreach ($freqs as $docId => $freq) {
$freqsShifted[$docId + $indexShift] = $freq;
}
$freqs = $freqsShifted;
}
$indexShift += $index->count();
$freqsList[] = $freqs;
}
return call_user_func_array('array_merge', $freqsList);
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
* @return array
*/
public function termPositions(Index\Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
throw new InvalidArgumentException('Document filters could not used with multi-searcher');
}
$termPositionsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$termPositions = $index->termPositions($term);
if ($indexShift != 0) {
$termPositionsShifted = array();
foreach ($termPositions as $docId => $positions) {
$termPositions[$docId + $indexShift] = $positions;
}
$termPositions = $termPositionsShifted;
}
$indexShift += $index->count();
$termPositionsList[] = $termPositions;
}
return call_user_func_array('array_merge', $termPositions);
}
/**
* Returns the number of documents in this index containing the $term.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @return integer
*/
public function docFreq(Index\Term $term)
{
$docFreq = 0;
foreach ($this->_indices as $index) {
$docFreq += $index->docFreq($term);
}
return $docFreq;
}
/**
* Retrive similarity used by index reader
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity
*/
public function getSimilarity()
{
if (count($this->_indices) == 0) {
throw new RuntimeException('Indices list is empty');
}
$similarity = reset($this->_indices)->getSimilarity();
foreach ($this->_indices as $index) {
if ($index->getSimilarity() !== $similarity) {
throw new RuntimeException('Indices have different similarity.');
}
}
return $similarity;
}
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->norm($id, $fieldName);
}
$id -= $indexCount;
}
return null;
}
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions()
{
foreach ($this->_indices as $index) {
if ($index->hasDeletions()) {
return true;
}
}
return false;
}
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|\ZendSearch\Lucene\Search\QueryHit $id
* @throws \ZendSearch\Lucene\Exception\OutOfRangeException
*/
public function delete($id)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
$index->delete($id);
return;
}
$id -= $indexCount;
}
throw new OutOfRangeException('Document id is out of the range.');
}
/**
* Callback used to choose target index for new documents
*
* Function/method signature:
* Zend_Search_Lucene_Interface callbackFunction(Zend_Search_Lucene_Document $document, array $indices);
*
* null means "default documents distributing algorithm"
*
* @var callback
*/
protected $_documentDistributorCallBack = null;
/**
* Set callback for choosing target index.
*
* @param callback $callback
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function setDocumentDistributorCallback($callback)
{
if ($callback !== null && !is_callable($callback)) {
throw new InvalidArgumentException('$callback parameter must be a valid callback.');
}
$this->_documentDistributorCallBack = $callback;
}
/**
* Get callback for choosing target index.
*
* @return callback
*/
public function getDocumentDistributorCallback()
{
return $this->_documentDistributorCallBack;
}
/**
* Adds a document to this index.
*
* @param \ZendSearch\Lucene\Document $document
*/
public function addDocument(Document $document)
{
if ($this->_documentDistributorCallBack !== null) {
$index = call_user_func($this->_documentDistributorCallBack, $document, $this->_indices);
} else {
$index = $this->_indices[array_rand($this->_indices)];
}
$index->addDocument($document);
}
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*/
public function commit()
{
foreach ($this->_indices as $index) {
$index->commit();
}
}
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize()
{
foreach ($this->_indices as $index) {
$index->optimise();
}
}
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms()
{
$termsList = array();
foreach ($this->_indices as $index) {
$termsList[] = $index->terms();
}
return array_unique(call_user_func_array('array_merge', $termsList));
}
/**
* Terms stream priority queue object
*
* @var \ZendSearch\Lucene\TermStreamsPriorityQueue
*/
private $_termsStream = null;
/**
* Reset terms stream.
*/
public function resetTermsStream()
{
if ($this->_termsStream === null) {
$this->_termsStream = new TermStreamsPriorityQueue($this->_indices);
} else {
$this->_termsStream->resetTermsStream();
}
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param \ZendSearch\Lucene\Index\Term $prefix
*/
public function skipTo(Index\Term $prefix)
{
$this->_termsStream->skipTo($prefix);
}
/**
* Scans terms dictionary and returns next term
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function nextTerm()
{
return $this->_termsStream->nextTerm();
}
/**
* Returns term in current position
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function currentTerm()
{
return $this->_termsStream->currentTerm();
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_termsStream->closeTermsStream();
$this->_termsStream = null;
}
/**
* Undeletes all documents currently marked as deleted in this index.
*/
public function undeleteAll()
{
foreach ($this->_indices as $index) {
$index->undeleteAll();
}
}
}

View File

@@ -1,257 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class BooleanExpressionRecognizer extends Lucene\AbstractFSM
{
/** State Machine states */
const ST_START = 0;
const ST_LITERAL = 1;
const ST_NOT_OPERATOR = 2;
const ST_AND_OPERATOR = 3;
const ST_OR_OPERATOR = 4;
/** Input symbols */
const IN_LITERAL = 0;
const IN_NOT_OPERATOR = 1;
const IN_AND_OPERATOR = 2;
const IN_OR_OPERATOR = 3;
/**
* NOT operator signal
*
* @var boolean
*/
private $_negativeLiteral = false;
/**
* Current literal
*
* @var mixed
*/
private $_literal;
/**
* Set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @var array
*/
private $_conjunctions = array();
/**
* Current conjuction
*
* @var array
*/
private $_currentConjunction = array();
/**
* Object constructor
*/
public function __construct()
{
parent::__construct( array(self::ST_START,
self::ST_LITERAL,
self::ST_NOT_OPERATOR,
self::ST_AND_OPERATOR,
self::ST_OR_OPERATOR),
array(self::IN_LITERAL,
self::IN_NOT_OPERATOR,
self::IN_AND_OPERATOR,
self::IN_OR_OPERATOR));
$emptyOperatorAction = new Lucene\FSMAction($this, 'emptyOperatorAction');
$emptyNotOperatorAction = new Lucene\FSMAction($this, 'emptyNotOperatorAction');
$this->addRules(array( array(self::ST_START, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_START, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_LITERAL, self::IN_AND_OPERATOR, self::ST_AND_OPERATOR),
array(self::ST_LITERAL, self::IN_OR_OPERATOR, self::ST_OR_OPERATOR),
array(self::ST_LITERAL, self::IN_LITERAL, self::ST_LITERAL, $emptyOperatorAction),
array(self::ST_LITERAL, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR, $emptyNotOperatorAction),
array(self::ST_NOT_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_OR_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_OR_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
));
$notOperatorAction = new Lucene\FSMAction($this, 'notOperatorAction');
$orOperatorAction = new Lucene\FSMAction($this, 'orOperatorAction');
$literalAction = new Lucene\FSMAction($this, 'literalAction');
$this->addEntryAction(self::ST_NOT_OPERATOR, $notOperatorAction);
$this->addEntryAction(self::ST_OR_OPERATOR, $orOperatorAction);
$this->addEntryAction(self::ST_LITERAL, $literalAction);
}
/**
* Process next operator.
*
* Operators are defined by class constants: IN_AND_OPERATOR, IN_OR_OPERATOR and IN_NOT_OPERATOR
*
* @param integer $operator
*/
public function processOperator($operator)
{
$this->process($operator);
}
/**
* Process expression literal.
*
* @param integer $operator
*/
public function processLiteral($literal)
{
$this->_literal = $literal;
$this->process(self::IN_LITERAL);
}
/**
* Finish an expression and return result
*
* Result is a set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @throws \ZendSearch\Lucene\Exception\UnexpectedValueException
* @return array
*/
public function finishExpression()
{
if ($this->getState() != self::ST_LITERAL) {
throw new Lucene\Exception\UnexpectedValueException('Literal expected.');
}
$this->_conjunctions[] = $this->_currentConjunction;
return $this->_conjunctions;
}
/*********************************************************************
* Actions implementation
*********************************************************************/
/**
* default (omitted) operator processing
*/
public function emptyOperatorAction()
{
if (QueryParser::getDefaultOperator() == QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process literal
$this->literalAction();
}
/**
* default (omitted) + NOT operator processing
*/
public function emptyNotOperatorAction()
{
if (QueryParser::getDefaultOperator() == QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process NOT operator
$this->notOperatorAction();
}
/**
* NOT operator processing
*/
public function notOperatorAction()
{
$this->_negativeLiteral = true;
}
/**
* OR operator processing
* Close current conjunction
*/
public function orOperatorAction()
{
$this->_conjunctions[] = $this->_currentConjunction;
$this->_currentConjunction = array();
}
/**
* Literal processing
*/
public function literalAction()
{
// Add literal to the current conjunction
$this->_currentConjunction[] = array($this->_literal, !$this->_negativeLiteral);
// Switch off negative signal
$this->_negativeLiteral = false;
}
}

View File

@@ -1,15 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Exception;
interface ExceptionInterface
extends \ZendSearch\Lucene\Exception\ExceptionInterface
{}

View File

@@ -1,26 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Exception;
use ZendSearch\Lucene\Exception;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*
* Special exception type, which may be used to intercept wrong user input
*/
class QueryParserException
extends Exception\UnexpectedValueException
implements ExceptionInterface
{}

View File

@@ -1,81 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Highlighter;
use ZendSearch\Lucene\Document;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class DefaultHighlighter implements HighlighterInterface
{
/**
* List of colors for text highlighting
*
* @var array
*/
protected $_highlightColors = array('#66ffff', '#ff66ff', '#ffff66',
'#ff8888', '#88ff88', '#8888ff',
'#88dddd', '#dd88dd', '#dddd88',
'#aaddff', '#aaffdd', '#ddaaff',
'#ddffaa', '#ffaadd', '#ffddaa');
/**
* Index of current color for highlighting
*
* Index is increased at each highlight() call, so terms matching different queries are highlighted using different colors.
*
* @var integer
*/
protected $_currentColorIndex = 0;
/**
* HTML document for highlighting
*
* @var \ZendSearch\Lucene\Document\HTML
*/
protected $_doc;
/**
* Set document for highlighting.
*
* @param \ZendSearch\Lucene\Document\HTML $document
*/
public function setDocument(Document\HTML $document)
{
$this->_doc = $document;
}
/**
* Get document for highlighting.
*
* @return \ZendSearch\Lucene\Document\HTML $document
*/
public function getDocument()
{
return $this->_doc;
}
/**
* Highlight specified words
*
* @param string|array $words Words to highlight. They could be organized using the array or string.
*/
public function highlight($words)
{
$color = $this->_highlightColors[$this->_currentColorIndex];
$this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors);
$this->_doc->highlight($words, $color);
}
}

View File

@@ -1,42 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Highlighter;
use ZendSearch\Lucene\Document;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
interface HighlighterInterface
{
/**
* Set document for highlighting.
*
* @param \ZendSearch\Lucene\Document\HTML $document
*/
public function setDocument(Document\HTML $document);
/**
* Get document for highlighting.
*
* @return \ZendSearch\Lucene\Document\HTML $document
*/
public function getDocument();
/**
* Highlight specified words (method is invoked once per subquery)
*
* @param string|array $words Words to highlight. They could be organized using the array or string.
*/
public function highlight($words);
}

View File

@@ -1,209 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Document;
use ZendSearch\Lucene\Search\Highlighter\DefaultHighlighter;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
abstract class AbstractQuery
{
/**
* query boost factor
*
* @var float
*/
private $_boost = 1;
/**
* AbstractQuery weight
*
* @var \ZendSearch\Lucene\Search\Weight\AbstractWeight
*/
protected $_weight = null;
/**
* Gets the boost for this clause. Documents matching
* this clause will (in addition to the normal weightings) have their score
* multiplied by boost. The boost is 1.0 by default.
*
* @return float
*/
public function getBoost()
{
return $this->_boost;
}
/**
* Sets the boost for this query clause to $boost.
*
* @param float $boost
*/
public function setBoost($boost)
{
$this->_boost = $boost;
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
abstract public function score($docId, Lucene\SearchIndexInterface $reader);
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
abstract public function matchedDocs();
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* AbstractQuery specific implementation
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
abstract public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null);
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\AbstractWeight
*/
abstract public function createWeight(Lucene\SearchIndexInterface $reader);
/**
* Constructs an initializes a Weight for a _top-level_query_.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
protected function _initWeight(Lucene\SearchIndexInterface $reader)
{
// Check, that it's a top-level query and query weight is not initialized yet.
if ($this->_weight !== null) {
return $this->_weight;
}
$this->createWeight($reader);
$sum = $this->_weight->sumOfSquaredWeights();
$queryNorm = $reader->getSimilarity()->queryNorm($sum);
$this->_weight->normalize($queryNorm);
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
abstract public function rewrite(Lucene\SearchIndexInterface $index);
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
abstract public function optimize(Lucene\SearchIndexInterface $index);
/**
* Reset query, so it can be reused within other queries or
* with other indeces
*/
public function reset()
{
$this->_weight = null;
}
/**
* Print a query
*
* @return string
*/
abstract public function __toString();
/**
* Return query terms
*
* @return array
*/
abstract public function getQueryTerms();
/**
* AbstractQuery specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
abstract protected function _highlightMatches(Highlighter $highlighter);
/**
* Highlight matches in $inputHTML
*
* @param string $inputHTML
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @param Highlighter|null $highlighter
* @return string
*/
public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null)
{
if ($highlighter === null) {
$highlighter = new DefaultHighlighter();
}
$doc = Document\HTML::loadHTML($inputHTML, false, $defaultEncoding);
$highlighter->setDocument($doc);
$this->_highlightMatches($highlighter);
return $doc->getHTML();
}
/**
* Highlight matches in $inputHTMLFragment and return it (without HTML header and body tag)
*
* @param string $inputHTMLFragment
* @param string $encoding Input HTML string encoding
* @param Highlighter|null $highlighter
* @return string
*/
public function htmlFragmentHighlightMatches($inputHTMLFragment, $encoding = 'UTF-8', $highlighter = null)
{
if ($highlighter === null) {
$highlighter = new DefaultHighlighter();
}
$inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
. iconv($encoding, 'UTF-8//IGNORE', $inputHTMLFragment) . '</body></html>';
$doc = Document\HTML::loadHTML($inputHTML);
$highlighter->setDocument($doc);
$this->_highlightMatches($highlighter);
return $doc->getHTMLBody();
}
}

View File

@@ -1,792 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Boolean extends AbstractQuery
{
/**
* Subqueries
* Array of Zend_Search_Lucene_Search_Query
*
* @var array
*/
private $_subqueries = array();
/**
* Subqueries signs.
* If true then subquery is required.
* If false then subquery is prohibited.
* If null then subquery is neither prohibited, nor required
*
* If array is null then all subqueries are required
*
* @var array
*/
private $_signs = array();
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* A score factor based on the fraction of all query subqueries
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Class constructor. Create a new Boolean query object.
*
* if $signs array is omitted then all subqueries are required
* it differs from addSubquery() behavior, but should never be used
*
* @param array $subqueries Array of Zend_Search_Search_Query objects
* @param array $signs Array of signs. Sign is boolean|null.
* @return void
*/
public function __construct($subqueries = null, $signs = null)
{
if (is_array($subqueries)) {
$this->_subqueries = $subqueries;
$this->_signs = null;
// Check if all subqueries are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $subquery (Zend_Search_Lucene_Search_Query) to this query.
*
* The sign is specified as:
* TRUE - subquery is required
* FALSE - subquery is prohibited
* NULL - subquery is neither prohibited, nor required
*
* @param \ZendSearch\Lucene\Search\Query\AbstractQuery $subquery
* @param boolean|null $sign
* @return void
*/
public function addSubquery(AbstractQuery $subquery, $sign=null)
{
if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required
if ($this->_signs === null) { // Check, If all previous subqueries are required
$this->_signs = array();
foreach ($this->_subqueries as $prevSubquery) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_subqueries[] = $subquery;
}
/**
* Re-write queries into primitive queries
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
$query = new self();
$query->setBoost($this->getBoost());
foreach ($this->_subqueries as $subqueryId => $subquery) {
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$subqueryId]);
}
return $query;
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
$subqueries = array();
$signs = array();
// Optimize all subqueries
foreach ($this->_subqueries as $id => $subquery) {
$subqueries[] = $subquery->optimize($index);
$signs[] = ($this->_signs === null)? true : $this->_signs[$id];
}
// Remove insignificant subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Insignificant) {
// Insignificant subquery has to be removed anyway
unset($subqueries[$id]);
unset($signs[$id]);
}
}
if (count($subqueries) == 0) {
// Boolean query doesn't has non-insignificant subqueries
return new Insignificant();
}
// Check if all non-insignificant subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new Insignificant();
}
// Check for empty subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof EmptyResult) {
if ($signs[$id] === true) {
// Matching is required, but is actually empty
return new EmptyResult();
} else {
// Matching is optional or prohibited, but is empty
// Remove it from subqueries and signs list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if reduced subqueries list is empty
if (count($subqueries) == 0) {
return new EmptyResult();
}
// Check if all non-empty subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new EmptyResult();
}
// Check, if reduced subqueries list has only one entry
if (count($subqueries) == 1) {
// It's a query with only one required or optional clause
// (it's already checked, that it's not a prohibited clause)
if ($this->getBoost() == 1) {
return reset($subqueries);
}
$optimizedQuery = clone reset($subqueries);
$optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost());
return $optimizedQuery;
}
// Prepare first candidate for optimized query
$optimizedQuery = new self($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
$terms = array();
$tsigns = array();
$boostFactors = array();
// Try to decompose term and multi-term subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Term) {
$terms[] = $subquery->getTerm();
$tsigns[] = $signs[$id];
$boostFactors[] = $subquery->getBoost();
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} elseif ($subquery instanceof MultiTerm) {
$subTerms = $subquery->getTerms();
$subSigns = $subquery->getSigns();
if ($signs[$id] === true) {
// It's a required multi-term subquery.
// Something like '... +(+term1 -term2 term3 ...) ...'
// Multi-term required subquery can be decomposed only if it contains
// required terms and doesn't contain prohibited terms:
// ... +(+term1 term2 ...) ... => ... +term1 term2 ...
//
// Check this
$hasRequired = false;
$hasProhibited = false;
if ($subSigns === null) {
// All subterms are required
$hasRequired = true;
} else {
foreach ($subSigns as $sign) {
if ($sign === true) {
$hasRequired = true;
} elseif ($sign === false) {
$hasProhibited = true;
break;
}
}
}
// Continue if subquery has prohibited terms or doesn't have required terms
if ($hasProhibited || !$hasRequired) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($subSigns === null)? true : $subSigns[$termId];
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else { // $signs[$id] === null || $signs[$id] === false
// It's an optional or prohibited multi-term subquery.
// Something like '... (+term1 -term2 term3 ...) ...'
// or
// something like '... -(+term1 -term2 term3 ...) ...'
// Multi-term optional and required subqueries can be decomposed
// only if all terms are optional.
//
// Check if all terms are optional.
$onlyOptional = true;
if ($subSigns === null) {
// All subterms are required
$onlyOptional = false;
} else {
foreach ($subSigns as $sign) {
if ($sign !== null) {
$onlyOptional = false;
break;
}
}
}
// Continue if non-optional terms are presented in this multi-term subquery
if (!$onlyOptional) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($signs[$id] === null)? null /* optional */ :
false /* prohibited */;
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if there are no decomposed subqueries
if (count($terms) == 0 ) {
// return prepared candidate
return $optimizedQuery;
}
// Check, if all subqueries have been decomposed and all terms has the same boost factor
if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) {
$optimizedQuery = new MultiTerm($terms, $tsigns);
$optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost());
return $optimizedQuery;
}
// This boolean query can't be transformed to Term/MultiTerm query and still contains
// several subqueries
// Separate prohibited terms
$prohibitedTerms = array();
foreach ($terms as $id => $term) {
if ($tsigns[$id] === false) {
$prohibitedTerms[] = $term;
unset($terms[$id]);
unset($tsigns[$id]);
unset($boostFactors[$id]);
}
}
if (count($terms) == 1) {
$clause = new Term(reset($terms));
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
$signs[] = reset($tsigns);
// Clear terms list
$terms = array();
} elseif (count($terms) > 1 && count(array_unique($boostFactors)) == 1) {
$clause = new MultiTerm($terms, $tsigns);
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
// Clause sign is 'required' if clause contains required terms. 'Optional' otherwise.
$signs[] = (in_array(true, $tsigns))? true : null;
// Clear terms list
$terms = array();
}
if (count($prohibitedTerms) == 1) {
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new Term(reset($prohibitedTerms));
$signs[] = false;
// Clear prohibited terms list
$prohibitedTerms = array();
} elseif (count($prohibitedTerms) > 1) {
// prepare signs array
$prohibitedSigns = array();
foreach ($prohibitedTerms as $id => $term) {
// all prohibited term are grouped as optional into multi-term query
$prohibitedSigns[$id] = null;
}
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new MultiTerm($prohibitedTerms, $prohibitedSigns);
// Clause sign is 'prohibited'
$signs[] = false;
// Clear terms list
$prohibitedTerms = array();
}
/** @todo Group terms with the same boost factors together */
// Check, that all terms are processed
// Replace candidate for optimized query
if (count($terms) == 0 && count($prohibitedTerms) == 0) {
$optimizedQuery = new self($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
}
return $optimizedQuery;
}
/**
* Returns subqueries
*
* @return array
*/
public function getSubqueries()
{
return $this->_subqueries;
}
/**
* Return subqueries signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\Boolean
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
$this->_weight = new Weight\Boolean($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '<subquery1> AND <subquery2> AND <subquery3>')
*/
private function _calculateConjunctionResult()
{
$this->_resVector = null;
if (count($this->_subqueries) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_subqueries as $subqueryId => $subquery) {
$resVectors[] = $subquery->matchedDocs();
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $subqueryId;
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Used algorithm doesn't change elements order
}
/**
* Calculate result vector for non Conjunction query
* (like '<subquery1> AND <subquery2> AND NOT <subquery3> OR <subquery4>')
*/
private function _calculateNonConjunctionResult()
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs[$subqueryId] === true) {
// required
$requiredVectors[] = $subquery->matchedDocs();
$requiredVectorsSizes[] = count(end($requiredVectors));
$requiredVectorsIds[] = $subqueryId;
} elseif ($this->_signs[$subqueryId] === false) {
// prohibited
// Do nothing. matchedDocs() may include non-matching id's
// Calculating prohibited vector may take significant time, but do not affect the result
// Skipped.
} else {
// neither required, nor prohibited
// array union
$optional += $subquery->matchedDocs();
}
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = &$required;
} else {
$this->_resVector = &$optional;
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all subqueries are required)
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function _conjunctionScore($docId, Lucene\SearchIndexInterface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries),
count($this->_subqueries) );
}
$score = 0;
foreach ($this->_subqueries as $subquery) {
$subscore = $subquery->score($docId, $reader);
if ($subscore == 0) {
return 0;
}
$score += $subquery->score($docId, $reader) * $this->_coord;
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all subqueries are required)
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function _nonConjunctionScore($docId, Lucene\SearchIndexInterface $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0;
$matchedSubqueries = 0;
foreach ($this->_subqueries as $subqueryId => $subquery) {
$subscore = $subquery->score($docId, $reader);
// Prohibited
if ($this->_signs[$subqueryId] === false && $subscore != 0) {
return 0;
}
// is required, but doen't match
if ($this->_signs[$subqueryId] === true && $subscore == 0) {
return 0;
}
if ($subscore != 0) {
$matchedSubqueries++;
$score += $subscore;
}
}
return $score * $this->_coord[$matchedSubqueries] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
// Initialize weight if it's not done yet
$this->_initWeight($reader);
if ($docsFilter === null) {
// Create local documents filter if it's not provided by upper query
$docsFilter = new Index\DocsFilter();
}
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs == null || $this->_signs[$subqueryId] === true) {
// Subquery is required
$subquery->execute($reader, $docsFilter);
} else {
$subquery->execute($reader);
}
}
if ($this->_signs === null) {
$this->_calculateConjunctionResult();
} else {
$this->_calculateNonConjunctionResult();
}
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
$terms = array();
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$terms = array_merge($terms, $subquery->getQueryTerms());
}
}
return $terms;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$subquery->_highlightMatches($highlighter);
}
}
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_subqueries as $id => $subquery) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} elseif ($this->_signs[$id] === false) {
$query .= '-';
}
$query .= '(' . $subquery->__toString() . ')';
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,123 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class EmptyResult extends AbstractQuery
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
// "EmptyResult" query is a primitive query and don't need to be optimized
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\EmptyResultWeight
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
return new Weight\EmptyResultWeight();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<EmptyQuery>';
}
}

View File

@@ -1,472 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\OutOfBoundsException;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Fuzzy extends AbstractQuery
{
/** Default minimum similarity */
const DEFAULT_MIN_SIMILARITY = 0.5;
/**
* Maximum number of matched terms.
* Apache Lucene defines this limitation as boolean query maximum number of clauses:
* org.apache.lucene.search.BooleanQuery.getMaxClauseCount()
*/
const MAX_CLAUSE_COUNT = 1024;
/**
* Array of precalculated max distances
*
* keys are integers representing a word size
*/
private $_maxDistances = array();
/**
* Base searching term.
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_term;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* The length of common (non-fuzzy) prefix
*
* @var integer
*/
private $_prefixLength;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Matched terms scores
*
* @var array
*/
private $_scores = null;
/**
* Array of the term keys.
* Used to sort terms in alphabetical order if terms have the same socres
*
* @var array
*/
private $_termKeys = null;
/**
* Default non-fuzzy prefix length
*
* @var integer
*/
private static $_defaultPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param float $minimumSimilarity
* @param integer $prefixLength
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct(Index\Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = null)
{
if ($minimumSimilarity < 0) {
throw new InvalidArgumentException('minimumSimilarity cannot be less than 0');
}
if ($minimumSimilarity >= 1) {
throw new InvalidArgumentException('minimumSimilarity cannot be greater than or equal to 1');
}
if ($prefixLength < 0) {
throw new InvalidArgumentException('prefixLength cannot be less than 0');
}
$this->_term = $term;
$this->_minimumSimilarity = $minimumSimilarity;
$this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength;
}
/**
* Get default non-fuzzy prefix length
*
* @return integer
*/
public static function getDefaultPrefixLength()
{
return self::$_defaultPrefixLength;
}
/**
* Set default non-fuzzy prefix length
*
* @param integer $defaultPrefixLength
*/
public static function setDefaultPrefixLength($defaultPrefixLength)
{
self::$_defaultPrefixLength = $defaultPrefixLength;
}
/**
* Calculate maximum distance for specified word length
*
* @param integer $prefixLength
* @param integer $termLength
* @param integer $length
* @return integer
*/
private function _calculateMaxDistance($prefixLength, $termLength, $length)
{
$this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength));
return $this->_maxDistances[$length];
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
$this->_matches = array();
$this->_scores = array();
$this->_termKeys = array();
if ($this->_term->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_term->field);
}
$prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Index\Term::getLength($prefix);
$termLength = Index\Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
$maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Index\Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($index->currentTerm()->text, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} elseif (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} elseif ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Index\Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
// Calculate similarity
$target = $index->currentTerm()->text;
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance(0, $termRestLength, strlen($target));
if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new EmptyResult();
} elseif (count($this->_matches) == 1) {
return new Term(reset($this->_matches));
} else {
$rewrittenQuery = new Boolean();
array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC,
$this->_termKeys, SORT_ASC, SORT_STRING,
$this->_matches);
$termCount = 0;
foreach ($this->_matches as $id => $matchedTerm) {
$subquery = new Term($matchedTerm);
$subquery->setBoost($this->_scores[$id]);
$rewrittenQuery->addSubquery($subquery);
$termCount++;
if ($termCount >= self::MAX_CLAUSE_COUNT) {
break;
}
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
throw new UnsupportedMethodCallException('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return array
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new RuntimeException('Search or rewrite operations have to be performed before.');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Search\Weight\AbstractWeight
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException(
'Fuzzy query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
throw new UnsupportedMethodCallException(
'Fuzzy query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return array
*/
public function matchedDocs()
{
throw new UnsupportedMethodCallException(
'Fuzzy query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException(
'Fuzzy query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$words = array();
$prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Index\Term::getLength($prefix);
$termLength = Index\Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
$tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (substr($termText, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($termText, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} elseif (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} elseif ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$words[] = $termText;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '' : $this->_term->field . ':')
. $this->_term->text . '~'
. (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '')
. (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : '');
}
}

View File

@@ -1,124 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* The insignificant query returns empty result, but doesn't limit result set as a part of other queries
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Insignificant extends AbstractQuery
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\EmptyResultWeight
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
return new Weight\EmptyResultWeight();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<InsignificantQuery>';
}
}

View File

@@ -1,649 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class MultiTerm extends AbstractQuery
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term
*
* @var array
*/
private $_terms = array();
/**
* Term signs.
* If true then term is required.
* If false then term is prohibited.
* If null then term is neither prohibited, nor required
*
* If array is null then all terms are required
*
* @var array
*/
private $_signs;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => freq, ...)
* term2Id => (docId => freq, ...)
*
* @var array
*/
private $_termsFreqs = array();
/**
* A score factor based on the fraction of all query terms
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Terms weights
* array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights = array();
/**
* Class constructor. Create a new multi-term query object.
*
* if $signs array is omitted then all terms are required
* it differs from addTerm() behavior, but should never be used
*
* @param array $terms Array of \ZendSearch\Lucene\Index\Term objects
* @param array $signs Array of signs. Sign is boolean|null.
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($terms = null, $signs = null)
{
if (is_array($terms)) {
if (count($terms) > Lucene\Lucene::getTermsPerQueryLimit()) {
throw new InvalidArgumentException('Terms per query limit is reached.');
}
$this->_terms = $terms;
$this->_signs = null;
// Check if all terms are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $term (Zend_Search_Lucene_Index_Term) to this query.
*
* The sign is specified as:
* TRUE - term is required
* FALSE - term is prohibited
* NULL - term is neither prohibited, nor required
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param boolean|null $sign
* @return void
*/
public function addTerm(Index\Term $term, $sign = null)
{
if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
if ($this->_signs === null) { // Check, If all previous terms are required
$this->_signs = array();
foreach ($this->_terms as $prevTerm) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_terms[] = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
if (count($this->_terms) == 0) {
return new EmptyResult();
}
// Check, that all fields are qualified
$allQualified = true;
foreach ($this->_terms as $term) {
if ($term->field === null) {
$allQualified = false;
break;
}
}
if ($allQualified) {
return $this;
} else {
/** transform multiterm query to boolean and apply rewrite() method to subqueries. */
$query = new Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_terms as $termId => $term) {
$subquery = new Term($term);
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$termId]);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
$terms = $this->_terms;
$signs = $this->_signs;
foreach ($terms as $id => $term) {
if (!$index->hasTerm($term)) {
if ($signs === null || $signs[$id] === true) {
// Term is required
return new EmptyResult();
} else {
// Term is optional or prohibited
// Remove it from terms and signs list
unset($terms[$id]);
unset($signs[$id]);
}
}
}
// Check if all presented terms are prohibited
$allProhibited = true;
if ($signs === null) {
$allProhibited = false;
} else {
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
}
if ($allProhibited) {
return new EmptyResult();
}
/**
* @todo make an optimization for repeated terms
* (they may have different signs)
*/
if (count($terms) == 1) {
// It's already checked, that it's not a prohibited term
// It's one term query with one required or optional element
$optimizedQuery = new Term(reset($terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($terms) == 0) {
return new EmptyResult();
}
$optimizedQuery = new MultiTerm($terms, $signs);
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Return terms signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param \ZendSearch\Lucene\Search\Weight\Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\MultiTerm
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
$this->_weight = new Weight\MultiTerm($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '+something +another')
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
private function _calculateConjunctionResult(Lucene\SearchIndexInterface $reader)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
// Order terms by selectivity
$docFreqs = array();
$ids = array();
foreach ($this->_terms as $id => $term) {
$docFreqs[] = $reader->docFreq($term);
$ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
}
array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
$ids, SORT_ASC, SORT_NUMERIC,
$this->_terms);
$docsFilter = new Lucene\Index\DocsFilter();
foreach ($this->_terms as $termId => $term) {
$termDocs = $reader->termDocs($term, $docsFilter);
}
// Treat last retrieved docs vector as a result set
// (filter collects data for other terms)
$this->_resVector = array_flip($termDocs);
foreach ($this->_terms as $termId => $term) {
$this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithms doesn't change elements order.
}
/**
* Calculate result vector for non Conjunction query
* (like '+something -another')
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
private function _calculateNonConjunctionResult(Lucene\SearchIndexInterface $reader)
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
$prohibited = array();
foreach ($this->_terms as $termId => $term) {
$termDocs = array_flip($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
$requiredVectors[] = $termDocs;
$requiredVectorsSizes[] = count($termDocs);
$requiredVectorsIds[] = $termId;
} elseif ($this->_signs[$termId] === false) {
// prohibited
// array union
$prohibited += $termDocs;
} else {
// neither required, nor prohibited
// array union
$optional += $termDocs;
}
$this->_termsFreqs[$termId] = $reader->termFreqs($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = $required;
} else {
$this->_resVector = $optional;
}
if (count($prohibited) != 0) {
// $this->_resVector = array_diff_key($this->_resVector, $prohibited);
/**
* This code is used as workaround for array_diff_key() slowness problem.
*/
if (count($this->_resVector) < count($prohibited)) {
$updatedVector = $this->_resVector;
foreach ($this->_resVector as $id => $value) {
if (isset($prohibited[$id])) {
unset($updatedVector[$id]);
}
}
$this->_resVector = $updatedVector;
} else {
$updatedVector = $this->_resVector;
foreach ($prohibited as $id => $value) {
unset($updatedVector[$id]);
}
$this->_resVector = $updatedVector;
}
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all terms are required)
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function _conjunctionScore($docId, Lucene\SearchIndexInterface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
count($this->_terms) );
}
$score = 0.0;
foreach ($this->_terms as $termId => $term) {
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$freq = isset($this->_termsFreqs[$termId][$docId]) ? $this->_termsFreqs[$termId][$docId] : 0;
$score += $reader->getSimilarity()->tf($freq) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all terms are required)
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function _nonConjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0.0;
$matchedTerms = 0;
foreach ($this->_terms as $termId=>$term) {
// Check if term is
if ($this->_signs[$termId] !== false && // not prohibited
isset($this->_termsFreqs[$termId][$docId]) // matched
) {
$matchedTerms++;
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score +=
$reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
}
return $score * $this->_coord[$matchedTerms] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
if ($this->_signs === null) {
$this->_calculateConjunctionResult($reader);
} else {
$this->_calculateNonConjunctionResult($reader);
}
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
if ($this->_signs === null) {
return $this->_terms;
}
$terms = array();
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$terms[] = $this->_terms[$id];
}
}
return $terms;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$words = array();
if ($this->_signs === null) {
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
} else {
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$words[] = $this->_terms[$id]->text;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} elseif ($this->_signs[$id] === false) {
$query .= '-';
}
if ($term->field !== null) {
$query .= $term->field . ':';
}
$query .= $term->text;
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,554 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* A Query that matches documents containing a particular sequence of terms.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Phrase extends AbstractQuery
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term objects.
*
* @var array
*/
private $_terms;
/**
* Term positions (relative positions of terms within the phrase).
* Array of integers
*
* @var array
*/
private $_offsets;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* Class constructor. Create a new prase query.
*
* @param string $field Field to search.
* @param array $terms Terms to search Array of strings.
* @param array $offsets Relative term positions. Array of integers.
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($terms = null, $offsets = null, $field = null)
{
$this->_slop = 0;
if (is_array($terms)) {
$this->_terms = array();
foreach ($terms as $termId => $termText) {
$this->_terms[$termId] = ($field !== null)? new Index\Term($termText, $field):
new Index\Term($termText);
}
} elseif ($terms === null) {
$this->_terms = array();
} else {
throw new InvalidArgumentException('terms argument must be array of strings or null');
}
if (is_array($offsets)) {
if (count($this->_terms) != count($offsets)) {
throw new InvalidArgumentException('terms and offsets arguments must have the same size.');
}
$this->_offsets = $offsets;
} elseif ($offsets === null) {
$this->_offsets = array();
foreach ($this->_terms as $termId => $term) {
$position = count($this->_offsets);
$this->_offsets[$termId] = $position;
}
} else {
throw new InvalidArgumentException('offsets argument must be array of strings or null');
}
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately
* after the last term added.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param integer $position
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function addTerm(Index\Term $term, $position = null)
{
if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
throw new InvalidArgumentException('All phrase terms must be in the same field: ' .
$term->field . ':' . $term->text);
}
$this->_terms[] = $term;
if ($position !== null) {
$this->_offsets[] = $position;
} elseif (count($this->_offsets) != 0) {
$this->_offsets[] = end($this->_offsets) + 1;
} else {
$this->_offsets[] = 0;
}
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
if (count($this->_terms) == 0) {
return new EmptyResult();
} elseif ($this->_terms[0]->field !== null) {
return $this;
} else {
$query = new Boolean();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$subquery = new self();
$subquery->setSlop($this->getSlop());
foreach ($this->_terms as $termId => $term) {
$qualifiedTerm = new Index\Term($term->text, $fieldName);
$subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
}
$query->addSubquery($subquery);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
// Check, that index contains all phrase terms
foreach ($this->_terms as $term) {
if (!$index->hasTerm($term)) {
return new EmptyResult();
}
}
if (count($this->_terms) == 1) {
// It's one term query
$optimizedQuery = new Term(reset($this->_terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($this->_terms) == 0) {
return new EmptyResult();
}
return $this;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param \ZendSearch\Lucene\Search\Weight\Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\Phrase
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
$this->_weight = new Weight\Phrase($this, $reader);
return $this->_weight;
}
/**
* Score calculator for exact phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @return float
*/
public function _exactPhraseFreq($docId)
{
$freq = 0;
// Term Id with lowest cardinality
$lowCardTermId = null;
// Calculate $lowCardTermId
foreach ($this->_terms as $termId => $term) {
if ($lowCardTermId === null ||
count($this->_termsPositions[$termId][$docId]) <
count($this->_termsPositions[$lowCardTermId][$docId]) ) {
$lowCardTermId = $termId;
}
}
// Walk through positions of the term with lowest cardinality
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
// We expect phrase to be found
$freq++;
// Walk through other terms
foreach ($this->_terms as $termId => $term) {
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos +
($this->_offsets[$termId] -
$this->_offsets[$lowCardTermId]);
if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
$freq--; // Phrase wasn't found.
break;
}
}
}
}
return $freq;
}
/**
* Score calculator for sloppy phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function _sloppyPhraseFreq($docId, Lucene\SearchIndexInterface $reader)
{
$freq = 0;
$phraseQueue = array();
$phraseQueue[0] = array(); // empty phrase
$lastTerm = null;
// Walk through the terms to create phrases.
foreach ($this->_terms as $termId => $term) {
$queueSize = count($phraseQueue);
$firstPass = true;
// Walk through the term positions.
// Each term position produces a set of phrases.
foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
if ($firstPass) {
for ($count = 0; $count < $queueSize; $count++) {
$phraseQueue[$count][$termId] = $termPosition;
}
} else {
for ($count = 0; $count < $queueSize; $count++) {
if ($lastTerm !== null &&
abs( $termPosition - $phraseQueue[$count][$lastTerm] -
($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
continue;
}
$newPhraseId = count($phraseQueue);
$phraseQueue[$newPhraseId] = $phraseQueue[$count];
$phraseQueue[$newPhraseId][$termId] = $termPosition;
}
}
$firstPass = false;
}
$lastTerm = $termId;
}
foreach ($phraseQueue as $phrasePos) {
$minDistance = null;
for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
$distance = 0;
$start = reset($phrasePos) - reset($this->_offsets) + $shift;
foreach ($this->_terms as $termId => $term) {
$distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
if($distance > $this->_slop) {
break;
}
}
if ($minDistance === null || $distance < $minDistance) {
$minDistance = $distance;
}
}
if ($minDistance <= $this->_slop) {
$freq += $reader->getSimilarity()->sloppyFreq($minDistance);
}
}
return $freq;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithm doesn't change elements order.
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_slop == 0) {
$freq = $this->_exactPhraseFreq($docId);
} else {
$freq = $this->_sloppyPhraseFreq($docId, $reader);
}
if ($freq != 0) {
$tf = $reader->getSimilarity()->tf($freq);
$weight = $this->_weight->getValue();
$norm = $reader->norm($docId, reset($this->_terms)->field);
return $tf * $weight * $norm * $this->getBoost();
}
// Included in result, but culculated freq is zero
return 0;
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return $this->_terms;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$words = array();
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
$query = $this->_terms[0]->field . ':';
} else {
$query = '';
}
$query .= '"';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
$query .= $term->text;
}
$query .= '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,112 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query\Preprocessing;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
use ZendSearch\Lucene\Search\Query;
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
*/
abstract class AbstractPreprocessing extends Query\AbstractQuery
{
/**
* Matched terms.
*
* Matched terms list.
* It's filled during rewrite operation and may be used for search result highlighting
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
protected $_matches = null;
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
throw new UnsupportedMethodCallException('This query is not intended to be executed.');
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException('This query is not intended to be executed.');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
throw new UnsupportedMethodCallException('This query is not intended to be executed.');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return array
*/
public function matchedDocs()
{
throw new UnsupportedMethodCallException('This query is not intended to be executed.');
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException('This query is not intended to be executed.');
}
/**
* Return query terms
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return array
*/
public function getQueryTerms()
{
throw new UnsupportedMethodCallException('Rewrite operation has to be done before retrieving query terms.');
}
}

View File

@@ -1,274 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query\Preprocessing;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search;
use ZendSearch\Lucene\Search\Exception\QueryParserException;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Query;
use Laminas\Stdlib\ErrorHandler;
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
*/
class Fuzzy extends AbstractPreprocessing
{
/**
* word (query parser lexeme) to find.
*
* @var string
*/
private $_word;
/**
* Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_encoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $word Non-tokenized word (query parser lexeme) to search.
* @param string $encoding Word encoding.
* @param string $fieldName Field name.
* @param float $minimumSimilarity minimum similarity
*/
public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
{
$this->_word = $word;
$this->_encoding = $encoding;
$this->_field = $fieldName;
$this->_minimumSimilarity = $minimumSimilarity;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
if ($this->_field === null) {
$query = new Search\Query\Boolean();
$hasInsignificantSubqueries = false;
if (Lucene\Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Lucene\Lucene::getDefaultSearchField());
}
foreach ($searchFields as $fieldName) {
$subquery = new self($this->_word,
$this->_encoding,
$fieldName,
$this->_minimumSimilarity);
$rewrittenSubquery = $subquery->rewrite($index);
if ( !($rewrittenSubquery instanceof Query\Insignificant ||
$rewrittenSubquery instanceof Query\EmptyResult) ) {
$query->addSubquery($rewrittenSubquery);
}
if ($rewrittenSubquery instanceof Query\Insignificant) {
$hasInsignificantSubqueries = true;
}
}
$subqueries = $query->getSubqueries();
if (count($subqueries) == 0) {
$this->_matches = array();
if ($hasInsignificantSubqueries) {
return new Query\Insignificant();
} else {
return new Query\EmptyResult();
}
}
if (count($subqueries) == 1) {
$query = reset($subqueries);
}
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
$term = new Index\Term($this->_word, $this->_field);
if ($index->hasTerm($term)) {
$query = new Query\Fuzzy($term, $this->_minimumSimilarity);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// -------------------------------------
// Recognize wildcard queries
/**
* @todo check for PCRE unicode support may be performed through Zend_Environment in some future
*/
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
$subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
} else {
$subPatterns = preg_split('/[*?]/', $this->_word);
}
if (count($subPatterns) > 1) {
throw new QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
$this->_matches = array();
return new Query\Insignificant();
}
if (count($tokens) == 1) {
$term = new Index\Term($tokens[0]->getTermText(), $this->_field);
$query = new Query\Fuzzy($term, $this->_minimumSimilarity);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// Word is tokenized into several tokens
throw new QueryParserException('Fuzzy search is supported only for non-multiple word terms');
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
// -------------------------------------
// Recognize wildcard queries
/**
* @todo check for PCRE unicode support may be performed through Zend_Environment in some future
*/
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
$subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
} else {
$subPatterns = preg_split('/[*?]/', $this->_word);
}
if (count($subPatterns) > 1) {
// Do nothing
return;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
$term = new Index\Term($tokens[0]->getTermText(), $this->_field);
$query = new Query\Fuzzy($term, $this->_minimumSimilarity);
$query->_highlightMatches($highlighter);
return;
}
// Word is tokenized into several tokens
// But fuzzy search is supported only for non-multiple word terms
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= $this->_word;
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,250 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query\Preprocessing;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer\Analyzer;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Query;
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
*/
class Phrase extends AbstractPreprocessing
{
/**
* Phrase to find.
*
* @var string
*/
private $_phrase;
/**
* Phrase encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_phraseEncoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $phrase Phrase to search.
* @param string $phraseEncoding Phrase encoding.
* @param string $fieldName Field name.
*/
public function __construct($phrase, $phraseEncoding, $fieldName)
{
$this->_phrase = $phrase;
$this->_phraseEncoding = $phraseEncoding;
$this->_field = $fieldName;
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
// Allow to use wildcards within phrases
// They are either removed by text analyzer or used as a part of keyword for keyword fields
//
// if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
// require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
// throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
// }
// Split query into subqueries if field name is not specified
if ($this->_field === null) {
$query = new Query\Boolean();
$query->setBoost($this->getBoost());
if (Lucene\Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Lucene\Lucene::getDefaultSearchField());
}
foreach ($searchFields as $fieldName) {
$subquery = new Phrase($this->_phrase,
$this->_phraseEncoding,
$fieldName);
$subquery->setSlop($this->getSlop());
$query->addSubquery($subquery->rewrite($index));
}
$this->_matches = $query->getQueryTerms();
return $query;
}
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
$term = new Index\Term($this->_phrase, $this->_field);
if ($index->hasTerm($term)) {
$query = new Query\Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// tokenize phrase using current analyzer and process it as a phrase query
$tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
if (count($tokens) == 0) {
$this->_matches = array();
return new Query\Insignificant();
}
if (count($tokens) == 1) {
$term = new Index\Term($tokens[0]->getTermText(), $this->_field);
$query = new Query\Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
//It's non-trivial phrase query
$position = -1;
$query = new Query\Phrase();
foreach ($tokens as $token) {
$position += $token->getPositionIncrement();
$term = new Index\Term($token->getTermText(), $this->_field);
$query->addTerm($term, $position);
$query->setSlop($this->getSlop());
}
$this->_matches = $query->getQueryTerms();
return $query;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
/** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
// tokenize phrase using current analyzer and process it as a phrase query
$tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
$highlighter->highlight($tokens[0]->getTermText());
return;
}
//It's non-trivial phrase query
$words = array();
foreach ($tokens as $token) {
$words[] = $token->getTermText();
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= '"' . $this->_phrase . '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,322 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query\Preprocessing;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Exception\QueryParserException;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Query;
use Laminas\Stdlib\ErrorHandler;
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
*/
class Term extends AbstractPreprocessing
{
/**
* word (query parser lexeme) to find.
*
* @var string
*/
private $_word;
/**
* Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_encoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $word Non-tokenized word (query parser lexeme) to search.
* @param string $encoding Word encoding.
* @param string $fieldName Field name.
*/
public function __construct($word, $encoding, $fieldName)
{
$this->_word = $word;
$this->_encoding = $encoding;
$this->_field = $fieldName;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
if ($this->_field === null) {
$query = new Query\MultiTerm();
$query->setBoost($this->getBoost());
$hasInsignificantSubqueries = false;
if (Lucene\Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Lucene\Lucene::getDefaultSearchField());
}
foreach ($searchFields as $fieldName) {
$subquery = new Term($this->_word,
$this->_encoding,
$fieldName);
$rewrittenSubquery = $subquery->rewrite($index);
foreach ($rewrittenSubquery->getQueryTerms() as $term) {
$query->addTerm($term);
}
if ($rewrittenSubquery instanceof Query\Insignificant) {
$hasInsignificantSubqueries = true;
}
}
if (count($query->getTerms()) == 0) {
$this->_matches = array();
if ($hasInsignificantSubqueries) {
return new Query\Insignificant();
} else {
return new Query\EmptyResult();
}
}
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
$term = new Index\Term($this->_word, $this->_field);
if ($index->hasTerm($term)) {
$query = new Query\Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize wildcard queries
/**
* @todo check for PCRE unicode support may be performed through Zend_Environment in some future
*/
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
$word = iconv($this->_encoding, 'UTF-8', $this->_word);
$wildcardsPattern = '/[*?]/u';
$subPatternsEncoding = 'UTF-8';
} else {
$word = $this->_word;
$wildcardsPattern = '/[*?]/';
$subPatternsEncoding = $this->_encoding;
}
$subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
if (count($subPatterns) > 1) {
// Wildcard query is recognized
$pattern = '';
foreach ($subPatterns as $id => $subPattern) {
// Append corresponding wildcard character to the pattern before each sub-pattern (except first)
if ($id != 0) {
$pattern .= $word[ $subPattern[1] - 1 ];
}
// Check if each subputtern is a single word in terms of current analyzer
$tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
if (count($tokens) > 1) {
throw new QueryParserException('Wildcard search is supported only for non-multiple word terms');
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
$term = new Index\Term($pattern, $this->_field);
$query = new Query\Wildcard($term);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
$this->_matches = array();
return new Query\Insignificant();
}
if (count($tokens) == 1) {
$term = new Index\Term($tokens[0]->getTermText(), $this->_field);
$query = new Query\Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
//It's not insignificant or one term query
$query = new Query\MultiTerm();
/**
* @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
* analizer design features
*/
foreach ($tokens as $token) {
$term = new Index\Term($token->getTermText(), $this->_field);
$query->addTerm($term, true); // all subterms are required
}
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
// -------------------------------------
// Recognize wildcard queries
/**
* @todo check for PCRE unicode support may be performed through Zend_Environment in some future
*/
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
$word = iconv($this->_encoding, 'UTF-8', $this->_word);
$wildcardsPattern = '/[*?]/u';
$subPatternsEncoding = 'UTF-8';
} else {
$word = $this->_word;
$wildcardsPattern = '/[*?]/';
$subPatternsEncoding = $this->_encoding;
}
$subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
if (count($subPatterns) > 1) {
// Wildcard query is recognized
$pattern = '';
foreach ($subPatterns as $id => $subPattern) {
// Append corresponding wildcard character to the pattern before each sub-pattern (except first)
if ($id != 0) {
$pattern .= $word[ $subPattern[1] - 1 ];
}
// Check if each subputtern is a single word in terms of current analyzer
$tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
if (count($tokens) > 1) {
// Do nothing (nothing is highlighted)
return;
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
$term = new Index\Term($pattern, $this->_field);
$query = new Query\Wildcard($term);
$query->_highlightMatches($highlighter);
return;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
$highlighter->highlight($tokens[0]->getTermText());
return;
}
//It's not insignificant or one term query
$words = array();
foreach ($tokens as $token) {
$words[] = $token->getTermText();
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= $this->_word;
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,362 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\InvalidArgumentException;
use ZendSearch\Lucene\Exception\OutOfBoundsException;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Range extends AbstractQuery
{
/**
* Lower term.
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_lowerTerm;
/**
* Upper term.
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_upperTerm;
/**
* Search field
*
* @var string
*/
private $_field;
/**
* Inclusive
*
* @var boolean
*/
private $_inclusive;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Zend_Search_Lucene_Search_Query_Range constructor.
*
* @param \ZendSearch\Lucene\Index\Term|null $lowerTerm
* @param \ZendSearch\Lucene\Index\Term|null $upperTerm
* @param boolean $inclusive
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($lowerTerm, $upperTerm, $inclusive)
{
if ($lowerTerm === null && $upperTerm === null) {
throw new InvalidArgumentException('At least one term must be non-null');
}
if ($lowerTerm !== null && $upperTerm !== null && $lowerTerm->field != $upperTerm->field) {
throw new InvalidArgumentException('Both terms must be for the same field');
}
$this->_field = ($lowerTerm !== null)? $lowerTerm->field : $upperTerm->field;
$this->_lowerTerm = $lowerTerm;
$this->_upperTerm = $upperTerm;
$this->_inclusive = $inclusive;
}
/**
* Get query field name
*
* @return string|null
*/
public function getField()
{
return $this->_field;
}
/**
* Get lower term
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function getLowerTerm()
{
return $this->_lowerTerm;
}
/**
* Get upper term
*
* @return \ZendSearch\Lucene\Index\Term|null
*/
public function getUpperTerm()
{
return $this->_upperTerm;
}
/**
* Get upper term
*
* @return boolean
*/
public function isInclusive()
{
return $this->_inclusive;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
$this->_matches = array();
if ($this->_field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_field);
}
$maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($this->_lowerTerm !== null) {
$lowerTerm = new Index\Term($this->_lowerTerm->text, $field);
$index->skipTo($lowerTerm);
if (!$this->_inclusive &&
$index->currentTerm() == $lowerTerm) {
// Skip lower term
$index->nextTerm();
}
} else {
$index->skipTo(new Index\Term('', $field));
}
if ($this->_upperTerm !== null) {
// Walk up to the upper term
$upperTerm = new Index\Term($this->_upperTerm->text, $field);
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
$index->currentTerm()->text < $upperTerm->text) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
$index->nextTerm();
}
if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
// Include upper term into result
$this->_matches[] = $upperTerm;
}
} else {
// Walk up to the end of field data
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new EmptyResult();
} elseif (count($this->_matches) == 1) {
return new Term(reset($this->_matches));
} else {
$rewrittenQuery = new MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
throw new UnsupportedMethodCallException(
'Range query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Return query terms
*
* @return array
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new RuntimeException('Search or rewrite operations have to be performed before.');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException(
'Range query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
throw new UnsupportedMethodCallException(
'Range query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return array
*/
public function matchedDocs()
{
throw new UnsupportedMethodCallException(
'Range query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException(
'Range query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$words = array();
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
$tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
$lowerTermText = ($this->_lowerTerm !== null)? $this->_lowerTerm->text : null;
$upperTermText = ($this->_upperTerm !== null)? $this->_upperTerm->text : null;
if ($this->_inclusive) {
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (($lowerTermText == null || $lowerTermText <= $termText) &&
($upperTermText == null || $termText <= $upperTermText)) {
$words[] = $termText;
}
}
} else {
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (($lowerTermText == null || $lowerTermText < $termText) &&
($upperTermText == null || $termText < $upperTermText)) {
$words[] = $termText;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_field === null)? '' : $this->_field . ':')
. (($this->_inclusive)? '[' : '{')
. (($this->_lowerTerm !== null)? $this->_lowerTerm->text : 'null')
. ' TO '
. (($this->_upperTerm !== null)? $this->_upperTerm->text : 'null')
. (($this->_inclusive)? ']' : '}')
. (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : '');
}
}

View File

@@ -1,212 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use ZendSearch\Lucene\Search\Weight;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Term extends AbstractQuery
{
/**
* Term to find.
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_term;
/**
* Documents vector.
*
* @var array
*/
private $_docVector = null;
/**
* Term freqs vector.
* array(docId => freq, ...)
*
* @var array
*/
private $_termFreqs;
/**
* Zend_Search_Lucene_Search_Query_Term constructor
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param boolean $sign
*/
public function __construct(Index\Term $term)
{
$this->_term = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
if ($this->_term->field != null) {
return $this;
} else {
$query = new MultiTerm();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$term = new Index\Term($this->_term->text, $fieldName);
$query->addTerm($term);
}
return $query->rewrite($index);
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
// Check, that index contains specified term
if (!$index->hasTerm($this->_term)) {
return new EmptyResult();
}
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return \ZendSearch\Lucene\Search\Weight\Term
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
$this->_weight = new Weight\Term($this->_term, $this, $reader);
return $this->_weight;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
$this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter));
$this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter);
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_docVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
if (isset($this->_docVector[$docId])) {
return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) *
$this->_weight->getValue() *
$reader->norm($docId, $this->_term->field) *
$this->getBoost();
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array($this->_term);
}
/**
* Return query term
*
* @return \ZendSearch\Lucene\Index\Term
*/
public function getTerm()
{
return $this->_term;
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$highlighter->highlight($this->_term->text);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_term->field !== null) {
$query = $this->_term->field . ':';
} else {
$query = '';
}
$query .= $this->_term->text;
if ($this->getBoost() != 1) {
$query = $query . '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,355 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Query;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer\Analyzer;
use ZendSearch\Lucene\Analysis\Analyzer\AnalyzerInterface;
use ZendSearch\Lucene\Exception\OutOfBoundsException;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Exception\UnsupportedMethodCallException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Highlighter\HighlighterInterface as Highlighter;
use Laminas\Stdlib\ErrorHandler;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Wildcard extends AbstractQuery
{
/**
* Search pattern.
*
* Field has to be fully specified or has to be null
* Text may contain '*' or '?' symbols
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_pattern;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Minimum term prefix length (number of minimum non-wildcard characters)
*
* @var integer
*/
private static $_minPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param \ZendSearch\Lucene\Index\Term $pattern
*/
public function __construct(Index\Term $pattern)
{
$this->_pattern = $pattern;
}
/**
* Get minimum prefix length
*
* @return integer
*/
public static function getMinPrefixLength()
{
return self::$_minPrefixLength;
}
/**
* Set minimum prefix length
*
* @param integer $minPrefixLength
*/
public static function setMinPrefixLength($minPrefixLength)
{
self::$_minPrefixLength = $minPrefixLength;
}
/**
* Get terms prefix
*
* @param string $word
* @return string
*/
private static function _getPrefix($word)
{
$questionMarkPosition = strpos($word, '?');
$astrericPosition = strpos($word, '*');
if ($questionMarkPosition !== false) {
if ($astrericPosition !== false) {
return substr($word, 0, min($questionMarkPosition, $astrericPosition));
}
return substr($word, 0, $questionMarkPosition);
} elseif ($astrericPosition !== false) {
return substr($word, 0, $astrericPosition);
}
return $word;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @throws \ZendSearch\Lucene\Exception\OutOfBoundsException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function rewrite(Lucene\SearchIndexInterface $index)
{
$this->_matches = array();
if ($this->_pattern->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_pattern->field);
}
$prefix = self::_getPrefix($this->_pattern->text);
$prefixLength = strlen($prefix);
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if ($prefixLength < self::$_minPrefixLength) {
throw new RuntimeException(
'At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.'
);
}
/**
* @todo check for PCRE unicode support may be performed through Zend_Environment in some future
*/
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$maxTerms = Lucene\Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Index\Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Index\Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new OutOfBoundsException('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new EmptyResult();
} elseif (count($this->_matches) == 1) {
return new Term(reset($this->_matches));
} else {
$rewrittenQuery = new MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function optimize(Lucene\SearchIndexInterface $index)
{
throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Returns query pattern
*
* @return \ZendSearch\Lucene\Index\Term
*/
public function getPattern()
{
return $this->_pattern;
}
/**
* Return query terms
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return array
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new RuntimeException('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function createWeight(Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
*/
public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
{
throw new UnsupportedMethodCallException('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return array
*/
public function matchedDocs()
{
throw new UnsupportedMethodCallException(
'Wildcard query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Score specified document
*
* @param integer $docId
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @throws \ZendSearch\Lucene\Exception\UnsupportedMethodCallException
* @return float
*/
public function score($docId, Lucene\SearchIndexInterface $reader)
{
throw new UnsupportedMethodCallException(
'Wildcard query should not be directly used for search. Use $query->rewrite($index)'
);
}
/**
* Query specific matches highlighting
*
* @param Highlighter $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Highlighter $highlighter)
{
$words = array();
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
ErrorHandler::start(E_WARNING);
$result = preg_match('/\pL/u', 'a');
ErrorHandler::stop();
if ($result == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
$tokens = Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
foreach ($tokens as $token) {
if (preg_match($matchExpression, $token->getTermText()) === 1) {
$words[] = $token->getTermText();
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_pattern->field !== null) {
$query = $this->_pattern->field . ':';
} else {
$query = '';
}
$query .= $this->_pattern->text;
if ($this->getBoost() != 1) {
$query = $query . '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@@ -1,53 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\QueryEntry;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
abstract class AbstractQueryEntry
{
/**
* Query entry boost factor
*
* @var float
*/
protected $_boost = 1.0;
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
abstract public function processFuzzyProximityModifier($parameter = null);
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
abstract public function getQuery($encoding);
/**
* Boost query entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
$this->_boost *= $boostFactor;
}
}

View File

@@ -1,99 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\QueryEntry;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Phrase extends AbstractQueryEntry
{
/**
* Phrase value
*
* @var string
*/
private $_phrase;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Proximity phrase query
*
* @var boolean
*/
private $_proximityQuery = false;
/**
* Words distance, used for proximiti queries
*
* @var integer
*/
private $_wordsDistance = 0;
/**
* Object constractor
*
* @param string $phrase
* @param string $field
*/
public function __construct($phrase, $field)
{
$this->_phrase = $phrase;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_proximityQuery = true;
if ($parameter !== null) {
$this->_wordsDistance = $parameter;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function getQuery($encoding)
{
$query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Phrase($this->_phrase,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null);
if ($this->_proximityQuery) {
$query->setSlop($this->_wordsDistance);
}
$query->setBoost($this->_boost);
return $query;
}
}

View File

@@ -1,63 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\QueryEntry;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Subquery extends AbstractQueryEntry
{
/**
* Query
*
* @var \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
private $_query;
/**
* Object constractor
*
* @param \ZendSearch\Lucene\Search\Query\AbstractQuery $query
*/
public function __construct(\ZendSearch\Lucene\Search\Query\AbstractQuery $query)
{
$this->_query = $query;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
throw new \ZendSearch\Lucene\Search\Exception\QueryParserException(
'\'~\' sign must follow term or phrase'
);
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function getQuery($encoding)
{
$this->_query->setBoost($this->_boost);
return $this->_query;
}
}

View File

@@ -1,109 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\QueryEntry;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Term extends AbstractQueryEntry
{
/**
* Term value
*
* @var string
*/
private $_term;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Fuzzy search query
*
* @var boolean
*/
private $_fuzzyQuery = false;
/**
* Similarity
*
* @var float
*/
private $_similarity = 1.;
/**
* Object constractor
*
* @param string $term
* @param string $field
*/
public function __construct($term, $field)
{
$this->_term = $term;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_fuzzyQuery = true;
if ($parameter !== null) {
$this->_similarity = $parameter;
} else {
$this->_similarity = \ZendSearch\Lucene\Search\Query\Fuzzy::DEFAULT_MIN_SIMILARITY;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function getQuery($encoding)
{
if ($this->_fuzzyQuery) {
$query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Fuzzy($this->_term,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null,
$this->_similarity
);
$query->setBoost($this->_boost);
return $query;
}
$query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Term($this->_term,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null
);
$query->setBoost($this->_boost);
return $query;
}
}

View File

@@ -1,115 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Document;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class QueryHit
{
/**
* Object handle of the index
* @var \ZendSearch\Lucene\SearchIndexInterface
*/
protected $_index = null;
/**
* Object handle of the document associated with this hit
* @var \ZendSearch\Lucene\Document
*/
protected $_document = null;
/**
* Unique hit id
* @var integer
*/
public $id;
/**
* Number of the document in the index
* @var integer
*/
public $document_id;
/**
* Score of the hit
* @var float
*/
public $score;
/**
* Constructor - pass object handle of Zend_Search_Lucene_Interface index that produced
* the hit so the document can be retrieved easily from the hit.
*
* @param \ZendSearch\Lucene\SearchIndexInterface $index
*/
public function __construct(Lucene\SearchIndexInterface $index)
{
$this->_index = $index;
}
/**
* Magic method for checking the existence of a field
*
* @param string $offset
* @return boolean TRUE if the field exists else FALSE
*/
public function __isset($offset)
{
return isset($this->getDocument()->$offset);
}
/**
* Convenience function for getting fields from the document
* associated with this hit.
*
* @param string $offset
* @return string
*/
public function __get($offset)
{
return $this->getDocument()->getFieldValue($offset);
}
/**
* Return the document object for this hit
*
* @return \ZendSearch\Lucene\Document
*/
public function getDocument()
{
if (!$this->_document instanceof Document) {
$this->_document = $this->_index->getDocument($this->document_id);
}
return $this->_document;
}
/**
* Return the index object for this hit
*
* @return \ZendSearch\Lucene\SearchIndexInterface
*/
public function getIndex()
{
return $this->_index;
}
}

View File

@@ -1,484 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Exception\QueryParserException;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class QueryLexer extends Lucene\AbstractFSM
{
/** State Machine states */
const ST_WHITE_SPACE = 0;
const ST_SYNT_LEXEME = 1;
const ST_LEXEME = 2;
const ST_QUOTED_LEXEME = 3;
const ST_ESCAPED_CHAR = 4;
const ST_ESCAPED_QCHAR = 5;
const ST_LEXEME_MODIFIER = 6;
const ST_NUMBER = 7;
const ST_MANTISSA = 8;
const ST_ERROR = 9;
/** Input symbols */
const IN_WHITE_SPACE = 0;
const IN_SYNT_CHAR = 1;
const IN_LEXEME_MODIFIER = 2;
const IN_ESCAPE_CHAR = 3;
const IN_QUOTE = 4;
const IN_DECIMAL_POINT = 5;
const IN_ASCII_DIGIT = 6;
const IN_CHAR = 7;
const IN_MUTABLE_CHAR = 8;
const QUERY_WHITE_SPACE_CHARS = " \n\r\t";
const QUERY_SYNT_CHARS = ':()[]{}!|&';
const QUERY_MUTABLE_CHARS = '+-';
const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
const QUERY_LEXEMEMODIFIER_CHARS = '~^';
const QUERY_ASCIIDIGITS_CHARS = '0123456789';
/**
* List of recognized lexemes
*
* @var array
*/
private $_lexemes;
/**
* Query string (array of single- or non single-byte characters)
*
* @var array
*/
private $_queryString;
/**
* Current position within a query string
* Used to create appropriate error messages
*
* @var integer
*/
private $_queryStringPosition;
/**
* Recognized part of current lexeme
*
* @var string
*/
private $_currentLexeme;
public function __construct()
{
parent::__construct( array(self::ST_WHITE_SPACE,
self::ST_SYNT_LEXEME,
self::ST_LEXEME,
self::ST_QUOTED_LEXEME,
self::ST_ESCAPED_CHAR,
self::ST_ESCAPED_QCHAR,
self::ST_LEXEME_MODIFIER,
self::ST_NUMBER,
self::ST_MANTISSA,
self::ST_ERROR),
array(self::IN_WHITE_SPACE,
self::IN_SYNT_CHAR,
self::IN_MUTABLE_CHAR,
self::IN_LEXEME_MODIFIER,
self::IN_ESCAPE_CHAR,
self::IN_QUOTE,
self::IN_DECIMAL_POINT,
self::IN_ASCII_DIGIT,
self::IN_CHAR));
$lexemeModifierErrorAction = new Lucene\FSMAction($this, 'lexModifierErrException');
$quoteWithinLexemeErrorAction = new Lucene\FSMAction($this, 'quoteWithinLexemeErrException');
$wrongNumberErrorAction = new Lucene\FSMAction($this, 'wrongNumberErrException');
$this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
// IN_QUOTE not allowed
array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction),
array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR),
array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE),
array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
// IN_QUOTE not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction),
array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
));
$this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
$this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
// IN_DECIMAL_POINT not allowed
array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA),
// IN_CHAR not allowed
array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
/** Actions */
$syntaxLexemeAction = new Lucene\FSMAction($this, 'addQuerySyntaxLexeme');
$lexemeModifierAction = new Lucene\FSMAction($this, 'addLexemeModifier');
$addLexemeAction = new Lucene\FSMAction($this, 'addLexeme');
$addQuotedLexemeAction = new Lucene\FSMAction($this, 'addQuotedLexeme');
$addNumberLexemeAction = new Lucene\FSMAction($this, 'addNumberLexeme');
$addLexemeCharAction = new Lucene\FSMAction($this, 'addLexemeChar');
/** Syntax lexeme */
$this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction);
// Two lexemes in succession
$this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
/** Lexeme */
$this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
// ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
$this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction);
/** Quoted lexeme */
// We don't need entry action (skeep quote)
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
// Closing quote changes state to the ST_WHITE_SPACE other states are not used
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction);
/** Lexeme modifier */
$this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
/** Number */
$this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction);
$this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction);
// ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
$this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
}
/**
* Translate input char to an input symbol of state machine
*
* @param string $char
* @return integer
*/
private function _translateInput($char)
{
if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE;
} elseif (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR;
} elseif (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR;
} elseif (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
} elseif (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT;
} elseif ($char === '"' ) { return self::IN_QUOTE;
} elseif ($char === '.' ) { return self::IN_DECIMAL_POINT;
} elseif ($char === '\\') { return self::IN_ESCAPE_CHAR;
} else { return self::IN_CHAR;
}
}
/**
* This method is used to tokenize query string into lexemes
*
* @param string $inputString
* @param string $encoding
* @return array
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function tokenize($inputString, $encoding)
{
$this->reset();
$this->_lexemes = array();
$this->_queryString = array();
if (PHP_OS == 'AIX' && $encoding == '') {
$encoding = 'ISO8859-1';
}
$strLength = iconv_strlen($inputString, $encoding);
// Workaround for iconv_substr bug
$inputString .= ' ';
for ($count = 0; $count < $strLength; $count++) {
$this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
}
for ($this->_queryStringPosition = 0;
$this->_queryStringPosition < count($this->_queryString);
$this->_queryStringPosition++) {
$this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
}
$this->process(self::IN_WHITE_SPACE);
if ($this->getState() != self::ST_WHITE_SPACE) {
throw new QueryParserException('Unexpected end of query');
}
$this->_queryString = null;
return $this->_lexemes;
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add query syntax lexeme
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function addQuerySyntaxLexeme()
{
$lexeme = $this->_queryString[$this->_queryStringPosition];
// Process two char lexemes
if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
// increase current position in a query string
$this->_queryStringPosition++;
// check,
if ($this->_queryStringPosition == count($this->_queryString) ||
$this->_queryString[$this->_queryStringPosition] != $lexeme) {
throw new QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
}
// duplicate character
$lexeme .= $lexeme;
}
$token = new QueryToken(QueryToken::TC_SYNTAX_ELEMENT,
$lexeme,
$this->_queryStringPosition);
// Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
if ($token->type == QueryToken::TT_FIELD_INDICATOR) {
$token = array_pop($this->_lexemes);
if ($token === null || $token->type != QueryToken::TT_WORD) {
throw new QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
}
$token->type = QueryToken::TT_FIELD;
}
$this->_lexemes[] = $token;
}
/**
* Add lexeme modifier
*/
public function addLexemeModifier()
{
$this->_lexemes[] = new QueryToken(QueryToken::TC_SYNTAX_ELEMENT,
$this->_queryString[$this->_queryStringPosition],
$this->_queryStringPosition);
}
/**
* Add lexeme
*/
public function addLexeme()
{
$this->_lexemes[] = new QueryToken(QueryToken::TC_WORD,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Add quoted lexeme
*/
public function addQuotedLexeme()
{
$this->_lexemes[] = new QueryToken(QueryToken::TC_PHRASE,
$this->_currentLexeme,
$this->_queryStringPosition);
$this->_currentLexeme = '';
}
/**
* Add number lexeme
*/
public function addNumberLexeme()
{
$this->_lexemes[] = new QueryToken(QueryToken::TC_NUMBER,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Extend lexeme by one char
*/
public function addLexemeChar()
{
$this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
}
/**
* Position message
*
* @return string
*/
private function _positionMsg()
{
return 'Position is ' . $this->_queryStringPosition . '.';
}
/*********************************************************************
* Syntax errors actions
*********************************************************************/
public function lexModifierErrException()
{
throw new QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
}
public function quoteWithinLexemeErrException()
{
throw new QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
}
public function wrongNumberErrException()
{
throw new QueryParserException('Wrong number syntax.' . $this->_positionMsg());
}
}

View File

@@ -1,585 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Analysis\Analyzer;
use ZendSearch\Lucene\Exception\RuntimeException;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Exception\QueryParserException;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class QueryParser extends Lucene\AbstractFSM
{
/**
* Parser instance
*
* @var \ZendSearch\Lucene\Search\QueryParser
*/
private static $_instance = null;
/**
* Query lexer
*
* @var \ZendSearch\Lucene\Search\QueryLexer
*/
private $_lexer;
/**
* Tokens list
* Array of Zend_Search_Lucene_Search_QueryToken objects
*
* @var array
*/
private $_tokens;
/**
* Current token
*
* @var integer|string
*/
private $_currentToken;
/**
* Last token
*
* It can be processed within FSM states, but this addirional state simplifies FSM
*
* @var \ZendSearch\Lucene\Search\QueryToken
*/
private $_lastToken = null;
/**
* Range query first term
*
* @var string
*/
private $_rqFirstTerm = null;
/**
* Current query parser context
*
* @var \ZendSearch\Lucene\Search\QueryParserContext
*/
private $_context;
/**
* Context stack
*
* @var array
*/
private $_contextStack;
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Query string default encoding
*
* @var string
*/
private $_defaultEncoding = '';
/**
* Defines query parsing mode.
*
* If this option is turned on, then query parser suppress query parser exceptions
* and constructs multi-term query using all words from a query.
*
* That helps to avoid exceptions caused by queries, which don't conform to query language,
* but limits possibilities to check, that query entered by user has some inconsistencies.
*
*
* Default is true.
*
* Use {@link Zend_Search_Lucene::suppressQueryParsingExceptions()},
* {@link Zend_Search_Lucene::dontSuppressQueryParsingExceptions()} and
* {@link Zend_Search_Lucene::checkQueryParsingExceptionsSuppressMode()} to operate
* with this setting.
*
* @var boolean
*/
private $_suppressQueryParsingExceptions = true;
/**
* Boolean operators constants
*/
const B_OR = 0;
const B_AND = 1;
/**
* Default boolean queries operator
*
* @var integer
*/
private $_defaultOperator = self::B_OR;
/** Query parser State Machine states */
const ST_COMMON_QUERY_ELEMENT = 0; // Terms, phrases, operators
const ST_CLOSEDINT_RQ_START = 1; // Range query start (closed interval) - '['
const ST_CLOSEDINT_RQ_FIRST_TERM = 2; // First term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_TO_TERM = 3; // 'TO' lexeme in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_LAST_TERM = 4; // Second term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_END = 5; // Range query end (closed interval) - ']'
const ST_OPENEDINT_RQ_START = 6; // Range query start (opened interval) - '{'
const ST_OPENEDINT_RQ_FIRST_TERM = 7; // First term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_TO_TERM = 8; // 'TO' lexeme in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_LAST_TERM = 9; // Second term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_END = 10; // Range query end (opened interval) - '}'
/**
* Parser constructor
*/
public function __construct()
{
parent::__construct(array(self::ST_COMMON_QUERY_ELEMENT,
self::ST_CLOSEDINT_RQ_START,
self::ST_CLOSEDINT_RQ_FIRST_TERM,
self::ST_CLOSEDINT_RQ_TO_TERM,
self::ST_CLOSEDINT_RQ_LAST_TERM,
self::ST_CLOSEDINT_RQ_END,
self::ST_OPENEDINT_RQ_START,
self::ST_OPENEDINT_RQ_FIRST_TERM,
self::ST_OPENEDINT_RQ_TO_TERM,
self::ST_OPENEDINT_RQ_LAST_TERM,
self::ST_OPENEDINT_RQ_END
),
QueryToken::getTypes());
$this->addRules(
array(array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_WORD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PHRASE, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FIELD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_REQUIRED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PROHIBITED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FUZZY_PROX_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_BOOSTING_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_RANGE_INCL_START, self::ST_CLOSEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_RANGE_EXCL_START, self::ST_OPENEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_START, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_END, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_AND_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_OR_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NOT_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NUMBER, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_CLOSEDINT_RQ_START, QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_FIRST_TERM),
array(self::ST_CLOSEDINT_RQ_FIRST_TERM, QueryToken::TT_TO_LEXEME, self::ST_CLOSEDINT_RQ_TO_TERM),
array(self::ST_CLOSEDINT_RQ_TO_TERM, QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_LAST_TERM),
array(self::ST_CLOSEDINT_RQ_LAST_TERM, QueryToken::TT_RANGE_INCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_OPENEDINT_RQ_START, QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_FIRST_TERM),
array(self::ST_OPENEDINT_RQ_FIRST_TERM, QueryToken::TT_TO_LEXEME, self::ST_OPENEDINT_RQ_TO_TERM),
array(self::ST_OPENEDINT_RQ_TO_TERM, QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_LAST_TERM),
array(self::ST_OPENEDINT_RQ_LAST_TERM, QueryToken::TT_RANGE_EXCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$addTermEntryAction = new Lucene\FSMAction($this, 'addTermEntry');
$addPhraseEntryAction = new Lucene\FSMAction($this, 'addPhraseEntry');
$setFieldAction = new Lucene\FSMAction($this, 'setField');
$setSignAction = new Lucene\FSMAction($this, 'setSign');
$setFuzzyProxAction = new Lucene\FSMAction($this, 'processFuzzyProximityModifier');
$processModifierParameterAction = new Lucene\FSMAction($this, 'processModifierParameter');
$subqueryStartAction = new Lucene\FSMAction($this, 'subqueryStart');
$subqueryEndAction = new Lucene\FSMAction($this, 'subqueryEnd');
$logicalOperatorAction = new Lucene\FSMAction($this, 'logicalOperator');
$openedRQFirstTermAction = new Lucene\FSMAction($this, 'openedRQFirstTerm');
$openedRQLastTermAction = new Lucene\FSMAction($this, 'openedRQLastTerm');
$closedRQFirstTermAction = new Lucene\FSMAction($this, 'closedRQFirstTerm');
$closedRQLastTermAction = new Lucene\FSMAction($this, 'closedRQLastTerm');
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_WORD, $addTermEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PHRASE, $addPhraseEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FIELD, $setFieldAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_REQUIRED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_PROHIBITED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_FUZZY_PROX_MARK, $setFuzzyProxAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NUMBER, $processModifierParameterAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_START, $subqueryStartAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_SUBQUERY_END, $subqueryEndAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_AND_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_OR_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, QueryToken::TT_NOT_LEXEME, $logicalOperatorAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_FIRST_TERM, $openedRQFirstTermAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_LAST_TERM, $openedRQLastTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_FIRST_TERM, $closedRQFirstTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_LAST_TERM, $closedRQLastTermAction);
$this->_lexer = new QueryLexer();
}
/**
* Get query parser instance
*
* @return \ZendSearch\Lucene\Search\QueryParser
*/
private static function _getInstance()
{
if (self::$_instance === null) {
self::$_instance = new self();
}
return self::$_instance;
}
/**
* Set query string default encoding
*
* @param string $encoding
*/
public static function setDefaultEncoding($encoding)
{
self::_getInstance()->_defaultEncoding = $encoding;
}
/**
* Get query string default encoding
*
* @return string
*/
public static function getDefaultEncoding()
{
return self::_getInstance()->_defaultEncoding;
}
/**
* Set default boolean operator
*
* @param integer $operator
*/
public static function setDefaultOperator($operator)
{
self::_getInstance()->_defaultOperator = $operator;
}
/**
* Get default boolean operator
*
* @return integer
*/
public static function getDefaultOperator()
{
return self::_getInstance()->_defaultOperator;
}
/**
* Turn on 'suppress query parser exceptions' mode.
*/
public static function suppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = true;
}
/**
* Turn off 'suppress query parser exceptions' mode.
*/
public static function dontSuppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = false;
}
/**
* Check 'suppress query parser exceptions' mode.
* @return boolean
*/
public static function queryParsingExceptionsSuppressed()
{
return self::_getInstance()->_suppressQueryParsingExceptions;
}
/**
* Escape keyword to force it to be parsed as one term
*
* @param string $keyword
* @return string
*/
public static function escape($keyword)
{
return '\\' . implode('\\', str_split($keyword));
}
/**
* Parses a query string
*
* @param string $strQuery
* @param string $encoding
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public static function parse($strQuery, $encoding = null)
{
self::_getInstance();
// Reset FSM if previous parse operation didn't return it into a correct state
self::$_instance->reset();
try {
self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding;
self::$_instance->_lastToken = null;
self::$_instance->_context = new QueryParserContext(self::$_instance->_encoding);
self::$_instance->_contextStack = array();
self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding);
// Empty query
if (count(self::$_instance->_tokens) == 0) {
return new Query\Insignificant();
}
foreach (self::$_instance->_tokens as $token) {
try {
self::$_instance->_currentToken = $token;
self::$_instance->process($token->type);
self::$_instance->_lastToken = $token;
} catch (\Exception $e) {
if (strpos($e->getMessage(), 'There is no any rule for') !== false) {
throw new QueryParserException( 'Syntax error at char position ' . $token->position . '.', 0, $e);
}
throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
}
}
if (count(self::$_instance->_contextStack) != 0) {
throw new QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' );
}
return self::$_instance->_context->getQuery();
} catch (QueryParserException $e) {
if (self::$_instance->_suppressQueryParsingExceptions) {
$queryTokens = Analyzer\Analyzer::getDefault()->tokenize($strQuery, self::$_instance->_encoding);
$query = new Query\MultiTerm();
$termsSign = (self::$_instance->_defaultOperator == self::B_AND) ? true /* required term */ :
null /* optional term */;
foreach ($queryTokens as $token) {
$query->addTerm(new Index\Term($token->getTermText()), $termsSign);
}
return $query;
} else {
throw new RuntimeException($e->getMessage(), $e->getCode(), $e);
}
}
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add term to a query
*/
public function addTermEntry()
{
$entry = new QueryEntry\Term($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Add phrase to a query
*/
public function addPhraseEntry()
{
$entry = new QueryEntry\Phrase($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Set entry field
*/
public function setField()
{
$this->_context->setNextEntryField($this->_currentToken->text);
}
/**
* Set entry sign
*/
public function setSign()
{
$this->_context->setNextEntrySign($this->_currentToken->type);
}
/**
* Process fuzzy search/proximity modifier - '~'
*/
public function processFuzzyProximityModifier()
{
$this->_context->processFuzzyProximityModifier();
}
/**
* Process modifier parameter
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
public function processModifierParameter()
{
if ($this->_lastToken === null) {
throw new QueryParserException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
switch ($this->_lastToken->type) {
case QueryToken::TT_FUZZY_PROX_MARK:
$this->_context->processFuzzyProximityModifier($this->_currentToken->text);
break;
case QueryToken::TT_BOOSTING_MARK:
$this->_context->boost($this->_currentToken->text);
break;
default:
// It's not a user input exception
throw new RuntimeException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
}
/**
* Start subquery
*/
public function subqueryStart()
{
$this->_contextStack[] = $this->_context;
$this->_context = new QueryParserContext($this->_encoding, $this->_context->getField());
}
/**
* End subquery
*/
public function subqueryEnd()
{
if (count($this->_contextStack) == 0) {
throw new QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing. Char position ' . $this->_currentToken->position . '.' );
}
$query = $this->_context->getQuery();
$this->_context = array_pop($this->_contextStack);
$this->_context->addEntry(new QueryEntry\Subquery($query));
}
/**
* Process logical operator
*/
public function logicalOperator()
{
$this->_context->addLogicalOperator($this->_currentToken->type);
}
/**
* Process first range query term (opened interval)
*/
public function openedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (opened interval)
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function openedRQLastTerm()
{
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
} elseif (count($tokens) == 1) {
$from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
} elseif (count($tokens) == 1) {
$to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
throw new QueryParserException('At least one range query boundary term must be non-empty term');
}
$rangeQuery = new Query\Range($from, $to, false);
$entry = new QueryEntry\Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
/**
* Process first range query term (closed interval)
*/
public function closedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (closed interval)
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function closedRQLastTerm()
{
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
} elseif (count($tokens) == 1) {
$from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
} elseif (count($tokens) == 1) {
$to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
throw new QueryParserException('At least one range query boundary term must be non-empty term');
}
$rangeQuery = new Query\Range($from, $to, true);
$entry = new QueryEntry\Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
}

View File

@@ -1,375 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Exception\ExceptionInterface;
use ZendSearch\Lucene\Exception\UnexpectedValueException;
use ZendSearch\Lucene\Search\Exception\QueryParserException;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class QueryParserContext
{
/**
* Default field for the context.
*
* null means, that term should be searched through all fields
* Zend_Search_Lucene_Search_Query::rewriteQuery($index) transletes such queries to several
*
* @var string|null
*/
private $_defaultField;
/**
* Field specified for next entry
*
* @var string
*/
private $_nextEntryField = null;
/**
* True means, that term is required.
* False means, that term is prohibited.
* null means, that term is neither prohibited, nor required
*
* @var boolean
*/
private $_nextEntrySign = null;
/**
* Entries grouping mode
*/
const GM_SIGNS = 0; // Signs mode: '+term1 term2 -term3 +(subquery1) -(subquery2)'
const GM_BOOLEAN = 1; // Boolean operators mode: 'term1 and term2 or (subquery1) and not (subquery2)'
/**
* Grouping mode
*
* @var integer
*/
private $_mode = null;
/**
* Entries signs.
* Used in GM_SIGNS grouping mode
*
* @var array
*/
private $_signs = array();
/**
* Query entries
* Each entry is a Zend_Search_Lucene_Search_QueryEntry object or
* boolean operator (Zend_Search_Lucene_Search_QueryToken class constant)
*
* @var array
*/
private $_entries = array();
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Context object constructor
*
* @param string $encoding
* @param string|null $defaultField
*/
public function __construct($encoding, $defaultField = null)
{
$this->_encoding = $encoding;
$this->_defaultField = $defaultField;
}
/**
* Get context default field
*
* @return string|null
*/
public function getField()
{
return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField;
}
/**
* Set field for next entry
*
* @param string $field
*/
public function setNextEntryField($field)
{
$this->_nextEntryField = $field;
}
/**
* Set sign for next entry
*
* @param integer $sign
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @throws \ZendSearch\Lucene\Exception\UnexpectedValueException
*/
public function setNextEntrySign($sign)
{
if ($this->_mode === self::GM_BOOLEAN) {
throw new QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_SIGNS;
if ($sign == QueryToken::TT_REQUIRED) {
$this->_nextEntrySign = true;
} elseif ($sign == QueryToken::TT_PROHIBITED) {
$this->_nextEntrySign = false;
} else {
throw new UnexpectedValueException('Unrecognized sign type.');
}
}
/**
* Add entry to a query
*
* @param \ZendSearch\Lucene\Search\QueryEntry\AbstractQueryEntry $entry
*/
public function addEntry(QueryEntry\AbstractQueryEntry $entry)
{
if ($this->_mode !== self::GM_BOOLEAN) {
$this->_signs[] = $this->_nextEntrySign;
}
$this->_entries[] = $entry;
$this->_nextEntryField = null;
$this->_nextEntrySign = null;
}
/**
* Process fuzzy search or proximity search modifier
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
throw new QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof QueryEntry\AbstractQueryEntry) {
// there are no entries or last entry is boolean operator
throw new QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry->processFuzzyProximityModifier($parameter);
$this->_entries[] = $lastEntry;
}
/**
* Set boost factor to the entry
*
* @param float $boostFactor
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function boost($boostFactor)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
throw new QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof QueryEntry\AbstractQueryEntry) {
// there are no entries or last entry is boolean operator
throw new QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry->boost($boostFactor);
$this->_entries[] = $lastEntry;
}
/**
* Process logical operator
*
* @param integer $operator
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
*/
public function addLogicalOperator($operator)
{
if ($this->_mode === self::GM_SIGNS) {
throw new QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_BOOLEAN;
$this->_entries[] = $operator;
}
/**
* Generate 'signs style' query from the context
* '+term1 term2 -term3 +(<subquery1>) ...'
*
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function _signStyleExpressionQuery()
{
$query = new Query\Boolean();
if (QueryParser::getDefaultOperator() == QueryParser::B_AND) {
$defaultSign = true; // required
} else {
$defaultSign = null; // optional
}
foreach ($this->_entries as $entryId => $entry) {
$sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign;
$query->addSubquery($entry->getQuery($this->_encoding), $sign);
}
return $query;
}
/**
* Generate 'boolean style' query from the context
* 'term1 and term2 or term3 and (<subquery1>) and not (<subquery2>)'
*
* @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
private function _booleanExpressionQuery()
{
/**
* We treat each level of an expression as a boolean expression in
* a Disjunctive Normal Form
*
* AND operator has higher precedence than OR
*
* Thus logical query is a disjunction of one or more conjunctions of
* one or more query entries
*/
$expressionRecognizer = new BooleanExpressionRecognizer();
try {
foreach ($this->_entries as $entry) {
if ($entry instanceof QueryEntry\AbstractQueryEntry) {
$expressionRecognizer->processLiteral($entry);
} else {
switch ($entry) {
case QueryToken::TT_AND_LEXEME:
$expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_AND_OPERATOR);
break;
case QueryToken::TT_OR_LEXEME:
$expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_OR_OPERATOR);
break;
case QueryToken::TT_NOT_LEXEME:
$expressionRecognizer->processOperator(BooleanExpressionRecognizer::IN_NOT_OPERATOR);
break;
default:
throw new UnexpectedValueException('Boolean expression error. Unknown operator type.');
}
}
}
$conjuctions = $expressionRecognizer->finishExpression();
} catch (ExceptionInterface $e) {
// It's query syntax error message and it should be user friendly. So FSM message is omitted
throw new QueryParserException('Boolean expression error.', 0, $e);
}
// Remove 'only negative' conjunctions
foreach ($conjuctions as $conjuctionId => $conjuction) {
$nonNegativeEntryFound = false;
foreach ($conjuction as $conjuctionEntry) {
if ($conjuctionEntry[1]) {
$nonNegativeEntryFound = true;
break;
}
}
if (!$nonNegativeEntryFound) {
unset($conjuctions[$conjuctionId]);
}
}
$subqueries = array();
foreach ($conjuctions as $conjuction) {
// Check, if it's a one term conjuction
if (count($conjuction) == 1) {
$subqueries[] = $conjuction[0][0]->getQuery($this->_encoding);
} else {
$subquery = new Query\Boolean();
foreach ($conjuction as $conjuctionEntry) {
$subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]);
}
$subqueries[] = $subquery;
}
}
if (count($subqueries) == 0) {
return new Query\Insignificant();
}
if (count($subqueries) == 1) {
return $subqueries[0];
}
$query = new Query\Boolean();
foreach ($subqueries as $subquery) {
// Non-requirered entry/subquery
$query->addSubquery($subquery);
}
return $query;
}
/**
* Generate query from current context
*
* @return \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
public function getQuery()
{
if ($this->_mode === self::GM_BOOLEAN) {
return $this->_booleanExpressionQuery();
} else {
return $this->_signStyleExpressionQuery();
}
}
}

View File

@@ -1,218 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search;
use ZendSearch\Lucene;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class QueryToken
{
/**
* Token types.
*/
const TT_WORD = 0; // Word
const TT_PHRASE = 1; // Phrase (one or several quoted words)
const TT_FIELD = 2; // Field name in 'field:word', field:<phrase> or field:(<subquery>) pairs
const TT_FIELD_INDICATOR = 3; // ':'
const TT_REQUIRED = 4; // '+'
const TT_PROHIBITED = 5; // '-'
const TT_FUZZY_PROX_MARK = 6; // '~'
const TT_BOOSTING_MARK = 7; // '^'
const TT_RANGE_INCL_START = 8; // '['
const TT_RANGE_INCL_END = 9; // ']'
const TT_RANGE_EXCL_START = 10; // '{'
const TT_RANGE_EXCL_END = 11; // '}'
const TT_SUBQUERY_START = 12; // '('
const TT_SUBQUERY_END = 13; // ')'
const TT_AND_LEXEME = 14; // 'AND' or 'and'
const TT_OR_LEXEME = 15; // 'OR' or 'or'
const TT_NOT_LEXEME = 16; // 'NOT' or 'not'
const TT_TO_LEXEME = 17; // 'TO' or 'to'
const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, ....
/**
* Returns all possible lexeme types.
* It's used for syntax analyzer state machine initialization
*
* @return array
*/
public static function getTypes()
{
return array( self::TT_WORD,
self::TT_PHRASE,
self::TT_FIELD,
self::TT_FIELD_INDICATOR,
self::TT_REQUIRED,
self::TT_PROHIBITED,
self::TT_FUZZY_PROX_MARK,
self::TT_BOOSTING_MARK,
self::TT_RANGE_INCL_START,
self::TT_RANGE_INCL_END,
self::TT_RANGE_EXCL_START,
self::TT_RANGE_EXCL_END,
self::TT_SUBQUERY_START,
self::TT_SUBQUERY_END,
self::TT_AND_LEXEME,
self::TT_OR_LEXEME,
self::TT_NOT_LEXEME,
self::TT_TO_LEXEME,
self::TT_NUMBER
);
}
/**
* TokenCategories
*/
const TC_WORD = 0; // Word
const TC_PHRASE = 1; // Phrase (one or several quoted words)
const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8
const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^
/**
* Token type.
*
* @var integer
*/
public $type;
/**
* Token text.
*
* @var integer
*/
public $text;
/**
* Token position within query.
*
* @var integer
*/
public $position;
/**
* IndexReader constructor needs token type and token text as a parameters.
*
* @param integer $tokenCategory
* @param string $tokText
* @param integer $position
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($tokenCategory, $tokenText, $position)
{
$this->text = $tokenText;
$this->position = $position + 1; // Start from 1
switch ($tokenCategory) {
case self::TC_WORD:
if ( strtolower($tokenText) == 'and') {
$this->type = self::TT_AND_LEXEME;
} elseif (strtolower($tokenText) == 'or') {
$this->type = self::TT_OR_LEXEME;
} elseif (strtolower($tokenText) == 'not') {
$this->type = self::TT_NOT_LEXEME;
} elseif (strtolower($tokenText) == 'to') {
$this->type = self::TT_TO_LEXEME;
} else {
$this->type = self::TT_WORD;
}
break;
case self::TC_PHRASE:
$this->type = self::TT_PHRASE;
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
break;
case self::TC_SYNTAX_ELEMENT:
switch ($tokenText) {
case ':':
$this->type = self::TT_FIELD_INDICATOR;
break;
case '+':
$this->type = self::TT_REQUIRED;
break;
case '-':
$this->type = self::TT_PROHIBITED;
break;
case '~':
$this->type = self::TT_FUZZY_PROX_MARK;
break;
case '^':
$this->type = self::TT_BOOSTING_MARK;
break;
case '[':
$this->type = self::TT_RANGE_INCL_START;
break;
case ']':
$this->type = self::TT_RANGE_INCL_END;
break;
case '{':
$this->type = self::TT_RANGE_EXCL_START;
break;
case '}':
$this->type = self::TT_RANGE_EXCL_END;
break;
case '(':
$this->type = self::TT_SUBQUERY_START;
break;
case ')':
$this->type = self::TT_SUBQUERY_END;
break;
case '!':
$this->type = self::TT_NOT_LEXEME;
break;
case '&&':
$this->type = self::TT_AND_LEXEME;
break;
case '||':
$this->type = self::TT_OR_LEXEME;
break;
default:
throw new Lucene\Exception\InvalidArgumentException(
'Unrecognized query syntax lexeme: \'' . $tokenText . '\''
);
}
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
default:
throw new Lucene\Exception\InvalidArgumentException(
'Unrecognized lexeme type: \'' . $tokenCategory . '\''
);
}
}
}

View File

@@ -1,540 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Similarity;
/**
* @todo !!!!!!! This class is actually used as singleton. It has to be redesigned.
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
abstract class AbstractSimilarity
{
/**
* The Similarity implementation used by default.
*
* @var AbstractSimilarity
*/
private static $_defaultImpl;
/**
* Cache of decoded bytes.
* Array of floats
*
* @var array
*/
private static $_normTable = array( 0 => 0.0,
1 => 5.820766E-10,
2 => 6.9849193E-10,
3 => 8.1490725E-10,
4 => 9.313226E-10,
5 => 1.1641532E-9,
6 => 1.3969839E-9,
7 => 1.6298145E-9,
8 => 1.8626451E-9,
9 => 2.3283064E-9,
10 => 2.7939677E-9,
11 => 3.259629E-9,
12 => 3.7252903E-9,
13 => 4.656613E-9,
14 => 5.5879354E-9,
15 => 6.519258E-9,
16 => 7.4505806E-9,
17 => 9.313226E-9,
18 => 1.1175871E-8,
19 => 1.3038516E-8,
20 => 1.4901161E-8,
21 => 1.8626451E-8,
22 => 2.2351742E-8,
23 => 2.6077032E-8,
24 => 2.9802322E-8,
25 => 3.7252903E-8,
26 => 4.4703484E-8,
27 => 5.2154064E-8,
28 => 5.9604645E-8,
29 => 7.4505806E-8,
30 => 8.940697E-8,
31 => 1.0430813E-7,
32 => 1.1920929E-7,
33 => 1.4901161E-7,
34 => 1.7881393E-7,
35 => 2.0861626E-7,
36 => 2.3841858E-7,
37 => 2.9802322E-7,
38 => 3.5762787E-7,
39 => 4.172325E-7,
40 => 4.7683716E-7,
41 => 5.9604645E-7,
42 => 7.1525574E-7,
43 => 8.34465E-7,
44 => 9.536743E-7,
45 => 1.1920929E-6,
46 => 1.4305115E-6,
47 => 1.66893E-6,
48 => 1.9073486E-6,
49 => 2.3841858E-6,
50 => 2.861023E-6,
51 => 3.33786E-6,
52 => 3.8146973E-6,
53 => 4.7683716E-6,
54 => 5.722046E-6,
55 => 6.67572E-6,
56 => 7.6293945E-6,
57 => 9.536743E-6,
58 => 1.1444092E-5,
59 => 1.335144E-5,
60 => 1.5258789E-5,
61 => 1.9073486E-5,
62 => 2.2888184E-5,
63 => 2.670288E-5,
64 => 3.0517578E-5,
65 => 3.8146973E-5,
66 => 4.5776367E-5,
67 => 5.340576E-5,
68 => 6.1035156E-5,
69 => 7.6293945E-5,
70 => 9.1552734E-5,
71 => 1.0681152E-4,
72 => 1.2207031E-4,
73 => 1.5258789E-4,
74 => 1.8310547E-4,
75 => 2.1362305E-4,
76 => 2.4414062E-4,
77 => 3.0517578E-4,
78 => 3.6621094E-4,
79 => 4.272461E-4,
80 => 4.8828125E-4,
81 => 6.1035156E-4,
82 => 7.324219E-4,
83 => 8.544922E-4,
84 => 9.765625E-4,
85 => 0.0012207031,
86 => 0.0014648438,
87 => 0.0017089844,
88 => 0.001953125,
89 => 0.0024414062,
90 => 0.0029296875,
91 => 0.0034179688,
92 => 0.00390625,
93 => 0.0048828125,
94 => 0.005859375,
95 => 0.0068359375,
96 => 0.0078125,
97 => 0.009765625,
98 => 0.01171875,
99 => 0.013671875,
100 => 0.015625,
101 => 0.01953125,
102 => 0.0234375,
103 => 0.02734375,
104 => 0.03125,
105 => 0.0390625,
106 => 0.046875,
107 => 0.0546875,
108 => 0.0625,
109 => 0.078125,
110 => 0.09375,
111 => 0.109375,
112 => 0.125,
113 => 0.15625,
114 => 0.1875,
115 => 0.21875,
116 => 0.25,
117 => 0.3125,
118 => 0.375,
119 => 0.4375,
120 => 0.5,
121 => 0.625,
122 => 0.75,
123 => 0.875,
124 => 1.0,
125 => 1.25,
126 => 1.5,
127 => 1.75,
128 => 2.0,
129 => 2.5,
130 => 3.0,
131 => 3.5,
132 => 4.0,
133 => 5.0,
134 => 6.0,
135 => 7.0,
136 => 8.0,
137 => 10.0,
138 => 12.0,
139 => 14.0,
140 => 16.0,
141 => 20.0,
142 => 24.0,
143 => 28.0,
144 => 32.0,
145 => 40.0,
146 => 48.0,
147 => 56.0,
148 => 64.0,
149 => 80.0,
150 => 96.0,
151 => 112.0,
152 => 128.0,
153 => 160.0,
154 => 192.0,
155 => 224.0,
156 => 256.0,
157 => 320.0,
158 => 384.0,
159 => 448.0,
160 => 512.0,
161 => 640.0,
162 => 768.0,
163 => 896.0,
164 => 1024.0,
165 => 1280.0,
166 => 1536.0,
167 => 1792.0,
168 => 2048.0,
169 => 2560.0,
170 => 3072.0,
171 => 3584.0,
172 => 4096.0,
173 => 5120.0,
174 => 6144.0,
175 => 7168.0,
176 => 8192.0,
177 => 10240.0,
178 => 12288.0,
179 => 14336.0,
180 => 16384.0,
181 => 20480.0,
182 => 24576.0,
183 => 28672.0,
184 => 32768.0,
185 => 40960.0,
186 => 49152.0,
187 => 57344.0,
188 => 65536.0,
189 => 81920.0,
190 => 98304.0,
191 => 114688.0,
192 => 131072.0,
193 => 163840.0,
194 => 196608.0,
195 => 229376.0,
196 => 262144.0,
197 => 327680.0,
198 => 393216.0,
199 => 458752.0,
200 => 524288.0,
201 => 655360.0,
202 => 786432.0,
203 => 917504.0,
204 => 1048576.0,
205 => 1310720.0,
206 => 1572864.0,
207 => 1835008.0,
208 => 2097152.0,
209 => 2621440.0,
210 => 3145728.0,
211 => 3670016.0,
212 => 4194304.0,
213 => 5242880.0,
214 => 6291456.0,
215 => 7340032.0,
216 => 8388608.0,
217 => 1.048576E7,
218 => 1.2582912E7,
219 => 1.4680064E7,
220 => 1.6777216E7,
221 => 2.097152E7,
222 => 2.5165824E7,
223 => 2.9360128E7,
224 => 3.3554432E7,
225 => 4.194304E7,
226 => 5.0331648E7,
227 => 5.8720256E7,
228 => 6.7108864E7,
229 => 8.388608E7,
230 => 1.00663296E8,
231 => 1.17440512E8,
232 => 1.34217728E8,
233 => 1.6777216E8,
234 => 2.01326592E8,
235 => 2.34881024E8,
236 => 2.68435456E8,
237 => 3.3554432E8,
238 => 4.02653184E8,
239 => 4.69762048E8,
240 => 5.3687091E8,
241 => 6.7108864E8,
242 => 8.0530637E8,
243 => 9.395241E8,
244 => 1.07374182E9,
245 => 1.34217728E9,
246 => 1.61061274E9,
247 => 1.87904819E9,
248 => 2.14748365E9,
249 => 2.68435456E9,
250 => 3.22122547E9,
251 => 3.75809638E9,
252 => 4.2949673E9,
253 => 5.3687091E9,
254 => 6.4424509E9,
255 => 7.5161928E9 );
/**
* Set the default Similarity implementation used by indexing and search
* code.
*
* @param AbstractSimilarity $similarity
*/
public static function setDefault(AbstractSimilarity $similarity)
{
self::$_defaultImpl = $similarity;
}
/**
* Return the default Similarity implementation used by indexing and search
* code.
*
* @return AbstractSimilarity
*/
public static function getDefault()
{
if (!self::$_defaultImpl instanceof AbstractSimilarity) {
self::$_defaultImpl = new DefaultSimilarity();
}
return self::$_defaultImpl;
}
/**
* Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
* search code.
*
* Matches in longer fields are less precise, so implemenations of this
* method usually return smaller values when 'numTokens' is large,
* and larger values when 'numTokens' is small.
*
* That these values are computed under
* IndexWriter::addDocument(Document) and stored then using
* encodeNorm(float). Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* fieldName - name of field
* numTokens - the total number of tokens contained in fields named
* 'fieldName' of 'doc'.
* Returns a normalization factor for hits on this field of this document
*
* @param string $fieldName
* @param integer $numTokens
* @return float
*/
abstract public function lengthNorm($fieldName, $numTokens);
/**
* Computes the normalization value for a query given the sum of the squared
* weights of each of the query terms. This value is then multipled into the
* weight of each query term.
*
* This does not affect ranking, but rather just attempts to make scores
* from different queries comparable.
*
* sumOfSquaredWeights - the sum of the squares of query term weights
* Returns a normalization factor for query weights
*
* @param float $sumOfSquaredWeights
* @return float
*/
abstract public function queryNorm($sumOfSquaredWeights);
/**
* Decodes a normalization factor stored in an index.
*
* @param integer $byte
* @return float
*/
public static function decodeNorm($byte)
{
return self::$_normTable[$byte & 0xFF];
}
/**
* Encodes a normalization factor for storage in an index.
*
* The encoding uses a five-bit exponent and three-bit mantissa, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
*
* @param float $f
* @return integer
*/
public static function encodeNorm($f)
{
return self::_floatToByte($f);
}
/**
* Float to byte conversion
*
* @param integer $b
* @return float
*/
private static function _floatToByte($f)
{
// round negatives up to zero
if ($f <= 0.0) {
return 0;
}
// search for appropriate value
$lowIndex = 0;
$highIndex = 255;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$delta = $f - self::$_normTable[$mid];
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
return $mid; // We got it!
}
}
// round to closest value
if ($highIndex != 255 &&
$f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
return $highIndex + 1;
} else {
return $highIndex;
}
}
/**
* Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the idf(Term, Searcher)
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when 'freq' is large, and smaller values when 'freq'
* is small.
*
* freq - the frequency of a term within a document
* Returns a score factor based on a term's within-document frequency
*
* @param float $freq
* @return float
*/
abstract public function tf($freq);
/**
* Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to tf(float).
*
* A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* distance - the edit distance of this sloppy phrase match
* Returns the frequency increment for this match
*
* @param integer $distance
* @return float
*/
abstract public function sloppyFreq($distance);
/**
* Computes a score factor for a simple term or a phrase.
*
* The default implementation is:
* return idfFreq(searcher.docFreq(term), searcher.maxDoc());
*
* input - the term in question or array of terms
* reader - reader the document collection being searched
* Returns a score factor for the term
*
* @param mixed $input
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
* @return float a score factor for the term
*/
public function idf($input, \ZendSearch\Lucene\SearchIndexInterface $reader)
{
if (!is_array($input)) {
return $this->idfFreq($reader->docFreq($input), $reader->count());
} else {
$idf = 0.0;
foreach ($input as $term) {
$idf += $this->idfFreq($reader->docFreq($term), $reader->count());
}
return $idf;
}
}
/**
* Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* tf(int) factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* Terms that occur in fewer documents are better indicators of topic, so
* implemenations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* docFreq - the number of documents which contain the term
* numDocs - the total number of documents in the collection
* Returns a score factor based on the term's document frequency
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
abstract public function idfFreq($docFreq, $numDocs);
/**
* Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*
* The presence of a large portion of the query terms indicates a better
* match with the query, so implemenations of this method usually return
* larger values when the ratio between these parameters is large and smaller
* values when the ratio between them is small.
*
* overlap - the number of query terms matched in the document
* maxOverlap - the total number of terms in the query
* Returns a score factor based on term overlap with the query
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
abstract public function coord($overlap, $maxOverlap);
}

View File

@@ -1,95 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Similarity;
use ZendSearch\Lucene\Search\Similarity\AbstractSimilarity;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class DefaultSimilarity extends AbstractSimilarity
{
/**
* Implemented as '1/sqrt(numTerms)'.
*
* @param string $fieldName
* @param integer $numTerms
* @return float
*/
public function lengthNorm($fieldName, $numTerms)
{
if ($numTerms == 0) {
return 1E10;
}
return 1.0/sqrt($numTerms);
}
/**
* Implemented as '1/sqrt(sumOfSquaredWeights)'.
*
* @param float $sumOfSquaredWeights
* @return float
*/
public function queryNorm($sumOfSquaredWeights)
{
return 1.0/sqrt($sumOfSquaredWeights);
}
/**
* Implemented as 'sqrt(freq)'.
*
* @param float $freq
* @return float
*/
public function tf($freq)
{
return sqrt($freq);
}
/**
* Implemented as '1/(distance + 1)'.
*
* @param integer $distance
* @return float
*/
public function sloppyFreq($distance)
{
return 1.0/($distance + 1);
}
/**
* Implemented as 'log(numDocs/(docFreq+1)) + 1'.
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
public function idfFreq($docFreq, $numDocs)
{
return log($numDocs/(float)($docFreq+1)) + 1.0;
}
/**
* Implemented as 'overlap/maxOverlap'.
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
public function coord($overlap, $maxOverlap)
{
return $overlap/(float)$maxOverlap;
}
}

View File

@@ -1,71 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
/**
* Calculate query weights and build query scorers.
*
* A AbstractWeight is constructed by a query Query->createWeight().
* The sumOfSquaredWeights() method is then called on the top-level
* query to compute the query normalization factor Similarity->queryNorm(float).
* This factor is then passed to normalize(float). At this point the weighting
* is complete.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
abstract class AbstractWeight
{
/**
* Normalization factor.
* This value is stored only for query expanation purpose and not used in any other place
*
* @var float
*/
protected $_queryNorm;
/**
* AbstractWeight value
*
* AbstractWeight value may be initialized in sumOfSquaredWeights() or normalize()
* because they both are invoked either in Query::_initWeight (for top-level query) or
* in corresponding methods of parent query's weights
*
* @var float
*/
protected $_value;
/**
* The weight for this query.
*
* @return float
*/
public function getValue()
{
return $this->_value;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
abstract public function sumOfSquaredWeights();
/**
* Assigns the query normalization factor to this.
*
* @param $norm
*/
abstract public function normalize($norm);
}

View File

@@ -1,120 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Query;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Boolean extends AbstractWeight
{
/**
* IndexReader.
*
* @var \ZendSearch\Lucene\SearchIndexInterface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
private $_query;
/**
* Queries weights
* Array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_Boolean constructor
* query - the query that this concerns.
* reader - index reader
*
* @param \ZendSearch\Lucene\Search\Query\AbstractQuery $query
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
public function __construct(Query\AbstractQuery $query, Lucene\SearchIndexInterface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getSubqueries() as $num => $subquery) {
if ($signs === null || $signs[$num] === null || $signs[$num]) {
$this->_weights[$num] = $subquery->createWeight($reader);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}

View File

@@ -1,39 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class EmptyResultWeight extends AbstractWeight
{
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
return 1;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
}
}

View File

@@ -1,121 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Query;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class MultiTerm extends AbstractWeight
{
/**
* IndexReader.
*
* @var \ZendSearch\Lucene\SearchIndexInterface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
private $_query;
/**
* Query terms weights
* Array of Zend_Search_Lucene_Search_Weight_Term
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm constructor
* query - the query that this concerns.
* reader - index reader
*
* @param \ZendSearch\Lucene\Search\Query\AbstractQuery $query
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
public function __construct(Query\AbstractQuery $query, Lucene\SearchIndexInterface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getTerms() as $id => $term) {
if ($signs === null || $signs[$id] === null || $signs[$id]) {
$this->_weights[$id] = new Term($term, $query, $reader);
$query->setWeight($id, $this->_weights[$id]);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}

View File

@@ -1,89 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Search\Query;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Phrase extends AbstractWeight
{
/**
* IndexReader.
*
* @var \ZendSearch\Lucene\SearchIndexInterface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var \ZendSearch\Lucene\Search\Query\Phrase
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Zend_Search_Lucene_Search_Weight_Phrase constructor
*
* @param \ZendSearch\Lucene\Search\Query\Phrase $query
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
public function __construct(Query\Phrase $query, Lucene\SearchIndexInterface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@@ -1,111 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Search\Weight;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Search\Query;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
*/
class Term extends AbstractWeight
{
/**
* IndexReader.
*
* @var \ZendSearch\Lucene\SearchIndexInterface
*/
private $_reader;
/**
* Term
*
* @var \ZendSearch\Lucene\Index\Term
*/
private $_term;
/**
* The query that this concerns.
*
* @var \ZendSearch\Lucene\Search\Query\AbstractQuery
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Term constructor
* reader - index reader
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Search\Query\AbstractQuery $query
* @param \ZendSearch\Lucene\SearchIndexInterface $reader
*/
public function __construct(Index\Term $term,
Query\AbstractQuery $query,
Lucene\SearchIndexInterface $reader)
{
$this->_term = $term;
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@@ -1,322 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene;
/**
* @category Zend
* @package Zend_Search_Lucene
*/
interface SearchIndexInterface extends Index\TermsStreamInterface
{
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory
* @return integer
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface
*/
public static function getActualGeneration(Storage\Directory\DirectoryInterface $directory);
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation);
/**
* Get index format version
*
* @return integer
*/
public function getFormatVersion();
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface
*/
public function setFormatVersion($formatVersion);
/**
* Returns the ZendSearch\Lucene\Storage\Directory\DirectoryInterface instance for this index.
*
* @return \ZendSearch\Lucene\Storage\Directory\DirectoryInterface
*/
public function getDirectory();
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count();
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc();
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs();
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface Exception is thrown if $id is out of the range
*/
public function isDeleted($id);
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* @return integer
*/
public function getMaxBufferedDocs();
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs);
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* @return integer
*/
public function getMaxMergeDocs();
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs);
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* @return integer
*/
public function getMergeFactor();
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor);
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array|\ZendSearch\Lucene\Search\QueryHit
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface
*/
public function find($query);
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false);
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|\ZendSearch\Lucene\Search\QueryHit $id
* @return \ZendSearch\Lucene\Document
*/
public function getDocument($id);
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @return boolean
*/
public function hasTerm(Index\Term $term);
/**
* Returns IDs of all the documents containing term.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Index\Term $term, $docsFilter = null);
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return \ZendSearch\Lucene\Index\DocsFilter
*/
public function termDocsFilter(Index\Term $term, $docsFilter = null);
/**
* Returns an array of all term freqs.
* Return array structure: array( docId => freq, ...)
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return integer
*/
public function termFreqs(Index\Term $term, $docsFilter = null);
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param \ZendSearch\Lucene\Index\Term $term
* @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter
* @return array
*/
public function termPositions(Index\Term $term, $docsFilter = null);
/**
* Returns the number of documents in this index containing the $term.
*
* @param \ZendSearch\Lucene\Index\Term $term
* @return integer
*/
public function docFreq(Index\Term $term);
/**
* Retrive similarity used by index reader
*
* @return \ZendSearch\Lucene\Search\Similarity\AbstractSimilarity
*/
public function getSimilarity();
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName);
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions();
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|\ZendSearch\Lucene\Search\QueryHit $id
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface
*/
public function delete($id);
/**
* Adds a document to this index.
*
* @param \ZendSearch\Lucene\Document $document
*/
public function addDocument(Document $document);
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*/
public function commit();
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize();
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms();
/**
* Undeletes all documents currently marked as deleted in this index.
*/
public function undeleteAll();
}

View File

@@ -1,121 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Storage\Directory;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
*/
interface DirectoryInterface
{
/**
* Closes the store.
*
* @return void
*/
public function close();
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
public function fileList();
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return \ZendSearch\Lucene\Storage\File\FileInterface
*/
public function createFile($filename);
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
*/
public function deleteFile($filename);
/**
* Purge file if it's cached by directory object
*
* Method is used to prevent 'too many open files' error
*
* @param string $filename
* @return void
*/
public function purgeFile($filename);
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
public function fileExists($filename);
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
public function fileLength($filename);
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
public function fileModified($filename);
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
*/
public function renameFile($from, $to);
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
public function touchFile($filename);
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* If $shareHandler option is true, then file handler can be shared between File Object
* requests. It speed-ups performance, but makes problems with file position.
* Shared handler are good for short atomic requests.
* Non-shared handlers are useful for stream file reading (especial for compound files).
*
* @param string $filename
* @param boolean $shareHandler
* @return \ZendSearch\Lucene\Storage\File\FileInterface
*/
public function getFileObject($filename, $shareHandler = true);
}

View File

@@ -1,341 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Storage\Directory;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Storage\Directory;
use ZendSearch\Lucene\Storage\File;
use Laminas\Stdlib\ErrorHandler;
/**
* FileSystem implementation of DirectoryInterface abstraction.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
*/
class Filesystem implements DirectoryInterface
{
/**
* Filesystem path to the directory
*
* @var string
*/
protected $_dirPath = null;
/**
* Cache for Zend_Search_Lucene_Storage_File_Filesystem objects
* Array: filename => Zend_Search_Lucene_Storage_File object
*
* @var array
* @throws \ZendSearch\Lucene\Exception\ExceptionInterface
*/
protected $_fileHandlers;
/**
* Default file permissions
*
* @var integer
*/
protected static $_defaultFilePermissions = 0666;
/**
* Get default file permissions
*
* @return integer
*/
public static function getDefaultFilePermissions()
{
return self::$_defaultFilePermissions;
}
/**
* Set default file permissions
*
* @param integer $mode
*/
public static function setDefaultFilePermissions($mode)
{
self::$_defaultFilePermissions = $mode;
}
/**
* Utility function to recursive directory creation
*
* @param string $dir
* @param integer $mode
* @param boolean $recursive
* @return boolean
*/
public static function mkdirs($dir, $mode = 0777, $recursive = true)
{
if (($dir === null) || $dir === '') {
return false;
}
if (is_dir($dir) || $dir === '/') {
return true;
}
if (self::mkdirs(dirname($dir), $mode, $recursive)) {
return mkdir($dir, $mode);
}
return false;
}
/**
* Object constructor
* Checks if $path is a directory or tries to create it.
*
* @param string $path
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function __construct($path)
{
if (!is_dir($path)) {
if (file_exists($path)) {
throw new Lucene\Exception\InvalidArgumentException(
'Path exists, but it\'s not a directory'
);
} else {
if (!self::mkdirs($path)) {
throw new Lucene\Exception\InvalidArgumentException(
"Can't create directory '$path'."
);
}
}
}
$this->_dirPath = $path;
$this->_fileHandlers = array();
}
/**
* Closes the store.
*
* @return void
*/
public function close()
{
foreach ($this->_fileHandlers as $fileObject) {
$fileObject->close();
}
$this->_fileHandlers = array();
}
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
public function fileList()
{
$result = array();
$dirContent = opendir( $this->_dirPath );
while (($file = readdir($dirContent)) !== false) {
if (($file == '..')||($file == '.')) continue;
if( !is_dir($this->_dirPath . '/' . $file) ) {
$result[] = $file;
}
}
closedir($dirContent);
return $result;
}
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return \ZendSearch\Lucene\Storage\File\FileInterface
*/
public function createFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
$this->_fileHandlers[$filename] = new File\Filesystem($this->_dirPath . '/' . $filename, 'w+b');
// Set file permissions, but don't care about any possible failures, since file may be already
// created by anther user which has to care about right permissions
ErrorHandler::start(E_WARNING);
chmod($this->_dirPath . '/' . $filename, self::$_defaultFilePermissions);
ErrorHandler::stop();
return $this->_fileHandlers[$filename];
}
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return void
*/
public function deleteFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
if (!@unlink($this->_dirPath . '/' . $filename)) {
$message = error_get_last()['message'] ?? 'Unknown error';
throw new Lucene\Exception\RuntimeException('Can\'t delete file: ' . $message);
}
}
/**
* Purge file if it's cached by directory object
*
* Method is used to prevent 'too many open files' error
*
* @param string $filename
* @return void
*/
public function purgeFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
}
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
public function fileExists($filename)
{
return isset($this->_fileHandlers[$filename]) ||
file_exists($this->_dirPath . '/' . $filename);
}
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
public function fileLength($filename)
{
if (isset( $this->_fileHandlers[$filename] )) {
return $this->_fileHandlers[$filename]->size();
}
return filesize($this->_dirPath .'/'. $filename);
}
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
public function fileModified($filename)
{
return filemtime($this->_dirPath .'/'. $filename);
}
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return void
*/
public function renameFile($from, $to)
{
if (isset($this->_fileHandlers[$from])) {
$this->_fileHandlers[$from]->close();
}
unset($this->_fileHandlers[$from]);
if (isset($this->_fileHandlers[$to])) {
$this->_fileHandlers[$to]->close();
}
unset($this->_fileHandlers[$to]);
if (file_exists($this->_dirPath . '/' . $to)) {
if (!unlink($this->_dirPath . '/' . $to)) {
throw new Lucene\Exception\RuntimeException(
'Delete operation failed'
);
}
}
ErrorHandler::start(E_WARNING);
$success = rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to);
ErrorHandler::stop();
if (!$success) {
$message = error_get_last()['message'] ?? 'Unknown error';
throw new Lucene\Exception\RuntimeException($message);
}
return $success;
}
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
public function touchFile($filename)
{
return touch($this->_dirPath .'/'. $filename);
}
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* If $shareHandler option is true, then file handler can be shared between File Object
* requests. It speed-ups performance, but makes problems with file position.
* Shared handler are good for short atomic requests.
* Non-shared handlers are useful for stream file reading (especial for compound files).
*
* @param string $filename
* @param boolean $shareHandler
* @return \ZendSearch\Lucene\Storage\File\FileInterface
*/
public function getFileObject($filename, $shareHandler = true)
{
$fullFilename = $this->_dirPath . '/' . $filename;
if (!$shareHandler) {
return new File\Filesystem($fullFilename);
}
if (isset( $this->_fileHandlers[$filename] )) {
$this->_fileHandlers[$filename]->seek(0);
return $this->_fileHandlers[$filename];
}
$this->_fileHandlers[$filename] = new File\Filesystem($fullFilename);
return $this->_fileHandlers[$filename];
}
}

View File

@@ -1,399 +0,0 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/
namespace ZendSearch\Lucene\Storage\File;
use ZendSearch\Lucene;
use ZendSearch\Lucene\Storage\File;
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
*/
abstract class AbstractFile implements FileInterface
{
/**
* Reads a byte from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readByte()
{
return ord($this->_fread(1));
}
/**
* Writes a byte to the end of the file.
*
* @param integer $byte
*/
public function writeByte($byte)
{
return $this->_fwrite(chr($byte), 1);
}
/**
* Read num bytes from the current position in the file
* and advances the file pointer.
*
* @param integer $num
* @return string
*/
public function readBytes($num)
{
return $this->_fread($num);
}
/**
* Writes num bytes of data (all, if $num===null) to the end
* of the string.
*
* @param string $data
* @param integer $num
*/
public function writeBytes($data, $num=null)
{
$this->_fwrite($data, $num);
}
/**
* Reads an integer from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readInt()
{
$str = $this->_fread(4);
if ('' === $str) {
return 0;
}
return ord($str[0]) << 24 |
ord($str[1]) << 16 |
ord($str[2]) << 8 |
ord($str[3]);
}
/**
* Writes an integer to the end of file.
*
* @param integer $value
*/
public function writeInt($value)
{
settype($value, 'integer');
$this->_fwrite( chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 4 );
}
/**
* Returns a long integer from the current position in the file
* and advances the file pointer.
*
* @return integer|float
*/
public function readLong()
{
/**
* Check, that we work in 64-bit mode.
* fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
$str = $this->_fread(8);
return ord($str[0]) << 56 |
ord($str[1]) << 48 |
ord($str[2]) << 40 |
ord($str[3]) << 32 |
ord($str[4]) << 24 |
ord($str[5]) << 16 |
ord($str[6]) << 8 |
ord($str[7]);
} else {
return $this->_readLong32Bit();
}
}
/**
* Writes long integer to the end of file
*
* @param integer $value
*/
public function writeLong($value)
{
/**
* Check, that we work in 64-bit mode.
* fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
settype($value, 'integer');
$this->_fwrite( chr($value>>56 & 0xFF) .
chr($value>>48 & 0xFF) .
chr($value>>40 & 0xFF) .
chr($value>>32 & 0xFF) .
chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 8 );
} else {
$this->_writeLong32Bit($value);
}
}
/**
* Returns a long integer from the current position in the file,
* advances the file pointer and return it as float (for 32-bit platforms).
*
* @throws \ZendSearch\Lucene\Exception\RuntimeException
* @return integer|float
*/
protected function _readLong32Bit()
{
$wordHigh = $this->readInt();
$wordLow = $this->readInt();
if ($wordHigh & (int)0x80000000) {
// It's a negative value since the highest bit is set
if ($wordHigh == (int)0xFFFFFFFF && ($wordLow & (int)0x80000000)) {
return $wordLow;
} else {
throw new Lucene\Exception\RuntimeException(
'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.'
);
}
}
if ($wordLow < 0) {
// Value is large than 0x7FFF FFFF. Represent low word as float.
$wordLow &= 0x7FFFFFFF;
$wordLow += (float)0x80000000;
}
if ($wordHigh == 0) {
// Return value as integer if possible
return $wordLow;
}
return $wordHigh*(float)0x100000000/* 0x00000001 00000000 */ + $wordLow;
}
/**
* Writes long integer to the end of file (32-bit platforms implementation)
*
* @param integer|float $value
* @throws \ZendSearch\Lucene\Exception\RuntimeException
*/
protected function _writeLong32Bit($value)
{
if ($value < (int)0x80000000) {
throw new Lucene\Exception\RuntimeException(
'Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.'
);
}
if ($value < 0) {
$wordHigh = (int)0xFFFFFFFF;
$wordLow = (int)$value;
} else {
$wordHigh = (int)($value/(float)0x100000000/* 0x00000001 00000000 */);
$wordLow = $value - $wordHigh*(float)0x100000000/* 0x00000001 00000000 */;
if ($wordLow > 0x7FFFFFFF) {
// Highest bit of low word is set. Translate it to the corresponding negative integer value
$wordLow -= 0x80000000;
$wordLow |= 0x80000000;
}
}
$this->writeInt($wordHigh);
$this->writeInt($wordLow);
}
/**
* Returns a variable-length integer from the current
* position in the file and advances the file pointer.
*
* @return integer
*/
public function readVInt()
{
$nextByte = ord($this->_fread(1));
$val = $nextByte & 0x7F;
for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) {
$nextByte = ord($this->_fread(1));
$val |= ($nextByte & 0x7F) << $shift;
}
return $val;
}
/**
* Writes a variable-length integer to the end of file.
*
* @param integer $value
*/
public function writeVInt($value)
{
settype($value, 'integer');
while ($value > 0x7F) {
$this->_fwrite(chr( ($value & 0x7F)|0x80 ));
$value >>= 7;
}
$this->_fwrite(chr($value));
}
/**
* Reads a string from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readString()
{
$strlen = $this->readVInt();
if ($strlen == 0) {
return '';
} else {
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
$str_val = $this->_fread($strlen);
for ($count = 0; $count < $strlen; $count++ ) {
if (( ord($str_val[$count]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($str_val[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
if (ord($str_val[$count]) & 0x10 ) {
$addBytes++;
}
}
$str_val .= $this->_fread($addBytes);
$strlen += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($str_val[$count]) == 0xC0 &&
ord($str_val[$count+1]) == 0x80 ) {
$str_val[$count] = 0;
$str_val = substr($str_val,0,$count+1)
. substr($str_val,$count+2);
}
$count += $addBytes;
}
}
return $str_val;
}
}
/**
* Writes a string to the end of file.
*
* @param string $str
* @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
*/
public function writeString($str)
{
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
// convert input to a string before iterating string characters
settype($str, 'string');
$chars = $strlen = strlen($str);
$containNullChars = false;
for ($count = 0; $count < $strlen; $count++ ) {
/**
* String is already in Java 2 representation.
* We should only calculate actual string length and replace
* \x00 by \xC0\x80
*/
if ((ord($str[$count]) & 0xC0) == 0xC0) {
$addBytes = 1;
if (ord($str[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
// and we dont't support non-BMP characters
if (ord($str[$count]) & 0x10 ) {
$addBytes++;
}
}
$chars -= $addBytes;
if (ord($str[$count]) == 0 ) {
$containNullChars = true;
}
$count += $addBytes;
}
}
if ($chars < 0) {
throw new Lucene\Exception\InvalidArgumentException('Invalid UTF-8 string');
}
$this->writeVInt($chars);
if ($containNullChars) {
$this->_fwrite(str_replace($str, "\x00", "\xC0\x80"));
} else {
$this->_fwrite($str);
}
}
/**
* Reads binary data from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readBinary()
{
return $this->_fread($this->readVInt());
}
}

Some files were not shown because too many files have changed in this diff Show More