Subversion-Projekte lars-tiefland.prado

Revision

Blame | Letzte Änderung | Log anzeigen | RSS feed

<?php
/**
 * Zend Framework
 *
 * LICENSE
 *
 * This source file is subject to version 1.0 of the Zend Framework
 * license, that is bundled with this package in the file LICENSE, and
 * is available through the world-wide-web at the following URL:
 * http://www.zend.com/license/framework/1_0.txt. If you did not receive
 * a copy of the Zend Framework license and are unable to obtain it
 * through the world-wide-web, please send a note to license@zend.com
 * so we can mail you a copy immediately.
 *
 * @package    Zend_Search_Lucene
 * @subpackage Index
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */


/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';


/**
 * @package    Zend_Search_Lucene
 * @subpackage Index
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */
class Zend_Search_Lucene_Index_SegmentInfo
{
    /**
     * Number of docs in a segment
     *
     * @var integer
     */
    private $_docCount;

    /**
     * Segment name
     *
     * @var string
     */
    private $_name;

    /**
     * Term Dictionary Index
     * Array of the Zend_Search_Lucene_Index_Term objects
     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
     *
     * @var array
     */
    private $_termDictionary;

    /**
     * Term Dictionary Index TermInfos
     * Array of the Zend_Search_Lucene_Index_TermInfo objects
     *
     * @var array
     */
    private $_termDictionaryInfos;

    /**
     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
     *
     * @var array
     */
    private $_fields;

    /**
     * Field positions in a dictionary.
     * (Term dictionary contains filelds ordered by names)
     *
     * @var array
     */
    private $_fieldsDicPositions;


    /**
     * Associative array where the key is the file name and the value is data offset
     * in a compound segment file (.csf).
     *
     * @var array
     */
    private $_segFiles;

    /**
     * File system adapter.
     *
     * @var Zend_Search_Lucene_Storage_Directory_Filesystem
     */
    private $_directory;

    /**
     * Normalization factors.
     * An array fieldName => normVector
     * normVector is a binary string.
     * Each byte corresponds to an indexed document in a segment and
     * encodes normalization factor (float value, encoded by
     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
     *
     * @var array
     */
    private $_norms = array();

    /**
     * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
     * Documents count and Directory as a parameter.
     *
     * @param string $name
     * @param integer $docCount
     * @param Zend_Search_Lucene_Storage_Directory $directory
     */
    public function __construct($name, $docCount, $directory)
    {
        $this->_name = $name;
        $this->_docCount = $docCount;
        $this->_directory = $directory;
        $this->_termDictionary = null;

        $this->_segFiles = array();
        $cfsFile = $this->_directory->getFileObject($name . '.cfs');
        $segFilesCount = $cfsFile->readVInt();

        for ($count = 0; $count < $segFilesCount; $count++) {
            $dataOffset = $cfsFile->readLong();
            $fileName = $cfsFile->readString();
            $this->_segFiles[$fileName] = $dataOffset;
        }

        $fnmFile = $this->openCompoundFile('.fnm');
        $fieldsCount = $fnmFile->readVInt();
        $fieldNames = array();
        $fieldNums  = array();
        $this->_fields = array();
        for ($count=0; $count < $fieldsCount; $count++) {
            $fieldName = $fnmFile->readString();
            $fieldBits = $fnmFile->readByte();
            $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
                                                                            $fieldBits & 1,
                                                                            $count,
                                                                            $fieldBits & 2 );
            if ($fieldBits & 0x10) {
                // norms are omitted for the indexed field
                $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
            }

            $fieldNums[$count]  = $count;
            $fieldNames[$count] = $fieldName;
        }
        array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
        $this->_fieldsDicPositions = array_flip($fieldNums);
    }

    /**
     * Opens index file stoted within compound index file
     *
     * @param string $extension
     * @throws Zend_Search_Lucene_Exception
     * @return Zend_Search_Lucene_Storage_File
     */
    public function openCompoundFile($extension)
    {
        $filename = $this->_name . $extension;

        if( !isset($this->_segFiles[ $filename ]) ) {
            throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
                                       . $filename . ' file.' );
        }

        $file = $this->_directory->getFileObject( $this->_name.".cfs" );
        $file->seek( $this->_segFiles[ $filename ] );
        return $file;
    }

    /**
     * Returns field index or -1 if field is not found
     *
     * @param string $fieldName
     * @return integer
     */
    public function getFieldNum($fieldName)
    {
        foreach( $this->_fields as $field ) {
            if( $field->name == $fieldName ) {
                return $field->number;
            }
        }

        return -1;
    }

    /**
     * Returns field info for specified field
     *
     * @param integer $fieldNum
     * @return ZSearchFieldInfo
     */
    public function getField($fieldNum)
    {
        return $this->_fields[$fieldNum];
    }

    /**
     * Returns array of fields.
     * if $indexed parameter is true, then returns only indexed fields.
     *
     * @param boolean $indexed
     * @return array
     */
    public function getFields($indexed = false)
    {
        $result = array();
        foreach( $this->_fields as $field ) {
            if( (!$indexed) || $field->isIndexed ) {
                $result[ $field->name ] = $field->name;
            }
        }
        return $result;
    }

    /**
     * Returns the total number of documents in this segment.
     *
     * @return integer
     */
    public function count()
    {
        return $this->_docCount;
    }


    /**
     * Loads Term dictionary from TermInfoIndex file
     */
    protected function _loadDictionary()
    {
        if ($this->_termDictionary !== null) {
            return;
        }

        $this->_termDictionary = array();
        $this->_termDictionaryInfos = array();

        $tiiFile = $this->openCompoundFile('.tii');
        $tiVersion = $tiiFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE) {
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
        }

        $indexTermCount = $tiiFile->readLong();
                          $tiiFile->readInt();  // IndexInterval
        $skipInterval   = $tiiFile->readInt();

        $prevTerm     = '';
        $freqPointer  =  0;
        $proxPointer  =  0;
        $indexPointer =  0;
        for ($count = 0; $count < $indexTermCount; $count++) {
            $termPrefixLength = $tiiFile->readVInt();
            $termSuffix       = $tiiFile->readString();
            $termValue        = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;

            $termFieldNum     = $tiiFile->readVInt();
            $docFreq          = $tiiFile->readVInt();
            $freqPointer     += $tiiFile->readVInt();
            $proxPointer     += $tiiFile->readVInt();
            if( $docFreq >= $skipInterval ) {
                $skipDelta = $tiiFile->readVInt();
            } else {
                $skipDelta = 0;
            }

            $indexPointer += $tiiFile->readVInt();

            $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
            $this->_termDictionaryInfos[] =
                new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
            $prevTerm = $termValue;
        }
    }


    /**
     * Return segment name
     *
     * @return string
     */
    public function getName()
    {
        return $this->_name;
    }


    /**
     * Scans terms dictionary and returns term info
     *
     * @param Zend_Search_Lucene_Index_Term $term
     * @return Zend_Search_Lucene_Index_TermInfo
     */
    public function getTermInfo($term)
    {
        $this->_loadDictionary();

        $searchField = $this->getFieldNum($term->field);

        if ($searchField == -1) {
            return null;
        }
        $searchDicField = $this->_fieldsDicPositions[$searchField];

        // search for appropriate value in dictionary
        $lowIndex = 0;
        $highIndex = count($this->_termDictionary)-1;
        while ($highIndex >= $lowIndex) {
            // $mid = ($highIndex - $lowIndex)/2;
            $mid = ($highIndex + $lowIndex) >> 1;
            $midTerm = $this->_termDictionary[$mid];

            $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field];
            if ($delta == 0) {
                $delta = strcmp($term->text, $midTerm->text);
            }

            if ($delta < 0) {
                $highIndex = $mid-1;
            } elseif ($delta > 0) {
                $lowIndex  = $mid+1;
            } else {
                return $this->_termDictionaryInfos[$mid]; // We got it!
            }
        }

        if ($highIndex == -1) {
            // Term is out of the dictionary range
            return null;
        }

        $prevPosition = $highIndex;
        $prevTerm = $this->_termDictionary[$prevPosition];
        $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];

        $tisFile = $this->openCompoundFile('.tis');
        $tiVersion = $tisFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE) {
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
        }

        $termCount     = $tisFile->readLong();
        $indexInterval = $tisFile->readInt();
        $skipInterval  = $tisFile->readInt();

        $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);

        $termValue    = $prevTerm->text;
        $termFieldNum = $prevTerm->field;
        $freqPointer = $prevTermInfo->freqPointer;
        $proxPointer = $prevTermInfo->proxPointer;
        for ($count = $prevPosition*$indexInterval + 1;
             $count < $termCount &&
             ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField ||
              ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField &&
               strcmp($termValue, $term->text) < 0) );
             $count++) {
            $termPrefixLength = $tisFile->readVInt();
            $termSuffix       = $tisFile->readString();
            $termFieldNum     = $tisFile->readVInt();
            $termValue        = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;

            $docFreq      = $tisFile->readVInt();
            $freqPointer += $tisFile->readVInt();
            $proxPointer += $tisFile->readVInt();
            if( $docFreq >= $skipInterval ) {
                $skipOffset = $tisFile->readVInt();
            } else {
                $skipOffset = 0;
            }
        }

        if ($termFieldNum == $searchField && $termValue == $term->text) {
            return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
        } else {
            return null;
        }
    }

    /**
     * Returns normalization factor for specified documents
     *
     * @param integer $id
     * @param string $fieldName
     * @return string
     */
    public function norm($id, $fieldName)
    {
        $fieldNum = $this->getFieldNum($fieldName);

        if ( !($this->_fields[$fieldNum]->isIndexed) ) {
            return null;
        }

        if ( !isset( $this->_norms[$fieldNum] )) {
            $fFile = $this->openCompoundFile('.f' . $fieldNum);
            $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
        }

        return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
    }
}