Subversion-Projekte lars-tiefland.prado

Revision

Blame | Letzte Änderung | Log anzeigen | RSS feed

<?php
/**
 * Zend Framework
 *
 * LICENSE
 *
 * This source file is subject to version 1.0 of the Zend Framework
 * license, that is bundled with this package in the file LICENSE, and
 * is available through the world-wide-web at the following URL:
 * http://www.zend.com/license/framework/1_0.txt. If you did not receive
 * a copy of the Zend Framework license and are unable to obtain it
 * through the world-wide-web, please send a note to license@zend.com
 * so we can mail you a copy immediately.
 *
 * @package    Zend_Search_Lucene
 * @subpackage Index
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */


/** Zend_Search_Lucene_Exception */
require_once 'Zend/Search/Lucene/Exception.php';

/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';

/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';


/**
 * @package    Zend_Search_Lucene
 * @subpackage Index
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */
class Zend_Search_Lucene_Index_SegmentWriter
{
    /**
     * Expert: The fraction of terms in the "dictionary" which should be stored
     * in RAM.  Smaller values use more memory, but make searching slightly
     * faster, while larger values use less memory and make searching slightly
     * slower.  Searching is typically not dominated by dictionary lookup, so
     * tweaking this is rarely useful.
     *
     * @var integer
     */
    static public $indexInterval = 128;

    /** Expert: The fraction of TermDocs entries stored in skip tables.
     * Larger values result in smaller indexes, greater acceleration, but fewer
     * accelerable cases, while smaller values result in bigger indexes,
     * less acceleration and more
     * accelerable cases. More detailed experiments would be useful here.
     *
     * 0x0x7FFFFFFF indicates that we don't use skip data
     * Default value is 16
     *
     * @var integer
     */
    static public $skipInterval = 0x7FFFFFFF;

    /**
     * Number of docs in a segment
     *
     * @var integer
     */
    private $_docCount;

    /**
     * Segment name
     *
     * @var string
     */
    private $_name;

    /**
     * File system adapter.
     *
     * @var Zend_Search_Lucene_Storage_Directory
     */
    private $_directory;

    /**
     * List of the index files.
     * Used for automatic compound file generation
     *
     * @var unknown_type
     */
    private $_files;

    /**
     * Term Dictionary
     * Array of the Zend_Search_Lucene_Index_Term objects
     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
     *
     * @var array
     */
    private $_termDictionary;

    /**
     * Documents, which contain the term
     *
     * @var array
     */
    private $_termDocs;

    /**
     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
     *
     * @var array
     */
    private $_fields;

    /**
     * Normalization factors.
     * An array fieldName => normVector
     * normVector is a binary string.
     * Each byte corresponds to an indexed document in a segment and
     * encodes normalization factor (float value, encoded by
     * Zend_Search_Lucene_Search_Similarity::encodeNorm())
     *
     * @var array
     */
    private $_norms;


    /**
     * '.fdx'  file - Stored Fields, the field index.
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_fdxFile;

    /**
     * '.fdx'  file - Stored Fields, the field data.
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_fdtFile;


    /**
     * Object constructor.
     *
     * @param Zend_Search_Lucene_Storage_Directory $directory
     * @param string $name
     */
    public function __construct($directory, $name)
    {
        $this->_directory = $directory;
        $this->_name      = $name;
        $this->_docCount  = 0;

        $this->_fields   = array();
        $this->_termDocs = array();
        $this->_files    = array();
        $this->_norms    = array();

        $this->_fdxFile = null;
        $this->_fdtFile = null;
    }


    /**
     * Add field to the segment
     *
     * @param Zend_Search_Lucene_Field $field
     */
    private function _addFieldInfo(Zend_Search_Lucene_Field $field)
    {
        if (!isset($this->_fields[$field->name])) {
            $this->_fields[$field->name] =
                                new Zend_Search_Lucene_Index_FieldInfo($field->name,
                                                                       $field->isIndexed,
                                                                       count($this->_fields),
                                                                       $field->storeTermVector);
        } else {
            $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
            $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
        }
    }


    /**
     * Adds a document to this segment.
     *
     * @param Zend_Search_Lucene_Document $document
     * @throws Zend_Search_Lucene_Exception
     */
    public function addDocument(Zend_Search_Lucene_Document $document)
    {
        $storedFields = array();

        foreach ($document->getFieldNames() as $fieldName) {
            $field = $document->getField($fieldName);
            $this->_addFieldInfo($field);

            if ($field->storeTermVector) {
                /**
                 * @todo term vector storing support
                 */
                throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
            }

            if ($field->isIndexed) {
                if ($field->isTokenized) {
                    $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
                } else {
                    $tokenList = array();
                    $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
                }

                $position = 0;
                foreach ($tokenList as $token) {
                    $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                    $termKey = $term->key();

                    if (!isset($this->_termDictionary[$termKey])) {
                        // New term
                        $this->_termDictionary[$termKey] = $term;
                        $this->_termDocs[$termKey] = array();
                        $this->_termDocs[$termKey][$this->_docCount] = array();
                    } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                        // Existing term, but new term entry
                        $this->_termDocs[$termKey][$this->_docCount] = array();
                    }
                    $position += $token->getPositionIncrement();
                    $this->_termDocs[$termKey][$this->_docCount][] = $position;
                }
            }

            if ($field->isStored) {
                $storedFields[] = $field;
            }
        }

        if (count($storedFields) != 0) {
            if (!isset($this->_fdxFile)) {
                $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
                $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');

                $this->_files[] = $this->_name . '.fdx';
                $this->_files[] = $this->_name . '.fdt';
            }

            $this->_fdxFile->writeLong($this->_fdtFile->tell());

            $this->_fdtFile->writeVInt(count($storedFields));
            foreach ($storedFields as $field) {
                $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
                $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 |
                                           $field->isBinary ?    0x02 : 0x00 |
                                           0x00 /* 0x04 - third bit, compressed (ZLIB) */ );
                if ($field->isBinary) {
                    $this->_fdtFile->writeVInt(strlen($field->stringValue));
                    $this->_fdtFile->writeBytes($field->stringValue);
                } else {
                    $this->_fdtFile->writeString($field->stringValue);
                }
            }
        }

        $this->_docCount++;
    }


    /**
     * Dump Field Info (.fnm) segment file
     */
    private function _dumpFNM()
    {
        $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
        $fnmFile->writeVInt(count($this->_fields));

        foreach ($this->_fields as $field) {
            $fnmFile->writeString($field->name);
            $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
                                ($field->storeTermVector ? 0x02 : 0x00) |
// not supported yet            0x04 /* term positions are stored with the term vectors */ |
// not supported yet            0x08 /* term offsets are stored with the term vectors */   |
/* not supported yet */         0x10 /* norms are omitted for the indexed field */
                               );
        }

        $this->_files[] = $this->_name . '.fnm';
    }


    /**
     * Dump Term Dictionary segment file entry.
     * Used to write entry to .tis or .tii files
     *
     * @param Zend_Search_Lucene_Storage_File $dicFile
     * @param Zend_Search_Lucene_Index_Term $prevTerm
     * @param Zend_Search_Lucene_Index_Term $term
     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
     * @param Zend_Search_Lucene_Index_TermInfo $termInfo
     */
    private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
    {
        if (isset($prevTerm) && $prevTerm->field == $term->field) {
            $prefixLength = 0;
            while ($prefixLength < strlen($prevTerm->text) &&
                   $prefixLength < strlen($term->text) &&
                   $prevTerm->text{$prefixLength} == $term->text{$prefixLength}
                  ) {
                $prefixLength++;
            }
            // Write preffix length
            $dicFile->writeVInt($prefixLength);
            // Write suffix
            $dicFile->writeString( substr($term->text, $prefixLength) );
        } else {
            // Write preffix length
            $dicFile->writeVInt(0);
            // Write suffix
            $dicFile->writeString($term->text);
        }
        // Write field number
        $dicFile->writeVInt($term->field);
        // DocFreq (the count of documents which contain the term)
        $dicFile->writeVInt($termInfo->docFreq);

        $prevTerm = $term;

        if (!isset($prevTermInfo)) {
            // Write FreqDelta
            $dicFile->writeVInt($termInfo->freqPointer);
            // Write ProxDelta
            $dicFile->writeVInt($termInfo->proxPointer);
        } else {
            // Write FreqDelta
            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
            // Write ProxDelta
            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
        }
        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
        if ($termInfo->skipOffset != 0) {
            $dicFile->writeVInt($termInfo->skipOffset);
        }

        $prevTermInfo = $termInfo;
    }

    /**
     * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
     */
    private function _dumpDictionary()
    {
        $tisFile = $this->_directory->createFile($this->_name . '.tis');
        $tisFile->writeInt((int)0xFFFFFFFE);
        $tisFile->writeLong(count($this->_termDictionary));
        $tisFile->writeInt(self::$indexInterval);
        $tisFile->writeInt(self::$skipInterval);

        $tiiFile = $this->_directory->createFile($this->_name . '.tii');
        $tiiFile->writeInt((int)0xFFFFFFFE);
        $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1);
        $tiiFile->writeInt(self::$indexInterval);
        $tiiFile->writeInt(self::$skipInterval);

        $frqFile = $this->_directory->createFile($this->_name . '.frq');
        $prxFile = $this->_directory->createFile($this->_name . '.prx');

        $termKeys = array_keys($this->_termDictionary);
        sort($termKeys, SORT_STRING);

        $termCount = 0;

        $prevTerm     = null;
        $prevTermInfo = null;
        $prevIndexTerm     = null;
        $prevIndexTermInfo = null;
        $prevIndexPosition = 0;

        foreach ($termKeys as $termId) {
            $freqPointer = $frqFile->tell();
            $proxPointer = $prxFile->tell();

            $prevDoc = 0;
            foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
                $docDelta = ($docId - $prevDoc)*2;
                $prevDoc = $docId;
                if (count($termPositions) > 1) {
                    $frqFile->writeVInt($docDelta);
                    $frqFile->writeVInt(count($termPositions));
                } else {
                    $frqFile->writeVInt($docDelta + 1);
                }

                $prevPosition = 0;
                foreach ($termPositions as $position) {
                    $prxFile->writeVInt($position - $prevPosition);
                    $prevPosition = $position;
                }
            }

            if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
                /**
                 * @todo Write Skip Data to a freq file.
                 * It's not used now, but must be implemented to be compatible with Lucene
                 */
                $skipOffset = $frqFile->tell() - $freqPointer;
            } else {
                $skipOffset = 0;
            }

            $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
                                                      $this->_fields[$this->_termDictionary[$termId]->field]->number);
            $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
                                            $freqPointer, $proxPointer, $skipOffset);

            $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);

            if ($termCount % self::$indexInterval == 0) {
                $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);

                $indexPosition = $tisFile->tell();
                $tiiFile->writeVInt($indexPosition - $prevIndexPosition);
                $prevIndexPosition = $indexPosition;
            }
            $termCount++;
        }

        $this->_files[] = $this->_name . '.tis';
        $this->_files[] = $this->_name . '.tii';
        $this->_files[] = $this->_name . '.frq';
        $this->_files[] = $this->_name . '.prx';
    }


    /**
     * Generate compound index file
     */
    private function _generateCFS()
    {
        $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
        $cfsFile->writeVInt(count($this->_files));

        $dataOffsetPointers = array();
        foreach ($this->_files as $fileName) {
            $dataOffsetPointers[$fileName] = $cfsFile->tell();
            $cfsFile->writeLong(0); // write dummy data
            $cfsFile->writeString($fileName);
        }

        foreach ($this->_files as $fileName) {
            // Get actual data offset
            $dataOffset = $cfsFile->tell();
            // Seek to the data offset pointer
            $cfsFile->seek($dataOffsetPointers[$fileName]);
            // Write actual data offset value
            $cfsFile->writeLong($dataOffset);
            // Seek back to the end of file
            $cfsFile->seek($dataOffset);

            $dataFile = $this->_directory->getFileObject($fileName);
            $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName)));

            $this->_directory->deleteFile($fileName);
        }
    }


    /**
     * Close segment, write it to disk and return segment info
     *
     * @return Zend_Search_Lucene_Index_SegmentInfo
     */
    public function close()
    {
        if ($this->_docCount == 0) {
            return null;
        }

        $this->_dumpFNM();
        $this->_dumpDictionary();

        $this->_generateCFS();

        return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
                                                        $this->_docCount,
                                                        $this->_directory);
    }

}