Blame | Letzte Änderung | Log anzeigen | RSS feed
<?php/*** Zend Framework** LICENSE** This source file is subject to version 1.0 of the Zend Framework* license, that is bundled with this package in the file LICENSE, and* is available through the world-wide-web at the following URL:* http://www.zend.com/license/framework/1_0.txt. If you did not receive* a copy of the Zend Framework license and are unable to obtain it* through the world-wide-web, please send a note to license@zend.com* so we can mail you a copy immediately.** @package Zend_Search_Lucene* @subpackage Index* @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)* @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0*//** Zend_Search_Lucene_Exception */require_once 'Zend/Search/Lucene/Exception.php';/** Zend_Search_Lucene_Analysis_Analyzer */require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';/** Zend_Search_Lucene_Index_SegmentInfo */require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';/*** @package Zend_Search_Lucene* @subpackage Index* @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)* @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0*/class Zend_Search_Lucene_Index_SegmentWriter{/*** Expert: The fraction of terms in the "dictionary" which should be stored* in RAM. Smaller values use more memory, but make searching slightly* faster, while larger values use less memory and make searching slightly* slower. Searching is typically not dominated by dictionary lookup, so* tweaking this is rarely useful.** @var integer*/static public $indexInterval = 128;/** Expert: The fraction of TermDocs entries stored in skip tables.* Larger values result in smaller indexes, greater acceleration, but fewer* accelerable cases, while smaller values result in bigger indexes,* less acceleration and more* accelerable cases. More detailed experiments would be useful here.** 0x0x7FFFFFFF indicates that we don't use skip data* Default value is 16** @var integer*/static public $skipInterval = 0x7FFFFFFF;/*** Number of docs in a segment** @var integer*/private $_docCount;/*** Segment name** @var string*/private $_name;/*** File system adapter.** @var Zend_Search_Lucene_Storage_Directory*/private $_directory;/*** List of the index files.* Used for automatic compound file generation** @var unknown_type*/private $_files;/*** Term Dictionary* Array of the Zend_Search_Lucene_Index_Term objects* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos** @var array*/private $_termDictionary;/*** Documents, which contain the term** @var array*/private $_termDocs;/*** Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment** @var array*/private $_fields;/*** Normalization factors.* An array fieldName => normVector* normVector is a binary string.* Each byte corresponds to an indexed document in a segment and* encodes normalization factor (float value, encoded by* Zend_Search_Lucene_Search_Similarity::encodeNorm())** @var array*/private $_norms;/*** '.fdx' file - Stored Fields, the field index.** @var Zend_Search_Lucene_Storage_File*/private $_fdxFile;/*** '.fdx' file - Stored Fields, the field data.** @var Zend_Search_Lucene_Storage_File*/private $_fdtFile;/*** Object constructor.** @param Zend_Search_Lucene_Storage_Directory $directory* @param string $name*/public function __construct($directory, $name){$this->_directory = $directory;$this->_name = $name;$this->_docCount = 0;$this->_fields = array();$this->_termDocs = array();$this->_files = array();$this->_norms = array();$this->_fdxFile = null;$this->_fdtFile = null;}/*** Add field to the segment** @param Zend_Search_Lucene_Field $field*/private function _addFieldInfo(Zend_Search_Lucene_Field $field){if (!isset($this->_fields[$field->name])) {$this->_fields[$field->name] =new Zend_Search_Lucene_Index_FieldInfo($field->name,$field->isIndexed,count($this->_fields),$field->storeTermVector);} else {$this->_fields[$field->name]->isIndexed |= $field->isIndexed;$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;}}/*** Adds a document to this segment.** @param Zend_Search_Lucene_Document $document* @throws Zend_Search_Lucene_Exception*/public function addDocument(Zend_Search_Lucene_Document $document){$storedFields = array();foreach ($document->getFieldNames() as $fieldName) {$field = $document->getField($fieldName);$this->_addFieldInfo($field);if ($field->storeTermVector) {/*** @todo term vector storing support*/throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');}if ($field->isIndexed) {if ($field->isTokenized) {$tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);} else {$tokenList = array();$tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));}$position = 0;foreach ($tokenList as $token) {$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);$termKey = $term->key();if (!isset($this->_termDictionary[$termKey])) {// New term$this->_termDictionary[$termKey] = $term;$this->_termDocs[$termKey] = array();$this->_termDocs[$termKey][$this->_docCount] = array();} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {// Existing term, but new term entry$this->_termDocs[$termKey][$this->_docCount] = array();}$position += $token->getPositionIncrement();$this->_termDocs[$termKey][$this->_docCount][] = $position;}}if ($field->isStored) {$storedFields[] = $field;}}if (count($storedFields) != 0) {if (!isset($this->_fdxFile)) {$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');$this->_files[] = $this->_name . '.fdx';$this->_files[] = $this->_name . '.fdt';}$this->_fdxFile->writeLong($this->_fdtFile->tell());$this->_fdtFile->writeVInt(count($storedFields));foreach ($storedFields as $field) {$this->_fdtFile->writeVInt($this->_fields[$field->name]->number);$this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 |$field->isBinary ? 0x02 : 0x00 |0x00 /* 0x04 - third bit, compressed (ZLIB) */ );if ($field->isBinary) {$this->_fdtFile->writeVInt(strlen($field->stringValue));$this->_fdtFile->writeBytes($field->stringValue);} else {$this->_fdtFile->writeString($field->stringValue);}}}$this->_docCount++;}/*** Dump Field Info (.fnm) segment file*/private function _dumpFNM(){$fnmFile = $this->_directory->createFile($this->_name . '.fnm');$fnmFile->writeVInt(count($this->_fields));foreach ($this->_fields as $field) {$fnmFile->writeString($field->name);$fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |($field->storeTermVector ? 0x02 : 0x00) |// not supported yet 0x04 /* term positions are stored with the term vectors */ |// not supported yet 0x08 /* term offsets are stored with the term vectors */ |/* not supported yet */ 0x10 /* norms are omitted for the indexed field */);}$this->_files[] = $this->_name . '.fnm';}/*** Dump Term Dictionary segment file entry.* Used to write entry to .tis or .tii files** @param Zend_Search_Lucene_Storage_File $dicFile* @param Zend_Search_Lucene_Index_Term $prevTerm* @param Zend_Search_Lucene_Index_Term $term* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo* @param Zend_Search_Lucene_Index_TermInfo $termInfo*/private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,&$prevTerm, Zend_Search_Lucene_Index_Term $term,&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo){if (isset($prevTerm) && $prevTerm->field == $term->field) {$prefixLength = 0;while ($prefixLength < strlen($prevTerm->text) &&$prefixLength < strlen($term->text) &&$prevTerm->text{$prefixLength} == $term->text{$prefixLength}) {$prefixLength++;}// Write preffix length$dicFile->writeVInt($prefixLength);// Write suffix$dicFile->writeString( substr($term->text, $prefixLength) );} else {// Write preffix length$dicFile->writeVInt(0);// Write suffix$dicFile->writeString($term->text);}// Write field number$dicFile->writeVInt($term->field);// DocFreq (the count of documents which contain the term)$dicFile->writeVInt($termInfo->docFreq);$prevTerm = $term;if (!isset($prevTermInfo)) {// Write FreqDelta$dicFile->writeVInt($termInfo->freqPointer);// Write ProxDelta$dicFile->writeVInt($termInfo->proxPointer);} else {// Write FreqDelta$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);// Write ProxDelta$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);}// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipIntervalif ($termInfo->skipOffset != 0) {$dicFile->writeVInt($termInfo->skipOffset);}$prevTermInfo = $termInfo;}/*** Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files*/private function _dumpDictionary(){$tisFile = $this->_directory->createFile($this->_name . '.tis');$tisFile->writeInt((int)0xFFFFFFFE);$tisFile->writeLong(count($this->_termDictionary));$tisFile->writeInt(self::$indexInterval);$tisFile->writeInt(self::$skipInterval);$tiiFile = $this->_directory->createFile($this->_name . '.tii');$tiiFile->writeInt((int)0xFFFFFFFE);$tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1);$tiiFile->writeInt(self::$indexInterval);$tiiFile->writeInt(self::$skipInterval);$frqFile = $this->_directory->createFile($this->_name . '.frq');$prxFile = $this->_directory->createFile($this->_name . '.prx');$termKeys = array_keys($this->_termDictionary);sort($termKeys, SORT_STRING);$termCount = 0;$prevTerm = null;$prevTermInfo = null;$prevIndexTerm = null;$prevIndexTermInfo = null;$prevIndexPosition = 0;foreach ($termKeys as $termId) {$freqPointer = $frqFile->tell();$proxPointer = $prxFile->tell();$prevDoc = 0;foreach ($this->_termDocs[$termId] as $docId => $termPositions) {$docDelta = ($docId - $prevDoc)*2;$prevDoc = $docId;if (count($termPositions) > 1) {$frqFile->writeVInt($docDelta);$frqFile->writeVInt(count($termPositions));} else {$frqFile->writeVInt($docDelta + 1);}$prevPosition = 0;foreach ($termPositions as $position) {$prxFile->writeVInt($position - $prevPosition);$prevPosition = $position;}}if (count($this->_termDocs[$termId]) >= self::$skipInterval) {/*** @todo Write Skip Data to a freq file.* It's not used now, but must be implemented to be compatible with Lucene*/$skipOffset = $frqFile->tell() - $freqPointer;} else {$skipOffset = 0;}$term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,$this->_fields[$this->_termDictionary[$termId]->field]->number);$termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),$freqPointer, $proxPointer, $skipOffset);$this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);if ($termCount % self::$indexInterval == 0) {$this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);$indexPosition = $tisFile->tell();$tiiFile->writeVInt($indexPosition - $prevIndexPosition);$prevIndexPosition = $indexPosition;}$termCount++;}$this->_files[] = $this->_name . '.tis';$this->_files[] = $this->_name . '.tii';$this->_files[] = $this->_name . '.frq';$this->_files[] = $this->_name . '.prx';}/*** Generate compound index file*/private function _generateCFS(){$cfsFile = $this->_directory->createFile($this->_name . '.cfs');$cfsFile->writeVInt(count($this->_files));$dataOffsetPointers = array();foreach ($this->_files as $fileName) {$dataOffsetPointers[$fileName] = $cfsFile->tell();$cfsFile->writeLong(0); // write dummy data$cfsFile->writeString($fileName);}foreach ($this->_files as $fileName) {// Get actual data offset$dataOffset = $cfsFile->tell();// Seek to the data offset pointer$cfsFile->seek($dataOffsetPointers[$fileName]);// Write actual data offset value$cfsFile->writeLong($dataOffset);// Seek back to the end of file$cfsFile->seek($dataOffset);$dataFile = $this->_directory->getFileObject($fileName);$cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName)));$this->_directory->deleteFile($fileName);}}/*** Close segment, write it to disk and return segment info** @return Zend_Search_Lucene_Index_SegmentInfo*/public function close(){if ($this->_docCount == 0) {return null;}$this->_dumpFNM();$this->_dumpDictionary();$this->_generateCFS();return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,$this->_docCount,$this->_directory);}}