/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using FieldSelector = Lucene.Net.Documents.FieldSelector; using FieldSelectorResult = Lucene.Net.Documents.FieldSelectorResult; using Directory = Lucene.Net.Store.Directory; using IndexOutput = Lucene.Net.Store.IndexOutput; using RAMOutputStream = Lucene.Net.Store.RAMOutputStream; namespace Lucene.Net.Index { /// The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, /// into a single Segment. After adding the appropriate readers, call the merge method to combine the /// segments. ///

/// If the compoundFile flag is set, then the segments will be merged into a compound file. /// /// ///

/// /// /// /// public sealed class SegmentMerger { private class AnonymousClassFieldSelector : FieldSelector { public AnonymousClassFieldSelector(SegmentMerger enclosingInstance) { InitBlock(enclosingInstance); } private void InitBlock(SegmentMerger enclosingInstance) { this.enclosingInstance = enclosingInstance; } private SegmentMerger enclosingInstance; public SegmentMerger Enclosing_Instance { get { return enclosingInstance; } } public FieldSelectorResult Accept(System.String fieldName) { return FieldSelectorResult.LOAD_FOR_MERGE; } } private void InitBlock() { termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; } /// norms header placeholder internal static readonly byte[] NORMS_HEADER = new byte[]{(byte) 'N', (byte) 'R', (byte) 'M', (byte) 255}; private Directory directory; private System.String segment; private int termIndexInterval; private System.Collections.ArrayList readers = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); private FieldInfos fieldInfos; /// This ctor used only by test code. /// /// /// The Directory to merge the other segments into /// /// The name of the new segment /// public SegmentMerger(Directory dir, System.String name) { InitBlock(); directory = dir; segment = name; } internal SegmentMerger(IndexWriter writer, System.String name) { InitBlock(); directory = writer.GetDirectory(); segment = name; termIndexInterval = writer.GetTermIndexInterval(); } /// Add an IndexReader to the collection of readers that are to be merged /// reader /// public void Add(IndexReader reader) { readers.Add(reader); } /// /// The index of the reader to return /// /// The ith reader to be merged /// internal IndexReader SegmentReader(int i) { return (IndexReader) readers[i]; } /// Merges the readers specified by the {@link #add} method into the directory passed to the constructor /// The number of documents that were merged /// /// IOException public int Merge() { int value_Renamed; value_Renamed = MergeFields(); MergeTerms(); MergeNorms(); if (fieldInfos.HasVectors()) MergeVectors(); return value_Renamed; } /// close all IndexReaders that have been added. /// Should not be called before merge(). /// /// IOException public void CloseReaders() { for (int i = 0; i < readers.Count; i++) { // close readers IndexReader reader = (IndexReader) readers[i]; reader.Close(); } } public System.Collections.ArrayList CreateCompoundFile(System.String fileName) { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName); System.Collections.ArrayList files = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.Length + 1)); // Basic files for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.Length; i++) { files.Add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]); } // Fieldable norm files for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { files.Add(segment + "." + IndexFileNames.NORMS_EXTENSION); break; } } // Vector files if (fieldInfos.HasVectors()) { for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.Length; i++) { files.Add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]); } } // Now merge all added files System.Collections.IEnumerator it = files.GetEnumerator(); while (it.MoveNext()) { cfsWriter.AddFile((System.String) it.Current); } // Perform the merge cfsWriter.Close(); return files; } private void AddIndexed(IndexReader reader, FieldInfos fieldInfos, System.Collections.ICollection names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector) { System.Collections.IEnumerator i = names.GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry) i.Current; System.String field = (System.String) e.Key; fieldInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.HasNorms(field)); } } /// /// The number of documents in all of the readers /// /// IOException private int MergeFields() { fieldInfos = new FieldInfos(); // merge field names int docCount = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } fieldInfos.Write(directory, segment + ".fnm"); FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc; j++) if (!reader.IsDeleted(j)) { // skip deleted docs fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge)); docCount++; } } } finally { fieldsWriter.Close(); } return docCount; } /// Merge the TermVectors from each of the segments into the new one. /// IOException private void MergeVectors() { TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); try { for (int r = 0; r < readers.Count; r++) { IndexReader reader = (IndexReader) readers[r]; int maxDoc = reader.MaxDoc(); for (int docNum = 0; docNum < maxDoc; docNum++) { // skip deleted docs if (reader.IsDeleted(docNum)) continue; termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum)); } } } finally { termVectorsWriter.Close(); } } private IndexOutput freqOutput = null; private IndexOutput proxOutput = null; private TermInfosWriter termInfosWriter = null; private int skipInterval; private SegmentMergeQueue queue = null; private void MergeTerms() { try { freqOutput = directory.CreateOutput(segment + ".frq"); proxOutput = directory.CreateOutput(segment + ".prx"); termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); skipInterval = termInfosWriter.skipInterval; queue = new SegmentMergeQueue(readers.Count); MergeTermInfos(); } finally { if (freqOutput != null) freqOutput.Close(); if (proxOutput != null) proxOutput.Close(); if (termInfosWriter != null) termInfosWriter.Close(); if (queue != null) queue.Close(); } } private void MergeTermInfos() { int base_Renamed = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader) readers[i]; TermEnum termEnum = reader.Terms(); SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader); base_Renamed += reader.NumDocs(); if (smi.Next()) queue.Put(smi); // initialize queue else smi.Close(); } SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count]; while (queue.Size() > 0) { int matchSize = 0; // pop matching terms match[matchSize++] = (SegmentMergeInfo) queue.Pop(); Term term = match[0].term; SegmentMergeInfo top = (SegmentMergeInfo) queue.Top(); while (top != null && term.CompareTo(top.term) == 0) { match[matchSize++] = (SegmentMergeInfo) queue.Pop(); top = (SegmentMergeInfo) queue.Top(); } MergeTermInfo(match, matchSize); // add new TermInfo while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.Next()) queue.Put(smi); // restore queue else smi.Close(); // done with a segment } } } private TermInfo termInfo = new TermInfo(); // minimize consing /// Merge one term found in one or more segments. The array smis /// contains segments that are positioned at the same term. N /// is the number of cells in the array actually occupied. /// /// /// array of segments /// /// number of cells in the array actually occupied /// private void MergeTermInfo(SegmentMergeInfo[] smis, int n) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); int df = AppendPostings(smis, n); // append posting data long skipPointer = WriteSkip(); if (df > 0) { // add an entry to the dictionary with pointers to prox and freq files termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); termInfosWriter.Add(smis[0].term, termInfo); } } /// Process postings from multiple segments all positioned on the /// same term. Writes out merged entries into freqOutput and /// the proxOutput streams. /// /// /// array of segments /// /// number of cells in the array actually occupied /// /// number of documents across all segments where this term was found /// private int AppendPostings(SegmentMergeInfo[] smis, int n) { int lastDoc = 0; int df = 0; // number of docs w/ term ResetSkip(); for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.GetPositions(); int base_Renamed = smi.base_Renamed; int[] docMap = smi.GetDocMap(); postings.Seek(smi.termEnum); while (postings.Next()) { int doc = postings.Doc(); if (docMap != null) doc = docMap[doc]; // map around deletions doc += base_Renamed; // convert to merged space if (doc < 0 || (df > 0 && doc <= lastDoc)) throw new System.SystemException("docs out of order (" + doc + " <= " + lastDoc + " )"); df++; if ((df % skipInterval) == 0) { BufferSkip(lastDoc); } int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 lastDoc = doc; int freq = postings.Freq(); if (freq == 1) { freqOutput.WriteVInt(docCode | 1); // write doc & freq=1 } else { freqOutput.WriteVInt(docCode); // write doc freqOutput.WriteVInt(freq); // write frequency in doc } int lastPosition = 0; // write position deltas for (int j = 0; j < freq; j++) { int position = postings.NextPosition(); proxOutput.WriteVInt(position - lastPosition); lastPosition = position; } } } return df; } private RAMOutputStream skipBuffer = new RAMOutputStream(); private int lastSkipDoc; private long lastSkipFreqPointer; private long lastSkipProxPointer; private void ResetSkip() { skipBuffer.Reset(); lastSkipDoc = 0; lastSkipFreqPointer = freqOutput.GetFilePointer(); lastSkipProxPointer = proxOutput.GetFilePointer(); } private void BufferSkip(int doc) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); skipBuffer.WriteVInt(doc - lastSkipDoc); skipBuffer.WriteVInt((int) (freqPointer - lastSkipFreqPointer)); skipBuffer.WriteVInt((int) (proxPointer - lastSkipProxPointer)); lastSkipDoc = doc; lastSkipFreqPointer = freqPointer; lastSkipProxPointer = proxPointer; } private long WriteSkip() { long skipPointer = freqOutput.GetFilePointer(); skipBuffer.WriteTo(freqOutput); return skipPointer; } private void MergeNorms() { byte[] normBuffer = null; IndexOutput output = null; try { for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { if (output == null) { output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length); } for (int j = 0; j < readers.Count; j++) { IndexReader reader = (IndexReader) readers[j]; int maxDoc = reader.MaxDoc(); if (normBuffer == null || normBuffer.Length < maxDoc) { // the buffer is too small for the current segment normBuffer = new byte[maxDoc]; } reader.Norms(fi.name, normBuffer, 0); if (!reader.HasDeletions()) { //optimized case for segments without deleted docs output.WriteBytes(normBuffer, maxDoc); } else { // this segment has deleted docs, so we have to // check for every doc if it is deleted or not for (int k = 0; k < maxDoc; k++) { if (!reader.IsDeleted(k)) { output.WriteByte(normBuffer[k]); } } } } } } } finally { if (output != null) { output.Close(); } } } } }