/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Document = Lucene.Net.Documents.Document; using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException; using Directory = Lucene.Net.Store.Directory; using FSDirectory = Lucene.Net.Store.FSDirectory; using Lock = Lucene.Net.Store.Lock; using LockObtainFailedException = Lucene.Net.Store.LockObtainFailedException; using BitVector = Lucene.Net.Util.BitVector; using Analyzer = Lucene.Net.Analysis.Analyzer; using Similarity = Lucene.Net.Search.Similarity; using System.Collections; namespace Lucene.Net.Index { /// An IndexWriter creates and maintains an index. ///

The create argument to the /// constructor /// determines whether a new index is created, or whether an existing index is /// opened. Note that you /// can open an index with create=true even while readers are /// using the index. The old readers will continue to search /// the "point in time" snapshot they had opened, and won't /// see the newly created index until they re-open. There are /// also constructors /// with no create argument which /// will create a new index if there is not already an index at the /// provided path and otherwise open the existing index.

///

In either case, documents are added with addDocument /// and removed with deleteDocuments. /// A document can be updated with updateDocument /// (which just deletes and then adds the entire document). /// When finished adding, deleting and updating documents, close should be called.

///

These changes are buffered in memory and periodically /// flushed to the {@link Directory} (during the above method /// calls). A flush is triggered when there are enough /// buffered deletes (see {@link #setMaxBufferedDeleteTerms}) /// or enough added documents since the last flush, whichever /// is sooner. For the added documents, flushing is triggered /// either by RAM usage of the documents (see {@link /// #setRAMBufferSizeMB}) or the number of added documents. /// The default is to flush when RAM usage hits 16 MB. For /// best indexing speed you should flush by RAM usage with a /// large RAM buffer. You can also force a flush by calling /// {@link #flush}. When a flush occurs, both pending deletes /// and added documents are flushed to the index. A flush may /// also trigger one or more segment merges which by default /// run with a background thread so as not to block the /// addDocument calls (see below /// for changing the {@link MergeScheduler}).

/// ///

The optional autoCommit argument to the /// constructors /// controls visibility of the changes to {@link IndexReader} instances reading the same index. /// When this is false, changes are not /// visible until {@link #Close()} is called. /// Note that changes will still be flushed to the /// {@link Lucene.Net.Store.Directory} as new files, /// but are not committed (no new segments_N file /// is written referencing the new files) until {@link #close} is /// called. If something goes terribly wrong (for example the /// JVM crashes) before {@link #Close()}, then /// the index will reflect none of the changes made (it will /// remain in its starting state). /// You can also call {@link #Abort()}, which closes the writer without committing any /// changes, and removes any index /// files that had been flushed but are now unreferenced. /// This mode is useful for preventing readers from refreshing /// at a bad time (for example after you've done all your /// deletes but before you've done your adds). /// It can also be used to implement simple single-writer /// transactional semantics ("all or none").

///

When autoCommit is true then /// every flush is also a commit ({@link IndexReader} /// instances will see each flush as changes to the index). /// This is the default, to match the behavior before 2.2. /// When running in this mode, be careful not to refresh your /// readers while optimize or segment merges are taking place /// as this can tie up substantial disk space.

///
///

Regardless of autoCommit, an {@link /// IndexReader} or {@link Lucene.Net.Search.IndexSearcher} will only see the /// index as of the "point in time" that it was opened. Any /// changes committed to the index after the reader was opened /// are not visible until the reader is re-opened.

///

If an index will not have more documents added for a while and optimal search /// performance is desired, then the optimize /// method should be called before the index is closed.

///

Opening an IndexWriter creates a lock file for the directory in use. Trying to open /// another IndexWriter on the same directory will lead to a /// {@link LockObtainFailedException}. The {@link LockObtainFailedException} /// is also thrown if an IndexReader on the same directory is used to delete documents /// from the index.

///
/// ///

Expert: IndexWriter allows an optional /// {@link IndexDeletionPolicy} implementation to be /// specified. You can use this to control when prior commits /// are deleted from the index. The default policy is {@link /// KeepOnlyLastCommitDeletionPolicy} which removes all prior /// commits as soon as a new commit is done (this matches /// behavior before 2.2). Creating your own policy can allow /// you to explicitly keep previous "point in time" commits /// alive in the index for some time, to allow readers to /// refresh to the new commit without having the old commit /// deleted out from under them. This is necessary on /// filesystems like NFS that do not support "delete on last /// close" semantics, which Lucene's "point in time" search /// normally relies on.

///

Expert: /// IndexWriter allows you to separately change /// the {@link MergePolicy} and the {@link MergeScheduler}. /// The {@link MergePolicy} is invoked whenever there are /// changes to the segments in the index. Its role is to /// select which merges to do, if any, and return a {@link /// MergePolicy.MergeSpecification} describing the merges. It /// also selects merges to do for optimize(). (The default is /// {@link LogByteSizeMergePolicy}. Then, the {@link /// MergeScheduler} is invoked with the requested merges and /// it decides when and how to run the merges. The default is /// {@link ConcurrentMergeScheduler}.

///
/* * Clarification: Check Points (and commits) * Being able to set autoCommit=false allows IndexWriter to flush and * write new index files to the directory without writing a new segments_N * file which references these new files. It also means that the state of * the in memory SegmentInfos object is different than the most recent * segments_N file written to the directory. * * Each time the SegmentInfos is changed, and matches the (possibly * modified) directory files, we have a new "check point". * If the modified/new SegmentInfos is written to disk - as a new * (generation of) segments_N file - this check point is also an * IndexCommitPoint. * * With autoCommit=true, every checkPoint is also a CommitPoint. * With autoCommit=false, some checkPoints may not be commits. * * A new checkpoint always replaces the previous checkpoint and * becomes the new "front" of the index. This allows the IndexFileDeleter * to delete files that are referenced only by stale checkpoints. * (files that were created since the last commit, but are no longer * referenced by the "front" of the index). For this, IndexFileDeleter * keeps track of the last non commit checkpoint. */ public class IndexWriter { private void InitBlock() { similarity = Similarity.GetDefault(); } /// Default value for the write lock timeout (1,000). /// /// public static long WRITE_LOCK_TIMEOUT = 1000; private long writeLockTimeout = WRITE_LOCK_TIMEOUT; /// Name of the write lock in the index. public const System.String WRITE_LOCK_NAME = "write.lock"; /// /// /// /// public static readonly int DEFAULT_MERGE_FACTOR; /// Value to denote a flush trigger is disabled public const int DISABLE_AUTO_FLUSH = - 1; /// Disabled by default (because IndexWriter flushes by RAM usage /// by default). Change using {@link #SetMaxBufferedDocs(int)}. /// public static readonly int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH; /// Default value is 16 MB (which means flush when buffered /// docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}. /// public const double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; /// Disabled by default (because IndexWriter flushes by RAM usage /// by default). Change using {@link #SetMaxBufferedDeleteTerms(int)}. /// public static readonly int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH; /// /// /// /// public static readonly int DEFAULT_MAX_MERGE_DOCS; /// Default value is 10,000. Change using {@link #SetMaxFieldLength(int)}. public const int DEFAULT_MAX_FIELD_LENGTH = 10000; /// Default value is 128. Change using {@link #SetTermIndexInterval(int)}. public const int DEFAULT_TERM_INDEX_INTERVAL = 128; /// Absolute hard maximum length for a term. If a term /// arrives from the analyzer longer than this length, it /// is skipped and a message is printed to infoStream, if /// set (see {@link #setInfoStream}). /// public static readonly int MAX_TERM_LENGTH; // The normal read buffer size defaults to 1024, but // increasing this during merging seems to yield // performance gains. However we don't want to increase // it too much because there are quite a few // BufferedIndexInputs created during merging. See // LUCENE-888 for details. private const int MERGE_READ_BUFFER_SIZE = 4096; // Used for printing messages private static System.Object MESSAGE_ID_LOCK = new System.Object(); private static int MESSAGE_ID = 0; private int messageID = - 1; private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text private Similarity similarity; // how to normalize private bool commitPending; // true if segmentInfos has changes not yet committed private SegmentInfos rollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails private SegmentInfos localRollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails private bool localAutoCommit; // saved autoCommit during local transaction private bool autoCommit = true; // false if we should commit only on close private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private DocumentsWriter docWriter; private IndexFileDeleter deleter; private System.Collections.Hashtable segmentsToOptimize = new System.Collections.Hashtable(); // used by optimize to note those needing optimization private Lock writeLock; private int termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; private bool closeDir; private bool closed; private bool closing; // Holds all SegmentInfo instances currently involved in // merges private System.Collections.Hashtable mergingSegments = new System.Collections.Hashtable(); private MergePolicy mergePolicy = new LogByteSizeMergePolicy(); private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); private System.Collections.ArrayList pendingMerges = new System.Collections.ArrayList(); private System.Collections.Hashtable runningMerges = new System.Collections.Hashtable(); private System.Collections.IList mergeExceptions = new System.Collections.ArrayList(); private long mergeGen; private bool stopMerges; /// Used internally to throw an {@link /// AlreadyClosedException} if this IndexWriter has been /// closed. /// /// AlreadyClosedException if this IndexWriter is protected internal void EnsureOpen() { if (closed) { throw new AlreadyClosedException("this IndexWriter is closed"); } } /// Prints a message to the infoStream (if non-null), /// prefixed with the identifying information for this /// writer and the thread that's calling it. /// public virtual void Message(System.String message) { if (infoStream != null) infoStream.WriteLine("IW " + messageID + " [" + SupportClass.ThreadClass.Current().Name + "]: " + message); } private void SetMessageID() { lock (this) { if (infoStream != null && messageID == - 1) { lock (MESSAGE_ID_LOCK) { messageID = MESSAGE_ID++; } } } } /// Casts current mergePolicy to LogMergePolicy, and throws /// an exception if the mergePolicy is not a LogMergePolicy. /// private LogMergePolicy GetLogMergePolicy() { if (mergePolicy is LogMergePolicy) return (LogMergePolicy) mergePolicy; else throw new System.ArgumentException("this method can only be called when the merge policy is the default LogMergePolicy"); } ///

Get the current setting of whether newly flushed /// segments will use the compound file format. Note that /// this just returns the value previously set with /// setUseCompoundFile(boolean), or the default value /// (true). You cannot use this to query the status of /// previously flushed segments.

/// ///

Note that this method is a convenience method: it /// just calls mergePolicy.getUseCompoundFile as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

/// ///
/// /// public virtual bool GetUseCompoundFile() { return GetLogMergePolicy().GetUseCompoundFile(); } ///

Setting to turn on usage of a compound file. When on, /// multiple files for each segment are merged into a /// single file when a new segment is flushed.

/// ///

Note that this method is a convenience method: it /// just calls mergePolicy.setUseCompoundFile as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

///
public virtual void SetUseCompoundFile(bool value_Renamed) { GetLogMergePolicy().SetUseCompoundFile(value_Renamed); GetLogMergePolicy().SetUseCompoundDocStore(value_Renamed); } /// Expert: Set the Similarity implementation used by this IndexWriter. /// /// /// /// public virtual void SetSimilarity(Similarity similarity) { EnsureOpen(); this.similarity = similarity; } /// Expert: Return the Similarity implementation used by this IndexWriter. /// ///

This defaults to the current value of {@link Similarity#GetDefault()}. ///

public virtual Similarity GetSimilarity() { EnsureOpen(); return this.similarity; } /// Expert: Set the interval between indexed terms. Large values cause less /// memory to be used by IndexReader, but slow random-access to terms. Small /// values cause more memory to be used by an IndexReader, and speed /// random-access to terms. /// /// This parameter determines the amount of computation required per query /// term, regardless of the number of documents that contain that term. In /// particular, it is the maximum number of other terms that must be /// scanned before a term is located and its frequency and position information /// may be processed. In a large index with user-entered query terms, query /// processing time is likely to be dominated not by term lookup but rather /// by the processing of frequency and positional data. In a small index /// or when many uncommon query terms are generated (e.g., by wildcard /// queries) term lookup may become a dominant cost. /// /// In particular, numUniqueTerms/interval terms are read into /// memory by an IndexReader, and, on average, interval/2 terms /// must be scanned for each random term access. /// /// /// /// public virtual void SetTermIndexInterval(int interval) { EnsureOpen(); this.termIndexInterval = interval; } /// Expert: Return the interval between indexed terms. /// /// /// /// public virtual int GetTermIndexInterval() { EnsureOpen(); return termIndexInterval; } /// Constructs an IndexWriter for the index in path. /// Text will be analyzed with a. If create /// is true, then a new, empty index will be created in /// path, replacing the index already there, if any. /// /// /// the path to the index directory /// /// the analyzer to use /// /// true to create the index or overwrite /// the existing one; false to append to the existing /// index /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be read/written to, or /// if it does not exist and create is /// false or if there is any other low-level /// IO error /// public IndexWriter(System.String path, Analyzer a, bool create) { InitBlock(); Init(FSDirectory.GetDirectory(path), a, create, true, null, true); } /// Constructs an IndexWriter for the index in path. /// Text will be analyzed with a. If create /// is true, then a new, empty index will be created in /// path, replacing the index already there, if any. /// /// /// the path to the index directory /// /// the analyzer to use /// /// true to create the index or overwrite /// the existing one; false to append to the existing /// index /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be read/written to, or /// if it does not exist and create is /// false or if there is any other low-level /// IO error /// public IndexWriter(System.IO.FileInfo path, Analyzer a, bool create) { InitBlock(); Init(FSDirectory.GetDirectory(path), a, create, true, null, true); } /// Constructs an IndexWriter for the index in d. /// Text will be analyzed with a. If create /// is true, then a new, empty index will be created in /// d, replacing the index already there, if any. /// /// /// the index directory /// /// the analyzer to use /// /// true to create the index or overwrite /// the existing one; false to append to the existing /// index /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be read/written to, or /// if it does not exist and create is /// false or if there is any other low-level /// IO error /// public IndexWriter(Directory d, Analyzer a, bool create) { InitBlock(); Init(d, a, create, false, null, true); } /// Constructs an IndexWriter for the index in /// path, first creating it if it does not /// already exist. Text will be analyzed with /// a. /// /// /// the path to the index directory /// /// the analyzer to use /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be /// read/written to or if there is any other low-level /// IO error /// public IndexWriter(System.String path, Analyzer a) { InitBlock(); Init(FSDirectory.GetDirectory(path), a, true, null, true); } /// Constructs an IndexWriter for the index in /// path, first creating it if it does not /// already exist. Text will be analyzed with /// a. /// /// /// the path to the index directory /// /// the analyzer to use /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be /// read/written to or if there is any other low-level /// IO error /// public IndexWriter(System.IO.FileInfo path, Analyzer a) { InitBlock(); Init(FSDirectory.GetDirectory(path), a, true, null, true); } /// Constructs an IndexWriter for the index in /// d, first creating it if it does not /// already exist. Text will be analyzed with /// a. /// /// /// the index directory /// /// the analyzer to use /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be /// read/written to or if there is any other low-level /// IO error /// public IndexWriter(Directory d, Analyzer a) { InitBlock(); Init(d, a, false, null, true); } /// Constructs an IndexWriter for the index in /// d, first creating it if it does not /// already exist. Text will be analyzed with /// a. /// /// /// the index directory /// /// see above /// /// the analyzer to use /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be /// read/written to or if there is any other low-level /// IO error /// public IndexWriter(Directory d, bool autoCommit, Analyzer a) { InitBlock(); Init(d, a, false, null, autoCommit); } /// Constructs an IndexWriter for the index in d. /// Text will be analyzed with a. If create /// is true, then a new, empty index will be created in /// d, replacing the index already there, if any. /// /// /// the index directory /// /// see above /// /// the analyzer to use /// /// true to create the index or overwrite /// the existing one; false to append to the existing /// index /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be read/written to, or /// if it does not exist and create is /// false or if there is any other low-level /// IO error /// public IndexWriter(Directory d, bool autoCommit, Analyzer a, bool create) { InitBlock(); Init(d, a, create, false, null, autoCommit); } /// Expert: constructs an IndexWriter with a custom {@link /// IndexDeletionPolicy}, for the index in d, /// first creating it if it does not already exist. Text /// will be analyzed with a. /// /// /// the index directory /// /// see above /// /// the analyzer to use /// /// see above /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be /// read/written to or if there is any other low-level /// IO error /// public IndexWriter(Directory d, bool autoCommit, Analyzer a, IndexDeletionPolicy deletionPolicy) { InitBlock(); Init(d, a, false, deletionPolicy, autoCommit); } /// Expert: constructs an IndexWriter with a custom {@link /// IndexDeletionPolicy}, for the index in d. /// Text will be analyzed with a. If /// create is true, then a new, empty index /// will be created in d, replacing the index /// already there, if any. /// /// /// the index directory /// /// see above /// /// the analyzer to use /// /// true to create the index or overwrite /// the existing one; false to append to the existing /// index /// /// see above /// /// CorruptIndexException if the index is corrupt /// LockObtainFailedException if another writer /// has this index open (write.lock could not /// be obtained) /// /// IOException if the directory cannot be read/written to, or /// if it does not exist and create is /// false or if there is any other low-level /// IO error /// public IndexWriter(Directory d, bool autoCommit, Analyzer a, bool create, IndexDeletionPolicy deletionPolicy) { InitBlock(); Init(d, a, create, false, deletionPolicy, autoCommit); } private void Init(Directory d, Analyzer a, bool closeDir, IndexDeletionPolicy deletionPolicy, bool autoCommit) { if (IndexReader.IndexExists(d)) { Init(d, a, false, closeDir, deletionPolicy, autoCommit); } else { Init(d, a, true, closeDir, deletionPolicy, autoCommit); } } private void Init(Directory d, Analyzer a, bool create, bool closeDir, IndexDeletionPolicy deletionPolicy, bool autoCommit) { this.closeDir = closeDir; directory = d; analyzer = a; this.infoStream = defaultInfoStream; SetMessageID(); if (create) { // Clear the write lock in case it's leftover: directory.ClearLock(IndexWriter.WRITE_LOCK_NAME); } Lock writeLock = directory.MakeLock(IndexWriter.WRITE_LOCK_NAME); if (!writeLock.Obtain(writeLockTimeout)) // obtain write lock { throw new LockObtainFailedException("Index locked for write: " + writeLock); } this.writeLock = writeLock; // save it try { if (create) { // Try to read first. This is to allow create // against an index that's currently open for // searching. In this case we write the next // segments_N file with no segments: try { segmentInfos.Read(directory); segmentInfos.Clear(); } catch (System.IO.IOException e) { // Likely this means it's a fresh directory } segmentInfos.Write(directory); } else { segmentInfos.Read(directory); } this.autoCommit = autoCommit; if (!autoCommit) { rollbackSegmentInfos = (SegmentInfos) segmentInfos.Clone(); } docWriter = new DocumentsWriter(directory, this); docWriter.SetInfoStream(infoStream); // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, segmentInfos, infoStream, docWriter); PushMaxBufferedDocs(); if (infoStream != null) { Message("init: create=" + create); MessageState(); } } catch (System.IO.IOException e) { this.writeLock.Release(); this.writeLock = null; throw e; } } /// Expert: set the merge policy used by this writer. public virtual void SetMergePolicy(MergePolicy mp) { EnsureOpen(); if (mp == null) throw new System.NullReferenceException("MergePolicy must be non-null"); if (mergePolicy != mp) mergePolicy.Close(); mergePolicy = mp; PushMaxBufferedDocs(); if (infoStream != null) { Message("setMergePolicy " + mp); } } /// Expert: returns the current MergePolicy in use by this writer. /// /// public virtual MergePolicy GetMergePolicy() { EnsureOpen(); return mergePolicy; } /// Expert: set the merge scheduler used by this writer. public virtual void SetMergeScheduler(MergeScheduler mergeScheduler) { EnsureOpen(); if (mergeScheduler == null) throw new System.NullReferenceException("MergeScheduler must be non-null"); if (this.mergeScheduler != mergeScheduler) { FinishMerges(true); this.mergeScheduler.Close(); } this.mergeScheduler = mergeScheduler; if (infoStream != null) { Message("setMergeScheduler " + mergeScheduler); } } /// Expert: returns the current MergePolicy in use by this /// writer. /// /// /// public virtual MergeScheduler GetMergeScheduler() { EnsureOpen(); return mergeScheduler; } ///

Determines the largest segment (measured by /// document count) that may be merged with other segments. /// Small values (e.g., less than 10,000) are best for /// interactive indexing, as this limits the length of /// pauses while indexing to a few seconds. Larger values /// are best for batched indexing and speedier /// searches.

/// ///

The default value is {@link Integer#MAX_VALUE}.

/// ///

Note that this method is a convenience method: it /// just calls mergePolicy.setMaxMergeDocs as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

/// ///

The default merge policy ({@link /// LogByteSizeMergePolicy}) also allows you to set this /// limit by net size (in MB) of the segment, using {@link /// LogByteSizeMergePolicy#setMaxMergeMB}.

///
public virtual void SetMaxMergeDocs(int maxMergeDocs) { GetLogMergePolicy().SetMaxMergeDocs(maxMergeDocs); } ///

Returns the largest segment (measured by document /// count) that may be merged with other segments.

/// ///

Note that this method is a convenience method: it /// just calls mergePolicy.getMaxMergeDocs as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

/// ///
/// /// public virtual int GetMaxMergeDocs() { return GetLogMergePolicy().GetMaxMergeDocs(); } /// The maximum number of terms that will be indexed for a single field in a /// document. This limits the amount of memory required for indexing, so that /// collections with very large files will not crash the indexing process by /// running out of memory. This setting refers to the number of running terms, /// not to the number of different terms.

/// Note: this silently truncates large documents, excluding from the /// index all terms that occur further in the document. If you know your source /// documents are large, be sure to set this value high enough to accomodate /// the expected size. If you set it to Integer.MAX_VALUE, then the only limit /// is your memory, but you should anticipate an OutOfMemoryError.

/// By default, no more than 10,000 terms will be indexed for a field. ///

public virtual void SetMaxFieldLength(int maxFieldLength) { EnsureOpen(); this.maxFieldLength = maxFieldLength; if (infoStream != null) Message("setMaxFieldLength " + maxFieldLength); } /// Returns the maximum number of terms that will be /// indexed for a single field in a document. /// /// /// public virtual int GetMaxFieldLength() { EnsureOpen(); return maxFieldLength; } /// Determines the minimal number of documents required /// before the buffered in-memory documents are flushed as /// a new Segment. Large values generally gives faster /// indexing. /// ///

When this is set, the writer will flush every /// maxBufferedDocs added documents. Pass in {@link /// #DISABLE_AUTO_FLUSH} to prevent triggering a flush due /// to number of buffered documents. Note that if flushing /// by RAM usage is also enabled, then the flush will be /// triggered by whichever comes first.

/// ///

Disabled by default (writer flushes by RAM usage).

/// ///
/// IllegalArgumentException if maxBufferedDocs is /// enabled but smaller than 2, or it disables maxBufferedDocs /// when ramBufferSize is already disabled /// /// /// public virtual void SetMaxBufferedDocs(int maxBufferedDocs) { EnsureOpen(); if (maxBufferedDocs != DISABLE_AUTO_FLUSH && maxBufferedDocs < 2) throw new System.ArgumentException("maxBufferedDocs must at least be 2 when enabled"); if (maxBufferedDocs == DISABLE_AUTO_FLUSH && GetRAMBufferSizeMB() == DISABLE_AUTO_FLUSH) throw new System.ArgumentException("at least one of ramBufferSize and maxBufferedDocs must be enabled"); docWriter.SetMaxBufferedDocs(maxBufferedDocs); PushMaxBufferedDocs(); if (infoStream != null) Message("setMaxBufferedDocs " + maxBufferedDocs); } /// If we are flushing by doc count (not by RAM usage), and /// using LogDocMergePolicy then push maxBufferedDocs down /// as its minMergeDocs, to keep backwards compatibility. /// private void PushMaxBufferedDocs() { if (docWriter.GetMaxBufferedDocs() != DISABLE_AUTO_FLUSH) { MergePolicy mp = mergePolicy; if (mp is LogDocMergePolicy) { LogDocMergePolicy lmp = (LogDocMergePolicy) mp; int maxBufferedDocs = docWriter.GetMaxBufferedDocs(); if (lmp.GetMinMergeDocs() != maxBufferedDocs) { if (infoStream != null) Message("now push maxBufferedDocs " + maxBufferedDocs + " to LogDocMergePolicy"); lmp.SetMinMergeDocs(maxBufferedDocs); } } } } /// Returns the number of buffered added documents that will /// trigger a flush if enabled. /// /// /// public virtual int GetMaxBufferedDocs() { EnsureOpen(); return docWriter.GetMaxBufferedDocs(); } /// Determines the amount of RAM that may be used for /// buffering added documents before they are flushed as a /// new Segment. Generally for faster indexing performance /// it's best to flush by RAM usage instead of document /// count and use as large a RAM buffer as you can. /// ///

When this is set, the writer will flush whenever /// buffered documents use this much RAM. Pass in {@link /// #DISABLE_AUTO_FLUSH} to prevent triggering a flush due /// to RAM usage. Note that if flushing by document count /// is also enabled, then the flush will be triggered by /// whichever comes first.

/// ///

The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.

/// ///
/// IllegalArgumentException if ramBufferSize is /// enabled but non-positive, or it disables ramBufferSize /// when maxBufferedDocs is already disabled /// public virtual void SetRAMBufferSizeMB(double mb) { if (mb != DISABLE_AUTO_FLUSH && mb <= 0.0) throw new System.ArgumentException("ramBufferSize should be > 0.0 MB when enabled"); if (mb == DISABLE_AUTO_FLUSH && GetMaxBufferedDocs() == DISABLE_AUTO_FLUSH) throw new System.ArgumentException("at least one of ramBufferSize and maxBufferedDocs must be enabled"); docWriter.SetRAMBufferSizeMB(mb); if (infoStream != null) Message("setRAMBufferSizeMB " + mb); } /// Returns the value set by {@link #setRAMBufferSizeMB} if enabled. public virtual double GetRAMBufferSizeMB() { return docWriter.GetRAMBufferSizeMB(); } ///

Determines the minimal number of delete terms required before the buffered /// in-memory delete terms are applied and flushed. If there are documents /// buffered in memory at the time, they are merged and a new segment is /// created.

///

Disabled by default (writer flushes by RAM usage).

/// ///
/// IllegalArgumentException if maxBufferedDeleteTerms /// is enabled but smaller than 1 /// /// /// public virtual void SetMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) { EnsureOpen(); if (maxBufferedDeleteTerms != DISABLE_AUTO_FLUSH && maxBufferedDeleteTerms < 1) throw new System.ArgumentException("maxBufferedDeleteTerms must at least be 1 when enabled"); docWriter.SetMaxBufferedDeleteTerms(maxBufferedDeleteTerms); if (infoStream != null) Message("setMaxBufferedDeleteTerms " + maxBufferedDeleteTerms); } /// Returns the number of buffered deleted terms that will /// trigger a flush if enabled. /// /// /// public virtual int GetMaxBufferedDeleteTerms() { EnsureOpen(); return docWriter.GetMaxBufferedDeleteTerms(); } /// Determines how often segment indices are merged by addDocument(). With /// smaller values, less RAM is used while indexing, and searches on /// unoptimized indices are faster, but indexing speed is slower. With larger /// values, more RAM is used during indexing, and while searches on unoptimized /// indices are slower, indexing is faster. Thus larger values (> 10) are best /// for batch index creation, and smaller values (< 10) for indices that are /// interactively maintained. /// ///

Note that this method is a convenience method: it /// just calls mergePolicy.setMergeFactor as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

/// ///

This must never be less than 2. The default value is 10. ///

public virtual void SetMergeFactor(int mergeFactor) { GetLogMergePolicy().SetMergeFactor(mergeFactor); } ///

Returns the number of segments that are merged at /// once and also controls the total number of segments /// allowed to accumulate in the index.

/// ///

Note that this method is a convenience method: it /// just calls mergePolicy.getMergeFactor as long as /// mergePolicy is an instance of {@link LogMergePolicy}. /// Otherwise an IllegalArgumentException is thrown.

/// ///
/// /// public virtual int GetMergeFactor() { return GetLogMergePolicy().GetMergeFactor(); } /// If non-null, this will be the default infoStream used /// by a newly instantiated IndexWriter. /// /// /// public static void SetDefaultInfoStream(System.IO.TextWriter infoStream) { IndexWriter.defaultInfoStream = infoStream; } /// Returns the current default infoStream for newly /// instantiated IndexWriters. /// /// /// public static System.IO.TextWriter GetDefaultInfoStream() { return IndexWriter.defaultInfoStream; } /// If non-null, information about merges, deletes and a /// message when maxFieldLength is reached will be printed /// to this. /// public virtual void SetInfoStream(System.IO.TextWriter infoStream) { EnsureOpen(); this.infoStream = infoStream; SetMessageID(); docWriter.SetInfoStream(infoStream); deleter.SetInfoStream(infoStream); if (infoStream != null) MessageState(); } private void MessageState() { Message("setInfoStream: dir=" + directory + " autoCommit=" + autoCommit + " mergePolicy=" + mergePolicy + " mergeScheduler=" + mergeScheduler + " ramBufferSizeMB=" + docWriter.GetRAMBufferSizeMB() + " maxBuffereDocs=" + docWriter.GetMaxBufferedDocs() + " maxBuffereDeleteTerms=" + docWriter.GetMaxBufferedDeleteTerms() + " maxFieldLength=" + maxFieldLength + " index=" + SegString()); } /// Returns the current infoStream in use by this writer. /// /// public virtual System.IO.TextWriter GetInfoStream() { EnsureOpen(); return infoStream; } /// /// /// /// public virtual void SetWriteLockTimeout(long writeLockTimeout) { EnsureOpen(); this.writeLockTimeout = writeLockTimeout; } /// Returns allowed timeout when acquiring the write lock. /// /// public virtual long GetWriteLockTimeout() { EnsureOpen(); return writeLockTimeout; } /// Sets the default (for any instance of IndexWriter) maximum time to wait for a write lock (in /// milliseconds). /// public static void SetDefaultWriteLockTimeout(long writeLockTimeout) { IndexWriter.WRITE_LOCK_TIMEOUT = writeLockTimeout; } /// Returns default write lock timeout for newly /// instantiated IndexWriters. /// /// /// public static long GetDefaultWriteLockTimeout() { return IndexWriter.WRITE_LOCK_TIMEOUT; } /// Flushes all changes to an index and closes all /// associated files. /// ///

If an Exception is hit during close, eg due to disk /// full or some other reason, then both the on-disk index /// and the internal state of the IndexWriter instance will /// be consistent. However, the close will not be complete /// even though part of it (flushing buffered documents) /// may have succeeded, so the write lock will still be /// held.

/// ///

If you can correct the underlying cause (eg free up /// some disk space) then you can call close() again. /// Failing that, if you want to force the write lock to be /// released (dangerous, because you may then lose buffered /// docs in the IndexWriter instance) then you can do /// something like this:

/// ///
		/// try {
		/// writer.close();
		/// } finally {
		/// if (IndexReader.isLocked(directory)) {
		/// IndexReader.unlock(directory);
		/// }
		/// }
		/// 
/// /// after which, you must be certain not to use the writer /// instance anymore.

///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void Close() { Close(true); } /// Closes the index with or without waiting for currently /// running merges to finish. This is only meaningful when /// using a MergeScheduler that runs merges in background /// threads. /// /// if true, this call will block /// until all merges complete; else, it will ask all /// running merges to abort, wait until those merges have /// finished (which should be at most a few seconds), and /// then return. /// public virtual void Close(bool waitForMerges) { bool doClose; lock (this) { // Ensure that only one thread actually gets to do the closing: if (!closing) { doClose = true; closing = true; } else doClose = false; } if (doClose) CloseInternal(waitForMerges); // Another thread beat us to it (is actually doing the // close), so we will block until that other thread // has finished closing else WaitForClose(); } private void WaitForClose() { lock (this) { while (!closed && closing) { try { System.Threading.Monitor.Wait(this); } catch (System.Threading.ThreadInterruptedException ie) { } } } } private void CloseInternal(bool waitForMerges) { try { if (infoStream != null) Message("now flush at close"); docWriter.Close(); // Only allow a new merge to be triggered if we are // going to wait for merges: Flush(waitForMerges, true); mergePolicy.Close(); FinishMerges(waitForMerges); mergeScheduler.Close(); lock (this) { if (commitPending) { bool success = false; try { segmentInfos.Write(directory); // now commit changes success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception committing segments file during close"); DeletePartialSegmentsFile(); } } if (infoStream != null) Message("close: wrote segments file \"" + segmentInfos.GetCurrentSegmentFileName() + "\""); deleter.Checkpoint(segmentInfos, true); commitPending = false; rollbackSegmentInfos = null; } if (infoStream != null) Message("at close: " + SegString()); docWriter = null; deleter.Close(); } if (closeDir) directory.Close(); if (writeLock != null) { writeLock.Release(); // release write lock writeLock = null; } closed = true; } finally { lock (this) { if (!closed) closing = false; System.Threading.Monitor.PulseAll(this); } } } /// Tells the docWriter to close its currently open shared /// doc stores (stored fields & vectors files). /// Return value specifices whether new doc store files are compound or not. /// private bool FlushDocStores() { lock (this) { System.Collections.IList files = docWriter.Files(); bool useCompoundDocStore = false; if (files.Count > 0) { System.String docStoreSegment; bool success = false; try { docStoreSegment = docWriter.CloseDocStore(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception closing doc store segment"); docWriter.Abort(null); } } useCompoundDocStore = mergePolicy.UseCompoundDocStore(segmentInfos); if (useCompoundDocStore && docStoreSegment != null) { // Now build compound doc store file success = false; int numSegments = segmentInfos.Count; System.String compoundFileName = docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION; try { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, compoundFileName); int size = files.Count; for (int i = 0; i < size; i++) cfsWriter.AddFile((System.String) files[i]); // Perform the merge cfsWriter.Close(); for (int i = 0; i < numSegments; i++) { SegmentInfo si = segmentInfos.Info(i); if (si.GetDocStoreOffset() != - 1 && si.GetDocStoreSegment().Equals(docStoreSegment)) si.SetDocStoreIsCompoundFile(true); } Checkpoint(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception building compound file doc store for segment " + docStoreSegment); // Rollback to no compound file for (int i = 0; i < numSegments; i++) { SegmentInfo si = segmentInfos.Info(i); if (si.GetDocStoreOffset() != - 1 && si.GetDocStoreSegment().Equals(docStoreSegment)) si.SetDocStoreIsCompoundFile(false); } deleter.DeleteFile(compoundFileName); DeletePartialSegmentsFile(); } } deleter.Checkpoint(segmentInfos, false); } } return useCompoundDocStore; } } /// Release the write lock, if needed. ~IndexWriter() { try { if (writeLock != null) { writeLock.Release(); // release write lock writeLock = null; } } finally { } } /// Returns the Directory used by this index. public virtual Directory GetDirectory() { EnsureOpen(); return directory; } /// Returns the analyzer used by this index. public virtual Analyzer GetAnalyzer() { EnsureOpen(); return analyzer; } /// Returns the number of documents currently in this index. public virtual int DocCount() { lock (this) { EnsureOpen(); int count = docWriter.GetNumDocsInRAM(); for (int i = 0; i < segmentInfos.Count; i++) { SegmentInfo si = segmentInfos.Info(i); count += si.docCount; } return count; } } /// The maximum number of terms that will be indexed for a single field in a /// document. This limits the amount of memory required for indexing, so that /// collections with very large files will not crash the indexing process by /// running out of memory.

/// Note that this effectively truncates large documents, excluding from the /// index terms that occur further in the document. If you know your source /// documents are large, be sure to set this value high enough to accomodate /// the expected size. If you set it to Integer.MAX_VALUE, then the only limit /// is your memory, but you should anticipate an OutOfMemoryError.

/// By default, no more than 10,000 terms will be indexed for a field. /// ///

private int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; /// Adds a document to this index. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// ///

Note that if an Exception is hit (for example disk full) /// then the index will be consistent, but this document /// may not have been added. Furthermore, it's possible /// the index will have one segment in non-compound format /// even when using compound files (when a merge has /// partially succeeded).

/// ///

This method periodically flushes pending documents /// to the Directory (every {@link #setMaxBufferedDocs}), /// and also periodically merges segments in the index /// (every {@link #setMergeFactor} flushes). When this /// occurs, the method will take more time to run (possibly /// a long time if the index is large), and will require /// free temporary space in the Directory to do the /// merging.

/// ///

The amount of free space required when a merge is triggered is /// up to 1X the size of all segments being merged, when no /// readers/searchers are open against the index, and up to 2X the /// size of all segments being merged when readers/searchers are open /// against the index (see {@link #Optimize()} for details). The /// sequence of primitive merge operations performed is governed by /// the merge policy. /// ///

Note that each term in the document can be no longer /// than 16383 characters, otherwise an /// IllegalArgumentException will be thrown.

/// ///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void AddDocument(Document doc) { AddDocument(doc, analyzer); } /// Adds a document to this index, using the provided analyzer instead of the /// value of {@link #GetAnalyzer()}. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// ///

See {@link #AddDocument(Document)} for details on /// index and IndexWriter state after an Exception, and /// flushing/merging temporary free space requirements.

/// ///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void AddDocument(Document doc, Analyzer analyzer) { EnsureOpen(); bool doFlush = false; bool success = false; try { doFlush = docWriter.AddDocument(doc, analyzer); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception adding document"); lock (this) { // If docWriter has some aborted files that were // never incref'd, then we clean them up here if (docWriter != null) { System.Collections.IList files = docWriter.AbortedFiles(); if (files != null) deleter.DeleteNewFiles(files); } } } } if (doFlush) Flush(true, false); } /// Deletes the document(s) containing term. /// the term to identify the documents to be deleted /// /// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void DeleteDocuments(Term term) { EnsureOpen(); bool doFlush = docWriter.BufferDeleteTerm(term); if (doFlush) Flush(true, false); } /// Deletes the document(s) containing any of the /// terms. All deletes are flushed at the same time. /// /// array of terms to identify the documents /// to be deleted /// /// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void DeleteDocuments(Term[] terms) { EnsureOpen(); bool doFlush = docWriter.BufferDeleteTerms(terms); if (doFlush) Flush(true, false); } /// Updates a document by first deleting the document(s) /// containing term and then adding the new /// document. The delete and then add are atomic as seen /// by a reader on the same index (flush may happen only after /// the add). /// /// the term to identify the document(s) to be /// deleted /// /// the document to be added /// /// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void UpdateDocument(Term term, Document doc) { EnsureOpen(); UpdateDocument(term, doc, GetAnalyzer()); } /// Updates a document by first deleting the document(s) /// containing term and then adding the new /// document. The delete and then add are atomic as seen /// by a reader on the same index (flush may happen only after /// the add). /// /// the term to identify the document(s) to be /// deleted /// /// the document to be added /// /// the analyzer to use when analyzing the document /// /// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void UpdateDocument(Term term, Document doc, Analyzer analyzer) { EnsureOpen(); bool doFlush = false; bool success = false; try { doFlush = docWriter.UpdateDocument(term, doc, analyzer); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception updating document"); lock (this) { // If docWriter has some aborted files that were // never incref'd, then we clean them up here System.Collections.IList files = docWriter.AbortedFiles(); if (files != null) deleter.DeleteNewFiles(files); } } } if (doFlush) Flush(true, false); } // for test purpose public /*internal*/ int GetSegmentCount() { lock (this) { return segmentInfos.Count; } } // for test purpose public /*internal*/ int GetNumBufferedDocuments() { lock (this) { return docWriter.GetNumDocsInRAM(); } } // for test purpose public /*internal*/ int GetDocCount(int i) { lock (this) { if (i >= 0 && i < segmentInfos.Count) { return segmentInfos.Info(i).docCount; } else { return - 1; } } } internal System.String NewSegmentName() { // Cannot synchronize on IndexWriter because that causes // deadlock lock (segmentInfos) { // Important to set commitPending so that the // segmentInfos is written on close. Otherwise we // could close, re-open and re-return the same segment // name that was previously returned which can cause // problems at least with ConcurrentMergeScheduler. commitPending = true; return "_" + SupportClass.Number.ToString(segmentInfos.counter++); } } /// If non-null, information about merges will be printed to this. private System.IO.TextWriter infoStream = null; private static System.IO.TextWriter defaultInfoStream = null; /// Requests an "optimize" operation on an index, priming the index /// for the fastest available search. Traditionally this has meant /// merging all segments into a single segment as is done in the /// default merge policy, but individaul merge policies may implement /// optimize in different ways. /// /// /// /// ///

It is recommended that this method be called upon completion of indexing. In /// environments with frequent updates, optimize is best done during low volume times, if at all. /// ///

///

See http://www.gossamer-threads.com/lists/lucene/java-dev/47895 for more discussion.

/// ///

Note that this can require substantial temporary free /// space in the Directory (see LUCENE-764 /// for details):

/// /// /// ///

The actual temporary usage could be much less than /// these figures (it depends on many factors).

/// ///

In general, once the optimize completes, the total size of the /// index will be less than the size of the starting index. /// It could be quite a bit smaller (if there were many /// pending deletes) or just slightly smaller.

/// ///

If an Exception is hit during optimize(), for example /// due to disk full, the index will not be corrupt and no /// documents will have been lost. However, it may have /// been partially optimized (some segments were merged but /// not all), and it's possible that one of the segments in /// the index will be in non-compound format even when /// using compound file format. This will occur when the /// Exception is hit during conversion of the segment into /// compound format.

/// ///

This call will optimize those segments present in /// the index when the call started. If other threads are /// still adding documents and flushing segments, those /// newly created segments will not be optimized unless you /// call optimize again.

/// ///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void Optimize() { Optimize(true); } /// Optimize the index down to <= maxNumSegments. If /// maxNumSegments==1 then this is the same as {@link /// #Optimize()}. /// /// maximum number of segments left /// in the index after optimization finishes /// public virtual void Optimize(int maxNumSegments) { Optimize(maxNumSegments, true); } /// Just like {@link #Optimize()}, except you can specify /// whether the call should block until the optimize /// completes. This is only meaningful with a /// {@link MergeScheduler} that is able to run merges in /// background threads. /// public virtual void Optimize(bool doWait) { Optimize(1, true); } /// Just like {@link #Optimize(int)}, except you can /// specify whether the call should block until the /// optimize completes. This is only meaningful with a /// {@link MergeScheduler} that is able to run merges in /// background threads. /// public virtual void Optimize(int maxNumSegments, bool doWait) { EnsureOpen(); if (maxNumSegments < 1) throw new System.ArgumentException("maxNumSegments must be >= 1; got " + maxNumSegments); if (infoStream != null) Message("optimize: index now " + SegString()); Flush(); lock (this) { ResetMergeExceptions(); segmentsToOptimize = new System.Collections.Hashtable(); int numSegments = segmentInfos.Count; for (int i = 0; i < numSegments; i++) if (!segmentsToOptimize.ContainsKey(segmentInfos.Info(i))) segmentsToOptimize.Add(segmentInfos.Info(i), segmentInfos.Info(i)); // Now mark all pending & running merges as optimize // merge: System.Collections.IEnumerator it = pendingMerges.GetEnumerator(); while (it.MoveNext()) { MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.Current; merge.optimize = true; merge.maxNumSegmentsOptimize = maxNumSegments; } it = runningMerges.GetEnumerator(); while (it.MoveNext()) { MergePolicy.OneMerge merge = (MergePolicy.OneMerge)((DictionaryEntry)it.Current).Value; merge.optimize = true; merge.maxNumSegmentsOptimize = maxNumSegments; } } MaybeMerge(maxNumSegments, true); if (doWait) { lock (this) { while (OptimizeMergesPending()) { try { System.Threading.Monitor.Wait(this); } catch (System.Threading.ThreadInterruptedException ie) { } if (mergeExceptions.Count > 0) { // Forward any exceptions in background merge // threads to the current thread: int size = mergeExceptions.Count; for (int i = 0; i < size; i++) { MergePolicy.OneMerge merge = (MergePolicy.OneMerge) mergeExceptions[0]; if (merge.optimize) { System.IO.IOException err = new System.IO.IOException("background merge hit exception: " + merge.SegString(directory), merge.GetException()); throw err; } } } } } } // NOTE: in the ConcurrentMergeScheduler case, when // doWait is false, we can return immediately while // background threads accomplish the optimization } /// Returns true if any merges in pendingMerges or /// runningMerges are optimization merges. /// private bool OptimizeMergesPending() { lock (this) { System.Collections.IEnumerator it = pendingMerges.GetEnumerator(); while (it.MoveNext()) { if (((MergePolicy.OneMerge) it.Current).optimize) return true; } it = runningMerges.GetEnumerator(); while (it.MoveNext()) { if (((MergePolicy.OneMerge) ((DictionaryEntry)it.Current).Value).optimize) return true; } return false; } } /// Expert: asks the mergePolicy whether any merges are /// necessary now and if so, runs the requested merges and /// then iterate (test again if merges are needed) until no /// more merges are returned by the mergePolicy. /// /// Explicit calls to maybeMerge() are usually not /// necessary. The most common case is when merge policy /// parameters have changed. /// public void MaybeMerge() { MaybeMerge(false); } private void MaybeMerge(bool optimize) { MaybeMerge(1, optimize); } private void MaybeMerge(int maxNumSegmentsOptimize, bool optimize) { UpdatePendingMerges(maxNumSegmentsOptimize, optimize); mergeScheduler.Merge(this); } private void UpdatePendingMerges(int maxNumSegmentsOptimize, bool optimize) { lock (this) { System.Diagnostics.Debug.Assert(!optimize || maxNumSegmentsOptimize > 0); if (stopMerges) return ; MergePolicy.MergeSpecification spec; if (optimize) { spec = mergePolicy.FindMergesForOptimize(segmentInfos, this, maxNumSegmentsOptimize, segmentsToOptimize); if (spec != null) { int numMerges = spec.merges.Count; for (int i = 0; i < numMerges; i++) { MergePolicy.OneMerge merge = ((MergePolicy.OneMerge) spec.merges[i]); merge.optimize = true; merge.maxNumSegmentsOptimize = maxNumSegmentsOptimize; } } } else spec = mergePolicy.FindMerges(segmentInfos, this); if (spec != null) { int numMerges = spec.merges.Count; for (int i = 0; i < numMerges; i++) RegisterMerge((MergePolicy.OneMerge) spec.merges[i]); } } } /// Expert: the {@link MergeScheduler} calls this method /// to retrieve the next merge requested by the /// MergePolicy /// public /*internal*/ virtual MergePolicy.OneMerge GetNextMerge() { lock (this) { if (pendingMerges.Count == 0) return null; else { // Advance the merge from pending to running System.Object tempObject; tempObject = pendingMerges[0]; pendingMerges.RemoveAt(0); MergePolicy.OneMerge merge = (MergePolicy.OneMerge) tempObject; runningMerges.Add(merge, merge); return merge; } } } /* * Begin a transaction. During a transaction, any segment * merges that happen (or ram segments flushed) will not * write a new segments file and will not remove any files * that were present at the start of the transaction. You * must make a matched (try/finally) call to * commitTransaction() or rollbackTransaction() to finish * the transaction. * * Note that buffered documents and delete terms are not handled * within the transactions, so they must be flushed before the * transaction is started. */ private void StartTransaction() { if (infoStream != null) Message("now start transaction"); System.Diagnostics.Debug.Assert(docWriter.GetNumBufferedDeleteTerms() == 0, "calling startTransaction with buffered delete terms not supported"); System.Diagnostics.Debug.Assert(docWriter.GetNumDocsInRAM() == 0, "calling startTransaction with buffered documents not supported"); localRollbackSegmentInfos = (SegmentInfos) segmentInfos.Clone(); localAutoCommit = autoCommit; if (localAutoCommit) { if (infoStream != null) Message("flush at startTransaction"); Flush(); // Turn off auto-commit during our local transaction: autoCommit = false; } // We must "protect" our files at this point from // deletion in case we need to rollback: else deleter.IncRef(segmentInfos, false); } /* * Rolls back the transaction and restores state to where * we were at the start. */ private void RollbackTransaction() { if (infoStream != null) Message("now rollback transaction"); // First restore autoCommit in case we hit an exception below: autoCommit = localAutoCommit; // Keep the same segmentInfos instance but replace all // of its SegmentInfo instances. This is so the next // attempt to commit using this instance of IndexWriter // will always write to a new generation ("write once"). segmentInfos.Clear(); segmentInfos.AddRange(localRollbackSegmentInfos); localRollbackSegmentInfos = null; // Ask deleter to locate unreferenced files we had // created & remove them: deleter.Checkpoint(segmentInfos, false); if (!autoCommit) // Remove the incRef we did in startTransaction: deleter.DecRef(segmentInfos); deleter.Refresh(); FinishMerges(false); stopMerges = false; } /* * Commits the transaction. This will write the new * segments file and remove and pending deletions we have * accumulated during the transaction */ private void CommitTransaction() { if (infoStream != null) Message("now commit transaction"); // First restore autoCommit in case we hit an exception below: autoCommit = localAutoCommit; bool success = false; try { Checkpoint(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception committing transaction"); RollbackTransaction(); } } if (!autoCommit) // Remove the incRef we did in startTransaction. deleter.DecRef(localRollbackSegmentInfos); localRollbackSegmentInfos = null; // Give deleter a chance to remove files now: deleter.Checkpoint(segmentInfos, autoCommit); } /// Close the IndexWriter without committing /// any of the changes that have occurred since it was /// opened. This removes any temporary files that had been /// created, after which the state of the index will be the /// same as it was when this writer was first opened. This /// can only be called when this IndexWriter was opened /// with autoCommit=false. /// /// IllegalStateException if this is called when /// the writer was opened with autoCommit=true. /// /// IOException if there is a low-level IO error public virtual void Abort() { EnsureOpen(); if (autoCommit) throw new System.SystemException("abort() can only be called when IndexWriter was opened with autoCommit=false"); bool doClose; lock (this) { // Ensure that only one thread actually gets to do the closing: if (!closing) { doClose = true; closing = true; } else doClose = false; } if (doClose) { FinishMerges(false); // Must pre-close these two, in case they set // commitPending=true, so that we can then set it to // false before calling closeInternal mergePolicy.Close(); mergeScheduler.Close(); lock (this) { // Keep the same segmentInfos instance but replace all // of its SegmentInfo instances. This is so the next // attempt to commit using this instance of IndexWriter // will always write to a new generation ("write // once"). segmentInfos.Clear(); segmentInfos.AddRange(rollbackSegmentInfos); docWriter.Abort(null); // Ask deleter to locate unreferenced files & remove // them: deleter.Checkpoint(segmentInfos, false); deleter.Refresh(); } commitPending = false; CloseInternal(false); } else WaitForClose(); } private void FinishMerges(bool waitForMerges) { lock (this) { if (!waitForMerges) { stopMerges = true; // Abort all pending & running merges: System.Collections.IEnumerator it = pendingMerges.GetEnumerator(); while (it.MoveNext()) { MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.Current; if (infoStream != null) Message("now abort pending merge " + merge.SegString(directory)); merge.Abort(); MergeFinish(merge); } pendingMerges.Clear(); it = runningMerges.GetEnumerator(); while (it.MoveNext()) { MergePolicy.OneMerge merge = (MergePolicy.OneMerge)((DictionaryEntry)it.Current).Value; if (infoStream != null) Message("now abort running merge " + merge.SegString(directory)); merge.Abort(); } // These merges periodically check whether they have // been aborted, and stop if so. We wait here to make // sure they all stop. It should not take very long // because the merge threads periodically check if // they are aborted. while (runningMerges.Count > 0) { if (infoStream != null) Message("now wait for " + runningMerges.Count + " running merge to abort"); try { System.Threading.Monitor.Wait(this); } catch (System.Threading.ThreadInterruptedException ie) { SupportClass.ThreadClass.Current().Interrupt(); } } System.Diagnostics.Debug.Assert(0 == mergingSegments.Count); if (infoStream != null) Message("all running merges have aborted"); } else { while (pendingMerges.Count > 0 || runningMerges.Count > 0) { try { System.Threading.Monitor.Wait(this); } catch (System.Threading.ThreadInterruptedException ie) { } } System.Diagnostics.Debug.Assert(0 == mergingSegments.Count); } } } /* * Called whenever the SegmentInfos has been updated and * the index files referenced exist (correctly) in the * index directory. If we are in autoCommit mode, we * commit the change immediately. Else, we mark * commitPending. */ private void Checkpoint() { lock (this) { if (autoCommit) { segmentInfos.Write(directory); commitPending = false; if (infoStream != null) Message("checkpoint: wrote segments file \"" + segmentInfos.GetCurrentSegmentFileName() + "\""); } else { commitPending = true; } } } /// Merges all segments from an array of indexes into this index. /// ///

This may be used to parallelize batch indexing. A large document /// collection can be broken into sub-collections. Each sub-collection can be /// indexed in parallel, on a different thread, process or machine. The /// complete index can then be created by merging sub-collection indexes /// with this method. /// ///

NOTE: the index in each Directory must not be /// changed (opened by a writer) while this method is /// running. This method does not acquire a write lock in /// each input Directory, so it is up to the caller to /// enforce this. /// ///

After this completes, the index is optimized. /// ///

This method is transactional in how Exceptions are /// handled: it does not commit a new segments_N file until /// all indexes are added. This means if an Exception /// occurs (for example disk full), then either no indexes /// will have been added or they all will have been.

/// ///

If an Exception is hit, it's still possible that all /// indexes were successfully added. This happens when the /// Exception is hit when trying to build a CFS file. In /// this case, one segment in the index will be in non-CFS /// format, even when using compound file format.

/// ///

Also note that on an Exception, the index may still /// have been partially or fully optimized even though none /// of the input indexes were added.

/// ///

Note that this requires temporary free space in the /// Directory up to 2X the sum of all input indexes /// (including the starting index). If readers/searchers /// are open against the starting index, then temporary /// free space required will be higher by the size of the /// starting index (see {@link #Optimize()} for details). ///

/// ///

Once this completes, the final size of the index /// will be less than the sum of all input index sizes /// (including the starting index). It could be quite a /// bit smaller (if there were many pending deletes) or /// just slightly smaller.

/// ///

See LUCENE-702 /// for details.

///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void AddIndexes(Directory[] dirs) { lock (this) { EnsureOpen(); if (infoStream != null) Message("flush at addIndexes"); Flush(); bool success = false; StartTransaction(); try { for (int i = 0; i < dirs.Length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.Read(dirs[i]); for (int j = 0; j < sis.Count; j++) { segmentInfos.Add(sis.Info(j)); // add each info } } Optimize(); success = true; } finally { if (success) { CommitTransaction(); } else { RollbackTransaction(); } } } } private void ResetMergeExceptions() { lock (this) { mergeExceptions = new System.Collections.ArrayList(); mergeGen++; } } /// Merges all segments from an array of indexes into this index. ///

/// This is similar to addIndexes(Directory[]). However, no optimize() /// is called either at the beginning or at the end. Instead, merges /// are carried out as necessary. /// ///

NOTE: the index in each Directory must not be /// changed (opened by a writer) while this method is /// running. This method does not acquire a write lock in /// each input Directory, so it is up to the caller to /// enforce this. /// ///

/// This requires this index not be among those to be added, and the /// upper bound* of those segment doc counts not exceed maxMergeDocs. /// ///

See {@link #AddIndexes(Directory[])} for /// details on transactional semantics, temporary free /// space required in the Directory, and non-CFS segments /// on an Exception.

///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void AddIndexesNoOptimize(Directory[] dirs) { lock (this) { EnsureOpen(); if (infoStream != null) Message("flush at addIndexesNoOptimize"); Flush(); bool success = false; StartTransaction(); try { for (int i = 0; i < dirs.Length; i++) { if (directory == dirs[i]) { // cannot add this index: segments may be deleted in merge before added throw new System.ArgumentException("Cannot add this index to itself"); } SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.Read(dirs[i]); for (int j = 0; j < sis.Count; j++) { SegmentInfo info = sis.Info(j); segmentInfos.Add(info); // add each info } } MaybeMerge(); // If after merging there remain segments in the index // that are in a different directory, just copy these // over into our index. This is necessary (before // finishing the transaction) to avoid leaving the // index in an unusable (inconsistent) state. CopyExternalSegments(); success = true; } finally { if (success) { CommitTransaction(); } else { RollbackTransaction(); } } } } /* If any of our segments are using a directory != ours * then copy them over. Currently this is only used by * addIndexesNoOptimize(). */ private void CopyExternalSegments() { lock (this) { int numSegments = segmentInfos.Count; for (int i = 0; i < numSegments; i++) { SegmentInfo info = segmentInfos.Info(i); if (info.dir != directory) { MergePolicy.OneMerge merge = new MergePolicy.OneMerge(segmentInfos.Range(i, 1 + i), info.GetUseCompoundFile()); if (RegisterMerge(merge)) { pendingMerges.Remove(merge); runningMerges.Add(merge, merge); Merge(merge); } // This means there is a bug in the // MergeScheduler. MergeSchedulers in general are // not allowed to run a merge involving segments // external to this IndexWriter's directory in the // background because this would put the index // into an inconsistent state (where segmentInfos // has been written with such external segments // that an IndexReader would fail to load). else throw new MergePolicy.MergeException("segment \"" + info.name + " exists in external directory yet the MergeScheduler executed the merge in a separate thread"); } } } } /// Merges the provided indexes into this index. ///

After this completes, the index is optimized.

///

The provided IndexReaders are not closed.

///

See {@link #AddIndexes(Directory[])} for /// details on transactional semantics, temporary free /// space required in the Directory, and non-CFS segments /// on an Exception.

///
/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public virtual void AddIndexes(IndexReader[] readers) { lock (this) { EnsureOpen(); Optimize(); // start with zero or 1 seg System.String mergedName = NewSegmentName(); SegmentMerger merger = new SegmentMerger(this, mergedName, null); SegmentInfo info; IndexReader sReader = null; try { if (segmentInfos.Count == 1) { // add existing index, if any sReader = SegmentReader.Get(segmentInfos.Info(0)); merger.Add(sReader); } for (int i = 0; i < readers.Length; i++) // add new indexes merger.Add(readers[i]); bool success = false; StartTransaction(); try { int docCount = merger.Merge(); // merge 'em if (sReader != null) { sReader.Close(); sReader = null; } segmentInfos.RemoveRange(0, segmentInfos.Count); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, - 1, null, false); segmentInfos.Add(info); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception in addIndexes during merge"); RollbackTransaction(); } else { CommitTransaction(); } } } finally { if (sReader != null) { sReader.Close(); } } if (mergePolicy is LogMergePolicy && GetUseCompoundFile()) { bool success = false; StartTransaction(); try { merger.CreateCompoundFile(mergedName + ".cfs"); info.SetUseCompoundFile(true); } finally { if (!success) { if (infoStream != null) Message("hit exception building compound file in addIndexes during merge"); RollbackTransaction(); } else { CommitTransaction(); } } } } } // This is called after pending added and deleted // documents have been flushed to the Directory but before // the change is committed (new segments_N file written). internal virtual void DoAfterFlush() { } /// Flush all in-memory buffered updates (adds and deletes) /// to the Directory. ///

Note: if autoCommit=false, flushed data would still /// not be visible to readers, until {@link #close} is called. ///

/// CorruptIndexException if the index is corrupt /// IOException if there is a low-level IO error public void Flush() { Flush(true, false); } /// Flush all in-memory buffered udpates (adds and deletes) /// to the Directory. /// /// if true, we may merge segments (if /// deletes or docs were flushed) if necessary /// /// if false we are allowed to keep /// doc stores open to share with the next segment /// public /*protected internal*/ void Flush(bool triggerMerge, bool flushDocStores) { EnsureOpen(); if (DoFlush(flushDocStores) && triggerMerge) MaybeMerge(); } private bool DoFlush(bool flushDocStores) { lock (this) { // Make sure no threads are actively adding a document // Returns true if docWriter is currently aborting, in // which case we skip flushing this segment if (docWriter.PauseAllThreads()) { docWriter.ResumeAllThreads(); return false; } try { SegmentInfo newSegment = null; int numDocs = docWriter.GetNumDocsInRAM(); // Always flush docs if there are any bool flushDocs = numDocs > 0; // With autoCommit=true we always must flush the doc // stores when we flush flushDocStores |= autoCommit; System.String docStoreSegment = docWriter.GetDocStoreSegment(); if (docStoreSegment == null) flushDocStores = false; // Always flush deletes if there are any delete terms. // TODO: when autoCommit=false we don't have to flush // deletes with every flushed segment; we can save // CPU/IO by buffering longer & flushing deletes only // when they are full or writer is being closed. We // have to fix the "applyDeletesSelectively" logic to // apply to more than just the last flushed segment bool flushDeletes = docWriter.HasDeletes(); if (infoStream != null) { Message(" flush: segment=" + docWriter.GetSegment() + " docStoreSegment=" + docWriter.GetDocStoreSegment() + " docStoreOffset=" + docWriter.GetDocStoreOffset() + " flushDocs=" + flushDocs + " flushDeletes=" + flushDeletes + " flushDocStores=" + flushDocStores + " numDocs=" + numDocs + " numBufDelTerms=" + docWriter.GetNumBufferedDeleteTerms()); Message(" index before flush " + SegString()); } int docStoreOffset = docWriter.GetDocStoreOffset(); // docStoreOffset should only be non-zero when // autoCommit == false System.Diagnostics.Debug.Assert(!autoCommit || 0 == docStoreOffset); bool docStoreIsCompoundFile = false; // Check if the doc stores must be separately flushed // because other segments, besides the one we are about // to flush, reference it if (flushDocStores && (!flushDocs || !docWriter.GetSegment().Equals(docWriter.GetDocStoreSegment()))) { // We must separately flush the doc store if (infoStream != null) Message(" flush shared docStore segment " + docStoreSegment); docStoreIsCompoundFile = FlushDocStores(); flushDocStores = false; } System.String segment = docWriter.GetSegment(); // If we are flushing docs, segment must not be null: System.Diagnostics.Debug.Assert(segment != null || !flushDocs); if (flushDocs || flushDeletes) { SegmentInfos rollback = null; if (flushDeletes) rollback = (SegmentInfos) segmentInfos.Clone(); bool success = false; try { if (flushDocs) { if (0 == docStoreOffset && flushDocStores) { // This means we are flushing private doc stores // with this segment, so it will not be shared // with other segments System.Diagnostics.Debug.Assert(docStoreSegment != null); System.Diagnostics.Debug.Assert(docStoreSegment.Equals(segment)); docStoreOffset = - 1; docStoreIsCompoundFile = false; docStoreSegment = null; } int flushedDocCount = docWriter.Flush(flushDocStores); newSegment = new SegmentInfo(segment, flushedDocCount, directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile); segmentInfos.Add(newSegment); } if (flushDeletes) { // we should be able to change this so we can // buffer deletes longer and then flush them to // multiple flushed segments, when // autoCommit=false ApplyDeletes(flushDocs); DoAfterFlush(); } Checkpoint(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception flushing segment " + segment); if (flushDeletes) { // Carefully check if any partial .del files // should be removed: int size = rollback.Count; for (int i = 0; i < size; i++) { System.String newDelFileName = segmentInfos.Info(i).GetDelFileName(); System.String delFileName = rollback.Info(i).GetDelFileName(); if (newDelFileName != null && !newDelFileName.Equals(delFileName)) deleter.DeleteFile(newDelFileName); } // Fully replace the segmentInfos since flushed // deletes could have changed any of the // SegmentInfo instances: segmentInfos.Clear(); segmentInfos.AddRange(rollback); } else { // Remove segment we added, if any: if (newSegment != null && segmentInfos.Count > 0 && segmentInfos.Info(segmentInfos.Count - 1) == newSegment) segmentInfos.RemoveAt(segmentInfos.Count - 1); } if (flushDocs) docWriter.Abort(null); DeletePartialSegmentsFile(); deleter.Checkpoint(segmentInfos, false); if (segment != null) deleter.Refresh(segment); } } deleter.Checkpoint(segmentInfos, autoCommit); if (flushDocs && mergePolicy.UseCompoundFile(segmentInfos, newSegment)) { success = false; try { docWriter.CreateCompoundFile(segment); newSegment.SetUseCompoundFile(true); Checkpoint(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception creating compound file for newly flushed segment " + segment); newSegment.SetUseCompoundFile(false); deleter.DeleteFile(segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); DeletePartialSegmentsFile(); } } deleter.Checkpoint(segmentInfos, autoCommit); } return true; } else { return false; } } finally { docWriter.ClearFlushPending(); docWriter.ResumeAllThreads(); } } } /// Expert: Return the total size of all index files currently cached in memory. /// Useful for size management with flushRamDocs() /// public long RamSizeInBytes() { EnsureOpen(); return docWriter.GetRAMUsed(); } /// Expert: Return the number of documents whose segments are currently cached in memory. /// Useful when calling flush() /// public int NumRamDocs() { lock (this) { EnsureOpen(); return docWriter.GetNumDocsInRAM(); } } private int EnsureContiguousMerge(MergePolicy.OneMerge merge) { int first = segmentInfos.IndexOf(merge.segments.Info(0)); if (first == - 1) throw new MergePolicy.MergeException("could not find segment " + merge.segments.Info(0).name + " in current segments"); int numSegments = segmentInfos.Count; int numSegmentsToMerge = merge.segments.Count; for (int i = 0; i < numSegmentsToMerge; i++) { SegmentInfo info = merge.segments.Info(i); if (first + i >= numSegments || !segmentInfos.Info(first + i).Equals(info)) { if (segmentInfos.IndexOf(info) == - 1) throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is not in the index"); else { throw new MergePolicy.MergeException("MergePolicy selected non-contiguous segments to merge (" + merge + " vs " + SegString() + "), which IndexWriter (currently) cannot handle"); } } } return first; } /* FIXME if we want to support non-contiguous segment merges */ private bool CommitMerge(MergePolicy.OneMerge merge) { lock (this) { System.Diagnostics.Debug.Assert(merge.registerDone); // If merge was explicitly aborted, or, if abort() or // rollbackTransaction() had been called since our merge // started (which results in an unqualified // deleter.refresh() call that will remove any index // file that current segments does not reference), we // abort this merge if (merge.IsAborted()) { if (infoStream != null) Message("commitMerge: skipping merge " + merge.SegString(directory) + ": it was aborted"); System.Diagnostics.Debug.Assert(merge.increfDone); DecrefMergeSegments(merge); deleter.Refresh(merge.info.name); return false; } bool success = false; int start; try { SegmentInfos sourceSegmentsClone = merge.segmentsClone; SegmentInfos sourceSegments = merge.segments; start = EnsureContiguousMerge(merge); if (infoStream != null) Message("commitMerge " + merge.SegString(directory)); // Carefully merge deletes that occurred after we // started merging: BitVector deletes = null; int docUpto = 0; int numSegmentsToMerge = sourceSegments.Count; for (int i = 0; i < numSegmentsToMerge; i++) { SegmentInfo previousInfo = sourceSegmentsClone.Info(i); SegmentInfo currentInfo = sourceSegments.Info(i); System.Diagnostics.Debug.Assert(currentInfo.docCount == previousInfo.docCount); int docCount = currentInfo.docCount; if (previousInfo.HasDeletions()) { // There were deletes on this segment when the merge // started. The merge has collapsed away those // deletes, but, if new deletes were flushed since // the merge started, we must now carefully keep any // newly flushed deletes but mapping them to the new // docIDs. System.Diagnostics.Debug.Assert(currentInfo.HasDeletions()); // Load deletes present @ start of merge, for this segment: BitVector previousDeletes = new BitVector(previousInfo.dir, previousInfo.GetDelFileName()); if (!currentInfo.GetDelFileName().Equals(previousInfo.GetDelFileName())) { // This means this segment has had new deletes // committed since we started the merge, so we // must merge them: if (deletes == null) deletes = new BitVector(merge.info.docCount); BitVector currentDeletes = new BitVector(currentInfo.dir, currentInfo.GetDelFileName()); for (int j = 0; j < docCount; j++) { if (previousDeletes.Get(j)) System.Diagnostics.Debug.Assert(currentDeletes.Get(j)); else { if (currentDeletes.Get(j)) deletes.Set(docUpto); docUpto++; } } } else docUpto += docCount - previousDeletes.Count(); } else if (currentInfo.HasDeletions()) { // This segment had no deletes before but now it // does: if (deletes == null) deletes = new BitVector(merge.info.docCount); BitVector currentDeletes = new BitVector(directory, currentInfo.GetDelFileName()); for (int j = 0; j < docCount; j++) { if (currentDeletes.Get(j)) deletes.Set(docUpto); docUpto++; } } // No deletes before or after else docUpto += currentInfo.docCount; merge.CheckAborted(directory); } if (deletes != null) { merge.info.AdvanceDelGen(); deletes.Write(directory, merge.info.GetDelFileName()); } success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception creating merged deletes file"); deleter.Refresh(merge.info.name); } } // Simple optimization: if the doc store we are using // has been closed and is in now compound format (but // wasn't when we started), then we will switch to the // compound format as well: System.String mergeDocStoreSegment = merge.info.GetDocStoreSegment(); if (mergeDocStoreSegment != null && !merge.info.GetDocStoreIsCompoundFile()) { int size = segmentInfos.Count; for (int i = 0; i < size; i++) { SegmentInfo info = segmentInfos.Info(i); System.String docStoreSegment = info.GetDocStoreSegment(); if (docStoreSegment != null && docStoreSegment.Equals(mergeDocStoreSegment) && info.GetDocStoreIsCompoundFile()) { merge.info.SetDocStoreIsCompoundFile(true); break; } } } success = false; SegmentInfos rollback = null; try { rollback = (SegmentInfos) segmentInfos.Clone(); ((System.Collections.IList) ((System.Collections.ArrayList) segmentInfos).GetRange(start, start + merge.segments.Count - start)).Clear(); segmentInfos.Insert(start, merge.info); Checkpoint(); success = true; } finally { if (!success && rollback != null) { if (infoStream != null) Message("hit exception when checkpointing after merge"); segmentInfos.Clear(); segmentInfos.AddRange(rollback); DeletePartialSegmentsFile(); deleter.Refresh(merge.info.name); } } if (merge.optimize) segmentsToOptimize.Add(merge.info, merge.info); // Must checkpoint before decrefing so any newly // referenced files in the new merge.info are incref'd // first: deleter.Checkpoint(segmentInfos, autoCommit); DecrefMergeSegments(merge); return true; } } private void DecrefMergeSegments(MergePolicy.OneMerge merge) { SegmentInfos sourceSegmentsClone = merge.segmentsClone; int numSegmentsToMerge = sourceSegmentsClone.Count; System.Diagnostics.Debug.Assert(merge.increfDone); merge.increfDone = false; for (int i = 0; i < numSegmentsToMerge; i++) { SegmentInfo previousInfo = sourceSegmentsClone.Info(i); // Decref all files for this SegmentInfo (this // matches the incref in mergeInit): if (previousInfo.dir == directory) deleter.DecRef(previousInfo.Files()); } } /// Merges the indicated segments, replacing them in the stack with a /// single segment. /// public /*internal*/ void Merge(MergePolicy.OneMerge merge) { System.Diagnostics.Debug.Assert(merge.registerDone); System.Diagnostics.Debug.Assert(!merge.optimize || merge.maxNumSegmentsOptimize > 0); bool success = false; try { try { if (merge.info == null) MergeInit(merge); if (infoStream != null) Message("now merge\n merge=" + merge.SegString(directory) + "\n index=" + SegString()); MergeMiddle(merge); success = true; } catch (MergePolicy.MergeAbortedException e) { merge.SetException(e); AddMergeException(merge); // We can ignore this exception, unless the merge // involves segments from external directories, in // which case we must throw it so, for example, the // rollbackTransaction code in addIndexes* is // executed. if (merge.isExternal) throw e; } } finally { lock (this) { try { if (!success && infoStream != null) Message("hit exception during merge"); MergeFinish(merge); // This merge (and, generally, any change to the // segments) may now enable new merges, so we call // merge policy & update pending merges. if (success && !merge.IsAborted() && !closed && !closing) UpdatePendingMerges(merge.maxNumSegmentsOptimize, merge.optimize); } finally { runningMerges.Remove(merge); // Optimize may be waiting on the final optimize // merge to finish; and finishMerges() may be // waiting for all merges to finish: System.Threading.Monitor.PulseAll(this); } } } } /// Checks whether this merge involves any segments /// already participating in a merge. If not, this merge /// is "registered", meaning we record that its segments /// are now participating in a merge, and true is /// returned. Else (the merge conflicts) false is /// returned. /// internal bool RegisterMerge(MergePolicy.OneMerge merge) { lock (this) { if (merge.registerDone) return true; int count = merge.segments.Count; bool isExternal = false; for (int i = 0; i < count; i++) { SegmentInfo info = merge.segments.Info(i); if (mergingSegments.Contains(info)) return false; if (segmentInfos.IndexOf(info) == - 1) return false; if (info.dir != directory) isExternal = true; } pendingMerges.Add(merge); if (infoStream != null) Message("add merge to pendingMerges: " + merge.SegString(directory) + " [total " + pendingMerges.Count + " pending]"); merge.mergeGen = mergeGen; merge.isExternal = isExternal; // OK it does not conflict; now record that this merge // is running (while synchronized) to avoid race // condition where two conflicting merges from different // threads, start for (int i = 0; i < count; i++) if (!mergingSegments.Contains(merge.segments.Info(i))) mergingSegments.Add(merge.segments.Info(i), merge.segments.Info(i)); // Merge is now registered merge.registerDone = true; return true; } } /// Does initial setup for a merge, which is fast but holds /// the synchronized lock on IndexWriter instance. /// internal void MergeInit(MergePolicy.OneMerge merge) { lock (this) { System.Diagnostics.Debug.Assert(merge.registerDone); if (merge.IsAborted()) return ; SegmentInfos sourceSegments = merge.segments; int end = sourceSegments.Count; EnsureContiguousMerge(merge); // Check whether this merge will allow us to skip // merging the doc stores (stored field & vectors). // This is a very substantial optimization (saves tons // of IO) that can only be applied with // autoCommit=false. Directory lastDir = directory; System.String lastDocStoreSegment = null; int next = - 1; bool mergeDocStores = false; bool doFlushDocStore = false; System.String currentDocStoreSegment = docWriter.GetDocStoreSegment(); // Test each segment to be merged: check if we need to // flush/merge doc stores for (int i = 0; i < end; i++) { SegmentInfo si = sourceSegments.Info(i); // If it has deletions we must merge the doc stores if (si.HasDeletions()) mergeDocStores = true; // If it has its own (private) doc stores we must // merge the doc stores if (- 1 == si.GetDocStoreOffset()) mergeDocStores = true; // If it has a different doc store segment than // previous segments, we must merge the doc stores System.String docStoreSegment = si.GetDocStoreSegment(); if (docStoreSegment == null) mergeDocStores = true; else if (lastDocStoreSegment == null) lastDocStoreSegment = docStoreSegment; else if (!lastDocStoreSegment.Equals(docStoreSegment)) mergeDocStores = true; // Segments' docScoreOffsets must be in-order, // contiguous. For the default merge policy now // this will always be the case but for an arbitrary // merge policy this may not be the case if (- 1 == next) next = si.GetDocStoreOffset() + si.docCount; else if (next != si.GetDocStoreOffset()) mergeDocStores = true; else next = si.GetDocStoreOffset() + si.docCount; // If the segment comes from a different directory // we must merge if (lastDir != si.dir) mergeDocStores = true; // If the segment is referencing the current "live" // doc store outputs then we must merge if (si.GetDocStoreOffset() != - 1 && currentDocStoreSegment != null && si.GetDocStoreSegment().Equals(currentDocStoreSegment)) doFlushDocStore = true; } int docStoreOffset; System.String docStoreSegment2; bool docStoreIsCompoundFile; if (mergeDocStores) { docStoreOffset = - 1; docStoreSegment2 = null; docStoreIsCompoundFile = false; } else { SegmentInfo si = sourceSegments.Info(0); docStoreOffset = si.GetDocStoreOffset(); docStoreSegment2 = si.GetDocStoreSegment(); docStoreIsCompoundFile = si.GetDocStoreIsCompoundFile(); } if (mergeDocStores && doFlushDocStore) { // SegmentMerger intends to merge the doc stores // (stored fields, vectors), and at least one of the // segments to be merged refers to the currently // live doc stores. // TODO: if we know we are about to merge away these // newly flushed doc store files then we should not // make compound file out of them... if (infoStream != null) Message("flush at merge"); Flush(false, true); } // We must take a full copy at this point so that we can // properly merge deletes in commitMerge() merge.segmentsClone = (SegmentInfos) merge.segments.Clone(); for (int i = 0; i < end; i++) { SegmentInfo si = merge.segmentsClone.Info(i); // IncRef all files for this segment info to make sure // they are not removed while we are trying to merge. if (si.dir == directory) deleter.IncRef(si.Files()); } merge.increfDone = true; merge.mergeDocStores = mergeDocStores; // Bind a new segment name here so even with // ConcurrentMergePolicy we keep deterministic segment // names. merge.info = new SegmentInfo(NewSegmentName(), 0, directory, false, true, docStoreOffset, docStoreSegment2, docStoreIsCompoundFile); // Also enroll the merged segment into mergingSegments; // this prevents it from getting selected for a merge // after our merge is done but while we are building the // CFS: mergingSegments.Add(merge.info, merge.info); } } /// Does fininishing for a merge, which is fast but holds /// the synchronized lock on IndexWriter instance. /// internal void MergeFinish(MergePolicy.OneMerge merge) { lock (this) { if (merge.increfDone) DecrefMergeSegments(merge); System.Diagnostics.Debug.Assert(merge.registerDone); SegmentInfos sourceSegments = merge.segments; int end = sourceSegments.Count; for (int i = 0; i < end; i++) mergingSegments.Remove(sourceSegments.Info(i)); if (merge.info != null) mergingSegments.Remove(merge.info); merge.registerDone = false; } } /// Does the actual (time-consuming) work of the merge, /// but without holding synchronized lock on IndexWriter /// instance /// private int MergeMiddle(MergePolicy.OneMerge merge) { merge.CheckAborted(directory); System.String mergedName = merge.info.name; SegmentMerger merger = null; int mergedDocCount = 0; SegmentInfos sourceSegments = merge.segments; SegmentInfos sourceSegmentsClone = merge.segmentsClone; int numSegments = sourceSegments.Count; if (infoStream != null) Message("merging " + merge.SegString(directory)); merger = new SegmentMerger(this, mergedName, merge); // This is try/finally to make sure merger's readers are // closed: bool success = false; try { int totDocCount = 0; for (int i = 0; i < numSegments; i++) { SegmentInfo si = sourceSegmentsClone.Info(i); IndexReader reader = SegmentReader.Get(si, MERGE_READ_BUFFER_SIZE, merge.mergeDocStores); // no need to set deleter (yet) merger.Add(reader); totDocCount += reader.NumDocs(); } if (infoStream != null) { Message("merge: total " + totDocCount + " docs"); } merge.CheckAborted(directory); mergedDocCount = merge.info.docCount = merger.Merge(merge.mergeDocStores); System.Diagnostics.Debug.Assert(mergedDocCount == totDocCount); success = true; } finally { // close readers before we attempt to delete // now-obsolete segments if (merger != null) { merger.CloseReaders(); } if (!success) { if (infoStream != null) Message("hit exception during merge; now refresh deleter on segment " + mergedName); lock (this) { AddMergeException(merge); deleter.Refresh(mergedName); } } } if (!CommitMerge(merge)) // commitMerge will return false if this merge was aborted return 0; if (merge.useCompoundFile) { success = false; bool skip = false; System.String compoundFileName = mergedName + "." + IndexFileNames.COMPOUND_FILE_EXTENSION; try { try { merger.CreateCompoundFile(compoundFileName); success = true; } catch (System.IO.IOException ioe) { lock (this) { if (segmentInfos.IndexOf(merge.info) == - 1) { // If another merge kicked in and merged our // new segment away while we were trying to // build the compound file, we can hit a // FileNotFoundException and possibly // IOException over NFS. We can tell this has // happened because our SegmentInfo is no // longer in the segments; if this has // happened it is safe to ignore the exception // & skip finishing/committing our compound // file creating. if (infoStream != null) Message("hit exception creating compound file; ignoring it because our info (segment " + merge.info.name + ") has been merged away"); skip = true; } else throw ioe; } } } finally { if (!success) { if (infoStream != null) Message("hit exception creating compound file during merge: skip=" + skip); lock (this) { if (!skip) AddMergeException(merge); deleter.DeleteFile(compoundFileName); } } } if (!skip) { lock (this) { if (skip || segmentInfos.IndexOf(merge.info) == - 1 || merge.IsAborted()) { // Our segment (committed in non-compound // format) got merged away while we were // building the compound format. deleter.DeleteFile(compoundFileName); } else { success = false; try { merge.info.SetUseCompoundFile(true); Checkpoint(); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception checkpointing compound file during merge"); // Must rollback: AddMergeException(merge); merge.info.SetUseCompoundFile(false); DeletePartialSegmentsFile(); deleter.DeleteFile(compoundFileName); } } // Give deleter a chance to remove files now. deleter.Checkpoint(segmentInfos, autoCommit); } } } } return mergedDocCount; } internal virtual void AddMergeException(MergePolicy.OneMerge merge) { lock (this) { if (!mergeExceptions.Contains(merge) && mergeGen == merge.mergeGen) mergeExceptions.Add(merge); } } private void DeletePartialSegmentsFile() { if (segmentInfos.GetLastGeneration() != segmentInfos.GetGeneration()) { System.String segmentFileName = IndexFileNames.FileNameFromGeneration(IndexFileNames.SEGMENTS, "", segmentInfos.GetGeneration()); if (infoStream != null) Message("now delete partial segments file \"" + segmentFileName + "\""); deleter.DeleteFile(segmentFileName); } } // Called during flush to apply any buffered deletes. If // flushedNewSegment is true then a new segment was just // created and flushed from the ram segments, so we will // selectively apply the deletes to that new segment. private void ApplyDeletes(bool flushedNewSegment) { System.Collections.Hashtable bufferedDeleteTerms = docWriter.GetBufferedDeleteTerms(); System.Collections.IList bufferedDeleteDocIDs = docWriter.GetBufferedDeleteDocIDs(); if (infoStream != null) Message("flush " + docWriter.GetNumBufferedDeleteTerms() + " buffered deleted terms and " + bufferedDeleteDocIDs.Count + " deleted docIDs on " + segmentInfos.Count + " segments."); if (flushedNewSegment) { IndexReader reader = null; try { // Open readers w/o opening the stored fields / // vectors because these files may still be held // open for writing by docWriter reader = SegmentReader.Get(segmentInfos.Info(segmentInfos.Count - 1), false); // Apply delete terms to the segment just flushed from ram // apply appropriately so that a delete term is only applied to // the documents buffered before it, not those buffered after it. ApplyDeletesSelectively(bufferedDeleteTerms, bufferedDeleteDocIDs, reader); } finally { if (reader != null) { try { reader.DoCommit(); } finally { reader.DoClose(); } } } } int infosEnd = segmentInfos.Count; if (flushedNewSegment) { infosEnd--; } for (int i = 0; i < infosEnd; i++) { IndexReader reader = null; try { reader = SegmentReader.Get(segmentInfos.Info(i), false); // Apply delete terms to disk segments // except the one just flushed from ram. ApplyDeletes(bufferedDeleteTerms, reader); } finally { if (reader != null) { try { reader.DoCommit(); } finally { reader.DoClose(); } } } } // Clean up bufferedDeleteTerms. docWriter.ClearBufferedDeletes(); } // For test purposes. public /*internal*/ int GetBufferedDeleteTermsSize() { lock (this) { return docWriter.GetBufferedDeleteTerms().Count; } } // For test purposes. public /*internal*/ int GetNumBufferedDeleteTerms() { lock (this) { return docWriter.GetNumBufferedDeleteTerms(); } } // Apply buffered delete terms to the segment just flushed from ram // apply appropriately so that a delete term is only applied to // the documents buffered before it, not those buffered after it. private void ApplyDeletesSelectively(System.Collections.Hashtable deleteTerms, System.Collections.IList deleteIds, IndexReader reader) { System.Collections.IEnumerator iter = new System.Collections.Hashtable(deleteTerms).GetEnumerator(); while (iter.MoveNext()) { System.Collections.DictionaryEntry entry = (System.Collections.DictionaryEntry) iter.Current; Term term = (Term) entry.Key; TermDocs docs = reader.TermDocs(term); if (docs != null) { int num = ((DocumentsWriter.Num) entry.Value).GetNum(); try { while (docs.Next()) { int doc = docs.Doc(); if (doc >= num) { break; } reader.DeleteDocument(doc); } } finally { docs.Close(); } } } if (deleteIds.Count > 0) { iter = deleteIds.GetEnumerator(); while (iter.MoveNext()) { reader.DeleteDocument(((System.Int32) iter.Current)); } } } // Apply buffered delete terms to this reader. private void ApplyDeletes(System.Collections.Hashtable deleteTerms, IndexReader reader) { System.Collections.IEnumerator iter = new System.Collections.Hashtable(deleteTerms).GetEnumerator(); while (iter.MoveNext()) { System.Collections.DictionaryEntry entry = (System.Collections.DictionaryEntry) iter.Current; reader.DeleteDocuments((Term) entry.Key); } } // utility routines for tests public /*internal*/ virtual SegmentInfo NewestSegment() { return segmentInfos.Info(segmentInfos.Count - 1); } public virtual System.String SegString() { lock (this) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); for (int i = 0; i < segmentInfos.Count; i++) { if (i > 0) { buffer.Append(' '); } buffer.Append(segmentInfos.Info(i).SegString(directory)); } return buffer.ToString(); } } static IndexWriter() { DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; DEFAULT_MAX_MERGE_DOCS = LogDocMergePolicy.DEFAULT_MAX_MERGE_DOCS; MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH; } } }