/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
namespace Lucene.Net.Index
{
/// This class implements a that tries
/// to merge segments into levels of exponentially
/// increasing size, where each level has fewer segments than
/// the value of the merge factor. Whenever extra segments
/// (beyond the merge factor upper bound) are encountered,
/// all segments within the level are merged. You can get or
/// set the merge factor using and
/// respectively.
///
/// This class is abstract and requires a subclass to
/// define the method which specifies how a
/// segment's size is determined.
/// is one subclass that measures size by document count in
/// the segment. is another
/// subclass that measures size as the total byte size of the
/// file(s) for the segment.
///
public abstract class LogMergePolicy : MergePolicy
{
/// Defines the allowed range of log(size) for each
/// level. A level is computed by taking the max segment
/// log size, minus LEVEL_LOG_SPAN, and finding all
/// segments falling within that range.
///
public const double LEVEL_LOG_SPAN = 0.75;
/// Default merge factor, which is how many segments are
/// merged at a time
///
public const int DEFAULT_MERGE_FACTOR = 10;
/// Default maximum segment size. A segment of this size
///
///
public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
/// Default noCFSRatio. If a merge's size is >= 10% of
/// the index, then we disable compound file for it.
/// See
///
public static double DEFAULT_NO_CFS_RATIO = 0.1;
private int mergeFactor = DEFAULT_MERGE_FACTOR;
internal long minMergeSize;
internal long maxMergeSize;
internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
protected double internalNoCFSRatio = DEFAULT_NO_CFS_RATIO;
/* TODO 3.0: change this default to true */
protected internal bool internalCalibrateSizeByDeletes = true;
private bool useCompoundFile = true;
private bool useCompoundDocStore = true;
protected LogMergePolicy(IndexWriter writer):base(writer)
{
}
protected internal virtual bool Verbose()
{
return writer != null && writer.Verbose;
}
public double NoCFSRatio
{
get { return internalNoCFSRatio; }
set
{
if (value < 0.0 || value > 1.0)
{
throw new ArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + value);
}
this.internalNoCFSRatio = value;
}
}
/* If a merged segment will be more than this percentage
* of the total size of the index, leave the segment as
* non-compound file even if compound file is enabled.
* Set to 1.0 to always use CFS regardless of merge
* size. */
private void Message(System.String message)
{
if (Verbose())
writer.Message("LMP: " + message);
}
/// Gets or sets how often segment indices are merged by
/// addDocument(). With smaller values, less RAM is used
/// while indexing, and searches on unoptimized indices are
/// faster, but indexing speed is slower. With larger
/// values, more RAM is used during indexing, and while
/// searches on unoptimized indices are slower, indexing is
/// faster. Thus larger values (> 10) are best for batch
/// index creation, and smaller values (< 10) for indices
/// that are interactively maintained.
///
public virtual int MergeFactor
{
get { return mergeFactor; }
set
{
if (value < 2)
throw new System.ArgumentException("mergeFactor cannot be less than 2");
this.mergeFactor = value;
}
}
public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
{
return useCompoundFile;
}
/// Gets or sets whether compound file format should be used for
/// newly flushed and newly merged segments.
///
public virtual void SetUseCompoundFile(bool useCompoundFile)
{
this.useCompoundFile = useCompoundFile;
}
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1024:UsePropertiesWhereAppropriate")]
public virtual bool GetUseCompoundFile()
{
return useCompoundFile;
}
// Javadoc inherited
public override bool UseCompoundDocStore(SegmentInfos infos)
{
return useCompoundDocStore;
}
/// Sets whether compound file format should be used for
/// newly flushed and newly merged doc store
/// segment files (term vectors and stored fields).
///
public virtual void SetUseCompoundDocStore(bool useCompoundDocStore)
{
this.useCompoundDocStore = useCompoundDocStore;
}
/// Returns true if newly flushed and newly merge doc
/// store segment files (term vectors and stored fields)
///
///
///
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1024:UsePropertiesWhereAppropriate")]
public virtual bool GetUseCompoundDocStore()
{
return useCompoundDocStore;
}
/// Gets or sets whether the segment size should be calibrated by
/// the number of deletes when choosing segments for merge.
///
public virtual bool CalibrateSizeByDeletes
{
set { this.internalCalibrateSizeByDeletes = value; }
get { return internalCalibrateSizeByDeletes; }
}
abstract protected internal long Size(SegmentInfo info);
protected internal virtual long SizeDocs(SegmentInfo info)
{
if (internalCalibrateSizeByDeletes)
{
int delCount = writer.NumDeletedDocs(info);
return (info.docCount - (long) delCount);
}
else
{
return info.docCount;
}
}
protected internal virtual long SizeBytes(SegmentInfo info)
{
long byteSize = info.SizeInBytes();
if (internalCalibrateSizeByDeletes)
{
int delCount = writer.NumDeletedDocs(info);
float delRatio = (info.docCount <= 0?0.0f:((float) delCount / (float) info.docCount));
return (info.docCount <= 0?byteSize:(long) (byteSize * (1.0f - delRatio)));
}
else
{
return byteSize;
}
}
private bool IsOptimized(SegmentInfos infos, int maxNumSegments, ISet segmentsToOptimize)
{
int numSegments = infos.Count;
int numToOptimize = 0;
SegmentInfo optimizeInfo = null;
for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
{
SegmentInfo info = infos.Info(i);
if (segmentsToOptimize.Contains(info))
{
numToOptimize++;
optimizeInfo = info;
}
}
return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(optimizeInfo));
}
/// Returns true if this single info is optimized (has no
/// pending norms or deletes, is in the same dir as the
/// writer, and matches the current compound file setting
///
private bool IsOptimized(SegmentInfo info)
{
bool hasDeletions = writer.NumDeletedDocs(info) > 0;
return !hasDeletions && !info.HasSeparateNorms() && info.dir == writer.Directory &&
(info.GetUseCompoundFile() == useCompoundFile || internalNoCFSRatio < 1.0);
}
/// Returns the merges necessary to optimize the index.
/// This merge policy defines "optimized" to mean only one
/// segment in the index, where that segment has no
/// deletions pending nor separate norms, and it is in
/// compound file format if the current useCompoundFile
/// setting is true. This method returns multiple merges
/// (mergeFactor at a time) so the
/// in use may make use of concurrency.
///
public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, int maxNumSegments, ISet segmentsToOptimize)
{
MergeSpecification spec;
System.Diagnostics.Debug.Assert(maxNumSegments > 0);
if (!IsOptimized(infos, maxNumSegments, segmentsToOptimize))
{
// Find the newest (rightmost) segment that needs to
// be optimized (other segments may have been flushed
// since optimize started):
int last = infos.Count;
while (last > 0)
{
SegmentInfo info = infos.Info(--last);
if (segmentsToOptimize.Contains(info))
{
last++;
break;
}
}
if (last > 0)
{
spec = new MergeSpecification();
// First, enroll all "full" merges (size
// mergeFactor) to potentially be run concurrently:
while (last - maxNumSegments + 1 >= mergeFactor)
{
spec.Add(MakeOneMerge(infos, infos.Range(last - mergeFactor, last)));
last -= mergeFactor;
}
// Only if there are no full merges pending do we
// add a final partial (< mergeFactor segments) merge:
if (0 == spec.merges.Count)
{
if (maxNumSegments == 1)
{
// Since we must optimize down to 1 segment, the
// choice is simple:
if (last > 1 || !IsOptimized(infos.Info(0)))
spec.Add(MakeOneMerge(infos, infos.Range(0, last)));
}
else if (last > maxNumSegments)
{
// Take care to pick a partial merge that is
// least cost, but does not make the index too
// lopsided. If we always just picked the
// partial tail then we could produce a highly
// lopsided index over time:
// We must merge this many segments to leave
// maxNumSegments in the index (from when
// optimize was first kicked off):
int finalMergeSize = last - maxNumSegments + 1;
// Consider all possible starting points:
long bestSize = 0;
int bestStart = 0;
for (int i = 0; i < last - finalMergeSize + 1; i++)
{
long sumSize = 0;
for (int j = 0; j < finalMergeSize; j++)
sumSize += Size(infos.Info(j + i));
if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
{
bestStart = i;
bestSize = sumSize;
}
}
spec.Add(MakeOneMerge(infos, infos.Range(bestStart, bestStart + finalMergeSize)));
}
}
}
else
spec = null;
}
else
spec = null;
return spec;
}
/// Finds merges necessary to expunge all deletes from the
/// index. We simply merge adjacent segments that have
/// deletes, up to mergeFactor at a time.
///
public override MergeSpecification FindMergesToExpungeDeletes(SegmentInfos segmentInfos)
{
int numSegments = segmentInfos.Count;
if (Verbose())
Message("findMergesToExpungeDeletes: " + numSegments + " segments");
MergeSpecification spec = new MergeSpecification();
int firstSegmentWithDeletions = - 1;
for (int i = 0; i < numSegments; i++)
{
SegmentInfo info = segmentInfos.Info(i);
int delCount = writer.NumDeletedDocs(info);
if (delCount > 0)
{
if (Verbose())
Message(" segment " + info.name + " has deletions");
if (firstSegmentWithDeletions == - 1)
firstSegmentWithDeletions = i;
else if (i - firstSegmentWithDeletions == mergeFactor)
{
// We've seen mergeFactor segments in a row with
// deletions, so force a merge now:
if (Verbose())
Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = i;
}
}
else if (firstSegmentWithDeletions != - 1)
{
// End of a sequence of segments with deletions, so,
// merge those past segments even if it's fewer than
// mergeFactor segments
if (Verbose())
Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = - 1;
}
}
if (firstSegmentWithDeletions != - 1)
{
if (Verbose())
Message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments - 1) + " inclusive");
spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, numSegments)));
}
return spec;
}
/// Checks if any merges are now necessary and returns a
/// if so. A merge
/// is necessary when there are more than
/// segments at a given level. When
/// multiple levels have too many segments, this method
/// will return multiple merges, allowing the
/// to use concurrency.
///
public override MergeSpecification FindMerges(SegmentInfos infos)
{
int numSegments = infos.Count;
if (Verbose())
Message("findMerges: " + numSegments + " segments");
// Compute levels, which is just log (base mergeFactor)
// of the size of each segment
float[] levels = new float[numSegments];
float norm = (float) System.Math.Log(mergeFactor);
for (int i = 0; i < numSegments; i++)
{
SegmentInfo info = infos.Info(i);
long size = Size(info);
// Floor tiny segments
if (size < 1)
size = 1;
levels[i] = (float) System.Math.Log(size) / norm;
}
float levelFloor;
if (minMergeSize <= 0)
levelFloor = (float) 0.0;
else
{
levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
}
// Now, we quantize the log values into levels. The
// first level is any segment whose log size is within
// LEVEL_LOG_SPAN of the max size, or, who has such as
// segment "to the right". Then, we find the max of all
// other segments and use that to define the next level
// segment, etc.
MergeSpecification spec = null;
int start = 0;
while (start < numSegments)
{
// Find max level of all segments not already
// quantized.
float maxLevel = levels[start];
for (int i = 1 + start; i < numSegments; i++)
{
float level = levels[i];
if (level > maxLevel)
maxLevel = level;
}
// Now search backwards for the rightmost segment that
// falls into this level:
float levelBottom;
if (maxLevel < levelFloor)
// All remaining segments fall into the min level
levelBottom = - 1.0F;
else
{
levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
// Force a boundary at the level floor
if (levelBottom < levelFloor && maxLevel >= levelFloor)
levelBottom = levelFloor;
}
int upto = numSegments - 1;
while (upto >= start)
{
if (levels[upto] >= levelBottom)
{
break;
}
upto--;
}
if (Verbose())
Message(" level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
// Finally, record all merges that are viable at this level:
int end = start + mergeFactor;
while (end <= 1 + upto)
{
bool anyTooLarge = false;
for (int i = start; i < end; i++)
{
SegmentInfo info = infos.Info(i);
anyTooLarge |= (Size(info) >= maxMergeSize || SizeDocs(info) >= maxMergeDocs);
}
if (!anyTooLarge)
{
if (spec == null)
spec = new MergeSpecification();
if (Verbose())
Message(" " + start + " to " + end + ": add this merge");
spec.Add(MakeOneMerge(infos, infos.Range(start, end)));
}
else if (Verbose())
Message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
start = end;
end = start + mergeFactor;
}
start = 1 + upto;
}
return spec;
}
protected OneMerge MakeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge)
{
bool doCFS;
if (!useCompoundFile)
{
doCFS = false;
}
else if (internalNoCFSRatio == 1.0)
{
doCFS = true;
}
else
{
long totSize = 0;
foreach(SegmentInfo info in infos)
{
totSize += Size(info);
}
long mergeSize = 0;
foreach(SegmentInfo info in infosToMerge)
{
mergeSize += Size(info);
}
doCFS = mergeSize <= internalNoCFSRatio * totSize;
}
return new OneMerge(infosToMerge, doCFS);
}
///
/// Gets or sets the largest segment (measured by document
/// count) that may be merged with other segments.
/// Determines the largest segment (measured by
/// document count) that may be merged with other segments.
/// Small values (e.g., less than 10,000) are best for
/// interactive indexing, as this limits the length of
/// pauses while indexing to a few seconds. Larger values
/// are best for batched indexing and speedier
/// searches.
///
/// The default value is .
///
/// The default merge policy ()
/// also allows you to set this
/// limit by net size (in MB) of the segment, using
/// .
///
public virtual int MaxMergeDocs
{
set { this.maxMergeDocs = value; }
get { return maxMergeDocs; }
}
}
}