package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; /**
This class implements a {@link MergePolicy} that tries * to merge segments into levels of exponentially * increasing size, where each level has fewer segments than * the value of the merge factor. Whenever extra segments * (beyond the merge factor upper bound) are encountered, * all segments within the level are merged. You can get or * set the merge factor using {@link #getMergeFactor()} and * {@link #setMergeFactor(int)} respectively.
* *This class is abstract and requires a subclass to * define the {@link #size} method which specifies how a * segment's size is determined. {@link LogDocMergePolicy} * is one subclass that measures size by document count in * the segment. {@link LogByteSizeMergePolicy} is another * subclass that measures size as the total byte size of the * file(s) for the segment.
*/ public abstract class LogMergePolicy extends MergePolicy { /** Defines the allowed range of log(size) for each * level. A level is computed by taking the max segment * log size, minus LEVEL_LOG_SPAN, and finding all * segments falling within that range. */ public static final double LEVEL_LOG_SPAN = 0.75; /** Default merge factor, which is how many segments are * merged at a time */ public static final int DEFAULT_MERGE_FACTOR = 10; /** Default maximum segment size. A segment of this size * or larger will never be merged. @see setMaxMergeDocs */ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; private int mergeFactor = DEFAULT_MERGE_FACTOR; long minMergeSize; long maxMergeSize; int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; /* TODO 3.0: change this default to true */ protected boolean calibrateSizeByDeletes = false; private boolean useCompoundFile = true; private boolean useCompoundDocStore = true; public LogMergePolicy(IndexWriter writer) { super(writer); } protected boolean verbose() { return writer != null && writer.verbose(); } private void message(String message) { if (verbose()) writer.message("LMP: " + message); } /**Returns the number of segments that are merged at * once and also controls the total number of segments * allowed to accumulate in the index.
*/ public int getMergeFactor() { return mergeFactor; } /** Determines how often segment indices are merged by * addDocument(). With smaller values, less RAM is used * while indexing, and searches on unoptimized indices are * faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while * searches on unoptimized indices are slower, indexing is * faster. Thus larger values (> 10) are best for batch * index creation, and smaller values (< 10) for indices * that are interactively maintained. */ public void setMergeFactor(int mergeFactor) { if (mergeFactor < 2) throw new IllegalArgumentException("mergeFactor cannot be less than 2"); this.mergeFactor = mergeFactor; } // Javadoc inherited @Override public boolean useCompoundFile(SegmentInfos infos, SegmentInfo info) { return useCompoundFile; } /** Sets whether compound file format should be used for * newly flushed and newly merged segments. */ public void setUseCompoundFile(boolean useCompoundFile) { this.useCompoundFile = useCompoundFile; } /** Returns true if newly flushed and newly merge segments * are written in compound file format. @see * #setUseCompoundFile */ public boolean getUseCompoundFile() { return useCompoundFile; } // Javadoc inherited @Override public boolean useCompoundDocStore(SegmentInfos infos) { return useCompoundDocStore; } /** Sets whether compound file format should be used for * newly flushed and newly merged doc store * segment files (term vectors and stored fields). */ public void setUseCompoundDocStore(boolean useCompoundDocStore) { this.useCompoundDocStore = useCompoundDocStore; } /** Returns true if newly flushed and newly merge doc * store segment files (term vectors and stored fields) * are written in compound file format. @see * #setUseCompoundDocStore */ public boolean getUseCompoundDocStore() { return useCompoundDocStore; } /** Sets whether the segment size should be calibrated by * the number of deletes when choosing segments for merge. */ public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) { this.calibrateSizeByDeletes = calibrateSizeByDeletes; } /** Returns true if the segment size should be calibrated * by the number of deletes when choosing segments for merge. */ public boolean getCalibrateSizeByDeletes() { return calibrateSizeByDeletes; } @Override public void close() {} abstract protected long size(SegmentInfo info) throws IOException; protected long sizeDocs(SegmentInfo info) throws IOException { if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); return (info.docCount - (long)delCount); } else { return info.docCount; } } protected long sizeBytes(SegmentInfo info) throws IOException { long byteSize = info.sizeInBytes(); if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); float delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount)); return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0f - delRatio))); } else { return byteSize; } } private boolean isOptimized(SegmentInfos infos, int maxNumSegments, SetDetermines the largest segment (measured by * document count) that may be merged with other segments. * Small values (e.g., less than 10,000) are best for * interactive indexing, as this limits the length of * pauses while indexing to a few seconds. Larger values * are best for batched indexing and speedier * searches.
* *The default value is {@link Integer#MAX_VALUE}.
* *The default merge policy ({@link * LogByteSizeMergePolicy}) also allows you to set this * limit by net size (in MB) of the segment, using {@link * LogByteSizeMergePolicy#setMaxMergeMB}.
*/ public void setMaxMergeDocs(int maxMergeDocs) { this.maxMergeDocs = maxMergeDocs; } /** Returns the largest segment (measured by document * count) that may be merged with other segments. * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { return maxMergeDocs; } }