/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Directory = Lucene.Net.Store.Directory;
using IndexOutput = Lucene.Net.Store.IndexOutput;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
namespace Lucene.Net.Index
{
/// This stores a monotonically increasing set of <Term, TermInfo> pairs in a
/// Directory. A TermInfos can be written once, in order.
///
sealed class TermInfosWriter : IDisposable
{
/// The file format version, a negative number.
public const int FORMAT = - 3;
// Changed strings to true utf8 with length-in-bytes not
// length-in-chars
public const int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = - 4;
// NOTE: always change this if you switch to a new format!
public static readonly int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
private bool isDisposed;
private FieldInfos fieldInfos;
private IndexOutput output;
private TermInfo lastTi = new TermInfo();
private long size;
// TODO: the default values for these two parameters should be settable from
// IndexWriter. However, once that's done, folks will start setting them to
// ridiculous values and complaining that things don't work well, as with
// mergeFactor. So, let's wait until a number of folks find that alternate
// values work better. Note that both of these values are stored in the
// segment, so that it's safe to change these w/o rebuilding all indexes.
/// Expert: The fraction of terms in the "dictionary" which should be stored
/// in RAM. Smaller values use more memory, but make searching slightly
/// faster, while larger values use less memory and make searching slightly
/// slower. Searching is typically not dominated by dictionary lookup, so
/// tweaking this is rarely useful.
///
internal int indexInterval = 128;
/// Expert: The fraction of entries stored in skip tables,
/// used to accellerate . Larger values result in
/// smaller indexes, greater acceleration, but fewer accelerable cases, while
/// smaller values result in bigger indexes, less acceleration and more
/// accelerable cases. More detailed experiments would be useful here.
///
internal int skipInterval = 16;
/// Expert: The maximum number of skip levels. Smaller values result in
/// slightly smaller indexes, but slower skipping in big posting lists.
///
internal int maxSkipLevels = 10;
private long lastIndexPointer;
private bool isIndex;
private byte[] lastTermBytes = new byte[10];
private int lastTermBytesLength = 0;
private int lastFieldNumber = - 1;
private TermInfosWriter other;
private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
internal TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval)
{
Initialize(directory, segment, fis, interval, false);
other = new TermInfosWriter(directory, segment, fis, interval, true);
other.other = this;
}
private TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval, bool isIndex)
{
Initialize(directory, segment, fis, interval, isIndex);
}
private void Initialize(Directory directory, System.String segment, FieldInfos fis, int interval, bool isi)
{
indexInterval = interval;
fieldInfos = fis;
isIndex = isi;
output = directory.CreateOutput(segment + (isIndex?".tii":".tis"));
output.WriteInt(FORMAT_CURRENT); // write format
output.WriteLong(0); // leave space for size
output.WriteInt(indexInterval); // write indexInterval
output.WriteInt(skipInterval); // write skipInterval
output.WriteInt(maxSkipLevels); // write maxSkipLevels
System.Diagnostics.Debug.Assert(InitUTF16Results());
}
internal void Add(Term term, TermInfo ti)
{
UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result);
Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti);
}
// Currently used only by assert statements
internal UnicodeUtil.UTF16Result utf16Result1;
internal UnicodeUtil.UTF16Result utf16Result2;
// Currently used only by assert statements
private bool InitUTF16Results()
{
utf16Result1 = new UnicodeUtil.UTF16Result();
utf16Result2 = new UnicodeUtil.UTF16Result();
return true;
}
// Currently used only by assert statement
private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
{
if (lastFieldNumber != fieldNumber)
{
int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber));
// If there is a field named "" (empty string) then we
// will get 0 on this comparison, yet, it's "OK". But
// it's not OK if two different field numbers map to
// the same name.
if (cmp != 0 || lastFieldNumber != - 1)
return cmp;
}
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
int len;
if (utf16Result1.length < utf16Result2.length)
len = utf16Result1.length;
else
len = utf16Result2.length;
for (int i = 0; i < len; i++)
{
char ch1 = utf16Result1.result[i];
char ch2 = utf16Result2.result[i];
if (ch1 != ch2)
return ch1 - ch2;
}
return utf16Result1.length - utf16Result2.length;
}
/// Adds a new <fieldNumber, termBytes>, TermInfo> pair to the set.
/// Term must be lexicographically greater than all previous Terms added.
/// TermInfo pointers must be positive and greater than all previous.
///
internal void Add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
{
System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0),
"Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + " (number " + fieldNumber + ")" +
" lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
" text=" + System.Text.Encoding.UTF8.GetString(termBytes, 0, termBytesLength) + " lastText=" + System.Text.Encoding.UTF8.GetString(lastTermBytes, 0, lastTermBytesLength));
System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")");
System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")");
if (!isIndex && size % indexInterval == 0)
other.Add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
WriteTerm(fieldNumber, termBytes, termBytesLength); // write term
output.WriteVInt(ti.docFreq); // write doc freq
output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
output.WriteVLong(ti.proxPointer - lastTi.proxPointer);
if (ti.docFreq >= skipInterval)
{
output.WriteVInt(ti.skipOffset);
}
if (isIndex)
{
output.WriteVLong(other.output.FilePointer - lastIndexPointer);
lastIndexPointer = other.output.FilePointer; // write pointer
}
lastFieldNumber = fieldNumber;
lastTi.Set(ti);
size++;
}
private void WriteTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
{
// TODO: UTF16toUTF8 could tell us this prefix
// Compute prefix in common with last term:
int start = 0;
int limit = termBytesLength < lastTermBytesLength?termBytesLength:lastTermBytesLength;
while (start < limit)
{
if (termBytes[start] != lastTermBytes[start])
break;
start++;
}
int length = termBytesLength - start;
output.WriteVInt(start); // write shared prefix length
output.WriteVInt(length); // write delta length
output.WriteBytes(termBytes, start, length); // write delta bytes
output.WriteVInt(fieldNumber); // write field num
if (lastTermBytes.Length < termBytesLength)
{
byte[] newArray = new byte[(int) (termBytesLength * 1.5)];
Array.Copy(lastTermBytes, 0, newArray, 0, start);
lastTermBytes = newArray;
}
Array.Copy(termBytes, start, lastTermBytes, start, length);
lastTermBytesLength = termBytesLength;
}
/// Called to complete TermInfos creation.
public void Dispose()
{
// Move to protected method if class becomes unsealed
if (isDisposed) return;
output.Seek(4); // write size after format
output.WriteLong(size);
output.Dispose();
if (!isIndex)
other.Dispose();
isDisposed = true;
}
}
}