/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Document = Lucene.Net.Documents.Document;
using Directory = Lucene.Net.Store.Directory;
using FSDirectory = Lucene.Net.Store.FSDirectory;
using IndexInput = Lucene.Net.Store.IndexInput;
namespace Lucene.Net.Index
{
/// Basic tool to check the health of an index and write a
/// new segments file that removes reference to problematic
/// segments. There are many more checks that this tool
/// could do but does not yet, eg: reconstructing a segments
/// file by looking for all loadable segments (if no segments
/// file is found), removing specifically specified segments,
/// listing files that exist but are not referenced, etc.
///
public class CheckIndex
{
public static System.IO.TextWriter out_Renamed;
private class MySegmentTermDocs : SegmentTermDocs
{
internal int delCount;
internal MySegmentTermDocs(SegmentReader p) : base(p)
{
}
public override void Seek(Term term)
{
base.Seek(term);
delCount = 0;
}
protected internal override void SkippingDoc()
{
delCount++;
}
}
/// Returns true if index is clean, else false.
public static bool Check(Directory dir, bool doFix)
{
System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat;
SegmentInfos sis = new SegmentInfos();
try
{
sis.Read(dir);
}
catch (System.Exception t)
{
out_Renamed.WriteLine("ERROR: could not read any segments file in directory");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
return false;
}
int numSegments = sis.Count;
System.String segmentsFileName = sis.GetCurrentSegmentFileName();
IndexInput input = null;
try
{
input = dir.OpenInput(segmentsFileName);
}
catch (System.Exception t)
{
out_Renamed.WriteLine("ERROR: could not open segments file in directory");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
return false;
}
int format = 0;
try
{
format = input.ReadInt();
}
catch (System.Exception t)
{
out_Renamed.WriteLine("ERROR: could not read segment file version in directory");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
return false;
}
finally
{
if (input != null)
input.Close();
}
System.String sFormat = "";
bool skip = false;
if (format == SegmentInfos.FORMAT)
sFormat = "FORMAT [Lucene Pre-2.1]";
if (format == SegmentInfos.FORMAT_LOCKLESS)
sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE)
{
sFormat = "int=" + format + " [newer version of Lucene than this tool]";
skip = true;
}
else
{
sFormat = format + " [Lucene 1.3 or prior]";
}
out_Renamed.WriteLine("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat);
if (skip)
{
out_Renamed.WriteLine("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
return false;
}
SegmentInfos newSIS = (SegmentInfos) sis.Clone();
newSIS.Clear();
bool changed = false;
int totLoseDocCount = 0;
int numBadSegments = 0;
for (int i = 0; i < numSegments; i++)
{
SegmentInfo info = sis.Info(i);
out_Renamed.WriteLine(" " + (1 + i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
int toLoseDocCount = info.docCount;
SegmentReader reader = null;
try
{
out_Renamed.WriteLine(" compound=" + info.GetUseCompoundFile());
out_Renamed.WriteLine(" numFiles=" + info.Files().Count);
out_Renamed.WriteLine(String.Format(nf, " size (MB)={0:f}", new Object[] { (info.SizeInBytes() / (1024.0 * 1024.0)) }));
int docStoreOffset = info.GetDocStoreOffset();
if (docStoreOffset != - 1)
{
out_Renamed.WriteLine(" docStoreOffset=" + docStoreOffset);
out_Renamed.WriteLine(" docStoreSegment=" + info.GetDocStoreSegment());
out_Renamed.WriteLine(" docStoreIsCompoundFile=" + info.GetDocStoreIsCompoundFile());
}
System.String delFileName = info.GetDelFileName();
if (delFileName == null)
out_Renamed.WriteLine(" no deletions");
else
out_Renamed.WriteLine(" has deletions [delFileName=" + delFileName + "]");
out_Renamed.Write(" test: open reader.........");
reader = SegmentReader.Get(info);
int numDocs = reader.NumDocs();
toLoseDocCount = numDocs;
if (reader.HasDeletions())
out_Renamed.WriteLine("OK [" + (info.docCount - numDocs) + " deleted docs]");
else
out_Renamed.WriteLine("OK");
out_Renamed.Write(" test: fields, norms.......");
System.Collections.ICollection fieldNames = reader.GetFieldNames(IndexReader.FieldOption.ALL);
System.Collections.IEnumerator it = fieldNames.GetEnumerator();
while (it.MoveNext())
{
System.String fieldName = (System.String) it.Current;
byte[] b = reader.Norms(fieldName);
if (b.Length != info.docCount)
throw new System.SystemException("norms for field \"" + fieldName + "\" is length " + b.Length + " != maxDoc " + info.docCount);
}
out_Renamed.WriteLine("OK [" + fieldNames.Count + " fields]");
out_Renamed.Write(" test: terms, freq, prox...");
TermEnum termEnum = reader.Terms();
TermPositions termPositions = reader.TermPositions();
// Used only to count up # deleted docs for this
// term
MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
long termCount = 0;
long totFreq = 0;
long totPos = 0;
while (termEnum.Next())
{
termCount++;
Term term = termEnum.Term();
int docFreq = termEnum.DocFreq();
termPositions.Seek(term);
int lastDoc = - 1;
int freq0 = 0;
totFreq += docFreq;
while (termPositions.Next())
{
freq0++;
int doc = termPositions.Doc();
int freq = termPositions.Freq();
if (doc <= lastDoc)
{
throw new System.SystemException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc);
}
lastDoc = doc;
if (freq <= 0)
{
throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
}
int lastPos = - 1;
totPos += freq;
for (int j = 0; j < freq; j++)
{
int pos = termPositions.NextPosition();
if (pos < 0)
{
throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
}
if (pos <= lastPos)
{
throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
}
}
}
// Now count how many deleted docs occurred in
// this term:
int delCount;
if (reader.HasDeletions())
{
myTermDocs.Seek(term);
while (myTermDocs.Next())
{
}
delCount = myTermDocs.delCount;
}
else
delCount = 0;
if (freq0 + delCount != docFreq)
{
throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
}
}
out_Renamed.WriteLine("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]");
out_Renamed.Write(" test: stored fields.......");
int docCount = 0;
long totFields = 0;
for (int j = 0; j < info.docCount; j++)
if (!reader.IsDeleted(j))
{
docCount++;
Document doc = reader.Document(j);
totFields += doc.GetFields().Count;
}
if (docCount != reader.NumDocs())
throw new System.SystemException("docCount=" + docCount + " but saw " + docCount + " undeleted docs");
out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total field count; avg {1:f} fields per doc]", new Object[] { totFields, (((float)totFields) / docCount) }));
out_Renamed.Write(" test: term vectors........");
int totVectors = 0;
for (int j = 0; j < info.docCount; j++)
if (!reader.IsDeleted(j))
{
TermFreqVector[] tfv = reader.GetTermFreqVectors(j);
if (tfv != null)
totVectors += tfv.Length;
}
out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new Object[] { totVectors, (((float)totVectors) / docCount) }));
out_Renamed.WriteLine("");
}
catch (System.Exception t)
{
out_Renamed.WriteLine("FAILED");
System.String comment;
if (doFix)
comment = "will remove reference to this segment (-fix is specified)";
else
comment = "would remove reference to this segment (-fix was not specified)";
out_Renamed.WriteLine(" WARNING: " + comment + "; full exception:");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
out_Renamed.WriteLine("");
totLoseDocCount += toLoseDocCount;
numBadSegments++;
changed = true;
continue;
}
finally
{
if (reader != null)
reader.Close();
}
// Keeper
newSIS.Add(info.Clone());
}
if (!changed)
{
out_Renamed.WriteLine("No problems were detected with this index.\n");
return true;
}
else
{
out_Renamed.WriteLine("WARNING: " + numBadSegments + " broken segments detected");
if (doFix)
out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents will be lost");
else
out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified");
out_Renamed.WriteLine();
}
if (doFix)
{
out_Renamed.WriteLine("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
for (int i = 0; i < 5; i++)
{
try
{
System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * 1000));
}
catch (System.Threading.ThreadInterruptedException)
{
SupportClass.ThreadClass.Current().Interrupt();
i--;
continue;
}
out_Renamed.WriteLine(" " + (5 - i) + "...");
}
out_Renamed.Write("Writing...");
try
{
newSIS.Write(dir);
}
catch (System.Exception t)
{
out_Renamed.WriteLine("FAILED; exiting");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
return false;
}
out_Renamed.WriteLine("OK");
out_Renamed.WriteLine("Wrote new segments file \"" + newSIS.GetCurrentSegmentFileName() + "\"");
}
else
{
out_Renamed.WriteLine("NOTE: would write new segments file [-fix was not specified]");
}
out_Renamed.WriteLine("");
return false;
}
[STAThread]
public static void Main(System.String[] args)
{
bool doFix = false;
for (int i = 0; i < args.Length; i++)
if (args[i].Equals("-fix"))
{
doFix = true;
break;
}
if (args.Length != (doFix ? 2 : 1))
{
out_Renamed.WriteLine("\nUsage: java Lucene.Net.Index.CheckIndex pathToIndex [-fix]\n" + "\n" + " -fix: actually write a new segments_N file, removing any problematic segments\n" + "\n" + "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" + "documents (perhaps many) to be permanently removed from the index. Always make\n" + "a backup copy of your index before running this! Do not run this tool on an index\n" + "that is actively being written to. You have been warned!\n" + "\n" + "Run without -fix, this tool will open the index, report version information\n" + "and report any exceptions it hits and what action it would take if -fix were\n" + "specified. With -fix, this tool will remove any segments that have issues and\n" + "write a new segments_N file. This means all documents contained in the affected\n" + "segments will be removed.\n" + "\n" + "This tool exits with exit code 1 if the index cannot be opened or has has any\n" + "corruption, else 0.\n");
System.Environment.Exit(1);
}
System.String dirName = args[0];
out_Renamed.WriteLine("\nOpening index @ " + dirName + "\n");
Directory dir = null;
try
{
dir = FSDirectory.GetDirectory(dirName);
}
catch (System.Exception t)
{
out_Renamed.WriteLine("ERROR: could not open directory \"" + dirName + "\"; exiting");
out_Renamed.Write(t.StackTrace);
out_Renamed.Flush();
System.Environment.Exit(1);
}
bool isClean = Check(dir, doFix);
int exitCode;
if (isClean)
exitCode = 0;
else
exitCode = 1;
System.Environment.Exit(exitCode);
}
static CheckIndex()
{
System.IO.StreamWriter temp_writer;
temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
temp_writer.AutoFlush = true;
out_Renamed = temp_writer;
}
}
}