/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Document = Lucene.Net.Documents.Document; using Directory = Lucene.Net.Store.Directory; using FSDirectory = Lucene.Net.Store.FSDirectory; using IndexInput = Lucene.Net.Store.IndexInput; namespace Lucene.Net.Index { /// Basic tool to check the health of an index and write a /// new segments file that removes reference to problematic /// segments. There are many more checks that this tool /// could do but does not yet, eg: reconstructing a segments /// file by looking for all loadable segments (if no segments /// file is found), removing specifically specified segments, /// listing files that exist but are not referenced, etc. /// public class CheckIndex { public static System.IO.TextWriter out_Renamed; private class MySegmentTermDocs : SegmentTermDocs { internal int delCount; internal MySegmentTermDocs(SegmentReader p) : base(p) { } public override void Seek(Term term) { base.Seek(term); delCount = 0; } protected internal override void SkippingDoc() { delCount++; } } /// Returns true if index is clean, else false. public static bool Check(Directory dir, bool doFix) { System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat; SegmentInfos sis = new SegmentInfos(); try { sis.Read(dir); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not read any segments file in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return false; } int numSegments = sis.Count; System.String segmentsFileName = sis.GetCurrentSegmentFileName(); IndexInput input = null; try { input = dir.OpenInput(segmentsFileName); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not open segments file in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return false; } int format = 0; try { format = input.ReadInt(); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not read segment file version in directory"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return false; } finally { if (input != null) input.Close(); } System.String sFormat = ""; bool skip = false; if (format == SegmentInfos.FORMAT) sFormat = "FORMAT [Lucene Pre-2.1]"; if (format == SegmentInfos.FORMAT_LOCKLESS) sFormat = "FORMAT_LOCKLESS [Lucene 2.1]"; else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE) sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]"; else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; } else { sFormat = format + " [Lucene 1.3 or prior]"; } out_Renamed.WriteLine("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat); if (skip) { out_Renamed.WriteLine("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting"); return false; } SegmentInfos newSIS = (SegmentInfos) sis.Clone(); newSIS.Clear(); bool changed = false; int totLoseDocCount = 0; int numBadSegments = 0; for (int i = 0; i < numSegments; i++) { SegmentInfo info = sis.Info(i); out_Renamed.WriteLine(" " + (1 + i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount); int toLoseDocCount = info.docCount; SegmentReader reader = null; try { out_Renamed.WriteLine(" compound=" + info.GetUseCompoundFile()); out_Renamed.WriteLine(" numFiles=" + info.Files().Count); out_Renamed.WriteLine(String.Format(nf, " size (MB)={0:f}", new Object[] { (info.SizeInBytes() / (1024.0 * 1024.0)) })); int docStoreOffset = info.GetDocStoreOffset(); if (docStoreOffset != - 1) { out_Renamed.WriteLine(" docStoreOffset=" + docStoreOffset); out_Renamed.WriteLine(" docStoreSegment=" + info.GetDocStoreSegment()); out_Renamed.WriteLine(" docStoreIsCompoundFile=" + info.GetDocStoreIsCompoundFile()); } System.String delFileName = info.GetDelFileName(); if (delFileName == null) out_Renamed.WriteLine(" no deletions"); else out_Renamed.WriteLine(" has deletions [delFileName=" + delFileName + "]"); out_Renamed.Write(" test: open reader........."); reader = SegmentReader.Get(info); int numDocs = reader.NumDocs(); toLoseDocCount = numDocs; if (reader.HasDeletions()) out_Renamed.WriteLine("OK [" + (info.docCount - numDocs) + " deleted docs]"); else out_Renamed.WriteLine("OK"); out_Renamed.Write(" test: fields, norms......."); System.Collections.ICollection fieldNames = reader.GetFieldNames(IndexReader.FieldOption.ALL); System.Collections.IEnumerator it = fieldNames.GetEnumerator(); while (it.MoveNext()) { System.String fieldName = (System.String) it.Current; byte[] b = reader.Norms(fieldName); if (b.Length != info.docCount) throw new System.SystemException("norms for field \"" + fieldName + "\" is length " + b.Length + " != maxDoc " + info.docCount); } out_Renamed.WriteLine("OK [" + fieldNames.Count + " fields]"); out_Renamed.Write(" test: terms, freq, prox..."); TermEnum termEnum = reader.Terms(); TermPositions termPositions = reader.TermPositions(); // Used only to count up # deleted docs for this // term MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); long termCount = 0; long totFreq = 0; long totPos = 0; while (termEnum.Next()) { termCount++; Term term = termEnum.Term(); int docFreq = termEnum.DocFreq(); termPositions.Seek(term); int lastDoc = - 1; int freq0 = 0; totFreq += docFreq; while (termPositions.Next()) { freq0++; int doc = termPositions.Doc(); int freq = termPositions.Freq(); if (doc <= lastDoc) { throw new System.SystemException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc); } lastDoc = doc; if (freq <= 0) { throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } int lastPos = - 1; totPos += freq; for (int j = 0; j < freq; j++) { int pos = termPositions.NextPosition(); if (pos < 0) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos <= lastPos) { throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } } } // Now count how many deleted docs occurred in // this term: int delCount; if (reader.HasDeletions()) { myTermDocs.Seek(term); while (myTermDocs.Next()) { } delCount = myTermDocs.delCount; } else delCount = 0; if (freq0 + delCount != docFreq) { throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); } } out_Renamed.WriteLine("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]"); out_Renamed.Write(" test: stored fields......."); int docCount = 0; long totFields = 0; for (int j = 0; j < info.docCount; j++) if (!reader.IsDeleted(j)) { docCount++; Document doc = reader.Document(j); totFields += doc.GetFields().Count; } if (docCount != reader.NumDocs()) throw new System.SystemException("docCount=" + docCount + " but saw " + docCount + " undeleted docs"); out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total field count; avg {1:f} fields per doc]", new Object[] { totFields, (((float)totFields) / docCount) })); out_Renamed.Write(" test: term vectors........"); int totVectors = 0; for (int j = 0; j < info.docCount; j++) if (!reader.IsDeleted(j)) { TermFreqVector[] tfv = reader.GetTermFreqVectors(j); if (tfv != null) totVectors += tfv.Length; } out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new Object[] { totVectors, (((float)totVectors) / docCount) })); out_Renamed.WriteLine(""); } catch (System.Exception t) { out_Renamed.WriteLine("FAILED"); System.String comment; if (doFix) comment = "will remove reference to this segment (-fix is specified)"; else comment = "would remove reference to this segment (-fix was not specified)"; out_Renamed.WriteLine(" WARNING: " + comment + "; full exception:"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); out_Renamed.WriteLine(""); totLoseDocCount += toLoseDocCount; numBadSegments++; changed = true; continue; } finally { if (reader != null) reader.Close(); } // Keeper newSIS.Add(info.Clone()); } if (!changed) { out_Renamed.WriteLine("No problems were detected with this index.\n"); return true; } else { out_Renamed.WriteLine("WARNING: " + numBadSegments + " broken segments detected"); if (doFix) out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents will be lost"); else out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified"); out_Renamed.WriteLine(); } if (doFix) { out_Renamed.WriteLine("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!"); for (int i = 0; i < 5; i++) { try { System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * 1000)); } catch (System.Threading.ThreadInterruptedException) { SupportClass.ThreadClass.Current().Interrupt(); i--; continue; } out_Renamed.WriteLine(" " + (5 - i) + "..."); } out_Renamed.Write("Writing..."); try { newSIS.Write(dir); } catch (System.Exception t) { out_Renamed.WriteLine("FAILED; exiting"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); return false; } out_Renamed.WriteLine("OK"); out_Renamed.WriteLine("Wrote new segments file \"" + newSIS.GetCurrentSegmentFileName() + "\""); } else { out_Renamed.WriteLine("NOTE: would write new segments file [-fix was not specified]"); } out_Renamed.WriteLine(""); return false; } [STAThread] public static void Main(System.String[] args) { bool doFix = false; for (int i = 0; i < args.Length; i++) if (args[i].Equals("-fix")) { doFix = true; break; } if (args.Length != (doFix ? 2 : 1)) { out_Renamed.WriteLine("\nUsage: java Lucene.Net.Index.CheckIndex pathToIndex [-fix]\n" + "\n" + " -fix: actually write a new segments_N file, removing any problematic segments\n" + "\n" + "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" + "documents (perhaps many) to be permanently removed from the index. Always make\n" + "a backup copy of your index before running this! Do not run this tool on an index\n" + "that is actively being written to. You have been warned!\n" + "\n" + "Run without -fix, this tool will open the index, report version information\n" + "and report any exceptions it hits and what action it would take if -fix were\n" + "specified. With -fix, this tool will remove any segments that have issues and\n" + "write a new segments_N file. This means all documents contained in the affected\n" + "segments will be removed.\n" + "\n" + "This tool exits with exit code 1 if the index cannot be opened or has has any\n" + "corruption, else 0.\n"); System.Environment.Exit(1); } System.String dirName = args[0]; out_Renamed.WriteLine("\nOpening index @ " + dirName + "\n"); Directory dir = null; try { dir = FSDirectory.GetDirectory(dirName); } catch (System.Exception t) { out_Renamed.WriteLine("ERROR: could not open directory \"" + dirName + "\"; exiting"); out_Renamed.Write(t.StackTrace); out_Renamed.Flush(); System.Environment.Exit(1); } bool isClean = Check(dir, doFix); int exitCode; if (isClean) exitCode = 0; else exitCode = 1; System.Environment.Exit(exitCode); } static CheckIndex() { System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; out_Renamed = temp_writer; } } }