/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Documents; using Lucene.Net.Store; using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; using Lucene.Net.Analysis; using NUnit.Framework; namespace Lucene.Net.Index { [TestFixture] public class TestStressIndexing2 : LuceneTestCase { internal class AnonymousClassComparator : System.Collections.IComparer { Fieldable f1, f2; public virtual int Compare(System.Object o1, System.Object o2) { if (o1 == o2) return 0; f1 = (Fieldable)o1; f2 = (Fieldable)o2; return String.CompareOrdinal(f1.Name() + f1.StringValue(), f2.Name() + f2.StringValue()); } } internal static int maxFields = 4; internal static int bigFieldSize = 10; internal static bool sameFieldOrder = false; internal static bool autoCommit = false; internal static int mergeFactor = 3; internal static int maxBufferedDocs = 3; internal static int seed = 0; internal static System.Random r = new System.Random((System.Int32) 0); [Test] public virtual void TestRandom() { Directory dir1 = new MockRAMDirectory(); Directory dir2 = new MockRAMDirectory(); System.Collections.IDictionary docs = IndexRandom(10, 100, 100, dir1); IndexSerial(docs, dir2); VerifyEquals(dir1, dir2, "id"); } [Test] public virtual void TestMultiConfig() { // test lots of smaller different params together for (int i = 0; i < 100; i++) { // increase iterations for better testing sameFieldOrder = r.NextDouble() > 0.5; autoCommit = r.NextDouble() > 0.5; mergeFactor = r.Next(3) + 2; maxBufferedDocs = r.Next(3) + 2; seed++; int nThreads = r.Next(5) + 1; int iter = r.Next(10) + 1; int range = r.Next(20) + 1; Directory dir1 = new MockRAMDirectory(); Directory dir2 = new MockRAMDirectory(); System.Collections.IDictionary docs = IndexRandom(nThreads, iter, range, dir1); IndexSerial(docs, dir2); VerifyEquals(dir1, dir2, "id"); } } internal static Term idTerm = new Term("id", ""); internal IndexingThread[] threads; internal static System.Collections.IComparer fieldNameComparator; // This test avoids using any extra synchronization in the multiple // indexing threads to test that IndexWriter does correctly synchronize // everything. public virtual System.Collections.IDictionary IndexRandom(int nThreads, int iterations, int range, Directory dir) { IndexWriter w = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true); w.SetUseCompoundFile(false); // force many merges w.SetMergeFactor(mergeFactor); w.SetRAMBufferSizeMB(.1); w.SetMaxBufferedDocs(maxBufferedDocs); threads = new IndexingThread[nThreads]; for (int i = 0; i < threads.Length; i++) { IndexingThread th = new IndexingThread(); th.w = w; th.base_Renamed = 1000000 * i; th.range = range; th.iterations = iterations; threads[i] = th; } for (int i = 0; i < threads.Length; i++) { threads[i].Start(); } for (int i = 0; i < threads.Length; i++) { threads[i].Join(); } // w.optimize(); w.Close(); System.Collections.IDictionary docs = new System.Collections.Hashtable(); for (int i = 0; i < threads.Length; i++) { IndexingThread th = threads[i]; lock (th) { System.Collections.IEnumerator e = th.docs.Keys.GetEnumerator(); while (e.MoveNext()) { docs[e.Current] = th.docs[e.Current]; } } } return docs; } public static void IndexSerial(System.Collections.IDictionary docs, Directory dir) { IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer()); // index all docs in a single thread System.Collections.IEnumerator iter = docs.Values.GetEnumerator(); while (iter.MoveNext()) { Document d = (Document) iter.Current; System.Collections.ArrayList fields = new System.Collections.ArrayList(); fields.AddRange(d.GetFields()); // nonono - can't do this (below) // // if multiple fields w/ same name, each instance must be // added in the same order as orginal doc, as the fields // are effectively concatendated // // term position/offset information must be maintained // put fields in same order each time //fields.Sort(fieldNameComparator); Document d1 = new Document(); d1.SetBoost(d.GetBoost()); for (int i = 0; i < fields.Count; i++) { d1.Add((Fieldable) fields[i]); } w.AddDocument(d1); // System.out.println("indexing "+d1); } w.Close(); } public static void VerifyEquals(Directory dir1, Directory dir2, System.String idField) { IndexReader r1 = IndexReader.Open(dir1); IndexReader r2 = IndexReader.Open(dir2); VerifyEquals(r1, r2, idField); r1.Close(); r2.Close(); } public static void VerifyEquals(IndexReader r1, IndexReader r2, System.String idField) { Assert.AreEqual(r1.NumDocs(), r2.NumDocs()); bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc()); int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping TermDocs termDocs1 = r1.TermDocs(); TermDocs termDocs2 = r2.TermDocs(); // create mapping from id2 space to id2 based on idField idField = String.Intern(idField); TermEnum termEnum = r1.Terms(new Term(idField, "")); do { Term term = termEnum.Term(); if (term == null || (System.Object) term.Field() != (System.Object) idField) break; termDocs1.Seek(termEnum); Assert.IsTrue(termDocs1.Next()); int id1 = termDocs1.Doc(); Assert.IsFalse(termDocs1.Next()); termDocs2.Seek(termEnum); Assert.IsTrue(termDocs2.Next()); int id2 = termDocs2.Doc(); Assert.IsFalse(termDocs2.Next()); r2r1[id2] = id1; // verify stored fields are equivalent VerifyEquals(r1.Document(id1), r2.Document(id2)); try { // verify term vectors are equivalent VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2)); } catch (System.ApplicationException e) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1); System.Console.Out.WriteLine(" d1=" + tv1); if (tv1 != null) for (int i = 0; i < tv1.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv1[i]); } TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2); System.Console.Out.WriteLine(" d2=" + tv2); if (tv2 != null) for (int i = 0; i < tv2.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv2[i]); } throw e; } } while (termEnum.Next()); termEnum.Close(); // Verify postings TermEnum termEnum1 = r1.Terms(new Term("", "")); TermEnum termEnum2 = r2.Terms(new Term("", "")); // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs()]; long[] info2 = new long[r2.NumDocs()]; for (; ; ) { Term term1, term2; // iterate until we get some docs int len1; for (; ; ) { len1 = 0; term1 = termEnum1.Term(); if (term1 == null) break; termDocs1.Seek(termEnum1); while (termDocs1.Next()) { int d1 = termDocs1.Doc(); int f1 = termDocs1.Freq(); info1[len1] = (((long) d1) << 32) | f1; len1++; } if (len1 > 0) break; if (!termEnum1.Next()) break; } // iterate until we get some docs int len2; for (; ; ) { len2 = 0; term2 = termEnum2.Term(); if (term2 == null) break; termDocs2.Seek(termEnum2); while (termDocs2.Next()) { int d2 = termDocs2.Doc(); int f2 = termDocs2.Freq(); info2[len2] = (((long) r2r1[d2]) << 32) | f2; len2++; } if (len2 > 0) break; if (!termEnum2.Next()) break; } if (!hasDeletes) Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq()); Assert.AreEqual(len1, len2); if (len1 == 0) break; // no more terms Assert.AreEqual(term1, term2); // sort info2 to get it into ascending docid System.Array.Sort(info2, 0, len2 - 0); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i]); } termEnum1.Next(); termEnum2.Next(); } } public static void VerifyEquals(Document d1, Document d2) { System.Collections.ArrayList ff1 = new System.Collections.ArrayList(d1.GetFields()); System.Collections.ArrayList ff2 = new System.Collections.ArrayList(d2.GetFields()); ff1.Sort(fieldNameComparator); ff2.Sort(fieldNameComparator); if (ff1.Count != ff2.Count) { // print out whole doc on error System.Console.Write("Doc 1:"); for (int j = 0; j < ff1.Count; j++) { Fieldable field = (Fieldable)ff1[j]; System.Console.Write(" {0}={1};", field.Name(), field.StringValue()); } System.Console.WriteLine(); System.Console.Write("Doc 2:"); for (int j = 0; j < ff2.Count; j++) { Fieldable field = (Fieldable)ff2[j]; System.Console.Write(" {0}={1};", field.Name(), field.StringValue()); } System.Console.WriteLine(); Assert.AreEqual(ff1.Count, ff2.Count); } for (int i = 0; i < ff1.Count; i++) { Fieldable f1 = (Fieldable) ff1[i]; Fieldable f2 = (Fieldable) ff2[i]; if (f1.IsBinary()) { System.Diagnostics.Debug.Assert(f2.IsBinary()); //TODO } else { System.String s1 = f1.StringValue(); System.String s2 = f2.StringValue(); if (!s1.Equals(s2)) { // print out whole doc on error System.Console.Write("Doc 1:"); for (int j = 0; j < ff1.Count; j++) { Fieldable field = (Fieldable)ff1[j]; System.Console.Write(" {0}={1};", field.Name(), field.StringValue()); } System.Console.WriteLine(); System.Console.Write("Doc 2:"); for (int j = 0; j < ff2.Count; j++) { Fieldable field = (Fieldable)ff2[j]; System.Console.Write(" {0}={1};", field.Name(), field.StringValue()); } System.Console.WriteLine(); Assert.AreEqual(s1, s2); } } } } public static void VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return ; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { TermFreqVector v1 = d1[i]; TermFreqVector v2 = d2[i]; Assert.AreEqual(v1.Size(), v2.Size()); int numTerms = v1.Size(); System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) Assert.AreEqual(terms1[j], terms2[j]); Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector) v1; TermPositionVector tpv2 = (TermPositionVector) v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) Assert.IsTrue(offsets2 == null); else Assert.IsTrue(offsets2 != null); for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset()); Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset()); } } } } } } internal class IndexingThread : SupportClass.ThreadClass { internal IndexWriter w; internal int base_Renamed; internal int range; internal int iterations; internal System.Collections.IDictionary docs = new System.Collections.Hashtable(); // Map internal System.Random r; public virtual int NextInt(int lim) { return r.Next(lim); } public virtual System.String GetString(int nTokens) { nTokens = nTokens != 0?nTokens:r.Next(4) + 1; // avoid StringBuffer because it adds extra synchronization. char[] arr = new char[nTokens * 2]; for (int i = 0; i < nTokens; i++) { arr[i * 2] = (char) ('A' + r.Next(10)); arr[i * 2 + 1] = ' '; } return new System.String(arr); } public virtual void IndexDoc() { Document d = new Document(); System.Collections.ArrayList fields = new System.Collections.ArrayList(); int id = base_Renamed + NextInt(range); System.String idString = "" + id; Field idField = new Field("id", idString, Field.Store.YES, Field.Index.NO_NORMS); fields.Add(idField); int nFields = NextInt(Lucene.Net.Index.TestStressIndexing2.maxFields); for (int i = 0; i < nFields; i++) { Field.TermVector tvVal = Field.TermVector.NO; switch (NextInt(4)) { case 0: tvVal = Field.TermVector.NO; break; case 1: tvVal = Field.TermVector.YES; break; case 2: tvVal = Field.TermVector.WITH_POSITIONS; break; case 3: tvVal = Field.TermVector.WITH_POSITIONS_OFFSETS; break; } switch (NextInt(4)) { case 0: fields.Add(new Field("f0", GetString(1), Field.Store.YES, Field.Index.NO_NORMS, tvVal)); break; case 1: fields.Add(new Field("f1", GetString(0), Field.Store.NO, Field.Index.TOKENIZED, tvVal)); break; case 2: fields.Add(new Field("f2", GetString(0), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); break; case 3: fields.Add(new Field("f3", GetString(Lucene.Net.Index.TestStressIndexing2.bigFieldSize), Field.Store.YES, Field.Index.TOKENIZED, tvVal)); break; } } if (Lucene.Net.Index.TestStressIndexing2.sameFieldOrder) { fields.Sort(Lucene.Net.Index.TestStressIndexing2.fieldNameComparator); } else { // random placement of id field also int index = NextInt(fields.Count); fields[0] = fields[index]; fields[index] = idField; } for (int i = 0; i < fields.Count; i++) { d.Add((Fieldable) fields[i]); } w.UpdateDocument(Lucene.Net.Index.TestStressIndexing2.idTerm.CreateTerm(idString), d); // System.out.println("indexing "+d); docs[idString] = d; } override public void Run() { try { r = new System.Random((System.Int32) (base_Renamed + range + Lucene.Net.Index.TestStressIndexing2.seed)); for (int i = 0; i < iterations; i++) { IndexDoc(); } } catch (System.Exception e) { System.Console.Error.WriteLine(e.StackTrace); Assert.Fail(e.ToString()); } lock (this) { int generatedAux = docs.Count; } } } static TestStressIndexing2() { fieldNameComparator = new AnonymousClassComparator(); } } }