/// float score = search(String text, Query query) ////// /// Each instance can hold at most one Lucene "document", with a document containing /// zero or more "fields", each field having a name and a fulltext value. The /// fulltext value is tokenized (split and transformed) into zero or more index terms /// (aka words) on
addField()
, according to the policy implemented by an
/// Analyzer. For example, Lucene analyzers can split on whitespace, normalize to lower case
/// for case insensitivity, ignore common terms with little discriminatory value such as "he", "in", "and" (stop
/// words), reduce the terms to their natural linguistic root form such as "fishing"
/// being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri
/// (upon indexing and/or querying), etc. For details, see
/// Lucene Analyzer Intro.
/// /// Arbitrary Lucene queries can be run against this class - see Lucene Query Syntax /// as well as Query Parser Rules. /// Note that a Lucene query selects on the field names and associated (indexed) /// tokenized terms, not on the original fulltext(s) - the latter are not stored /// but rather thrown away immediately after tokenization. ///
/// For some interesting background information on search technology, see Bob Wyman's /// Prospective Search, /// Jim Gray's /// /// A Call to Arms - Custom subscriptions, and Tim Bray's /// On Search, the Series. /// /// ///
/// Analyzer analyzer = PatternAnalyzer.DEFAULT_ANALYZER; /// //Analyzer analyzer = new SimpleAnalyzer(); /// MemoryIndex index = new MemoryIndex(); /// index.addField("content", "Readings about Salmons and other select Alaska fishing Manuals", analyzer); /// index.addField("author", "Tales of James", analyzer); /// QueryParser parser = new QueryParser("content", analyzer); /// float score = index.search(parser.parse("+author:james +salmon~ +fish/// manual~")); /// if (score > 0.0f) { /// System.out.println("it's a match"); /// } else { /// System.out.println("no match found"); /// } /// System.out.println("indexData=" + index.toString()); ////// /// ///
/// (: An XQuery that finds all books authored by James that have something to do with "salmon fishing manuals", sorted by relevance :) /// declare namespace lucene = "java:nux.xom.pool.FullTextUtil"; /// declare variable $query := "+salmon~ +fish/// manual~"; (: any arbitrary Lucene query can go here :) /// /// for $book in /books/book[author="James" and lucene:match(abstract, $query) > 0.0] /// let $score := lucene:match($book/abstract, $query) /// order by $score descending /// return $book ////// /// ///
/// MemoryIndex index = ... /// synchronized (index) { /// // read and/or write index (i.e. add fields and/or query) /// } ////// /// ///
RAMDirectory
.
/// Note that RAMDirectory
has particularly
/// large efficiency overheads for small to medium sized texts, both in time and space.
/// Indexing a field with N tokens takes O(N) in the best case, and O(N logN) in the worst
/// case. Memory consumption is probably larger than for RAMDirectory
.
///
/// Example throughput of many simple term queries over a single MemoryIndex:
/// ~500000 queries/sec on a MacBook Pro, jdk 1.5.0_06, server VM.
/// As always, your mileage may vary.
///
/// If you're curious about
/// the whereabouts of bottlenecks, run java 1.5 with the non-perturbing '-server
/// -agentlib:hprof=cpu=samples,depth=10' flags, then study the trace log and
/// correlate its hotspot trailer with its call stack headers (see
/// hprof tracing ).
///
///addField(fieldName, stream, 1.0f)
.
*
* @param fieldName
* a name to be associated with the text
* @param stream
* the token stream to retrieve tokens from
*/
public void AddField(String fieldName, TokenStream stream)
{
AddField(fieldName, stream, 1.0f);
}
/**
* Iterates over the given token stream and adds the resulting terms to the index;
* Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
* Lucene {@link org.apache.lucene.document.Field}.
* Finally closes the token stream. Note that untokenized keywords can be added with this method via
* {@link #CreateKeywordTokenStream(Collection)}, the Lucene contrib KeywordTokenizer
or similar utilities.
*
* @param fieldName
* a name to be associated with the text
* @param stream
* the token stream to retrieve tokens from.
* @param boost
* the boost factor for hits for this field
* @see org.apache.lucene.document.Field#setBoost(float)
*/
public void AddField(String fieldName, TokenStream stream, float boost)
{
try
{
if (fieldName == null)
throw new ArgumentException("fieldName must not be null");
if (stream == null)
throw new ArgumentException("token stream must not be null");
if (boost <= 0.0f)
throw new ArgumentException("boost factor must be greater than 0.0");
if (fields[fieldName] != null)
throw new ArgumentException("field must not be added more than once");
var terms = new HashMapint
elements;
* implemented with arrays.
*/
[Serializable]
private sealed class ArrayIntList
{
private int[] elements;
private int size = 0;
private static long serialVersionUID = 2282195016849084649L;
private ArrayIntList()
: this(10)
{
}
public ArrayIntList(int initialCapacity)
{
elements = new int[initialCapacity];
}
public void Add(int elem)
{
if (size == elements.Length) EnsureCapacity(size + 1);
elements[size++] = elem;
}
public void Add(int pos, int start, int end)
{
if (size + 3 > elements.Length) EnsureCapacity(size + 3);
elements[size] = pos;
elements[size + 1] = start;
elements[size + 2] = end;
size += 3;
}
public int Get(int index)
{
if (index >= size) ThrowIndex(index);
return elements[index];
}
public int Size()
{
return size;
}
public int[] ToArray(int stride)
{
int[] arr = new int[Size()/stride];
if (stride == 1)
{
Array.Copy(elements, 0, arr, 0, size);
}
else
{
for (int i = 0, j = 0; j < size; i++, j += stride) arr[i] = elements[j];
}
return arr;
}
private void EnsureCapacity(int minCapacity)
{
int newCapacity = Math.Max(minCapacity, (elements.Length*3)/2 + 1);
int[] newElements = new int[newCapacity];
Array.Copy(elements, 0, newElements, 0, size);
elements = newElements;
}
private void ThrowIndex(int index)
{
throw new IndexOutOfRangeException("index: " + index
+ ", size: " + size);
}
/** returns the first few positions (without offsets); debug only */
public string ToString(int stride)
{
int s = Size()/stride;
int len = Math.Min(10, s); // avoid printing huge lists
StringBuilder buf = new StringBuilder(4*len);
buf.Append("[");
for (int i = 0; i < len; i++)
{
buf.Append(Get(i*stride));
if (i < len - 1) buf.Append(", ");
}
if (len != s) buf.Append(", ..."); // and some more...
buf.Append("]");
return buf.ToString();
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static readonly Term MATCH_ALL_TERM = new Term("");
/**
* Search support for Lucene framework integration; implements all methods
* required by the Lucene IndexReader contracts.
*/
private sealed partial class MemoryIndexReader : IndexReader
{
private readonly MemoryIndex _index;
private Searcher searcher; // needed to find searcher.getSimilarity()
internal MemoryIndexReader(MemoryIndex index)
{
_index = index;
}
private Info GetInfo(String fieldName)
{
return _index.fields[fieldName];
}
private Info GetInfo(int pos)
{
return _index.sortedFields[pos].Value;
}
public override int DocFreq(Term term)
{
Info info = GetInfo(term.Field);
int freq = 0;
if (info != null) freq = info.GetPositions(term.Text) != null ? 1 : 0;
if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.docFreq: " + term + ", freq:" + freq);
return freq;
}
public override TermEnum Terms()
{
if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.terms()");
return Terms(MATCH_ALL_TERM);
}
public override TermEnum Terms(Term term)
{
if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.terms: " + term);
int i; // index into info.sortedTerms
int j; // index into sortedFields
_index.SortFields();
if (_index.sortedFields.Length == 1 && _index.sortedFields[0].Key == term.Field)
{
j = 0; // fast path
}
else
{
j = Array.BinarySearch(_index.sortedFields, new KeyValuePair