/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Index;
using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Search.Similar
{
/// Simple similarity measures.
///
///
///
///
///
public sealed class SimilarityQueries
{
///
private SimilarityQueries()
{
}
/// Simple similarity query generators.
/// Takes every unique word and forms a boolean query where all words are optional.
/// After you get this you'll use to to query your for similar docs.
/// The only caveat is the first hit returned should be your source document - you'll
/// need to then ignore that.
///
///
///
/// So, if you have a code fragment like this:
///
///
/// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
///
///
///
///
/// The query returned, in string form, will be '(i use lucene to search fast searchers are good').
///
///
/// The philosophy behind this method is "two documents are similar if they share lots of words".
/// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
///
///
/// This method is fail-safe in that if a long 'body' is passed in and
/// (used internally)
/// throws
/// , the
/// query as it is will be returned.
///
/// the body of the document you want to find similar documents to
///
/// the analyzer to use to parse the body
///
/// the field you want to search on, probably something like "contents" or "body"
///
/// optional set of stop words to ignore
///
/// a query with all unique words in 'body'
///
/// IOException this can't happen...
public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet stop)
{
TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
ITermAttribute termAtt = ts.AddAttribute();
BooleanQuery tmp = new BooleanQuery();
ISet already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet(); // ignore dups
while (ts.IncrementToken())
{
String word = termAtt.Term;
// ignore opt stop words
if (stop != null && stop.Contains(word))
continue;
// ignore dups
if (already.Contains(word))
continue;
already.Add(word);
// add to query
TermQuery tq = new TermQuery(new Term(field, word));
try
{
tmp.Add(tq, Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}
}
}