/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using Lucene.Net.Analysis; using Lucene.Net.Index; using Lucene.Net.Analysis.Tokenattributes; namespace Lucene.Net.Search.Similar { /// Simple similarity measures. /// /// /// /// /// public sealed class SimilarityQueries { /// private SimilarityQueries() { } /// Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your for similar docs. /// The only caveat is the first hit returned should be your source document - you'll /// need to then ignore that. /// ///

/// /// So, if you have a code fragment like this: ///
/// /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// /// ///

/// /// The query returned, in string form, will be '(i use lucene to search fast searchers are good'). /// ///

/// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// ///

/// This method is fail-safe in that if a long 'body' is passed in and /// (used internally) /// throws /// , the /// query as it is will be returned. ///

/// the body of the document you want to find similar documents to /// /// the analyzer to use to parse the body /// /// the field you want to search on, probably something like "contents" or "body" /// /// optional set of stop words to ignore /// /// a query with all unique words in 'body' /// /// IOException this can't happen... public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); ITermAttribute termAtt = ts.AddAttribute(); BooleanQuery tmp = new BooleanQuery(); ISet already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term; // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word)) continue; already.Add(word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; } } }