/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Analyzer = Lucene.Net.Analysis.Analyzer; using TokenStream = Lucene.Net.Analysis.TokenStream; using Term = Lucene.Net.Index.Term; using BooleanQuery = Lucene.Net.Search.BooleanQuery; using IndexSearcher = Lucene.Net.Search.IndexSearcher; using Query = Lucene.Net.Search.Query; using TermQuery = Lucene.Net.Search.TermQuery; using BooleanClause = Lucene.Net.Search.BooleanClause; namespace Similarity.Net { /// Simple similarity measures. /// /// /// /// /// public sealed class SimilarityQueries { /// private SimilarityQueries() { } /// Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned should be your source document - you'll /// need to then ignore that. /// ///

/// /// So, if you have a code fragment like this: ///
/// /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// /// ///

/// ///

/// The query returned, in string form, will be '(i use lucene to search fast searchers are good'). /// ///

/// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// ///

/// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// ///

/// the body of the document you want to find similar documents to /// /// the analyzer to use to parse the body /// /// the field you want to search on, probably something like "contents" or "body" /// /// optional set of stop words to ignore /// /// a query with all unique words in 'body' /// /// IOException this can't happen... public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); Lucene.Net.Analysis.Token t; BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while ((t = ts.Next()) != null) { System.String word = t.TermText(); // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word) == true) continue; already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false); } catch (BooleanQuery.TooManyClauses too) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; } } }