/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Lucene.Net.Index; using Lucene.Net.Store; using Lucene.Net.Support; using Lucene.Net.Util; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; using BooleanClause = Lucene.Net.Search.BooleanClause; using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity; using TermQuery = Lucene.Net.Search.TermQuery; using BooleanQuery = Lucene.Net.Search.BooleanQuery; using IndexSearcher = Lucene.Net.Search.IndexSearcher; using Query = Lucene.Net.Search.Query; using Analyzer = Lucene.Net.Analysis.Analyzer; using TokenStream = Lucene.Net.Analysis.TokenStream; using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; using Document = Lucene.Net.Documents.Document; using Lucene.Net.Analysis.Tokenattributes; namespace Lucene.Net.Search.Similar { /// Generate "more like this" similarity queries. /// Based on this mail: ///
    /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
    /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
    /// is usually fast enough.  But looking up the DocFreq() of every term in the document is
    /// probably too slow.
    /// 
    /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
    /// or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
    /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
    /// reduce the number of terms under consideration.  Another heuristic is that terms with a
    /// high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
    /// number of characters, not selecting anything less than, e.g., six or seven characters.
    /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
    /// that do a pretty good job of characterizing a document.
    /// 
    /// It all depends on what you're trying to do.  If you're trying to eek out that last percent
    /// of precision and recall regardless of computational difficulty so that you can win a TREC
    /// competition, then the techniques I mention above are useless.  But if you're trying to
    /// provide a "more like this" button on a search results page that does a decent job and has
    /// good performance, such techniques might be useful.
    /// 
    /// An efficient, effective "more-like-this" query generator would be a great contribution, if
    /// anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
    /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
    /// above.  The frequency and length thresholds could be parameters, etc.
    /// 
    /// Doug
    /// 
/// /// ///

///

Initial Usage

/// /// This class has lots of options to try to make it efficient and flexible. /// See the body of below in the source for real code, or /// if you want pseudo code, the simpliest possible usage is as follows. The bold /// fragment is specific to this class. /// ///
    /// 
    /// IndexReader ir = ...
    /// IndexSearcher is = ...
    /// 
    /// MoreLikeThis mlt = new MoreLikeThis(ir);
    /// Reader target = ... // orig source of doc you want to find similarities to
    /// Query query = mlt.Like( target);
    /// 
    /// Hits hits = is.Search(query);
    /// // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
    /// you ignore the doc if it matches your 'target' document, as it should be similar to itself 
    /// 
    /// 
/// /// Thus you: ///
    ///
  1. do your normal, Lucene setup for searching,
  2. ///
  3. create a MoreLikeThis,
  4. ///
  5. get the text of the doc you want to find similaries to
  6. ///
  7. then call one of the Like() calls to generate a similarity query
  8. ///
  9. call the searcher to find the similar docs
  10. ///
/// ///

More Advanced Usage

/// /// You may want to use so you can examine /// multiple fields (e.g. body and title) for similarity. ///

/// /// Depending on the size of your index and the size and makeup of your documents you /// may want to call the other set methods to control how the similarity queries are /// generated: ///

    ///
  • ///
  • ///
  • ///
  • ///
  • ///
  • ///
  • ///
  • ///
  • ///
/// ///
///
    /// Changes: Mark Harwood 29/02/04
    /// Some bugfixing, some refactoring, some optimisation.
    /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
    /// - bugfix: No significant terms being created for fields with a termvector - because 
    /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector) 
    /// - refactor: moved common code into isNoiseWord()
    /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
    /// 
///
public sealed class MoreLikeThis { /// Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. /// /// public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; /// Default analyzer to parse source doc with. /// /// public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(Util.Version.LUCENE_CURRENT); /// Ignore terms with less than this frequency in the source doc. /// /// /// /// public const int DEFAULT_MIN_TERM_FREQ = 2; /// Ignore words which do not occur in at least this many docs. /// /// /// /// public const int DEFAULT_MIN_DOC_FREQ = 5; /// /// Ignore words wich occur in more than this many docs /// /// /// public const int DEFAULT_MAX_DOC_FREQ = int.MaxValue; /// Boost terms in query based on score. /// /// /// /// public const bool DEFAULT_BOOST = false; /// Default field names. Null is used to specify that the field names should be looked /// up at runtime from the provided reader. /// public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[] { "contents" }; /// Ignore words less than this length or if 0 then this has no effect. /// /// /// /// public const int DEFAULT_MIN_WORD_LENGTH = 0; /// Ignore words greater than this length or if 0 then this has no effect. /// /// /// /// public const int DEFAULT_MAX_WORD_LENGTH = 0; /// Default set of stopwords. /// If null means to allow stop words. /// /// /// /// /// /// public static readonly ISet DEFAULT_STOP_WORDS = null; /// Current set of stop words. private ISet stopWords = DEFAULT_STOP_WORDS; /// Return a Query with no more than this many terms. /// /// /// /// /// /// /// /// public const int DEFAULT_MAX_QUERY_TERMS = 25; /// Analyzer that will be used to parse the doc. private Analyzer analyzer = DEFAULT_ANALYZER; /// Ignore words less freqent that this. private int minTermFreq = DEFAULT_MIN_TERM_FREQ; /// Ignore words which do not occur in at least this many docs. private int minDocFreq = DEFAULT_MIN_DOC_FREQ; /// /// Ignore words which occur in more than this many docs. /// private int maxDocfreq = DEFAULT_MAX_DOC_FREQ; /// Should we apply a boost to the Query based on the scores? private bool boost = DEFAULT_BOOST; /// Field name we'll analyze. private System.String[] fieldNames = DEFAULT_FIELD_NAMES; /// The maximum number of tokens to parse in each example doc field that is not stored with TermVector support private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; /// Ignore words if less than this len. private int minWordLen = DEFAULT_MIN_WORD_LENGTH; /// Ignore words if greater than this len. private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; /// Don't return a query longer than this. private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; /// For idf() calculations. private Lucene.Net.Search.Similarity similarity = null; /// IndexReader to use private IndexReader ir; /// Boost factor to use when boosting the terms private float boostFactor = 1; /// /// Gets or sets the boost factor used when boosting terms /// public float BoostFactor { get { return boostFactor; } set { this.boostFactor = value; } } /// Constructor requiring an IndexReader. public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity()) { } public MoreLikeThis(IndexReader ir, Lucene.Net.Search.Similarity sim) { this.ir = ir; this.similarity = sim; } public Similarity Similarity { get { return similarity; } set { this.similarity = value; } } /// Gets or sets the analyzer used to parse source doc with. The default analyzer /// is the . /// /// An analyzer is not required for generating a query with the /// method, all other 'like' methods require an analyzer. /// /// the analyzer that will be used to parse source doc with. /// /// public Analyzer Analyzer { get { return analyzer; } set { this.analyzer = value; } } /// /// Gets or sets the frequency below which terms will be ignored in the source doc. The default /// frequency is the . /// public int MinTermFreq { get { return minTermFreq; } set { this.minTermFreq = value; } } /// /// Gets or sets the frequency at which words will be ignored which do not occur in at least this /// many docs. The default frequency is . /// public int MinDocFreq { get { return minDocFreq; } set { this.minDocFreq = value; } } /// /// Gets or sets the maximum frequency in which words may still appear. /// Words that appear in more than this many docs will be ignored. The default frequency is /// /// public int MaxDocFreq { get { return this.maxDocfreq; } set { this.maxDocfreq = value; } } /// /// Set the maximum percentage in which words may still appear. Words that appear /// in more than this many percent of all docs will be ignored. /// /// /// the maximum percentage of documents (0-100) that a term may appear /// in to be still considered relevant /// public void SetMaxDocFreqPct(int maxPercentage) { this.maxDocfreq = maxPercentage * ir.NumDocs() / 100; } /// Gets or sets a boolean indicating whether to boost terms in query based /// on "score" or not. The default is . /// public bool Boost { get { return boost; } set { this.boost = value; } } /// Returns the field names that will be used when generating the 'More Like This' query. /// The default field names that will be used is . /// /// /// the field names that will be used when generating the 'More Like This' query. /// public System.String[] GetFieldNames() { return fieldNames; } /// Sets the field names that will be used when generating the 'More Like This' query. /// Set this to null for the field names to be determined at runtime from the IndexReader /// provided in the constructor. /// /// /// the field names that will be used when generating the 'More Like This' /// query. /// public void SetFieldNames(System.String[] fieldNames) { this.fieldNames = fieldNames; } /// /// Gets or sets the minimum word length below which words will be ignored. /// Set this to 0 for no minimum word length. The default is . /// public int MinWordLen { get { return minWordLen; } set { this.minWordLen = value; } } /// /// Gets or sets the maximum word length above which words will be ignored. Set this to 0 for no /// maximum word length. The default is . /// public int MaxWordLen { get { return maxWordLen; } set { this.maxWordLen = value; } } /// Set the set of stopwords. /// Any word in this set is considered "uninteresting" and ignored. /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". /// /// /// set of stopwords, if null it means to allow stop words /// /// /// /// /// /// public void SetStopWords(ISet stopWords) { this.stopWords = stopWords; } /// Get the current stop words being used. /// /// public ISet GetStopWords() { return stopWords; } /// /// Gets or sets the maximum number of query terms that will be included in any generated query. /// The default is . /// public int MaxQueryTerms { get { return maxQueryTerms; } set { this.maxQueryTerms = value; } } /// /// Gets or sets the maximum number of tokens to parse in each example doc /// field that is not stored with TermVector support /// /// public int MaxNumTokensParsed { get { return maxNumTokensParsed; } set { maxNumTokensParsed = value; } } /// Return a query that will return docs like the passed lucene document ID. /// the documentID of the lucene doc to generate the 'More Like This" query for. /// a query that will return docs like the passed lucene document ID. public Query Like(int docNum) { if (fieldNames == null) { // gather list of valid fields from lucene ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); fieldNames = fields.ToArray(); } return CreateQuery(RetrieveTerms(docNum)); } /// Return a query that will return docs like the passed file. /// /// /// a query that will return docs like the passed file. /// public Query Like(System.IO.FileInfo f) { if (fieldNames == null) { // gather list of valid fields from lucene ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); fieldNames = fields.ToArray(); } return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default)); } /// Return a query that will return docs like the passed URL. /// /// /// a query that will return docs like the passed URL. /// public Query Like(System.Uri u) { return Like(new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default)); } /// Return a query that will return docs like the passed stream. /// /// /// a query that will return docs like the passed stream. /// public Query Like(System.IO.Stream is_Renamed) { return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default)); } /// Return a query that will return docs like the passed Reader. /// /// /// a query that will return docs like the passed Reader. /// public Query Like(System.IO.TextReader r) { return CreateQuery(RetrieveTerms(r)); } /// Create the More like query from a PriorityQueue private Query CreateQuery(PriorityQueue q) { BooleanQuery query = new BooleanQuery(); System.Object cur; int qterms = 0; float bestScore = 0; while (((cur = q.Pop()) != null)) { System.Object[] ar = (System.Object[])cur; TermQuery tq = new TermQuery(new Term((System.String)ar[1], (System.String)ar[0])); if (boost) { if (qterms == 0) { bestScore = (float)ar[2]; } float myScore = (float)ar[2]; tq.Boost = boostFactor * myScore / bestScore; } try { query.Add(tq, Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } qterms++; if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { break; } } return query; } /// Create a PriorityQueue from a word->tf map. /// /// /// a map of words keyed on the word(String) with Int objects as the values. /// private PriorityQueue CreateQueue(IDictionary words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs(); FreqQ res = new FreqQ(words.Count); // will order words by score var it = words.Keys.GetEnumerator(); while (it.MoveNext()) { // for every word System.String word = it.Current; int tf = words[word].x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency System.String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.Length; i++) { int freq = ir.DocFreq(new Term(fieldNames[i], word)); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocfreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf }); } return res; } /// Describe the parameters that control how the "more like this" query is formed. public System.String DescribeParams() { System.Text.StringBuilder sb = new System.Text.StringBuilder(); sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n"); sb.Append("\t" + "minWordLen : " + minWordLen + "\n"); sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n"); sb.Append("\t" + "fieldNames : \""); System.String delim = ""; for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; sb.Append(delim).Append(fieldName); delim = ", "; } sb.Append("\n"); sb.Append("\t" + "boost : " + boost + "\n"); sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n"); sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n"); return sb.ToString(); } /// Test driver. /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". /// [STAThread] public static void Main(System.String[] a) { System.String indexName = "localhost_index"; System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; System.Uri url = null; for (int i = 0; i < a.Length; i++) { if (a[i].Equals("-i")) { indexName = a[++i]; } else if (a[i].Equals("-f")) { fn = a[++i]; } else if (a[i].Equals("-url")) { url = new System.Uri(a[++i]); } } System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; System.IO.StreamWriter o = temp_writer; FSDirectory dir = FSDirectory.Open(new DirectoryInfo(indexName)); IndexReader r = IndexReader.Open(dir, true); o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs"); MoreLikeThis mlt = new MoreLikeThis(r); o.WriteLine("Query generation parameters:"); o.WriteLine(mlt.DescribeParams()); o.WriteLine(); Query query = null; if (url != null) { o.WriteLine("Parsing URL: " + url); query = mlt.Like(url); } else if (fn != null) { o.WriteLine("Parsing file: " + fn); query = mlt.Like(new System.IO.FileInfo(fn)); } o.WriteLine("q: " + query); o.WriteLine(); IndexSearcher searcher = new IndexSearcher(dir, true); TopDocs hits = searcher.Search(query, null, 25); int len = hits.TotalHits; o.WriteLine("found: " + len + " documents matching"); o.WriteLine(); ScoreDoc[] scoreDocs = hits.ScoreDocs; for (int i = 0; i < System.Math.Min(25, len); i++) { Document d = searcher.Doc(scoreDocs[i].Doc); System.String summary = d.Get("summary"); o.WriteLine("score : " + scoreDocs[i].Score); o.WriteLine("url : " + d.Get("url")); o.WriteLine("\ttitle : " + d.Get("title")); if (summary != null) o.WriteLine("\tsummary: " + d.Get("summary")); o.WriteLine(); } } /// Find words for a more-like-this query former. /// /// /// the id of the lucene document from which to find terms /// private PriorityQueue RetrieveTerms(int docNum) { IDictionary termFreqMap = new HashMap(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; ITermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.Document(docNum); System.String[] text = d.GetValues(fieldName); if (text != null) { for (int j = 0; j < text.Length; j++) { AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName); } } } else { AddTermFrequencies(termFreqMap, vector); } } return CreateQueue(termFreqMap); } /// Adds terms and frequencies found in vector into the Map termFreqMap /// a Map of terms and their frequencies /// /// List of terms and their frequencies for a doc/field /// private void AddTermFrequencies(IDictionary termFreqMap, ITermFreqVector vector) { System.String[] terms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int j = 0; j < terms.Length; j++) { System.String term = terms[j]; if (IsNoiseWord(term)) { continue; } // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freqs[j]; } else { cnt.x += freqs[j]; } } } /// Adds term frequencies found by tokenizing text from reader into the Map words /// a source of text to be tokenized /// /// a Map of terms and their frequencies /// /// Used by analyzer for any special per-field analysis /// private void AddTermFrequencies(System.IO.TextReader r, IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount=0; // for every token ITermAttribute termAtt = ts.AddAttribute(); while (ts.IncrementToken()) { string word = termAtt.Term; tokenCount++; if(tokenCount>maxNumTokensParsed) { break; } if(IsNoiseWord(word)){ continue; } // increment frequency Int cnt = termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } } /// determines if the passed term is likely to be of interest in "more like" comparisons /// /// /// The word being considered /// /// true if should be ignored, false if should be used in further analysis /// private bool IsNoiseWord(System.String term) { int len = term.Length; if (minWordLen > 0 && len < minWordLen) { return true; } if (maxWordLen > 0 && len > maxWordLen) { return true; } if (stopWords != null && stopWords.Contains(term)) { return true; } return false; } /// Find words for a more-like-this query former. /// The result is a priority queue of arrays with one entry for every word in the document. /// Each array has 6 elements. /// The elements are: ///
    ///
  1. The word (String)
  2. ///
  3. The top field that this word comes from (String)
  4. ///
  5. The score for this word (Float)
  6. ///
  7. The IDF value (Float)
  8. ///
  9. The frequency of this word in the index (Integer)
  10. ///
  11. The frequency of this word in the source document (Integer)
  12. ///
/// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. /// This method is exposed so that you can identify the "interesting words" in a document. /// For an easier method to call see . /// ///
/// the reader that has the content of the document /// /// the most intresting words in the document ordered by score, with the highest scoring, or best entry, first /// /// /// /// public PriorityQueue RetrieveTerms(System.IO.TextReader r) { IDictionary words = new HashMap(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; AddTermFrequencies(r, words, fieldName); } return CreateQueue(words); } public System.String[] RetrieveInterestingTerms(int docNum) { List al = new List(maxQueryTerms); PriorityQueue pq = RetrieveTerms(docNum); System.Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((cur = pq.Pop()) != null) && lim-- > 0) { System.Object[] ar = (System.Object[])cur; al.Add(ar[0]); // the 1st entry is the interesting word } //System.String[] res = new System.String[al.Count]; //return al.toArray(res); return al.Select(x => x.ToString()).ToArray(); } /// Convenience routine to make it easy to return the most interesting words in a document. /// More advanced users will call directly. /// /// the source document /// /// the most interesting words in the document /// /// /// /// /// /// public System.String[] RetrieveInterestingTerms(System.IO.TextReader r) { List al = new List(maxQueryTerms); PriorityQueue pq = RetrieveTerms(r); System.Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((cur = pq.Pop()) != null) && lim-- > 0) { System.Object[] ar = (System.Object[])cur; al.Add(ar[0]); // the 1st entry is the interesting word } //System.String[] res = new System.String[al.Count]; // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res); return al.Select(x => x.ToString()).ToArray(); } /// PriorityQueue that orders words by score. private class FreqQ : PriorityQueue { internal FreqQ(int s) { Initialize(s); } override public bool LessThan(System.Object[] aa, System.Object[] bb) { float fa = (float)aa[2]; float fb = (float)bb[2]; return (float)fa > (float)fb; } } /// Use for frequencies and to avoid renewing Integers. private class Int { internal int x; internal Int() { x = 1; } } } }