/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using PriorityQueue = Lucene.Net.Util.PriorityQueue; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; using TermFreqVector = Lucene.Net.Index.TermFreqVector; using BooleanClause = Lucene.Net.Search.BooleanClause; using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity; using TermQuery = Lucene.Net.Search.TermQuery; using BooleanQuery = Lucene.Net.Search.BooleanQuery; using IndexSearcher = Lucene.Net.Search.IndexSearcher; using Query = Lucene.Net.Search.Query; using Hits = Lucene.Net.Search.Hits; using Analyzer = Lucene.Net.Analysis.Analyzer; using TokenStream = Lucene.Net.Analysis.TokenStream; using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; using Document = Lucene.Net.Documents.Document; namespace Similarity.Net { ///

Generate "more like this" similarity queries. /// Based on this mail: ///

    /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
    /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
    /// is usually fast enough.  But looking up the DocFreq() of every term in the document is
    /// probably too slow.
    /// 
    /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
    /// or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
    /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
    /// reduce the number of terms under consideration.  Another heuristic is that terms with a
    /// high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
    /// number of characters, not selecting anything less than, e.g., six or seven characters.
    /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
    /// that do a pretty good job of characterizing a document.
    /// 
    /// It all depends on what you're trying to do.  If you're trying to eek out that last percent
    /// of precision and recall regardless of computational difficulty so that you can win a TREC
    /// competition, then the techniques I mention above are useless.  But if you're trying to
    /// provide a "more like this" button on a search results page that does a decent job and has
    /// good performance, such techniques might be useful.
    /// 
    /// An efficient, effective "more-like-this" query generator would be a great contribution, if
    /// anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
    /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
    /// above.  The frequency and length thresholds could be parameters, etc.
    /// 
    /// Doug
    ///

/// /// ///

///

Initial Usage

/// /// This class has lots of options to try to make it efficient and flexible. /// See the body of {@link #main Main()} below in the source for real code, or /// if you want pseudo code, the simpliest possible usage is as follows. The bold /// fragment is specific to this class. /// ///

    /// 
    /// IndexReader ir = ...
    /// IndexSearcher is = ...
    /// 
    /// MoreLikeThis mlt = new MoreLikeThis(ir);
    /// Reader target = ... // orig source of doc you want to find similarities to
    /// Query query = mlt.Like( target);
    /// 
    /// Hits hits = is.Search(query);
    /// // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
    /// you ignore the doc if it matches your 'target' document, as it should be similar to itself 
    /// 
    ///

/// /// Thus you: ///

do your normal, Lucene setup for searching, ///
create a MoreLikeThis, ///
get the text of the doc you want to find similaries to ///
then call one of the Like() calls to generate a similarity query ///
call the searcher to find the similar docs ///

/// ///

More Advanced Usage

/// /// You may want to use {@link #SetFieldNames SetFieldNames(...)} so you can examine /// multiple fields (e.g. body and title) for similarity. ///

/// /// Depending on the size of your index and the size and makeup of your documents you /// may want to call the other set methods to control how the similarity queries are /// generated: ///

{@link #SetMinTermFreq SetMinTermFreq(...)} ///
{@link #SetMinDocFreq SetMinDocFreq(...)} ///
{@link #SetMinWordLen SetMinWordLen(...)} ///
{@link #SetMaxWordLen SetMaxWordLen(...)} ///
{@link #SetMaxQueryTerms SetMaxQueryTerms(...)} ///
{@link #SetMaxNumTokensParsed SetMaxNumTokensParsed(...)} ///
{@link #SetStopWords SetStopWord(...)} ///

/// ///

///

    /// Changes: Mark Harwood 29/02/04
    /// Some bugfixing, some refactoring, some optimisation.
    /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
    /// - bugfix: No significant terms being created for fields with a termvector - because 
    /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector) 
    /// - refactor: moved common code into isNoiseWord()
    /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
    ///

/// ///

/// David Spencer /// /// Bruce Ritchie /// /// Mark Harwood /// public sealed class MoreLikeThis { ///

Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.

/// /// public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; ///

Default analyzer to parse source doc with.

/// /// public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(); ///

Ignore terms with less than this frequency in the source doc.

/// /// /// /// public const int DEFAULT_MIN_TERM_FREQ = 2; ///

Ignore words which do not occur in at least this many docs.

/// /// /// /// public const int DEFALT_MIN_DOC_FREQ = 5; ///

Boost terms in query based on score.

/// /// /// /// public const bool DEFAULT_BOOST = false; ///

Default field names. Null is used to specify that the field names should be looked /// up at runtime from the provided reader. ///

public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[]{"contents"}; ///

Ignore words less than this length or if 0 then this has no effect.

/// /// /// /// public const int DEFAULT_MIN_WORD_LENGTH = 0; ///

Ignore words greater than this length or if 0 then this has no effect.

/// /// /// /// public const int DEFAULT_MAX_WORD_LENGTH = 0; ///

Default set of stopwords. /// If null means to allow stop words. /// ///

/// /// /// /// public static readonly System.Collections.Hashtable DEFAULT_STOP_WORDS = null; ///

Current set of stop words.

private System.Collections.Hashtable stopWords = DEFAULT_STOP_WORDS; ///

Return a Query with no more than this many terms. /// ///

/// /// /// /// /// /// public const int DEFAULT_MAX_QUERY_TERMS = 25; ///

Analyzer that will be used to parse the doc.

private Analyzer analyzer = DEFAULT_ANALYZER; ///

Ignore words less freqent that this.

private int minTermFreq = DEFAULT_MIN_TERM_FREQ; ///

Ignore words which do not occur in at least this many docs.

private int minDocFreq = DEFALT_MIN_DOC_FREQ; ///

Should we apply a boost to the Query based on the scores?

private bool boost = DEFAULT_BOOST; ///

Field name we'll analyze.

private System.String[] fieldNames = DEFAULT_FIELD_NAMES; ///

The maximum number of tokens to parse in each example doc field that is not stored with TermVector support

private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; ///

Ignore words if less than this len.

private int minWordLen = DEFAULT_MIN_WORD_LENGTH; ///

Ignore words if greater than this len.

private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; ///

Don't return a query longer than this.

private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; ///

For idf() calculations.

private Lucene.Net.Search.Similarity similarity = new DefaultSimilarity(); ///

IndexReader to use

private IndexReader ir; ///

Constructor requiring an IndexReader.

public MoreLikeThis(IndexReader ir) { this.ir = ir; } ///

Returns an analyzer that will be used to parse source doc with. The default analyzer /// is the {@link #DEFAULT_ANALYZER}. /// ///

/// the analyzer that will be used to parse source doc with. /// /// /// public Analyzer GetAnalyzer() { return analyzer; } ///

Sets the analyzer to use. An analyzer is not required for generating a query with the /// {@link #Like(int)} method, all other 'like' methods require an analyzer. /// ///

/// the analyzer to use to tokenize text. /// public void SetAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } ///

Returns the frequency below which terms will be ignored in the source doc. The default /// frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. /// ///

/// the frequency below which terms will be ignored in the source doc. /// public int GetMinTermFreq() { return minTermFreq; } ///

Sets the frequency below which terms will be ignored in the source doc. /// ///

/// the frequency below which terms will be ignored in the source doc. /// public void SetMinTermFreq(int minTermFreq) { this.minTermFreq = minTermFreq; } ///

Returns the frequency at which words will be ignored which do not occur in at least this /// many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}. /// ///

/// the frequency at which words will be ignored which do not occur in at least this /// many docs. /// public int GetMinDocFreq() { return minDocFreq; } ///

Sets the frequency at which words will be ignored which do not occur in at least this /// many docs. /// ///

/// the frequency at which words will be ignored which do not occur in at /// least this many docs. /// public void SetMinDocFreq(int minDocFreq) { this.minDocFreq = minDocFreq; } ///

Returns whether to boost terms in query based on "score" or not. The default is /// {@link #DEFAULT_BOOST}. /// ///

/// whether to boost terms in query based on "score" or not. /// /// /// public bool IsBoost() { return boost; } ///

Sets whether to boost terms in query based on "score" or not. /// ///

/// true to boost terms in query based on "score", false otherwise. /// /// /// public void SetBoost(bool boost) { this.boost = boost; } ///

Returns the field names that will be used when generating the 'More Like This' query. /// The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. /// ///

/// the field names that will be used when generating the 'More Like This' query. /// public System.String[] GetFieldNames() { return fieldNames; } ///

Sets the field names that will be used when generating the 'More Like This' query. /// Set this to null for the field names to be determined at runtime from the IndexReader /// provided in the constructor. /// ///

/// the field names that will be used when generating the 'More Like This' /// query. /// public void SetFieldNames(System.String[] fieldNames) { this.fieldNames = fieldNames; } ///

Returns the minimum word length below which words will be ignored. Set this to 0 for no /// minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. /// ///

/// the minimum word length below which words will be ignored. /// public int GetMinWordLen() { return minWordLen; } ///

Sets the minimum word length below which words will be ignored. /// ///

/// the minimum word length below which words will be ignored. /// public void SetMinWordLen(int minWordLen) { this.minWordLen = minWordLen; } ///

Returns the maximum word length above which words will be ignored. Set this to 0 for no /// maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. /// ///

/// the maximum word length above which words will be ignored. /// public int GetMaxWordLen() { return maxWordLen; } ///

Sets the maximum word length above which words will be ignored. /// ///

/// the maximum word length above which words will be ignored. /// public void SetMaxWordLen(int maxWordLen) { this.maxWordLen = maxWordLen; } ///

Set the set of stopwords. /// Any word in this set is considered "uninteresting" and ignored. /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". /// ///

/// set of stopwords, if null it means to allow stop words /// /// /// /// /// /// public void SetStopWords(System.Collections.Hashtable stopWords) { this.stopWords = stopWords; } ///

Get the current stop words being used.

/// /// public System.Collections.Hashtable GetStopWords() { return stopWords; } ///

Returns the maximum number of query terms that will be included in any generated query. /// The default is {@link #DEFAULT_MAX_QUERY_TERMS}. /// ///

/// the maximum number of query terms that will be included in any generated query. /// public int GetMaxQueryTerms() { return maxQueryTerms; } ///

Sets the maximum number of query terms that will be included in any generated query. /// ///

/// the maximum number of query terms that will be included in any /// generated query. /// public void SetMaxQueryTerms(int maxQueryTerms) { this.maxQueryTerms = maxQueryTerms; } /// The maximum number of tokens to parse in each example doc field that is not stored with TermVector support /// /// /// public int GetMaxNumTokensParsed() { return maxNumTokensParsed; } /// The maximum number of tokens to parse in each example doc field that is not stored with TermVector support /// public void SetMaxNumTokensParsed(int i) { maxNumTokensParsed = i; } ///

Return a query that will return docs like the passed lucene document ID. /// ///

/// the documentID of the lucene doc to generate the 'More Like This" query for. /// /// a query that will return docs like the passed lucene document ID. /// public Query Like(int docNum) { if (fieldNames == null) { // gather list of valid fields from lucene System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); System.Collections.IEnumerator e = fields.GetEnumerator(); fieldNames = new System.String[fields.Count]; int index = 0; while (e.MoveNext()) fieldNames[index++] = (System.String) e.Current; } return CreateQuery(RetrieveTerms(docNum)); } ///

Return a query that will return docs like the passed file. /// ///

/// a query that will return docs like the passed file. /// public Query Like(System.IO.FileInfo f) { if (fieldNames == null) { // gather list of valid fields from lucene System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); System.Collections.IEnumerator e = fields.GetEnumerator(); fieldNames = new System.String[fields.Count]; int index = 0; while (e.MoveNext()) fieldNames[index++] = (System.String) e.Current; } return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default)); } ///

Return a query that will return docs like the passed URL. /// ///

/// a query that will return docs like the passed URL. /// public Query Like(System.Uri u) { return Like(new System.IO.StreamReader(((System.Net.HttpWebRequest) System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default)); } ///

Return a query that will return docs like the passed stream. /// ///

/// a query that will return docs like the passed stream. /// public Query Like(System.IO.Stream is_Renamed) { return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default)); } ///

Return a query that will return docs like the passed Reader. /// ///

/// a query that will return docs like the passed Reader. /// public Query Like(System.IO.StreamReader r) { return CreateQuery(RetrieveTerms(r)); } ///

Create the More like query from a PriorityQueue

private Query CreateQuery(PriorityQueue q) { BooleanQuery query = new BooleanQuery(); System.Object cur; int qterms = 0; float bestScore = 0; while (((cur = q.Pop()) != null)) { System.Object[] ar = (System.Object[]) cur; TermQuery tq = new TermQuery(new Term((System.String) ar[1], (System.String) ar[0])); if (boost) { if (qterms == 0) { bestScore = (float) ((System.Single) ar[2]); } float myScore = (float) ((System.Single) ar[2]); tq.SetBoost(myScore / bestScore); } try { query.Add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } qterms++; if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { break; } } return query; } ///

Create a PriorityQueue from a word->tf map. /// ///

/// a map of words keyed on the word(String) with Int objects as the values. /// private PriorityQueue CreateQueue(System.Collections.IDictionary words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs(); FreqQ res = new FreqQ(words.Count); // will order words by score System.Collections.IEnumerator it = words.Keys.GetEnumerator(); while (it.MoveNext()) { // for every word System.String word = (System.String) it.Current; int tf = ((Int) words[word]).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency System.String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.Length; i++) { int freq = ir.DocFreq(new Term(fieldNames[i], word)); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.Insert(new System.Object[]{word, topField, (float) score, (float) idf, (System.Int32) docFreq, (System.Int32) tf}); } return res; } ///

Describe the parameters that control how the "more like this" query is formed.

public System.String DescribeParams() { System.Text.StringBuilder sb = new System.Text.StringBuilder(); sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n"); sb.Append("\t" + "minWordLen : " + minWordLen + "\n"); sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n"); sb.Append("\t" + "fieldNames : \""); System.String delim = ""; for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; sb.Append(delim).Append(fieldName); delim = ", "; } sb.Append("\n"); sb.Append("\t" + "boost : " + boost + "\n"); sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n"); sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n"); return sb.ToString(); } ///

Test driver. /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". ///

[STAThread] public static void Main(System.String[] a) { System.String indexName = "localhost_index"; System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; System.Uri url = null; for (int i = 0; i < a.Length; i++) { if (a[i].Equals("-i")) { indexName = a[++i]; } else if (a[i].Equals("-f")) { fn = a[++i]; } else if (a[i].Equals("-url")) { url = new System.Uri(a[++i]); } } System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; System.IO.StreamWriter o = temp_writer; IndexReader r = IndexReader.Open(indexName); o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs"); MoreLikeThis mlt = new MoreLikeThis(r); o.WriteLine("Query generation parameters:"); o.WriteLine(mlt.DescribeParams()); o.WriteLine(); Query query = null; if (url != null) { o.WriteLine("Parsing URL: " + url); query = mlt.Like(url); } else if (fn != null) { o.WriteLine("Parsing file: " + fn); query = mlt.Like(new System.IO.FileInfo(fn)); } o.WriteLine("q: " + query); o.WriteLine(); IndexSearcher searcher = new IndexSearcher(indexName); Hits hits = searcher.Search(query); int len = hits.Length(); o.WriteLine("found: " + len + " documents matching"); o.WriteLine(); for (int i = 0; i < System.Math.Min(25, len); i++) { Document d = hits.Doc(i); System.String summary = d.Get("summary"); o.WriteLine("score : " + hits.Score(i)); o.WriteLine("url : " + d.Get("url")); o.WriteLine("\ttitle : " + d.Get("title")); if (summary != null) o.WriteLine("\tsummary: " + d.Get("summary")); o.WriteLine(); } } ///

Find words for a more-like-this query former. /// ///

/// the id of the lucene document from which to find terms /// private PriorityQueue RetrieveTerms(int docNum) { System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; TermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.Document(docNum); System.String[] text = d.GetValues(fieldName); if (text != null) { for (int j = 0; j < text.Length; j++) { AddTermFrequencies(new System.IO.StreamReader(text[j]), termFreqMap, fieldName); } } } else { AddTermFrequencies(termFreqMap, vector); } } return CreateQueue(termFreqMap); } ///

Adds terms and frequencies found in vector into the Map termFreqMap

/// a Map of terms and their frequencies /// /// List of terms and their frequencies for a doc/field /// private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector) { System.String[] terms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int j = 0; j < terms.Length; j++) { System.String term = terms[j]; if (IsNoiseWord(term)) { continue; } // increment frequency Int cnt = (Int) termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freqs[j]; } else { cnt.x += freqs[j]; } } } ///

Adds term frequencies found by tokenizing text from reader into the Map words

/// a source of text to be tokenized /// /// a Map of terms and their frequencies /// /// Used by analyzer for any special per-field analysis /// private void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); Lucene.Net.Analysis.Token token; int tokenCount = 0; while ((token = ts.Next()) != null) { // for every token System.String word = token.TermText(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int) termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } } ///

determines if the passed term is likely to be of interest in "more like" comparisons /// ///

/// The word being considered /// /// true if should be ignored, false if should be used in further analysis /// private bool IsNoiseWord(System.String term) { int len = term.Length; if (minWordLen > 0 && len < minWordLen) { return true; } if (maxWordLen > 0 && len > maxWordLen) { return true; } if (stopWords != null && stopWords.Contains(term)) { return true; } return false; } ///

Find words for a more-like-this query former. /// The result is a priority queue of arrays with one entry for every word in the document. /// Each array has 6 elements. /// The elements are: ///

The word (String) ///
The top field that this word comes from (String) ///
The score for this word (Float) ///
The IDF value (Float) ///
The frequency of this word in the index (Integer) ///
The frequency of this word in the source document (Integer) ///

/// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. /// This method is exposed so that you can identify the "interesting words" in a document. /// For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. /// ///

/// the reader that has the content of the document /// /// the most intresting words in the document ordered by score, with the highest scoring, or best entry, first /// /// /// /// public PriorityQueue RetrieveTerms(System.IO.StreamReader r) { System.Collections.IDictionary words = new System.Collections.Hashtable(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; AddTermFrequencies(r, words, fieldName); } return CreateQueue(words); } ///

Convenience routine to make it easy to return the most interesting words in a document. /// More advanced users will call {@link #RetrieveTerms(java.io.Reader) retrieveTerms()} directly. ///

/// the source document /// /// the most interesting words in the document /// /// /// /// /// /// public System.String[] RetrieveInterestingTerms(System.IO.StreamReader r) { System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms); PriorityQueue pq = RetrieveTerms(r); System.Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((cur = pq.Pop()) != null) && lim-- > 0) { System.Object[] ar = (System.Object[]) cur; al.Add(ar[0]); // the 1st entry is the interesting word } System.String[] res = new System.String[al.Count]; // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res); return (System.String[]) al.ToArray(typeof(System.String)); } ///

PriorityQueue that orders words by score.

private class FreqQ : PriorityQueue { internal FreqQ(int s) { Initialize(s); } override public bool LessThan(System.Object a, System.Object b) { System.Object[] aa = (System.Object[]) a; System.Object[] bb = (System.Object[]) b; System.Single fa = (System.Single) aa[2]; System.Single fb = (System.Single) bb[2]; return (float) fa > (float) fb; } } ///

Use for frequencies and to avoid renewing Integers.

private class Int { internal int x; internal Int() { x = 1; } } } }