/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using Lucene.Net.Analysis; using Lucene.Net.Index; using Lucene.Net.Index.Memory; using Lucene.Net.Search.Spans; using Lucene.Net.Store; using Lucene.Net.Support; using Lucene.Net.Util; namespace Lucene.Net.Search.Highlight { /// /// Class used to extract s from a based on whether /// s from the are contained in a supplied . /// public class WeightedSpanTermExtractor { private String fieldName; private TokenStream tokenStream; private IDictionary readers = new HashMap(10); private String defaultField; private bool expandMultiTermQuery; private bool cachedTokenStream; private bool wrapToCaching = true; public WeightedSpanTermExtractor() { } public WeightedSpanTermExtractor(String defaultField) { if (defaultField != null) { this.defaultField = StringHelper.Intern(defaultField); } } private void CloseReaders() { ICollection readerSet = readers.Values; foreach (IndexReader reader in readerSet) { try { reader.Close(); } catch (IOException e) { // alert? } } } /// /// Fills a Map with s using the terms from the supplied Query. /// /// Query to extract Terms from /// Map to place created WeightedSpanTerms in private void Extract(Query query, IDictionary terms) { if (query is BooleanQuery) { BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses(); for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].IsProhibited) { Extract(queryClauses[i].Query, terms); } } } else if (query is PhraseQuery) { PhraseQuery phraseQuery = ((PhraseQuery) query); Term[] phraseQueryTerms = phraseQuery.GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = phraseQuery.Slop; int[] positions = phraseQuery.GetPositions(); // add largest position increment to slop if (positions.Length > 0) { int lastPos = positions[0]; int largestInc = 0; int sz = positions.Length; for (int i = 1; i < sz; i++) { int pos = positions[i]; int inc = pos - lastPos; if (inc > largestInc) { largestInc = inc; } lastPos = pos; } if (largestInc > 1) { slop += largestInc; } } bool inorder = slop == 0; SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } else if (query is TermQuery) { ExtractWeightedTerms(terms, query); } else if (query is SpanQuery) { ExtractWeightedSpanTerms(terms, (SpanQuery) query); } else if (query is FilteredQuery) { Extract(((FilteredQuery) query).Query, terms); } else if (query is DisjunctionMaxQuery) { foreach (var q in ((DisjunctionMaxQuery) query)) { Extract(q, terms); } } else if (query is MultiTermQuery && expandMultiTermQuery) { MultiTermQuery mtq = ((MultiTermQuery) query); if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) { mtq = (MultiTermQuery) mtq.Clone(); mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; query = mtq; } FakeReader fReader = new FakeReader(); MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq); if (fReader.Field != null) { IndexReader ir = GetReaderForField(fReader.Field); Extract(query.Rewrite(ir), terms); } } else if (query is MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery) query; IList termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } var disjunctLists = new List[maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; List disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new List(termArray.Length)); ++distinctPositions; } for (int j = 0; j < termArray.Length; ++j) { disjuncts.Add(new SpanTermQuery(termArray[j])); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { List disjuncts = disjunctLists[i]; if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts.ToArray()); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; ExtractWeightedSpanTerms(terms, sp); } } } /// /// Fills a Map with s using the terms from the supplied SpanQuery. /// /// Map to place created WeightedSpanTerms in /// SpanQuery to extract Terms from private void ExtractWeightedSpanTerms(IDictionary terms, SpanQuery spanQuery) { HashSet fieldNames; if (fieldName == null) { fieldNames = new HashSet(); CollectSpanQueryFields(spanQuery, fieldNames); } else { fieldNames = new HashSet(); fieldNames.Add(fieldName); } // To support the use of the default field name if (defaultField != null) { fieldNames.Add(defaultField); } IDictionary queries = new HashMap(); var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet(); bool mustRewriteQuery = MustRewriteQuery(spanQuery); if (mustRewriteQuery) { foreach (String field in fieldNames) { SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(GetReaderForField(field)); queries[field] = rewrittenQuery; rewrittenQuery.ExtractTerms(nonWeightedTerms); } } else { spanQuery.ExtractTerms(nonWeightedTerms); } List spanPositions = new List(); foreach (String field in fieldNames) { IndexReader reader = GetReaderForField(field); Spans.Spans spans; if (mustRewriteQuery) { spans = queries[field].GetSpans(reader); } else { spans = spanQuery.GetSpans(reader); } // collect span positions while (spans.Next()) { spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1)); } } if (spanPositions.Count == 0) { // no spans found return; } foreach (Term queryTerm in nonWeightedTerms) { if (FieldNameComparator(queryTerm.Field)) { WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text]; if (weightedSpanTerm == null) { weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text); weightedSpanTerm.AddPositionSpans(spanPositions); weightedSpanTerm.SetPositionSensitive(true); terms[queryTerm.Text] = weightedSpanTerm; } else { if (spanPositions.Count > 0) { weightedSpanTerm.AddPositionSpans(spanPositions); } } } } } /// /// Fills a Map with s using the terms from the supplied Query. /// /// /// private void ExtractWeightedTerms(IDictionary terms, Query query) { var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet(); query.ExtractTerms(nonWeightedTerms); foreach (Term queryTerm in nonWeightedTerms) { if (FieldNameComparator(queryTerm.Field)) { WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text); terms[queryTerm.Text] = weightedSpanTerm; } } } /// /// Necessary to implement matches for queries against defaultField /// private bool FieldNameComparator(String fieldNameToCheck) { bool rv = fieldName == null || fieldNameToCheck == fieldName || fieldNameToCheck == defaultField; return rv; } private IndexReader GetReaderForField(String field) { if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter)) { tokenStream = new CachingTokenFilter(tokenStream); cachedTokenStream = true; } IndexReader reader = readers[field]; if (reader == null) { MemoryIndex indexer = new MemoryIndex(); indexer.AddField(field, tokenStream); tokenStream.Reset(); IndexSearcher searcher = indexer.CreateSearcher(); reader = searcher.IndexReader; readers[field] = reader; } return reader; } /// /// Creates a Map of WeightedSpanTerms from the given Query and TokenStream. /// /// query that caused hit /// TokenStream of text to be highlighted /// Map containing WeightedSpanTerms public IDictionary GetWeightedSpanTerms(Query query, TokenStream tokenStream) { return GetWeightedSpanTerms(query, tokenStream, null); } /// /// Creates a Map of WeightedSpanTerms from the given Query and TokenStream. /// /// query that caused hit /// tokenStream of text to be highlighted /// restricts Term's used based on field name /// Map containing WeightedSpanTerms public IDictionary GetWeightedSpanTerms(Query query, TokenStream tokenStream, String fieldName) { if (fieldName != null) { this.fieldName = StringHelper.Intern(fieldName); } else { this.fieldName = null; } IDictionary terms = new PositionCheckingMap(); this.tokenStream = tokenStream; try { Extract(query, terms); } finally { CloseReaders(); } return terms; } /// /// Creates a Map of WeightedSpanTerms from the given Query and TokenStream. Uses a supplied /// IndexReader to properly Weight terms (for gradient highlighting). /// /// Query that caused hit /// Tokenstream of text to be highlighted /// restricts Term's used based on field name /// to use for scoring /// Map of WeightedSpanTerms with quasi tf/idf scores public IDictionary GetWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName, IndexReader reader) { if (fieldName != null) { this.fieldName = StringHelper.Intern(fieldName); } else { this.fieldName = null; } this.tokenStream = tokenStream; IDictionary terms = new PositionCheckingMap(); Extract(query, terms); int totalNumDocs = reader.NumDocs(); var weightedTerms = terms.Keys; try { foreach (var wt in weightedTerms) { WeightedSpanTerm weightedSpanTerm = terms[wt]; int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.Term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } // IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0); weightedSpanTerm.Weight *= idf; } } finally { CloseReaders(); } return terms; } private void CollectSpanQueryFields(SpanQuery spanQuery, HashSet fieldNames) { if (spanQuery is FieldMaskingSpanQuery) { CollectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames); } else if (spanQuery is SpanFirstQuery) { CollectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames); } else if (spanQuery is SpanNearQuery) { foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses()) { CollectSpanQueryFields(clause, fieldNames); } } else if (spanQuery is SpanNotQuery) { CollectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames); } else if (spanQuery is SpanOrQuery) { foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses()) { CollectSpanQueryFields(clause, fieldNames); } } else { fieldNames.Add(spanQuery.Field); } } private bool MustRewriteQuery(SpanQuery spanQuery) { if (!expandMultiTermQuery) { return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery. } else if (spanQuery is FieldMaskingSpanQuery) { return MustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).MaskedQuery); } else if (spanQuery is SpanFirstQuery) { return MustRewriteQuery(((SpanFirstQuery)spanQuery).Match); } else if (spanQuery is SpanNearQuery) { foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses()) { if (MustRewriteQuery(clause)) { return true; } } return false; } else if (spanQuery is SpanNotQuery) { SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery; return MustRewriteQuery(spanNotQuery.Include) || MustRewriteQuery(spanNotQuery.Exclude); } else if (spanQuery is SpanOrQuery) { foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses()) { if (MustRewriteQuery(clause)) { return true; } } return false; } else if (spanQuery is SpanTermQuery) { return false; } else { return true; } } /// /// This class makes sure that if both position sensitive and insensitive /// versions of the same term are added, the position insensitive one wins. /// /// private class PositionCheckingMap : HashMap { public PositionCheckingMap() { } public PositionCheckingMap(IEnumerable> m) { PutAll(m); } public void PutAll(IEnumerable> m) { foreach (var entry in m) { Add(entry.Key, entry.Value); } } public override void Add(K key, WeightedSpanTerm value) { base.Add(key, value); WeightedSpanTerm prev = this[key]; if (prev == null) return; WeightedSpanTerm prevTerm = prev; WeightedSpanTerm newTerm = value; if (!prevTerm.IsPositionSensitive()) { newTerm.SetPositionSensitive(false); } } } public bool ExpandMultiTermQuery { set { this.expandMultiTermQuery = value; } get { return expandMultiTermQuery; } } public bool IsCachedTokenStream { get { return cachedTokenStream; } } public TokenStream TokenStream { get { return tokenStream; } } /// /// By default, s that are not of the type /// are wrapped in a to /// impl and you don't want it to be wrapped, set this to /// false. /// public void SetWrapIfNotCachingTokenFilter(bool wrap) { this.wrapToCaching = wrap; } /// /// A fake IndexReader class to extract the field from a MultiTermQuery /// protected internal sealed class FakeReader : FilterIndexReader { private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().CreateSearcher().IndexReader; public String Field { get; private set; } protected internal FakeReader() : base(EMPTY_MEMORY_INDEX_READER) { } public override TermEnum Terms(Term t) { // only set first fieldname, maybe use a Set? if (t != null && Field == null) Field = t.Field; return base.Terms(t); } } } }