/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Index;
using Lucene.Net.Index.Memory;
using Lucene.Net.Search.Spans;
using Lucene.Net.Store;
using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Search.Highlight
{
///
/// Class used to extract s from a based on whether
/// s from the are contained in a supplied .
///
public class WeightedSpanTermExtractor
{
private String fieldName;
private TokenStream tokenStream;
private IDictionary readers = new HashMap(10);
private String defaultField;
private bool expandMultiTermQuery;
private bool cachedTokenStream;
private bool wrapToCaching = true;
public WeightedSpanTermExtractor()
{
}
public WeightedSpanTermExtractor(String defaultField)
{
if (defaultField != null)
{
this.defaultField = StringHelper.Intern(defaultField);
}
}
private void CloseReaders()
{
ICollection readerSet = readers.Values;
foreach (IndexReader reader in readerSet)
{
try
{
reader.Close();
}
catch (IOException e)
{
// alert?
}
}
}
///
/// Fills a Map with s using the terms from the supplied Query.
///
/// Query to extract Terms from
/// Map to place created WeightedSpanTerms in
private void Extract(Query query, IDictionary terms)
{
if (query is BooleanQuery)
{
BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses();
for (int i = 0; i < queryClauses.Length; i++)
{
if (!queryClauses[i].IsProhibited)
{
Extract(queryClauses[i].Query, terms);
}
}
}
else if (query is PhraseQuery)
{
PhraseQuery phraseQuery = ((PhraseQuery) query);
Term[] phraseQueryTerms = phraseQuery.GetTerms();
SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];
for (int i = 0; i < phraseQueryTerms.Length; i++)
{
clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
}
int slop = phraseQuery.Slop;
int[] positions = phraseQuery.GetPositions();
// add largest position increment to slop
if (positions.Length > 0)
{
int lastPos = positions[0];
int largestInc = 0;
int sz = positions.Length;
for (int i = 1; i < sz; i++)
{
int pos = positions[i];
int inc = pos - lastPos;
if (inc > largestInc)
{
largestInc = inc;
}
lastPos = pos;
}
if (largestInc > 1)
{
slop += largestInc;
}
}
bool inorder = slop == 0;
SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
sp.Boost = query.Boost;
ExtractWeightedSpanTerms(terms, sp);
}
else if (query is TermQuery)
{
ExtractWeightedTerms(terms, query);
}
else if (query is SpanQuery)
{
ExtractWeightedSpanTerms(terms, (SpanQuery) query);
}
else if (query is FilteredQuery)
{
Extract(((FilteredQuery) query).Query, terms);
}
else if (query is DisjunctionMaxQuery)
{
foreach (var q in ((DisjunctionMaxQuery) query))
{
Extract(q, terms);
}
}
else if (query is MultiTermQuery && expandMultiTermQuery)
{
MultiTermQuery mtq = ((MultiTermQuery) query);
if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
{
mtq = (MultiTermQuery) mtq.Clone();
mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
query = mtq;
}
FakeReader fReader = new FakeReader();
MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq);
if (fReader.Field != null)
{
IndexReader ir = GetReaderForField(fReader.Field);
Extract(query.Rewrite(ir), terms);
}
}
else if (query is MultiPhraseQuery)
{
MultiPhraseQuery mpq = (MultiPhraseQuery) query;
IList termArrays = mpq.GetTermArrays();
int[] positions = mpq.GetPositions();
if (positions.Length > 0)
{
int maxPosition = positions[positions.Length - 1];
for (int i = 0; i < positions.Length - 1; ++i)
{
if (positions[i] > maxPosition)
{
maxPosition = positions[i];
}
}
var disjunctLists = new List[maxPosition + 1];
int distinctPositions = 0;
for (int i = 0; i < termArrays.Count; ++i)
{
Term[] termArray = termArrays[i];
List disjuncts = disjunctLists[positions[i]];
if (disjuncts == null)
{
disjuncts = (disjunctLists[positions[i]] = new List(termArray.Length));
++distinctPositions;
}
for (int j = 0; j < termArray.Length; ++j)
{
disjuncts.Add(new SpanTermQuery(termArray[j]));
}
}
int positionGaps = 0;
int position = 0;
SpanQuery[] clauses = new SpanQuery[distinctPositions];
for (int i = 0; i < disjunctLists.Length; ++i)
{
List disjuncts = disjunctLists[i];
if (disjuncts != null)
{
clauses[position++] = new SpanOrQuery(disjuncts.ToArray());
}
else
{
++positionGaps;
}
}
int slop = mpq.Slop;
bool inorder = (slop == 0);
SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
sp.Boost = query.Boost;
ExtractWeightedSpanTerms(terms, sp);
}
}
}
///
/// Fills a Map with s using the terms from the supplied SpanQuery.
///
/// Map to place created WeightedSpanTerms in
/// SpanQuery to extract Terms from
private void ExtractWeightedSpanTerms(IDictionary terms, SpanQuery spanQuery)
{
HashSet fieldNames;
if (fieldName == null)
{
fieldNames = new HashSet();
CollectSpanQueryFields(spanQuery, fieldNames);
}
else
{
fieldNames = new HashSet();
fieldNames.Add(fieldName);
}
// To support the use of the default field name
if (defaultField != null)
{
fieldNames.Add(defaultField);
}
IDictionary queries = new HashMap();
var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet();
bool mustRewriteQuery = MustRewriteQuery(spanQuery);
if (mustRewriteQuery)
{
foreach (String field in fieldNames)
{
SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(GetReaderForField(field));
queries[field] = rewrittenQuery;
rewrittenQuery.ExtractTerms(nonWeightedTerms);
}
}
else
{
spanQuery.ExtractTerms(nonWeightedTerms);
}
List spanPositions = new List();
foreach (String field in fieldNames)
{
IndexReader reader = GetReaderForField(field);
Spans.Spans spans;
if (mustRewriteQuery)
{
spans = queries[field].GetSpans(reader);
}
else
{
spans = spanQuery.GetSpans(reader);
}
// collect span positions
while (spans.Next())
{
spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1));
}
}
if (spanPositions.Count == 0)
{
// no spans found
return;
}
foreach (Term queryTerm in nonWeightedTerms)
{
if (FieldNameComparator(queryTerm.Field))
{
WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text];
if (weightedSpanTerm == null)
{
weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text);
weightedSpanTerm.AddPositionSpans(spanPositions);
weightedSpanTerm.SetPositionSensitive(true);
terms[queryTerm.Text] = weightedSpanTerm;
}
else
{
if (spanPositions.Count > 0)
{
weightedSpanTerm.AddPositionSpans(spanPositions);
}
}
}
}
}
///
/// Fills a Map with s using the terms from the supplied Query.
///
///
///
private void ExtractWeightedTerms(IDictionary terms, Query query)
{
var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet();
query.ExtractTerms(nonWeightedTerms);
foreach (Term queryTerm in nonWeightedTerms)
{
if (FieldNameComparator(queryTerm.Field))
{
WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text);
terms[queryTerm.Text] = weightedSpanTerm;
}
}
}
///
/// Necessary to implement matches for queries against defaultField
///
private bool FieldNameComparator(String fieldNameToCheck)
{
bool rv = fieldName == null || fieldNameToCheck == fieldName
|| fieldNameToCheck == defaultField;
return rv;
}
private IndexReader GetReaderForField(String field)
{
if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter))
{
tokenStream = new CachingTokenFilter(tokenStream);
cachedTokenStream = true;
}
IndexReader reader = readers[field];
if (reader == null)
{
MemoryIndex indexer = new MemoryIndex();
indexer.AddField(field, tokenStream);
tokenStream.Reset();
IndexSearcher searcher = indexer.CreateSearcher();
reader = searcher.IndexReader;
readers[field] = reader;
}
return reader;
}
///
/// Creates a Map of WeightedSpanTerms from the given Query and TokenStream.
///
/// query that caused hit
/// TokenStream of text to be highlighted
/// Map containing WeightedSpanTerms
public IDictionary GetWeightedSpanTerms(Query query, TokenStream tokenStream)
{
return GetWeightedSpanTerms(query, tokenStream, null);
}
///
/// Creates a Map of WeightedSpanTerms from the given Query and TokenStream.
///
/// query that caused hit
/// tokenStream of text to be highlighted
/// restricts Term's used based on field name
/// Map containing WeightedSpanTerms
public IDictionary GetWeightedSpanTerms(Query query, TokenStream tokenStream,
String fieldName)
{
if (fieldName != null)
{
this.fieldName = StringHelper.Intern(fieldName);
}
else
{
this.fieldName = null;
}
IDictionary terms = new PositionCheckingMap();
this.tokenStream = tokenStream;
try
{
Extract(query, terms);
}
finally
{
CloseReaders();
}
return terms;
}
///
/// Creates a Map of WeightedSpanTerms from the given Query and TokenStream. Uses a supplied
/// IndexReader to properly Weight terms (for gradient highlighting).
///
/// Query that caused hit
/// Tokenstream of text to be highlighted
/// restricts Term's used based on field name
/// to use for scoring
/// Map of WeightedSpanTerms with quasi tf/idf scores
public IDictionary GetWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
String fieldName, IndexReader reader)
{
if (fieldName != null)
{
this.fieldName = StringHelper.Intern(fieldName);
}
else
{
this.fieldName = null;
}
this.tokenStream = tokenStream;
IDictionary terms = new PositionCheckingMap();
Extract(query, terms);
int totalNumDocs = reader.NumDocs();
var weightedTerms = terms.Keys;
try
{
foreach (var wt in weightedTerms)
{
WeightedSpanTerm weightedSpanTerm = terms[wt];
int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.Term));
// docFreq counts deletes
if (totalNumDocs < docFreq)
{
docFreq = totalNumDocs;
}
// IDF algorithm taken from DefaultSimilarity class
float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0);
weightedSpanTerm.Weight *= idf;
}
}
finally
{
CloseReaders();
}
return terms;
}
private void CollectSpanQueryFields(SpanQuery spanQuery, HashSet fieldNames)
{
if (spanQuery is FieldMaskingSpanQuery)
{
CollectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames);
}
else if (spanQuery is SpanFirstQuery)
{
CollectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames);
}
else if (spanQuery is SpanNearQuery)
{
foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
{
CollectSpanQueryFields(clause, fieldNames);
}
}
else if (spanQuery is SpanNotQuery)
{
CollectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames);
}
else if (spanQuery is SpanOrQuery)
{
foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
{
CollectSpanQueryFields(clause, fieldNames);
}
}
else
{
fieldNames.Add(spanQuery.Field);
}
}
private bool MustRewriteQuery(SpanQuery spanQuery)
{
if (!expandMultiTermQuery)
{
return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
}
else if (spanQuery is FieldMaskingSpanQuery)
{
return MustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).MaskedQuery);
}
else if (spanQuery is SpanFirstQuery)
{
return MustRewriteQuery(((SpanFirstQuery)spanQuery).Match);
}
else if (spanQuery is SpanNearQuery)
{
foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
{
if (MustRewriteQuery(clause))
{
return true;
}
}
return false;
}
else if (spanQuery is SpanNotQuery)
{
SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery;
return MustRewriteQuery(spanNotQuery.Include) || MustRewriteQuery(spanNotQuery.Exclude);
}
else if (spanQuery is SpanOrQuery)
{
foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
{
if (MustRewriteQuery(clause))
{
return true;
}
}
return false;
}
else if (spanQuery is SpanTermQuery)
{
return false;
}
else
{
return true;
}
}
///
/// This class makes sure that if both position sensitive and insensitive
/// versions of the same term are added, the position insensitive one wins.
///
///
private class PositionCheckingMap : HashMap
{
public PositionCheckingMap()
{
}
public PositionCheckingMap(IEnumerable> m)
{
PutAll(m);
}
public void PutAll(IEnumerable> m)
{
foreach (var entry in m)
{
Add(entry.Key, entry.Value);
}
}
public override void Add(K key, WeightedSpanTerm value)
{
base.Add(key, value);
WeightedSpanTerm prev = this[key];
if (prev == null) return;
WeightedSpanTerm prevTerm = prev;
WeightedSpanTerm newTerm = value;
if (!prevTerm.IsPositionSensitive())
{
newTerm.SetPositionSensitive(false);
}
}
}
public bool ExpandMultiTermQuery
{
set { this.expandMultiTermQuery = value; }
get { return expandMultiTermQuery; }
}
public bool IsCachedTokenStream
{
get { return cachedTokenStream; }
}
public TokenStream TokenStream
{
get { return tokenStream; }
}
///
/// By default, s that are not of the type
/// are wrapped in a to
/// impl and you don't want it to be wrapped, set this to
/// false.
///
public void SetWrapIfNotCachingTokenFilter(bool wrap)
{
this.wrapToCaching = wrap;
}
///
/// A fake IndexReader class to extract the field from a MultiTermQuery
///
protected internal sealed class FakeReader : FilterIndexReader
{
private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().CreateSearcher().IndexReader;
public String Field { get; private set; }
protected internal FakeReader()
: base(EMPTY_MEMORY_INDEX_READER)
{
}
public override TermEnum Terms(Term t)
{
// only set first fieldname, maybe use a Set?
if (t != null && Field == null)
Field = t.Field;
return base.Terms(t);
}
}
}
}