/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Shingle.Codec;
using Lucene.Net.Analysis.Shingle.Matrix;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Support;
namespace Lucene.Net.Analysis.Shingle
{
/// A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream.
/// In other words, it creates combinations of tokens as a single token. For example, the sentence "please divide this sentence into shingles"
/// might be tokenized into shingles "please divide", "divide this",
/// "this sentence", "sentence into", and "into shingles". Using a shingle filter at index and query time can in some instances
/// be used to replace phrase queries, especially them with 0 slop. Without a spacer character
/// it can be used to handle composition and decomposition of words
/// such as searching for "multi dimensional" instead of "multidimensional".
/// It is a rather common human problem at query time
/// in several languages, notably the northern Germanic branch. Shingles are amongst many things also known to solve problems
/// in spell checking, language detection and document clustering. This filter is backed by a three dimensional column oriented matrix
/// used to create permutations of the second dimension, the rows,
/// and leaves the third, the z-axis, for for multi token synonyms. In order to use this filter you need to define a way of positioning
/// the input stream tokens in the matrix. This is done using a
/// ShingleMatrixFilter.TokenSettingsCodec.
/// There are three simple implementations for demonstrational purposes,
/// see ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec,
/// ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec
/// and ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec. Consider this token matrix: This implementation can be rather heap demanding
/// if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns,
/// or if each column contains a great number of rows. The problem is that in order avoid producing duplicates
/// the filter needs to keep track of any shingle already produced and returned to the consumer. There is a bit of resource management to handle this
/// but it would of course be much better if the filter was written
/// so it never created the same shingle more than once in the first place. The filter also has basic support for calculating weights for the shingles
/// based on the weights of the tokens from the input stream, output shingle size, etc.
/// See CalculateShingleWeight.
///
/// Token[column][row][z-axis]{
/// {{hello}, {greetings, and, salutations}},
/// {{world}, {earth}, {tellus}}
/// };
///
///
/// It would produce the following 2-3 gram sized shingles:
///
///
/// "hello_world"
/// "greetings_and"
/// "greetings_and_salutations"
/// "and_salutations"
/// "and_salutations_world"
/// "salutations_world"
/// "hello_earth"
/// "and_salutations_earth"
/// "salutations_earth"
/// "hello_tellus"
/// "and_salutations_tellus"
/// "salutations_tellus"
///
///
///
/// NOTE: The Java List implementation uses a different equality comparison scheme /// than .NET's Generic List. So We have to use a custom IEqualityComparer implementation /// to get the same behaviour. ///
///