/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Text;
namespace SpellChecker.Net.Search.Spell
{
public class NGramDistance : StringDistance
{
private int n;
///
/// Creates an N-Gram distance measure using n-grams of the specified size.
///
/// The size of the n-gram to be used to compute the string distance.
public NGramDistance(int size)
{
this.n = size;
}
///
/// Creates an N-Gram distance measure using n-grams of size 2.
///
public NGramDistance()
: this(2)
{
}
public float GetDistance(String source, String target)
{
int sl = source.Length;
int tl = target.Length;
if (sl == 0 || tl == 0)
{
if (sl == tl)
{
return 1;
}
else
{
return 0;
}
}
int cost = 0;
if (sl < n || tl < n)
{
for (int ii = 0, ni = Math.Min(sl, tl); ii < ni; ii++)
{
if (source[ii] == target[ii])
{
cost++;
}
}
return (float)cost / Math.Max(sl, tl);
}
char[] sa = new char[sl + n - 1];
float[] p; //'previous' cost array, horizontally
float[] d; // cost array, horizontally
float[] _d; //placeholder to assist in swapping p and d
//construct sa with prefix
for (int ii = 0; ii < sa.Length; ii++)
{
if (ii < n - 1)
{
sa[ii] = (char)0; //add prefix
}
else
{
sa[ii] = source[ii - n + 1];
}
}
p = new float[sl + 1];
d = new float[sl + 1];
// indexes into strings s and t
int i; // iterates through source
int j; // iterates through target
char[] t_j = new char[n]; // jth n-gram of t
for (i = 0; i <= sl; i++)
{
p[i] = i;
}
for (j = 1; j <= tl; j++)
{
//construct t_j n-gram
if (j < n)
{
for (int ti = 0; ti < n - j; ti++)
{
t_j[ti] = (char)0; //add prefix
}
for (int ti = n - j; ti < n; ti++)
{
t_j[ti] = target[ti - (n - j)];
}
}
else
{
t_j = target.Substring(j - n, n).ToCharArray();
}
d[0] = j;
for (i = 1; i <= sl; i++)
{
cost = 0;
int tn = n;
//compare sa to t_j
for (int ni = 0; ni < n; ni++)
{
if (sa[i - 1 + ni] != t_j[ni])
{
cost++;
}
else if (sa[i - 1 + ni] == 0)
{ //discount matches on prefix
tn--;
}
}
float ec = (float)cost / tn;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return 1.0f - ((float)p[sl] / Math.Max(tl, sl));
}
}
}