/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.Text; namespace SpellChecker.Net.Search.Spell { public class NGramDistance : StringDistance { private int n; /// /// Creates an N-Gram distance measure using n-grams of the specified size. /// /// The size of the n-gram to be used to compute the string distance. public NGramDistance(int size) { this.n = size; } /// /// Creates an N-Gram distance measure using n-grams of size 2. /// public NGramDistance() : this(2) { } public float GetDistance(String source, String target) { int sl = source.Length; int tl = target.Length; if (sl == 0 || tl == 0) { if (sl == tl) { return 1; } else { return 0; } } int cost = 0; if (sl < n || tl < n) { for (int ii = 0, ni = Math.Min(sl, tl); ii < ni; ii++) { if (source[ii] == target[ii]) { cost++; } } return (float)cost / Math.Max(sl, tl); } char[] sa = new char[sl + n - 1]; float[] p; //'previous' cost array, horizontally float[] d; // cost array, horizontally float[] _d; //placeholder to assist in swapping p and d //construct sa with prefix for (int ii = 0; ii < sa.Length; ii++) { if (ii < n - 1) { sa[ii] = (char)0; //add prefix } else { sa[ii] = source[ii - n + 1]; } } p = new float[sl + 1]; d = new float[sl + 1]; // indexes into strings s and t int i; // iterates through source int j; // iterates through target char[] t_j = new char[n]; // jth n-gram of t for (i = 0; i <= sl; i++) { p[i] = i; } for (j = 1; j <= tl; j++) { //construct t_j n-gram if (j < n) { for (int ti = 0; ti < n - j; ti++) { t_j[ti] = (char)0; //add prefix } for (int ti = n - j; ti < n; ti++) { t_j[ti] = target[ti - (n - j)]; } } else { t_j = target.Substring(j - n, n).ToCharArray(); } d[0] = j; for (i = 1; i <= sl; i++) { cost = 0; int tn = n; //compare sa to t_j for (int ni = 0; ni < n; ni++) { if (sa[i - 1 + ni] != t_j[ni]) { cost++; } else if (sa[i - 1 + ni] == 0) { //discount matches on prefix tn--; } } float ec = (float)cost / tn; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec); } // copy current distance counts to 'previous row' distance counts _d = p; p = d; d = _d; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return 1.0f - ((float)p[sl] / Math.Max(tl, sl)); } } }