/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.AR
{
/**
* Stemmer for Arabic.
*
* Stemming is done in-place for efficiency, operating on a termbuffer.
*
* Stemming is defined as:
*
* - Removal of attached definite article, conjunction, and prepositions.
*
- Stemming of common suffixes.
*
*
*/
public class ArabicStemmer
{
public static char ALEF = '\u0627';
public static char BEH = '\u0628';
public static char TEH_MARBUTA = '\u0629';
public static char TEH = '\u062A';
public static char FEH = '\u0641';
public static char KAF = '\u0643';
public static char LAM = '\u0644';
public static char NOON = '\u0646';
public static char HEH = '\u0647';
public static char WAW = '\u0648';
public static char YEH = '\u064A';
public static char[][] prefixes = {
("" + ALEF + LAM).ToCharArray(),
("" + WAW + ALEF + LAM).ToCharArray(),
("" + BEH + ALEF + LAM).ToCharArray(),
("" + KAF + ALEF + LAM).ToCharArray(),
("" + FEH + ALEF + LAM).ToCharArray(),
("" + LAM + LAM).ToCharArray(),
("" + WAW).ToCharArray(),
};
public static char[][] suffixes = {
("" + HEH + ALEF).ToCharArray(),
("" + ALEF + NOON).ToCharArray(),
("" + ALEF + TEH).ToCharArray(),
("" + WAW + NOON).ToCharArray(),
("" + YEH + NOON).ToCharArray(),
("" + YEH + HEH).ToCharArray(),
("" + YEH + TEH_MARBUTA).ToCharArray(),
("" + HEH).ToCharArray(),
("" + TEH_MARBUTA).ToCharArray(),
("" + YEH).ToCharArray(),
};
/**
* Stem an input buffer of Arabic text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int Stem(char[] s, int len)
{
len = StemPrefix(s, len);
len = StemSuffix(s, len);
return len;
}
/**
* Stem a prefix off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming.
*/
public int StemPrefix(char[] s, int len)
{
for (int i = 0; i < prefixes.Length; i++)
if (StartsWith(s, len, prefixes[i]))
return DeleteN(s, 0, len, prefixes[i].Length);
return len;
}
/**
* Stem suffix(es) off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
public int StemSuffix(char[] s, int len)
{
for (int i = 0; i < suffixes.Length; i++)
if (EndsWith(s, len, suffixes[i]))
len = DeleteN(s, len - suffixes[i].Length, len, suffixes[i].Length);
return len;
}
/**
* Returns true if the prefix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param prefix prefix to check
* @return true if the prefix matches and can be stemmed
*/
bool StartsWith(char[] s, int len, char[] prefix)
{
if (prefix.Length == 1 && len < 4)
{ // wa- prefix requires at least 3 characters
return false;
}
else if (len < prefix.Length + 2)
{ // other prefixes require only 2.
return false;
}
else
{
for (int i = 0; i < prefix.Length; i++)
if (s[i] != prefix[i])
return false;
return true;
}
}
/**
* Returns true if the suffix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
* @return true if the suffix matches and can be stemmed
*/
bool EndsWith(char[] s, int len, char[] suffix)
{
if (len < suffix.Length + 2)
{ // all suffixes require at least 2 characters after stemming
return false;
}
else
{
for (int i = 0; i < suffix.Length; i++)
if (s[len - suffix.Length + i] != suffix[i])
return false;
return true;
}
}
/**
* Delete n characters in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len Length of input buffer
* @param nChars number of characters to delete
* @return length of input buffer after deletion
*/
protected int DeleteN(char[] s, int pos, int len, int nChars)
{
for (int i = 0; i < nChars; i++)
len = Delete(s, pos, len);
return len;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int Delete(char[] s, int pos, int len)
{
if (pos < len)
Array.Copy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}
}