/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Text;
namespace Lucene.Net.Analysis.Fr
{
/*
* A stemmer for French words.
*
* The algorithm is based on the work of
* Dr Martin Porter on his snowball project
* refer to http://snowball.sourceforge.net/french/stemmer.html
* (French stemming algorithm) for details
*
*/
public class FrenchStemmer {
/*
* Buffer for the terms while stemming them.
*/
private StringBuilder sb = new StringBuilder();
/*
* A temporary buffer, used to reconstruct R2
*/
private StringBuilder tb = new StringBuilder();
/*
* Region R0 is equal to the whole buffer
*/
private String R0;
/*
* Region RV
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."
*/
private String RV;
/*
* Region R1
* "R1 is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R1;
/*
* Region R2
* "R2 is the region after the first non-vowel in R1 following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R2;
/*
* Set to true if we need to perform step 2
*/
private bool suite;
/*
* Set to true if the buffer was modified
*/
private bool modified;
/*
* Stems the given term to a unique discriminator.
*
* @param term java.langString The term that should be stemmed
* @return java.lang.String Discriminator for term
*/
protected internal String Stem( String term ) {
if ( !IsStemmable( term ) ) {
return term;
}
// Use lowercase for medium stemming.
term = term.ToLower();
// Reset the StringBuilder.
sb.Length = 0;
sb.Insert( 0, term );
// reset the bools
modified = false;
suite = false;
sb = TreatVowels( sb );
SetStrings();
Step1();
if (!modified || suite)
{
if (RV != null)
{
suite = Step2A();
if (!suite)
Step2B();
}
}
if (modified || suite)
Step3();
else
Step4();
Step5();
Step6();
return sb.ToString();
}
/*
* Sets the search region Strings
* it needs to be done each time the buffer was modified
*/
private void SetStrings() {
// set the strings
R0 = sb.ToString();
RV = RetrieveRV( sb );
R1 = RetrieveR( sb );
if ( R1 != null )
{
tb.Length = 0;
tb.Insert( 0, R1 );
R2 = RetrieveR( tb );
}
else
R2 = null;
}
/*
* First step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step1( ) {
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
DeleteFrom( R2, suffix );
ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" );
ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
DeleteFrom( RV, new String[] { "ements", "ement" } );
DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "abil", false, R0, "abl");
DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "ic", false, R0, "iqU");
DeleteButSuffixFrom(R2, new [] { "it\u00e9s", "it\u00e9" }, "iv", true);
String[] autre = { "ifs", "ives", "if", "ive" };
DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
ReplaceFrom( R0, new String[] { "eaux" }, "eau" );
ReplaceFrom( R1, new String[] { "aux" }, "al" );
DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
DeleteFrom( R2, new String[] { "eux" } );
// if one of the next steps is performed, we will need to perform step2a
bool temp = false;
temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" );
if (temp == true)
suite = true;
temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" );
if (temp == true)
suite = true;
temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
if (temp == true)
suite = true;
}
/*
* Second step (A) of the Porter Algorithm
* Will be performed if nothing changed from the first step
* or changed were done in the amment, emment, ments or ment suffixes
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*
* @return bool - true if something changed in the StringBuilder
*/
private bool Step2A() {
String[] search = { "\u00eemes", "\u00eetes", "iraIent", "irait", "irais", "irai", "iras", "ira",
"irent", "iriez", "irez", "irions", "irons", "iront",
"issaIent", "issais", "issantes", "issante", "issants", "issant",
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
"isses", "isse", "ir", "is", "\u00eet", "it", "ies", "ie", "i" };
return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV );
}
/*
* Second step (B) of the Porter Algorithm
* Will be performed if step 2 A was performed unsuccessfully
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step2B() {
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
"erons", "eront","erez", "\u00e8rent", "era", "\u00e9es", "iez",
"\u00e9e", "\u00e9s", "er", "ez", "\u00e9" };
DeleteFrom( RV, suffix );
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
"antes", "aIent", "Aient", "ante", "\u00e2mes", "\u00e2tes", "ants", "ant",
"ait", "a\u00eet", "ais", "Ait", "A\u00eet", "Ais", "\u00e2t", "as", "ai", "Ai", "a" };
DeleteButSuffixFrom( RV, search, "e", true );
DeleteFrom( R2, new String[] { "ions" } );
}
/*
* Third step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step3() {
if (sb.Length>0)
{
char ch = sb[ sb.Length -1];
if (ch == 'Y')
{
sb[sb.Length -1] = 'i' ;
SetStrings();
}
else if (ch == 'ç')
{
sb[sb.Length -1] = 'c';
SetStrings();
}
}
}
/*
* Fourth step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step4() {
if (sb.Length > 1)
{
char ch = sb[ sb.Length -1];
if (ch == 's')
{
char b = sb[ sb.Length -2];
if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
{
sb.Length = sb.Length - 1;
SetStrings();
}
}
}
bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
if (!found)
found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
ReplaceFrom(RV, new String[] { "I\u00e8re", "i\u00e8re", "Ier", "ier" }, "i");
DeleteFrom( RV, new String[] { "e" } );
DeleteFromIfPrecededIn(RV, new String[] { "\u00eb" }, R0, "gu");
}
/*
* Fifth step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step5() {
if (R0 != null)
{
if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill"))
{
sb.Length = sb.Length - 1;
SetStrings();
}
}
}
/*
* Sixth (and last!) step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void Step6() {
if (R0!=null && R0.Length>0)
{
bool seenVowel = false;
bool seenConson = false;
int pos = -1;
for (int i = R0.Length-1; i > -1; i--)
{
char ch = R0[i] ;
if (IsVowel(ch))
{
if (!seenVowel)
{
if (ch == 'é' || ch == 'è')
{
pos = i;
break;
}
}
seenVowel = true;
}
else
{
if (seenVowel)
break;
else
seenConson = true;
}
}
if (pos > -1 && seenConson && !seenVowel)
sb[pos] = 'e';
}
}
/*
* Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param from java.lang.String - the secondary source zone for search
* @param prefix java.lang.String - the prefix to add to the search string to test
* @return bool - true if modified
*/
private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
bool found = false;
if (source!=null )
{
for (int i = 0; i < search.Length; i++) {
if ( source.EndsWith( search[i] ))
{
if (from!=null && from.EndsWith( prefix + search[i] ))
{
sb.Length = sb.Length - search[i].Length;
found = true;
SetStrings();
break;
}
}
}
}
return found;
}
/*
* Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param vowel bool - true if we need a vowel before the search string
* @param from java.lang.String - the secondary source zone for search (where vowel could be)
* @return bool - true if modified
*/
private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) {
bool found = false;
if (source!=null && from!=null)
{
for (int i = 0; i < search.Length; i++) {
if ( source.EndsWith( search[i] ))
{
if ((search[i].Length + 1) <= from.Length)
{
bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]);
if (test == vowel)
{
sb.Length = sb.Length - search[i].Length;
modified = true;
found = true;
SetStrings();
break;
}
}
}
}
}
return found;
}
/*
* Delete a suffix searched in zone "source" if preceded by the prefix
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without bool - true if it will be deleted even without prefix found
*/
private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) {
if (source!=null)
{
for (int i = 0; i < search.Length; i++) {
if ( source.EndsWith( prefix + search[i] ))
{
sb.Length = sb.Length - (prefix.Length + search[i].Length);
modified = true;
SetStrings();
break;
}
else if ( without && source.EndsWith( search[i] ))
{
sb.Length = sb.Length - search[i].Length;
modified = true;
SetStrings();
break;
}
}
}
}
/*
* Delete a suffix searched in zone "source" if preceded by prefix
* or replace it with the replace string if preceded by the prefix in the zone "from"
* or delete the suffix if specified
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without bool - true if it will be deleted even without prefix found
*/
private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) {
if (source!=null)
{
for (int i = 0; i < search.Length; i++) {
if ( source.EndsWith( prefix + search[i] ))
{
sb.Length = sb.Length - (prefix.Length + search[i].Length);
modified = true;
SetStrings();
break;
}
else if ( from!=null && from.EndsWith( prefix + search[i] ))
{
// java equivalent of replace
sb.Length = sb.Length - (prefix.Length + search[i].Length);
sb.Append(replace);
modified = true;
SetStrings();
break;
}
else if ( without && source.EndsWith( search[i] ))
{
sb.Length = sb.Length - search[i].Length;
modified = true;
SetStrings();
break;
}
}
}
}
/*
* Replace a search string with another within the source zone
*
* @param source java.lang.String - the source zone for search
* @param search java.lang.String[] - the strings to search for replacement
* @param replace java.lang.String - the replacement string
*/
private bool ReplaceFrom( String source, String[] search, String replace ) {
bool found = false;
if (source!=null)
{
for (int i = 0; i < search.Length; i++) {
if ( source.EndsWith( search[i] ))
{
// java equivalent for replace
sb.Length = sb.Length - search[i].Length;
sb.Append(replace);
modified = true;
found = true;
SetStrings();
break;
}
}
}
return found;
}
/*
* Delete a search string within the source zone
*
* @param source the source zone for search
* @param suffix the strings to search for suppression
*/
private void DeleteFrom(String source, String[] suffix ) {
if (source!=null)
{
for (int i = 0; i < suffix.Length; i++) {
if (source.EndsWith( suffix[i] ))
{
sb.Length = sb.Length - suffix[i].Length;
modified = true;
SetStrings();
break;
}
}
}
}
/*
* Test if a char is a french vowel, including accentuated ones
*
* @param ch the char to test
* @return bool - true if the char is a vowel
*/
private bool IsVowel(char ch) {
switch (ch)
{
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y':
case 'â':
case 'à':
case 'ë':
case 'é':
case 'ê':
case 'è':
case 'ï':
case 'î':
case 'ô':
case 'ü':
case 'ù':
case 'û':
return true;
default:
return false;
}
}
/*
* Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
* "R is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
* @param buffer java.lang.StringBuilder - the in buffer
* @return java.lang.String - the resulting string
*/
private String RetrieveR( StringBuilder buffer ) {
int len = buffer.Length;
int pos = -1;
for (int c = 0; c < len; c++) {
if (IsVowel( buffer[ c ] ))
{
pos = c;
break;
}
}
if (pos > -1)
{
int consonne = -1;
for (int c = pos; c < len; c++) {
if (!IsVowel(buffer[ c ] ))
{
consonne = c;
break;
}
}
if (consonne > -1 && (consonne+1) < len)
return buffer.ToString(consonne + 1, len - (consonne+1));
else
return null;
}
else
return null;
}
/*
* Retrieve the "RV zone" from a buffer an return the corresponding string
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."
* @param buffer java.lang.StringBuilder - the in buffer
* @return java.lang.String - the resulting string
*/
private String RetrieveRV( StringBuilder buffer ) {
int len = buffer.Length;
if ( buffer.Length > 3)
{
if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) {
return buffer.ToString(3, len - 3);
}
else
{
int pos = 0;
for (int c = 1; c < len; c++) {
if (IsVowel( buffer[ c ] ))
{
pos = c;
break;
}
}
if ( pos+1 < len )
return buffer.ToString(pos + 1, len - (pos+1));
else
return null;
}
}
else
return null;
}
/*
* Turns u and i preceded AND followed by a vowel to UpperCase
* Turns y preceded OR followed by a vowel to UpperCase
* Turns u preceded by q to UpperCase
*
* @param buffer java.util.StringBuilder - the buffer to treat
* @return java.util.StringBuilder - the treated buffer
*/
private StringBuilder TreatVowels( StringBuilder buffer ) {
for ( int c = 0; c < buffer.Length; c++ ) {
char ch = buffer[ c ] ;
if (c == 0) // first char
{
if (buffer.Length>1)
{
if (ch == 'y' && IsVowel(buffer[ c + 1 ] ))
buffer[c] = 'Y';
}
}
else if (c == buffer.Length-1) // last char
{
if (ch == 'u' && buffer[ c - 1 ] == 'q')
buffer[c] = 'U';
if (ch == 'y' && IsVowel(buffer[ c - 1 ] ))
buffer[c] = 'Y';
}
else // other cases
{
if (ch == 'u')
{
if (buffer[ c - 1] == 'q')
buffer[c] = 'U';
else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
buffer[c] = 'U';
}
if (ch == 'i')
{
if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
buffer[c] = 'I';
}
if (ch == 'y')
{
if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] ))
buffer[c] = 'Y';
}
}
}
return buffer;
}
/*
* Checks a term if it can be processed correctly.
*
* @return bool - true if, and only if, the given term consists in letters.
*/
private bool IsStemmable( String term ) {
bool upper = false;
int first = -1;
for ( int c = 0; c < term.Length; c++ ) {
// Discard terms that contain non-letter chars.
if ( !char.IsLetter( term[c] ) ) {
return false;
}
// Discard terms that contain multiple uppercase letters.
if ( char.IsUpper( term[ c] ) ) {
if ( upper ) {
return false;
}
// First encountered uppercase letter, set flag and save
// position.
else {
first = c;
upper = true;
}
}
}
// Discard the term if it contains a single uppercase letter that
// is not starting the term.
if ( first > 0 ) {
return false;
}
return true;
}
}
}