/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Text; namespace Lucene.Net.Analysis.Fr { /* * A stemmer for French words. *

* The algorithm is based on the work of * Dr Martin Porter on his snowball project
* refer to http://snowball.sourceforge.net/french/stemmer.html
* (French stemming algorithm) for details *

*/ public class FrenchStemmer { /* * Buffer for the terms while stemming them. */ private StringBuilder sb = new StringBuilder(); /* * A temporary buffer, used to reconstruct R2 */ private StringBuilder tb = new StringBuilder(); /* * Region R0 is equal to the whole buffer */ private String R0; /* * Region RV * "If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found." */ private String RV; /* * Region R1 * "R1 is the region after the first non-vowel following a vowel * or is the null region at the end of the word if there is no such non-vowel" */ private String R1; /* * Region R2 * "R2 is the region after the first non-vowel in R1 following a vowel * or is the null region at the end of the word if there is no such non-vowel" */ private String R2; /* * Set to true if we need to perform step 2 */ private bool suite; /* * Set to true if the buffer was modified */ private bool modified; /* * Stems the given term to a unique discriminator. * * @param term java.langString The term that should be stemmed * @return java.lang.String Discriminator for term */ protected internal String Stem( String term ) { if ( !IsStemmable( term ) ) { return term; } // Use lowercase for medium stemming. term = term.ToLower(); // Reset the StringBuilder. sb.Length = 0; sb.Insert( 0, term ); // reset the bools modified = false; suite = false; sb = TreatVowels( sb ); SetStrings(); Step1(); if (!modified || suite) { if (RV != null) { suite = Step2A(); if (!suite) Step2B(); } } if (modified || suite) Step3(); else Step4(); Step5(); Step6(); return sb.ToString(); } /* * Sets the search region Strings
* it needs to be done each time the buffer was modified */ private void SetStrings() { // set the strings R0 = sb.ToString(); RV = RetrieveRV( sb ); R1 = RetrieveR( sb ); if ( R1 != null ) { tb.Length = 0; tb.Insert( 0, R1 ); R2 = RetrieveR( tb ); } else R2 = null; } /* * First step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step1( ) { String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; DeleteFrom( R2, suffix ); ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" ); ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); DeleteFrom( RV, new String[] { "ements", "ement" } ); DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "abil", false, R0, "abl"); DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "ic", false, R0, "iqU"); DeleteButSuffixFrom(R2, new [] { "it\u00e9s", "it\u00e9" }, "iv", true); String[] autre = { "ifs", "ives", "if", "ive" }; DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); ReplaceFrom( R0, new String[] { "eaux" }, "eau" ); ReplaceFrom( R1, new String[] { "aux" }, "al" ); DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); DeleteFrom( R2, new String[] { "eux" } ); // if one of the next steps is performed, we will need to perform step2a bool temp = false; temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" ); if (temp == true) suite = true; temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" ); if (temp == true) suite = true; temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); if (temp == true) suite = true; } /* * Second step (A) of the Porter Algorithm
* Will be performed if nothing changed from the first step * or changed were done in the amment, emment, ments or ment suffixes
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation * * @return bool - true if something changed in the StringBuilder */ private bool Step2A() { String[] search = { "\u00eemes", "\u00eetes", "iraIent", "irait", "irais", "irai", "iras", "ira", "irent", "iriez", "irez", "irions", "irons", "iront", "issaIent", "issais", "issantes", "issante", "issants", "issant", "issait", "issais", "issions", "issons", "issiez", "issez", "issent", "isses", "isse", "ir", "is", "\u00eet", "it", "ies", "ie", "i" }; return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV ); } /* * Second step (B) of the Porter Algorithm
* Will be performed if step 2 A was performed unsuccessfully
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step2B() { String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", "erons", "eront","erez", "\u00e8rent", "era", "\u00e9es", "iez", "\u00e9e", "\u00e9s", "er", "ez", "\u00e9" }; DeleteFrom( RV, suffix ); String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", "antes", "aIent", "Aient", "ante", "\u00e2mes", "\u00e2tes", "ants", "ant", "ait", "a\u00eet", "ais", "Ait", "A\u00eet", "Ais", "\u00e2t", "as", "ai", "Ai", "a" }; DeleteButSuffixFrom( RV, search, "e", true ); DeleteFrom( R2, new String[] { "ions" } ); } /* * Third step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step3() { if (sb.Length>0) { char ch = sb[ sb.Length -1]; if (ch == 'Y') { sb[sb.Length -1] = 'i' ; SetStrings(); } else if (ch == 'ç') { sb[sb.Length -1] = 'c'; SetStrings(); } } } /* * Fourth step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step4() { if (sb.Length > 1) { char ch = sb[ sb.Length -1]; if (ch == 's') { char b = sb[ sb.Length -2]; if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') { sb.Length = sb.Length - 1; SetStrings(); } } } bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); if (!found) found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); ReplaceFrom(RV, new String[] { "I\u00e8re", "i\u00e8re", "Ier", "ier" }, "i"); DeleteFrom( RV, new String[] { "e" } ); DeleteFromIfPrecededIn(RV, new String[] { "\u00eb" }, R0, "gu"); } /* * Fifth step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step5() { if (R0 != null) { if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill")) { sb.Length = sb.Length - 1; SetStrings(); } } } /* * Sixth (and last!) step of the Porter Algorithm
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void Step6() { if (R0!=null && R0.Length>0) { bool seenVowel = false; bool seenConson = false; int pos = -1; for (int i = R0.Length-1; i > -1; i--) { char ch = R0[i] ; if (IsVowel(ch)) { if (!seenVowel) { if (ch == 'é' || ch == 'è') { pos = i; break; } } seenVowel = true; } else { if (seenVowel) break; else seenConson = true; } } if (pos > -1 && seenConson && !seenVowel) sb[pos] = 'e'; } } /* * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param from java.lang.String - the secondary source zone for search * @param prefix java.lang.String - the prefix to add to the search string to test * @return bool - true if modified */ private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { bool found = false; if (source!=null ) { for (int i = 0; i < search.Length; i++) { if ( source.EndsWith( search[i] )) { if (from!=null && from.EndsWith( prefix + search[i] )) { sb.Length = sb.Length - search[i].Length; found = true; SetStrings(); break; } } } } return found; } /* * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param vowel bool - true if we need a vowel before the search string * @param from java.lang.String - the secondary source zone for search (where vowel could be) * @return bool - true if modified */ private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) { bool found = false; if (source!=null && from!=null) { for (int i = 0; i < search.Length; i++) { if ( source.EndsWith( search[i] )) { if ((search[i].Length + 1) <= from.Length) { bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]); if (test == vowel) { sb.Length = sb.Length - search[i].Length; modified = true; found = true; SetStrings(); break; } } } } } return found; } /* * Delete a suffix searched in zone "source" if preceded by the prefix * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param prefix java.lang.String - the prefix to add to the search string to test * @param without bool - true if it will be deleted even without prefix found */ private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) { if (source!=null) { for (int i = 0; i < search.Length; i++) { if ( source.EndsWith( prefix + search[i] )) { sb.Length = sb.Length - (prefix.Length + search[i].Length); modified = true; SetStrings(); break; } else if ( without && source.EndsWith( search[i] )) { sb.Length = sb.Length - search[i].Length; modified = true; SetStrings(); break; } } } } /* * Delete a suffix searched in zone "source" if preceded by prefix
* or replace it with the replace string if preceded by the prefix in the zone "from"
* or delete the suffix if specified * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param prefix java.lang.String - the prefix to add to the search string to test * @param without bool - true if it will be deleted even without prefix found */ private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) { if (source!=null) { for (int i = 0; i < search.Length; i++) { if ( source.EndsWith( prefix + search[i] )) { sb.Length = sb.Length - (prefix.Length + search[i].Length); modified = true; SetStrings(); break; } else if ( from!=null && from.EndsWith( prefix + search[i] )) { // java equivalent of replace sb.Length = sb.Length - (prefix.Length + search[i].Length); sb.Append(replace); modified = true; SetStrings(); break; } else if ( without && source.EndsWith( search[i] )) { sb.Length = sb.Length - search[i].Length; modified = true; SetStrings(); break; } } } } /* * Replace a search string with another within the source zone * * @param source java.lang.String - the source zone for search * @param search java.lang.String[] - the strings to search for replacement * @param replace java.lang.String - the replacement string */ private bool ReplaceFrom( String source, String[] search, String replace ) { bool found = false; if (source!=null) { for (int i = 0; i < search.Length; i++) { if ( source.EndsWith( search[i] )) { // java equivalent for replace sb.Length = sb.Length - search[i].Length; sb.Append(replace); modified = true; found = true; SetStrings(); break; } } } return found; } /* * Delete a search string within the source zone * * @param source the source zone for search * @param suffix the strings to search for suppression */ private void DeleteFrom(String source, String[] suffix ) { if (source!=null) { for (int i = 0; i < suffix.Length; i++) { if (source.EndsWith( suffix[i] )) { sb.Length = sb.Length - suffix[i].Length; modified = true; SetStrings(); break; } } } } /* * Test if a char is a french vowel, including accentuated ones * * @param ch the char to test * @return bool - true if the char is a vowel */ private bool IsVowel(char ch) { switch (ch) { case 'a': case 'e': case 'i': case 'o': case 'u': case 'y': case 'â': case 'à': case 'ë': case 'é': case 'ê': case 'è': case 'ï': case 'î': case 'ô': case 'ü': case 'ù': case 'û': return true; default: return false; } } /* * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
* "R is the region after the first non-vowel following a vowel * or is the null region at the end of the word if there is no such non-vowel"
* @param buffer java.lang.StringBuilder - the in buffer * @return java.lang.String - the resulting string */ private String RetrieveR( StringBuilder buffer ) { int len = buffer.Length; int pos = -1; for (int c = 0; c < len; c++) { if (IsVowel( buffer[ c ] )) { pos = c; break; } } if (pos > -1) { int consonne = -1; for (int c = pos; c < len; c++) { if (!IsVowel(buffer[ c ] )) { consonne = c; break; } } if (consonne > -1 && (consonne+1) < len) return buffer.ToString(consonne + 1, len - (consonne+1)); else return null; } else return null; } /* * Retrieve the "RV zone" from a buffer an return the corresponding string
* "If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found."
* @param buffer java.lang.StringBuilder - the in buffer * @return java.lang.String - the resulting string */ private String RetrieveRV( StringBuilder buffer ) { int len = buffer.Length; if ( buffer.Length > 3) { if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) { return buffer.ToString(3, len - 3); } else { int pos = 0; for (int c = 1; c < len; c++) { if (IsVowel( buffer[ c ] )) { pos = c; break; } } if ( pos+1 < len ) return buffer.ToString(pos + 1, len - (pos+1)); else return null; } } else return null; } /* * Turns u and i preceded AND followed by a vowel to UpperCase
* Turns y preceded OR followed by a vowel to UpperCase
* Turns u preceded by q to UpperCase
* * @param buffer java.util.StringBuilder - the buffer to treat * @return java.util.StringBuilder - the treated buffer */ private StringBuilder TreatVowels( StringBuilder buffer ) { for ( int c = 0; c < buffer.Length; c++ ) { char ch = buffer[ c ] ; if (c == 0) // first char { if (buffer.Length>1) { if (ch == 'y' && IsVowel(buffer[ c + 1 ] )) buffer[c] = 'Y'; } } else if (c == buffer.Length-1) // last char { if (ch == 'u' && buffer[ c - 1 ] == 'q') buffer[c] = 'U'; if (ch == 'y' && IsVowel(buffer[ c - 1 ] )) buffer[c] = 'Y'; } else // other cases { if (ch == 'u') { if (buffer[ c - 1] == 'q') buffer[c] = 'U'; else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] )) buffer[c] = 'U'; } if (ch == 'i') { if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] )) buffer[c] = 'I'; } if (ch == 'y') { if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] )) buffer[c] = 'Y'; } } } return buffer; } /* * Checks a term if it can be processed correctly. * * @return bool - true if, and only if, the given term consists in letters. */ private bool IsStemmable( String term ) { bool upper = false; int first = -1; for ( int c = 0; c < term.Length; c++ ) { // Discard terms that contain non-letter chars. if ( !char.IsLetter( term[c] ) ) { return false; } // Discard terms that contain multiple uppercase letters. if ( char.IsUpper( term[ c] ) ) { if ( upper ) { return false; } // First encountered uppercase letter, set flag and save // position. else { first = c; upper = true; } } } // Discard the term if it contains a single uppercase letter that // is not starting the term. if ( first > 0 ) { return false; } return true; } } }