You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by eh...@apache.org on 2004/01/20 11:07:01 UTC
cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr FrenchAnalyzer.java FrenchStemFilter.java FrenchStemmer.java
ehatcher 2004/01/20 02:07:01
Added: contributions/analyzers/src/java/org/apache/lucene/analysis/fr
FrenchAnalyzer.java FrenchStemFilter.java
FrenchStemmer.java
Log:
#26268 - Add FrenchAnalyzer
Revision Changes Path
1.1 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
Index: FrenchAnalyzer.java
===================================================================
package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
import org.apache.lucene.analysis.de.WordlistLoader;
/**
* Analyzer for french language. Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed).
* A default set of stopwords is used unless an other list is specified, the
* exclusionlist is empty by default.
*
* @author Patrick Talbot (based on Gerhard Schwarz work for German)
* @version $Id: FrenchAnalyzer.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
*/
public final class FrenchAnalyzer extends Analyzer {
/**
* Extended list of typical french stopwords.
*/
private String[] FRENCH_STOP_WORDS = {
"a", "afin", "ai", "ainsi", "apr�s", "attendu", "au", "aujourd", "auquel", "aussi",
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
"dedans", "dehors", "del�", "depuis", "derri�re", "des", "d�sormais", "desquelles",
"desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
"diverses", "doit", "donc", "dont", "du", "duquel", "durant", "d�s", "elle", "elles",
"en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "except�", "hormis",
"hors", "h�las", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
"le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "l�",
"ma", "mais", "malgr�", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
"moins", "mon", "moyennant", "m�me", "m�mes", "n", "ne", "ni", "non", "nos", "notre",
"nous", "n�anmoins", "n�tre", "n�tres", "on", "ont", "ou", "outre", "o�", "par", "parmi",
"partant", "pas", "pass�", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
"proche", "pr�s", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
"qui", "quoi", "quoique", "revoici", "revoil�", "s", "sa", "sans", "sauf", "se", "selon",
"seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
"son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
"tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
"voici", "voil�", "vos", "votre", "vous", "vu", "v�tre", "v�tres", "y", "�", "�a", "�s",
"�t�", "�tre", "�"
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable excltable = new Hashtable();
/**
* Builds an analyzer.
*/
public FrenchAnalyzer() {
stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( String[] stopwords ) {
stoptable = StopFilter.makeStopTable( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( Hashtable stopwords ) {
stoptable = stopwords;
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( File stopwords ) {
stoptable = WordlistLoader.getWordtable( stopwords );
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionlist ) {
excltable = StopFilter.makeStopTable( exclusionlist );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist ) {
excltable = exclusionlist;
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) {
excltable = WordlistLoader.getWordtable( exclusionlist );
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
public final TokenStream tokenStream( String fieldName, Reader reader ) {
TokenStream result = new StandardTokenizer( reader );
result = new StandardFilter( result );
result = new StopFilter( result, stoptable );
result = new FrenchStemFilter( result, excltable );
// Convert to lowercase after stemming!
result = new LowerCaseFilter( result );
return result;
}
}
1.1 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
Index: FrenchStemFilter.java
===================================================================
package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
/**
* A filter that stemms french words. It supports a table of words that should
* not be stemmed at all. The used stemmer can be changed at runtime after the
* filter object is created (as long as it is a FrenchStemmer).
*
* @author Patrick Talbot (based on Gerhard Schwarz work for German)
* @version $Id: FrenchStemFilter.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
*/
public final class FrenchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private Token token = null;
private FrenchStemmer stemmer = null;
private Hashtable exclusions = null;
public FrenchStemFilter( TokenStream in ) {
super(in);
stemmer = new FrenchStemmer();
}
/**
* Builds a FrenchStemFilter that uses an exclusiontable.
*/
public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
this( in );
exclusions = exclusiontable;
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next()
throws IOException {
if ( ( token = input.next() ) == null ) {
return null;
}
// Check the exclusiontable
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
return token;
}
else {
String s = stemmer.stem( token.termText() );
// If not stemmed, dont waste the time creating a new token
if ( !s.equals( token.termText() ) ) {
return new Token( s, 0, s.length(), token.type() );
}
return token;
}
}
/**
* Set a alternative/custom FrenchStemmer for this filter.
*/
public void setStemmer( FrenchStemmer stemmer ) {
if ( stemmer != null ) {
this.stemmer = stemmer;
}
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionTable( Hashtable exclusiontable ) {
exclusions = exclusiontable;
}
}
1.1 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
Index: FrenchStemmer.java
===================================================================
package org.apache.lucene.analysis.fr;
/**
* A stemmer for French words. The algorithm is based on the work of
* Dr Martin Porter on his snowball project<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
* (French stemming algorithm) for details
*
* @author Patrick Talbot
* @version $Id: FrenchStemmer.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
*/
public class FrenchStemmer {
/**
* Buffer for the terms while stemming them.
*/
private StringBuffer sb = new StringBuffer();
/**
* A temporary buffer, used to reconstruct R2
*/
private StringBuffer tb = new StringBuffer();
/**
* Region R0 is equal to the whole buffer
*/
private String R0;
/**
* Region RV
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."
*/
private String RV;
/**
* Region R1
* "R1 is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R1;
/**
* Region R2
* "R2 is the region after the first non-vowel in R1 following a vowel
* or is the null region at the end of the word if there is no such non-vowel"
*/
private String R2;
/**
* Set to true if we need to perform step 2
*/
private boolean suite;
/**
* Set to true if the buffer was modified
*/
private boolean modified;
/**
* Stemms the given term to a unique <tt>discriminator</tt>.
*
* @param term java.langString The term that should be stemmed
* @return java.lang.String Discriminator for <tt>term</tt>
*/
protected String stem( String term ) {
if ( !isStemmable( term ) ) {
return term;
}
// Use lowercase for medium stemming.
term = term.toLowerCase();
// Reset the StringBuffer.
sb.delete( 0, sb.length() );
sb.insert( 0, term );
// reset the booleans
modified = false;
suite = false;
sb = treatVowels( sb );
setStrings();
step1();
if (!modified || suite)
{
if (RV != null)
{
suite = step2a();
if (!suite)
step2b();
}
}
if (modified || suite)
step3();
else
step4();
step5();
step6();
return sb.toString();
}
/**
* Sets the search region Strings<br>
* it needs to be done each time the buffer was modified
*/
private void setStrings() {
// set the strings
R0 = sb.toString();
RV = retrieveRV( sb );
R1 = retrieveR( sb );
if ( R1 != null )
{
tb.delete( 0, tb.length() );
tb.insert( 0, R1 );
R2 = retrieveR( tb );
}
else
R2 = null;
}
/**
* First step of the Porter Algorithmn<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step1( ) {
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
deleteFrom( R2, suffix );
replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
deleteFrom( RV, new String[] { "ements", "ement" } );
deleteButSuffixFromElseReplace( R2, new String[] { "it�s", "it�" }, "abil", false, R0, "abl" );
deleteButSuffixFromElseReplace( R2, new String[] { "it�s", "it�" }, "ic", false, R0, "iqU" );
deleteButSuffixFrom( R2, new String[] { "it�s", "it�" }, "iv", true );
String[] autre = { "ifs", "ives", "if", "ive" };
deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
replaceFrom( R0, new String[] { "eaux" }, "eau" );
replaceFrom( R1, new String[] { "aux" }, "al" );
deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
deleteFrom( R2, new String[] { "eux" } );
// if one of the next steps is performed, we will need to perform step2a
boolean temp = false;
temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
if (temp == true)
suite = true;
temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
if (temp == true)
suite = true;
temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
if (temp == true)
suite = true;
}
/**
* Second step (A) of the Porter Algorithmn<br>
* Will be performed if nothing changed from the first step
* or changed were done in the amment, emment, ments or ment suffixes<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*
* @return boolean - true if something changed in the StringBuffer
*/
private boolean step2a() {
String[] search = { "�mes", "�tes", "iraIent", "irait", "irais", "irai", "iras", "ira",
"irent", "iriez", "irez", "irions", "irons", "iront",
"issaIent", "issais", "issantes", "issante", "issants", "issant",
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
"isses", "isse", "ir", "is", "�t", "it", "ies", "ie", "i" };
return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
}
/**
* Second step (B) of the Porter Algorithmn<br>
* Will be performed if step 2 A was performed unsuccessfully<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step2b() {
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
"erons", "eront","erez", "�rent", "era", "�es", "iez",
"�e", "�s", "er", "ez", "�" };
deleteFrom( RV, suffix );
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
"antes", "aIent", "Aient", "ante", "�mes", "�tes", "ants", "ant",
"ait", "a�t", "ais", "Ait", "A�t", "Ais", "�t", "as", "ai", "Ai", "a" };
deleteButSuffixFrom( RV, search, "e", true );
deleteFrom( R2, new String[] { "ions" } );
}
/**
* Third step of the Porter Algorithmn<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step3() {
if (sb.length()>0)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 'Y')
{
sb.setCharAt( sb.length()-1, 'i' );
setStrings();
}
else if (ch == '�')
{
sb.setCharAt( sb.length()-1, 'c' );
setStrings();
}
}
}
/**
* Fourth step of the Porter Algorithmn<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step4() {
if (sb.length() > 1)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 's')
{
char b = sb.charAt( sb.length()-2 );
if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != '�' && b != 's')
{
sb.delete( sb.length() - 1, sb.length());
setStrings();
}
}
}
boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
if (!found)
found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
replaceFrom( RV, new String[] { "I�re", "i�re", "Ier", "ier" }, "i" );
deleteFrom( RV, new String[] { "e" } );
deleteFromIfPrecededIn( RV, new String[] { "�" }, R0, "gu" );
}
/**
* Fifth step of the Porter Algorithmn<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step5() {
if (R0 != null)
{
if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
{
sb.delete( sb.length() - 1, sb.length() );
setStrings();
}
}
}
/**
* Sixth (and last!) step of the Porter Algorithmn<br>
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
*/
private void step6() {
if (R0!=null && R0.length()>0)
{
boolean seenVowel = false;
boolean seenConson = false;
int pos = -1;
for (int i = R0.length()-1; i > -1; i--)
{
char ch = R0.charAt(i);
if (isVowel(ch))
{
if (!seenVowel)
{
if (ch == '�' || ch == '�')
{
pos = i;
break;
}
}
seenVowel = true;
}
else
{
if (seenVowel)
break;
else
seenConson = true;
}
}
if (pos > -1 && seenConson && !seenVowel)
sb.setCharAt(pos, 'e');
}
}
/**
* Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param from java.lang.String - the secondary source zone for search
* @param prefix java.lang.String - the prefix to add to the search string to test
* @return boolean - true if modified
*/
private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
boolean found = false;
if (source!=null )
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if (from!=null && from.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length());
found = true;
setStrings();
break;
}
}
}
}
return found;
}
/**
* Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param vowel boolean - true if we need a vowel before the search string
* @param from java.lang.String - the secondary source zone for search (where vowel could be)
* @return boolean - true if modified
*/
private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
boolean found = false;
if (source!=null && from!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if ((search[i].length() + 1) <= from.length())
{
boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
if (test == vowel)
{
sb.delete( sb.length() - search[i].length(), sb.length());
modified = true;
found = true;
setStrings();
break;
}
}
}
}
}
return found;
}
/**
* Delete a suffix searched in zone "source" if preceded by the prefix
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without boolean - true if it will be deleted even without prefix found
*/
private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
}
/**
* Delete a suffix searched in zone "source" if preceded by prefix<br>
* or replace it with the replace string if preceded by the prefix in the zone "from"<br>
* or delete the suffix if specified
*
* @param source java.lang.String - the primary source zone for search
* @param search java.lang.String[] - the strings to search for suppression
* @param prefix java.lang.String - the prefix to add to the search string to test
* @param without boolean - true if it will be deleted even without prefix found
*/
private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( from!=null && from.endsWith( prefix + search[i] ))
{
sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
}
/**
* Replace a search string with another within the source zone
*
* @param source java.lang.String - the source zone for search
* @param search java.lang.String[] - the strings to search for replacement
* @param replace java.lang.String - the replacement string
*/
private boolean replaceFrom( String source, String[] search, String replace ) {
boolean found = false;
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
sb.replace( sb.length() - search[i].length(), sb.length(), replace );
modified = true;
found = true;
setStrings();
break;
}
}
}
return found;
}
/**
* Delete a search string within the source zone
*
* @param source the source zone for search
* @param suffix the strings to search for suppression
*/
private void deleteFrom(String source, String[] suffix ) {
if (source!=null)
{
for (int i = 0; i < suffix.length; i++) {
if (source.endsWith( suffix[i] ))
{
sb.delete( sb.length() - suffix[i].length(), sb.length());
modified = true;
setStrings();
break;
}
}
}
}
/**
* Test if a char is a french vowel, including accentuated ones
*
* @param ch the char to test
* @return boolean - true if the char is a vowel
*/
private boolean isVowel(char ch) {
switch (ch)
{
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
case '�':
return true;
default:
return false;
}
}
/**
* Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
* "R is the region after the first non-vowel following a vowel
* or is the null region at the end of the word if there is no such non-vowel"<br>
* @param buffer java.lang.StringBuffer - the in buffer
* @return java.lang.String - the resulting string
*/
private String retrieveR( StringBuffer buffer ) {
int len = buffer.length();
int pos = -1;
for (int c = 0; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if (pos > -1)
{
int consonne = -1;
for (int c = pos; c < len; c++) {
if (!isVowel(buffer.charAt( c )))
{
consonne = c;
break;
}
}
if (consonne > -1 && (consonne+1) < len)
return buffer.substring( consonne+1, len );
else
return null;
}
else
return null;
}
/**
* Retrieve the "RV zone" from a buffer an return the corresponding string<br>
* "If the word begins with two vowels, RV is the region after the third letter,
* otherwise the region after the first vowel not at the beginning of the word,
* or the end of the word if these positions cannot be found."<br>
* @param buffer java.lang.StringBuffer - the in buffer
* @return java.lang.String - the resulting string
*/
private String retrieveRV( StringBuffer buffer ) {
int len = buffer.length();
if ( buffer.length() > 3)
{
if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
return buffer.substring(3,len);
}
else
{
int pos = 0;
for (int c = 1; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if ( pos+1 < len )
return buffer.substring( pos+1, len );
else
return null;
}
}
else
return null;
}
/**
* Turns u and i preceded AND followed by a vowel to UpperCase<br>
* Turns y preceded OR followed by a vowel to UpperCase<br>
* Turns u preceded by q to UpperCase<br>
*
* @param buffer java.util.StringBuffer - the buffer to treat
* @return java.util.StringBuffer - the treated buffer
*/
private StringBuffer treatVowels( StringBuffer buffer ) {
for ( int c = 0; c < buffer.length(); c++ ) {
char ch = buffer.charAt( c );
if (c == 0) // first char
{
if (buffer.length()>1)
{
if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y' );
}
}
else if (c == buffer.length()-1) // last char
{
if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
buffer.setCharAt( c, 'U' );
if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
buffer.setCharAt( c, 'Y' );
}
else // other cases
{
if (ch == 'u')
{
if (buffer.charAt( c - 1) == 'q')
buffer.setCharAt( c, 'U' );
else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'U' );
}
if (ch == 'i')
{
if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'I' );
}
if (ch == 'y')
{
if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y' );
}
}
}
return buffer;
}
/**
* Checks a term if it can be processed correctly.
*
* @return boolean - true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term ) {
boolean upper = false;
int first = -1;
for ( int c = 0; c < term.length(); c++ ) {
// Discard terms that contain non-letter characters.
if ( !Character.isLetter( term.charAt( c ) ) ) {
return false;
}
// Discard terms that contain multiple uppercase letters.
if ( Character.isUpperCase( term.charAt( c ) ) ) {
if ( upper ) {
return false;
}
// First encountered uppercase letter, set flag and save
// position.
else {
first = c;
upper = true;
}
}
}
// Discard the term if it contains a single uppercase letter that
// is not starting the term.
if ( first > 0 ) {
return false;
}
return true;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org