You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2004/03/30 17:54:49 UTC
cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanStemFilter.java GermanStemmer.java WordlistLoader.java
otis 2004/03/30 07:54:49
Modified: src/java/org/apache/lucene/analysis/de GermanStemFilter.java
GermanStemmer.java WordlistLoader.java
Log:
- Fixed mixed-up indentation
Revision Changes Path
1.8 +31 -32 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
Index: GermanStemFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- GermanStemFilter.java 29 Mar 2004 22:48:01 -0000 1.7
+++ GermanStemFilter.java 30 Mar 2004 15:54:48 -0000 1.8
@@ -40,22 +40,21 @@
private Token token = null;
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
-
+
public GermanStemFilter( TokenStream in )
{
- super(in);
- stemmer = new GermanStemmer();
+ super(in);
+ stemmer = new GermanStemmer();
}
-
+
/**
* Builds a GermanStemFilter that uses an exclusiontable.
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
*/
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
{
- this( in );
- exclusionSet = new HashSet(exclusiontable.keySet());
-
+ this( in );
+ exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
@@ -63,32 +62,32 @@
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
- this( in );
- this.exclusionSet = exclusionSet;
+ this( in );
+ this.exclusionSet = exclusionSet;
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next()
- throws IOException
+ throws IOException
{
- if ( ( token = input.next() ) == null ) {
- return null;
- }
- // Check the exclusiontable
- else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
- return token;
- }
- else {
- String s = stemmer.stem( token.termText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.equals( token.termText() ) ) {
- return new Token( s, token.startOffset(),
- token.endOffset(), token.type() );
- }
- return token;
- }
+ if ( ( token = input.next() ) == null ) {
+ return null;
+ }
+ // Check the exclusiontable
+ else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
+ return token;
+ }
+ else {
+ String s = stemmer.stem( token.termText() );
+ // If not stemmed, dont waste the time creating a new token
+ if ( !s.equals( token.termText() ) ) {
+ return new Token( s, token.startOffset(),
+ token.endOffset(), token.type() );
+ }
+ return token;
+ }
}
/**
@@ -96,9 +95,9 @@
*/
public void setStemmer( GermanStemmer stemmer )
{
- if ( stemmer != null ) {
- this.stemmer = stemmer;
- }
+ if ( stemmer != null ) {
+ this.stemmer = stemmer;
+ }
}
/**
@@ -107,7 +106,7 @@
*/
public void setExclusionTable( Hashtable exclusiontable )
{
- exclusionSet = new HashSet(exclusiontable.keySet());
+ exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
@@ -115,6 +114,6 @@
*/
public void setExclusionSet( Set exclusionSet )
{
- this.exclusionSet = exclusionSet;
+ this.exclusionSet = exclusionSet;
}
}
1.10 +157 -156 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
Index: GermanStemmer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- GermanStemmer.java 30 Mar 2004 15:44:58 -0000 1.9
+++ GermanStemmer.java 30 Mar 2004 15:54:48 -0000 1.10
@@ -44,20 +44,20 @@
*/
protected String stem( String term )
{
- // Use lowercase for medium stemming.
- term = term.toLowerCase();
- if ( !isStemmable( term ) )
- return term;
- // Reset the StringBuffer.
- sb.delete( 0, sb.length() );
- sb.insert( 0, term );
- // Stemming starts here...
- substitute( sb );
- strip( sb );
- optimize( sb );
- resubstitute( sb );
- removeParticleDenotion( sb );
- return sb.toString();
+ // Use lowercase for medium stemming.
+ term = term.toLowerCase();
+ if ( !isStemmable( term ) )
+ return term;
+ // Reset the StringBuffer.
+ sb.delete( 0, sb.length() );
+ sb.insert( 0, term );
+ // Stemming starts here...
+ substitute( sb );
+ strip( sb );
+ optimize( sb );
+ resubstitute( sb );
+ removeParticleDenotion( sb );
+ return sb.toString();
}
/**
@@ -67,10 +67,11 @@
*/
private boolean isStemmable( String term )
{
- for ( int c = 0; c < term.length(); c++ ) {
- if ( !Character.isLetter( term.charAt( c ) ) ) return false;
- }
- return true;
+ for ( int c = 0; c < term.length(); c++ ) {
+ if ( !Character.isLetter( term.charAt( c ) ) )
+ return false;
+ }
+ return true;
}
/**
@@ -83,38 +84,38 @@
*/
private void strip( StringBuffer buffer )
{
- boolean doMore = true;
- while ( doMore && buffer.length() > 3 ) {
- if ( ( buffer.length() + substCount > 5 ) &&
- buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
- {
- buffer.delete( buffer.length() - 2, buffer.length() );
- }
- else if ( ( buffer.length() + substCount > 4 ) &&
- buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
- buffer.delete( buffer.length() - 2, buffer.length() );
- }
- else if ( ( buffer.length() + substCount > 4 ) &&
- buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
- buffer.delete( buffer.length() - 2, buffer.length() );
- }
- else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
- buffer.deleteCharAt( buffer.length() - 1 );
- }
- else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
- buffer.deleteCharAt( buffer.length() - 1 );
- }
- else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
- buffer.deleteCharAt( buffer.length() - 1 );
- }
- // "t" occurs only as suffix of verbs.
- else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
- buffer.deleteCharAt( buffer.length() - 1 );
- }
- else {
- doMore = false;
- }
- }
+ boolean doMore = true;
+ while ( doMore && buffer.length() > 3 ) {
+ if ( ( buffer.length() + substCount > 5 ) &&
+ buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
+ {
+ buffer.delete( buffer.length() - 2, buffer.length() );
+ }
+ else if ( ( buffer.length() + substCount > 4 ) &&
+ buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
+ buffer.delete( buffer.length() - 2, buffer.length() );
+ }
+ else if ( ( buffer.length() + substCount > 4 ) &&
+ buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
+ buffer.delete( buffer.length() - 2, buffer.length() );
+ }
+ else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
+ buffer.deleteCharAt( buffer.length() - 1 );
+ }
+ else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
+ buffer.deleteCharAt( buffer.length() - 1 );
+ }
+ else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
+ buffer.deleteCharAt( buffer.length() - 1 );
+ }
+ // "t" occurs only as suffix of verbs.
+ else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
+ buffer.deleteCharAt( buffer.length() - 1 );
+ }
+ else {
+ doMore = false;
+ }
+ }
}
/**
@@ -123,15 +124,15 @@
*/
private void optimize( StringBuffer buffer )
{
- // Additional step for female plurals of professions and inhabitants.
- if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
- buffer.deleteCharAt( buffer.length() -1 );
- strip( buffer );
- }
- // Additional step for irregular plural nouns like "Matrizen -> Matrix".
- if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
- buffer.setCharAt( buffer.length() - 1, 'x' );
- }
+ // Additional step for female plurals of professions and inhabitants.
+ if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
+ buffer.deleteCharAt( buffer.length() -1 );
+ strip( buffer );
+ }
+ // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+ if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
+ buffer.setCharAt( buffer.length() - 1, 'x' );
+ }
}
/**
@@ -139,14 +140,14 @@
*/
private void removeParticleDenotion( StringBuffer buffer )
{
- if ( buffer.length() > 4 ) {
- for ( int c = 0; c < buffer.length() - 3; c++ ) {
- if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
- buffer.delete( c, c + 2 );
- return;
- }
- }
- }
+ if ( buffer.length() > 4 ) {
+ for ( int c = 0; c < buffer.length() - 3; c++ ) {
+ if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
+ buffer.delete( c, c + 2 );
+ return;
+ }
+ }
+ }
}
/**
@@ -161,64 +162,64 @@
*/
private void substitute( StringBuffer buffer )
{
- substCount = 0;
- for ( int c = 0; c < buffer.length(); c++ ) {
- // Replace the second char of a pair of the equal characters with an asterisk
- if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
- buffer.setCharAt( c, '*' );
- }
- // Substitute Umlauts.
- else if ( buffer.charAt( c ) == '�' ) {
- buffer.setCharAt( c, 'a' );
- }
- else if ( buffer.charAt( c ) == '�' ) {
- buffer.setCharAt( c, 'o' );
- }
- else if ( buffer.charAt( c ) == '�' ) {
- buffer.setCharAt( c, 'u' );
- }
- // Take care that at least one character is left left side from the current one
- if ( c < buffer.length() - 1 ) {
- if ( buffer.charAt( c ) == '�' ) {
- buffer.setCharAt( c, 's' );
- buffer.insert( c + 1, 's' );
- substCount++;
- }
- // Masking several common character combinations with an token
- else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
- buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
- {
- buffer.setCharAt( c, '$' );
- buffer.delete( c + 1, c + 3 );
- substCount =+ 2;
- }
- else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
- buffer.setCharAt( c, '�' );
- buffer.deleteCharAt( c + 1 );
- substCount++;
- }
- else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
- buffer.setCharAt( c, '%' );
- buffer.deleteCharAt( c + 1 );
- substCount++;
- }
- else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
- buffer.setCharAt( c, '&' );
- buffer.deleteCharAt( c + 1 );
- substCount++;
- }
- else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
- buffer.setCharAt( c, '#' );
- buffer.deleteCharAt( c + 1 );
- substCount++;
- }
- else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
- buffer.setCharAt( c, '!' );
- buffer.deleteCharAt( c + 1 );
- substCount++;
- }
- }
- }
+ substCount = 0;
+ for ( int c = 0; c < buffer.length(); c++ ) {
+ // Replace the second char of a pair of the equal characters with an asterisk
+ if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
+ buffer.setCharAt( c, '*' );
+ }
+ // Substitute Umlauts.
+ else if ( buffer.charAt( c ) == '�' ) {
+ buffer.setCharAt( c, 'a' );
+ }
+ else if ( buffer.charAt( c ) == '�' ) {
+ buffer.setCharAt( c, 'o' );
+ }
+ else if ( buffer.charAt( c ) == '�' ) {
+ buffer.setCharAt( c, 'u' );
+ }
+ // Take care that at least one character is left left side from the current one
+ if ( c < buffer.length() - 1 ) {
+ if ( buffer.charAt( c ) == '�' ) {
+ buffer.setCharAt( c, 's' );
+ buffer.insert( c + 1, 's' );
+ substCount++;
+ }
+ // Masking several common character combinations with an token
+ else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
+ buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
+ {
+ buffer.setCharAt( c, '$' );
+ buffer.delete( c + 1, c + 3 );
+ substCount =+ 2;
+ }
+ else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
+ buffer.setCharAt( c, '�' );
+ buffer.deleteCharAt( c + 1 );
+ substCount++;
+ }
+ else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
+ buffer.setCharAt( c, '%' );
+ buffer.deleteCharAt( c + 1 );
+ substCount++;
+ }
+ else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
+ buffer.setCharAt( c, '&' );
+ buffer.deleteCharAt( c + 1 );
+ substCount++;
+ }
+ else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
+ buffer.setCharAt( c, '#' );
+ buffer.deleteCharAt( c + 1 );
+ substCount++;
+ }
+ else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
+ buffer.setCharAt( c, '!' );
+ buffer.deleteCharAt( c + 1 );
+ substCount++;
+ }
+ }
+ }
}
/**
@@ -228,35 +229,35 @@
*/
private void resubstitute( StringBuffer buffer )
{
- for ( int c = 0; c < buffer.length(); c++ ) {
- if ( buffer.charAt( c ) == '*' ) {
- char x = buffer.charAt( c - 1 );
- buffer.setCharAt( c, x );
- }
- else if ( buffer.charAt( c ) == '$' ) {
- buffer.setCharAt( c, 's' );
- buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
- }
- else if ( buffer.charAt( c ) == '�' ) {
- buffer.setCharAt( c, 'c' );
- buffer.insert( c + 1, 'h' );
- }
- else if ( buffer.charAt( c ) == '%' ) {
- buffer.setCharAt( c, 'e' );
- buffer.insert( c + 1, 'i' );
- }
- else if ( buffer.charAt( c ) == '&' ) {
- buffer.setCharAt( c, 'i' );
- buffer.insert( c + 1, 'e' );
- }
- else if ( buffer.charAt( c ) == '#' ) {
- buffer.setCharAt( c, 'i' );
- buffer.insert( c + 1, 'g' );
- }
- else if ( buffer.charAt( c ) == '!' ) {
- buffer.setCharAt( c, 's' );
- buffer.insert( c + 1, 't' );
- }
- }
+ for ( int c = 0; c < buffer.length(); c++ ) {
+ if ( buffer.charAt( c ) == '*' ) {
+ char x = buffer.charAt( c - 1 );
+ buffer.setCharAt( c, x );
+ }
+ else if ( buffer.charAt( c ) == '$' ) {
+ buffer.setCharAt( c, 's' );
+ buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
+ }
+ else if ( buffer.charAt( c ) == '�' ) {
+ buffer.setCharAt( c, 'c' );
+ buffer.insert( c + 1, 'h' );
+ }
+ else if ( buffer.charAt( c ) == '%' ) {
+ buffer.setCharAt( c, 'e' );
+ buffer.insert( c + 1, 'i' );
+ }
+ else if ( buffer.charAt( c ) == '&' ) {
+ buffer.setCharAt( c, 'i' );
+ buffer.insert( c + 1, 'e' );
+ }
+ else if ( buffer.charAt( c ) == '#' ) {
+ buffer.setCharAt( c, 'i' );
+ buffer.insert( c + 1, 'g' );
+ }
+ else if ( buffer.charAt( c ) == '!' ) {
+ buffer.setCharAt( c, 's' );
+ buffer.insert( c + 1, 't' );
+ }
+ }
}
}
1.10 +2 -2 jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
Index: WordlistLoader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- WordlistLoader.java 30 Mar 2004 15:44:58 -0000 1.9
+++ WordlistLoader.java 30 Mar 2004 15:54:48 -0000 1.10
@@ -53,8 +53,8 @@
String word = null;
while ((word = lnr.readLine()) != null) {
result.add(word.trim());
- }
}
+ }
finally {
if (lnr != null)
lnr.close();
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org