You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2004/03/30 17:54:49 UTC

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanStemFilter.java GermanStemmer.java WordlistLoader.java

otis        2004/03/30 07:54:49

  Modified:    src/java/org/apache/lucene/analysis/de GermanStemFilter.java
                        GermanStemmer.java WordlistLoader.java
  Log:
  - Fixed mixed-up indentation
  
  Revision  Changes    Path
  1.8       +31 -32    jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- GermanStemFilter.java	29 Mar 2004 22:48:01 -0000	1.7
  +++ GermanStemFilter.java	30 Mar 2004 15:54:48 -0000	1.8
  @@ -40,22 +40,21 @@
       private Token token = null;
       private GermanStemmer stemmer = null;
       private Set exclusionSet = null;
  -    
  +
       public GermanStemFilter( TokenStream in )
       {
  -        super(in);
  -	stemmer = new GermanStemmer();
  +      super(in);
  +      stemmer = new GermanStemmer();
       }
  -    
  +
       /**
        * Builds a GermanStemFilter that uses an exclusiontable.
        * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
        */
       public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
       {
  -	this( in );
  -	exclusionSet = new HashSet(exclusiontable.keySet());
  -
  +      this( in );
  +      exclusionSet = new HashSet(exclusiontable.keySet());
       }
   
       /**
  @@ -63,32 +62,32 @@
        */
       public GermanStemFilter( TokenStream in, Set exclusionSet )
       {
  -	this( in );
  -	this.exclusionSet = exclusionSet;
  +      this( in );
  +      this.exclusionSet = exclusionSet;
       }
   
       /**
        * @return  Returns the next token in the stream, or null at EOS
        */
       public final Token next()
  -	throws IOException
  +      throws IOException
       {
  -	if ( ( token = input.next() ) == null ) {
  -	    return null;
  -	}
  -	// Check the exclusiontable
  -	else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
  -	    return token;
  -	}
  -	else {
  -	    String s = stemmer.stem( token.termText() );
  -	    // If not stemmed, dont waste the time creating a new token
  -	    if ( !s.equals( token.termText() ) ) {
  -		return new Token( s, token.startOffset(),
  -		    token.endOffset(), token.type() );
  -	    }
  -	    return token;
  -	}
  +      if ( ( token = input.next() ) == null ) {
  +        return null;
  +      }
  +      // Check the exclusiontable
  +      else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
  +        return token;
  +      }
  +      else {
  +        String s = stemmer.stem( token.termText() );
  +        // If not stemmed, dont waste the time creating a new token
  +        if ( !s.equals( token.termText() ) ) {
  +          return new Token( s, token.startOffset(),
  +            token.endOffset(), token.type() );
  +        }
  +        return token;
  +      }
       }
   
       /**
  @@ -96,9 +95,9 @@
        */
       public void setStemmer( GermanStemmer stemmer )
       {
  -	if ( stemmer != null ) {
  -	    this.stemmer = stemmer;
  -	}
  +      if ( stemmer != null ) {
  +        this.stemmer = stemmer;
  +      }
       }
   
       /**
  @@ -107,7 +106,7 @@
        */
       public void setExclusionTable( Hashtable exclusiontable )
       {
  -	exclusionSet = new HashSet(exclusiontable.keySet());
  +      exclusionSet = new HashSet(exclusiontable.keySet());
       }
   
       /**
  @@ -115,6 +114,6 @@
        */
       public void setExclusionSet( Set exclusionSet )
       {
  -	this.exclusionSet = exclusionSet;
  +      this.exclusionSet = exclusionSet;
       }
   }
  
  
  
  1.10      +157 -156  jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- GermanStemmer.java	30 Mar 2004 15:44:58 -0000	1.9
  +++ GermanStemmer.java	30 Mar 2004 15:54:48 -0000	1.10
  @@ -44,20 +44,20 @@
        */
       protected String stem( String term )
       {
  -	// Use lowercase for medium stemming.
  -	term = term.toLowerCase();
  -	if ( !isStemmable( term ) )
  -	    return term;
  -	// Reset the StringBuffer.
  -	sb.delete( 0, sb.length() );
  -	sb.insert( 0, term );
  -	// Stemming starts here...
  -	substitute( sb );
  -	strip( sb );
  -	optimize( sb );
  -	resubstitute( sb );
  -	removeParticleDenotion( sb );
  -	return sb.toString();
  +      // Use lowercase for medium stemming.
  +      term = term.toLowerCase();
  +      if ( !isStemmable( term ) )
  +        return term;
  +      // Reset the StringBuffer.
  +      sb.delete( 0, sb.length() );
  +      sb.insert( 0, term );
  +      // Stemming starts here...
  +      substitute( sb );
  +      strip( sb );
  +      optimize( sb );
  +      resubstitute( sb );
  +      removeParticleDenotion( sb );
  +      return sb.toString();
       }
   
       /**
  @@ -67,10 +67,11 @@
        */
       private boolean isStemmable( String term )
       {
  -	for ( int c = 0; c < term.length(); c++ ) {
  -	    if ( !Character.isLetter( term.charAt( c ) ) ) return false;
  -	}
  -	return true;
  +      for ( int c = 0; c < term.length(); c++ ) {
  +        if ( !Character.isLetter( term.charAt( c ) ) )
  +          return false;
  +      }
  +      return true;
       }
   
       /**
  @@ -83,38 +84,38 @@
        */
       private void strip( StringBuffer buffer )
       {
  -	boolean doMore = true;
  -	while ( doMore && buffer.length() > 3 ) {
  -	    if ( ( buffer.length() + substCount > 5 ) &&
  -		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
  -	    {
  -		buffer.delete( buffer.length() - 2, buffer.length() );
  -	    }
  -	    else if ( ( buffer.length() + substCount > 4 ) &&
  -		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
  -		buffer.delete( buffer.length() - 2, buffer.length() );
  -	    }
  -	    else if ( ( buffer.length() + substCount > 4 ) &&
  -		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
  -		buffer.delete( buffer.length() - 2, buffer.length() );
  -	    }
  -	    else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  -		buffer.deleteCharAt( buffer.length() - 1 );
  -	    }
  -	    else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  -		buffer.deleteCharAt( buffer.length() - 1 );
  -	    }
  -	    else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  -		buffer.deleteCharAt( buffer.length() - 1 );
  -	    }
  -	    // "t" occurs only as suffix of verbs.
  -	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
  -		buffer.deleteCharAt( buffer.length() - 1 );
  -	    }
  -	    else {
  -		doMore = false;
  -	    }
  -	}
  +      boolean doMore = true;
  +      while ( doMore && buffer.length() > 3 ) {
  +        if ( ( buffer.length() + substCount > 5 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
  +        {
  +          buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( ( buffer.length() + substCount > 4 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
  +            buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( ( buffer.length() + substCount > 4 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
  +            buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        // "t" occurs only as suffix of verbs.
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else {
  +          doMore = false;
  +        }
  +      }
       }
   
       /**
  @@ -123,15 +124,15 @@
        */
       private void optimize( StringBuffer buffer )
       {
  -	// Additional step for female plurals of professions and inhabitants.
  -	if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
  -	    buffer.deleteCharAt( buffer.length() -1 );
  -	    strip( buffer );
  -	}
  -	// Additional step for irregular plural nouns like "Matrizen -> Matrix".
  -	if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  -	    buffer.setCharAt( buffer.length() - 1, 'x' );
  -	}
  +      // Additional step for female plurals of professions and inhabitants.
  +      if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
  +        buffer.deleteCharAt( buffer.length() -1 );
  +        strip( buffer );
  +      }
  +      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
  +      if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  +        buffer.setCharAt( buffer.length() - 1, 'x' );
  +      }
       }
   
       /**
  @@ -139,14 +140,14 @@
        */
       private void removeParticleDenotion( StringBuffer buffer )
       {
  -	if ( buffer.length() > 4 ) {
  -	    for ( int c = 0; c < buffer.length() - 3; c++ ) {
  -		if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  -		    buffer.delete( c, c + 2 );
  -		    return;
  -		}
  -	    }
  -	}
  +      if ( buffer.length() > 4 ) {
  +        for ( int c = 0; c < buffer.length() - 3; c++ ) {
  +          if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  +            buffer.delete( c, c + 2 );
  +            return;
  +          }
  +        }
  +      }
       }
   
       /**
  @@ -161,64 +162,64 @@
        */
       private void substitute( StringBuffer buffer )
       {
  -	substCount = 0;
  -	for ( int c = 0; c < buffer.length(); c++ ) {
  -	    // Replace the second char of a pair of the equal characters with an asterisk
  -	    if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  -		buffer.setCharAt( c, '*' );
  -	    }
  -	    // Substitute Umlauts.
  -	    else if ( buffer.charAt( c ) == '�' ) {
  -		buffer.setCharAt( c, 'a' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '�' ) {
  -		buffer.setCharAt( c, 'o' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '�' ) {
  -		buffer.setCharAt( c, 'u' );
  -	    }
  -	    // Take care that at least one character is left left side from the current one
  -	    if ( c < buffer.length() - 1 ) {
  -		if ( buffer.charAt( c ) == '�' ) {
  -		    buffer.setCharAt( c, 's' );
  -		    buffer.insert( c + 1, 's' );
  -		    substCount++;
  -		}
  -		// Masking several common character combinations with an token
  -		else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
  -		    buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
  -		{
  -		    buffer.setCharAt( c, '$' );
  -		    buffer.delete( c + 1, c + 3 );
  -		    substCount =+ 2;
  -		}
  -		else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  -		    buffer.setCharAt( c, '�' );
  -		    buffer.deleteCharAt( c + 1 );
  -		    substCount++;
  -		}
  -		else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
  -		    buffer.setCharAt( c, '%' );
  -		    buffer.deleteCharAt( c + 1 );
  -		    substCount++;
  -		}
  -		else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
  -		    buffer.setCharAt( c, '&' );
  -		    buffer.deleteCharAt( c + 1 );
  -		    substCount++;
  -		}
  -		else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
  -		    buffer.setCharAt( c, '#' );
  -		    buffer.deleteCharAt( c + 1 );
  -		    substCount++;
  -		}
  -		else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
  -		    buffer.setCharAt( c, '!' );
  -		    buffer.deleteCharAt( c + 1 );
  -		    substCount++;
  -		}
  -	    }
  -	}
  +      substCount = 0;
  +      for ( int c = 0; c < buffer.length(); c++ ) {
  +        // Replace the second char of a pair of the equal characters with an asterisk
  +        if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  +          buffer.setCharAt( c, '*' );
  +        }
  +        // Substitute Umlauts.
  +        else if ( buffer.charAt( c ) == '�' ) {
  +          buffer.setCharAt( c, 'a' );
  +        }
  +        else if ( buffer.charAt( c ) == '�' ) {
  +          buffer.setCharAt( c, 'o' );
  +        }
  +        else if ( buffer.charAt( c ) == '�' ) {
  +          buffer.setCharAt( c, 'u' );
  +        }
  +        // Take care that at least one character is left left side from the current one
  +        if ( c < buffer.length() - 1 ) {
  +          if ( buffer.charAt( c ) == '�' ) {
  +            buffer.setCharAt( c, 's' );
  +            buffer.insert( c + 1, 's' );
  +            substCount++;
  +          }
  +          // Masking several common character combinations with an token
  +          else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
  +            buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
  +          {
  +            buffer.setCharAt( c, '$' );
  +            buffer.delete( c + 1, c + 3 );
  +            substCount =+ 2;
  +          }
  +          else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  +            buffer.setCharAt( c, '�' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
  +            buffer.setCharAt( c, '%' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
  +            buffer.setCharAt( c, '&' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
  +            buffer.setCharAt( c, '#' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
  +            buffer.setCharAt( c, '!' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +        }
  +      }
       }
   
       /**
  @@ -228,35 +229,35 @@
        */
       private void resubstitute( StringBuffer buffer )
       {
  -	for ( int c = 0; c < buffer.length(); c++ ) {
  -	    if ( buffer.charAt( c ) == '*' ) {
  -		char x = buffer.charAt( c - 1 );
  -		buffer.setCharAt( c, x );
  -	    }
  -	    else if ( buffer.charAt( c ) == '$' ) {
  -		buffer.setCharAt( c, 's' );
  -		buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  -	    }
  -	    else if ( buffer.charAt( c ) == '�' ) {
  -		buffer.setCharAt( c, 'c' );
  -		buffer.insert( c + 1, 'h' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '%' ) {
  -		buffer.setCharAt( c, 'e' );
  -		buffer.insert( c + 1, 'i' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '&' ) {
  -		buffer.setCharAt( c, 'i' );
  -		buffer.insert( c + 1, 'e' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '#' ) {
  -		buffer.setCharAt( c, 'i' );
  -		buffer.insert( c + 1, 'g' );
  -	    }
  -	    else if ( buffer.charAt( c ) == '!' ) {
  -		buffer.setCharAt( c, 's' );
  -		buffer.insert( c + 1, 't' );
  -	    }
  -	}
  +      for ( int c = 0; c < buffer.length(); c++ ) {
  +        if ( buffer.charAt( c ) == '*' ) {
  +          char x = buffer.charAt( c - 1 );
  +          buffer.setCharAt( c, x );
  +        }
  +        else if ( buffer.charAt( c ) == '$' ) {
  +          buffer.setCharAt( c, 's' );
  +          buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  +        }
  +        else if ( buffer.charAt( c ) == '�' ) {
  +          buffer.setCharAt( c, 'c' );
  +          buffer.insert( c + 1, 'h' );
  +        }
  +        else if ( buffer.charAt( c ) == '%' ) {
  +          buffer.setCharAt( c, 'e' );
  +          buffer.insert( c + 1, 'i' );
  +        }
  +        else if ( buffer.charAt( c ) == '&' ) {
  +          buffer.setCharAt( c, 'i' );
  +          buffer.insert( c + 1, 'e' );
  +        }
  +        else if ( buffer.charAt( c ) == '#' ) {
  +          buffer.setCharAt( c, 'i' );
  +          buffer.insert( c + 1, 'g' );
  +        }
  +        else if ( buffer.charAt( c ) == '!' ) {
  +          buffer.setCharAt( c, 's' );
  +          buffer.insert( c + 1, 't' );
  +        }
  +      }
       }
   }
  
  
  
  1.10      +2 -2      jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- WordlistLoader.java	30 Mar 2004 15:44:58 -0000	1.9
  +++ WordlistLoader.java	30 Mar 2004 15:54:48 -0000	1.10
  @@ -53,8 +53,8 @@
         String word = null;
         while ((word = lnr.readLine()) != null) {
           result.add(word.trim());
  -        }
         }
  +    }
       finally {
         if (lnr != null)
           lnr.close();
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org