You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2004/03/30 17:44:58 UTC

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanAnalyzer.java GermanStemmer.java WordlistLoader.java

otis        2004/03/30 07:44:58

  Modified:    src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
                        GermanStemmer.java WordlistLoader.java
  Log:
  - Applied a patch for bug 18410: http://issues.apache.org/bugzilla/show_bug.cgi?id=18410
  PR: 18410
  Submitted by: Daniel Naber
  
  Revision  Changes    Path
  1.15      +7 -5      jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.14
  retrieving revision 1.15
  diff -u -r1.14 -r1.15
  --- GermanAnalyzer.java	30 Mar 2004 02:56:44 -0000	1.14
  +++ GermanAnalyzer.java	30 Mar 2004 15:44:58 -0000	1.15
  @@ -17,12 +17,14 @@
    */
   
   import org.apache.lucene.analysis.Analyzer;
  +import org.apache.lucene.analysis.LowerCaseFilter;
   import org.apache.lucene.analysis.StopFilter;
   import org.apache.lucene.analysis.TokenStream;
   import org.apache.lucene.analysis.standard.StandardFilter;
   import org.apache.lucene.analysis.standard.StandardTokenizer;
   
   import java.io.File;
  +import java.io.IOException;
   import java.io.Reader;
   import java.io.IOException;
   import java.util.HashSet;
  @@ -93,7 +95,7 @@
      * Builds an analyzer with the given stop words.
      */
     public GermanAnalyzer(File stopwords) throws IOException {
  -    stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
  +    stopSet = WordlistLoader.getWordSet(stopwords);
     }
   
     /**
  @@ -114,19 +116,19 @@
      * Builds an exclusionlist from the words contained in the given file.
      */
     public void setStemExclusionTable(File exclusionlist) throws IOException {
  -    exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
  +    exclusionSet = WordlistLoader.getWordSet(exclusionlist);
     }
   
     /**
      * Creates a TokenStream which tokenizes all the text in the provided Reader.
      *
      * @return A TokenStream build from a StandardTokenizer filtered with
  -   *         StandardFilter, StopFilter, GermanStemFilter
  +   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
      */
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(reader);
       result = new StandardFilter(result);
  -// shouldn't there be a lowercaser before stop word filtering?
  +    result = new LowerCaseFilter(result);
       result = new StopFilter(result, stopSet);
       result = new GermanStemFilter(result, exclusionSet);
       return result;
  
  
  
  1.9       +2 -9      jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- GermanStemmer.java	29 Mar 2004 22:48:01 -0000	1.8
  +++ GermanStemmer.java	30 Mar 2004 15:44:58 -0000	1.9
  @@ -32,11 +32,6 @@
       private StringBuffer sb = new StringBuffer();
   
       /**
  -     * Indicates if a term is handled as a noun.
  -     */
  -    private boolean uppercase = false;
  -
  -    /**
        * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
        */
       private int substCount = 0;
  @@ -49,8 +44,6 @@
        */
       protected String stem( String term )
       {
  -	// Mark a possible noun.
  -	uppercase = Character.isUpperCase( term.charAt( 0 ) );
   	// Use lowercase for medium stemming.
   	term = term.toLowerCase();
   	if ( !isStemmable( term ) )
  @@ -115,7 +108,7 @@
   		buffer.deleteCharAt( buffer.length() - 1 );
   	    }
   	    // "t" occurs only as suffix of verbs.
  -	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
  +	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
   		buffer.deleteCharAt( buffer.length() - 1 );
   	    }
   	    else {
  
  
  
  1.9       +54 -47    jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- WordlistLoader.java	29 Mar 2004 22:48:01 -0000	1.8
  +++ WordlistLoader.java	30 Mar 2004 15:44:58 -0000	1.9
  @@ -20,66 +20,42 @@
   import java.io.FileReader;
   import java.io.IOException;
   import java.io.LineNumberReader;
  +import java.util.HashSet;
   import java.util.Hashtable;
  +import java.util.Iterator;
   
   /**
  - * Loads a text file and adds every line as an entry to a Hashtable. Every line
  - * should contain only one word.
  + * Loader for text files that represent a list of stopwords.
    *
    * @author Gerhard Schwarz
    * @version $Id$
  - * @todo refactor to convert to Sets instead of Hashtable
  + *
  + * @todo this is not specific to German, it should be moved up
    */
   public class WordlistLoader {
  -  /**
  -   * @param path     Path to the wordlist
  -   * @param wordfile Name of the wordlist
  -   */
  -  public static Hashtable getWordtable(String path, String wordfile) throws IOException {
  -    if (path == null || wordfile == null) {
  -      return new Hashtable();
  -    }
  -    return getWordtable(new File(path, wordfile));
  -  }
  -
  -  /**
  -   * @param wordfile Complete path to the wordlist
  -   */
  -  public static Hashtable getWordtable(String wordfile) throws IOException {
  -    if (wordfile == null) {
  -      return new Hashtable();
  -    }
  -    return getWordtable(new File(wordfile));
  -  }
   
     /**
  +   * Loads a text file and adds every line as an entry to a HashSet (omitting
  +   * leading and trailing whitespace). Every line of the file should contain only 
  +   * one word. The words need to be in lowercase if you make use of an
  +   * Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
  +   * 
      * @param wordfile File containing the wordlist
  -   * @todo Create a Set version of this method
  +   * @return A HashSet with the file's words
      */
  -  public static Hashtable getWordtable(File wordfile) throws IOException {
  -    if (wordfile == null) {
  -      return new Hashtable();
  -    }
  -    Hashtable result = null;
  +  public static HashSet getWordSet(File wordfile) throws IOException {
  +    HashSet result = new HashSet();
       FileReader freader = null;
       LineNumberReader lnr = null;
       try {
         freader = new FileReader(wordfile);
         lnr = new LineNumberReader(freader);
         String word = null;
  -      String[] stopwords = new String[100];
  -      int wordcount = 0;
         while ((word = lnr.readLine()) != null) {
  -        wordcount++;
  -        if (wordcount == stopwords.length) {
  -          String[] tmp = new String[stopwords.length + 50];
  -          System.arraycopy(stopwords, 0, tmp, 0, wordcount);
  -          stopwords = tmp;
  +        result.add(word.trim());
           }
  -        stopwords[wordcount - 1] = word;
         }
  -      result = makeWordTable(stopwords, wordcount);
  -    } finally {
  +    finally {
         if (lnr != null)
           lnr.close();
         if (freader != null)
  @@ -89,15 +65,46 @@
     }
   
     /**
  -   * Builds the wordlist table.
  +   * @param path      Path to the wordlist
  +   * @param wordfile  Name of the wordlist
  +   * 
  +   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
  +   */
  +  public static Hashtable getWordtable(String path, String wordfile) throws IOException {
  +    return getWordtable(new File(path, wordfile));
  +  }
  +
  +  /**
  +   * @param wordfile  Complete path to the wordlist
  +   * 
  +   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
  +   */
  +  public static Hashtable getWordtable(String wordfile) throws IOException {
  +    return getWordtable(new File(wordfile));
  +  }
  +
  +  /**
  +   * @param wordfile  File object that points to the wordlist
  +   *
  +   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
  +   */
  +  public static Hashtable getWordtable(File wordfile) throws IOException {
  +    HashSet wordSet = (HashSet)getWordSet(wordfile);
  +    Hashtable result = makeWordTable(wordSet);
  +    return result;
  +  }
  +
  +  /**
  +   * Builds a wordlist table, using words as both keys and values
  +   * for backward compatibility.
      *
  -   * @param words  Word that where read
  -   * @param length Amount of words that where read into <tt>words</tt>
  +   * @param wordSet   stopword set
      */
  -  private static Hashtable makeWordTable(String[] words, int length) {
  -    Hashtable table = new Hashtable(length);
  -    for (int i = 0; i < length; i++) {
  -      table.put(words[i], words[i]);
  +  private static Hashtable makeWordTable(HashSet wordSet) {
  +    Hashtable table = new Hashtable();
  +    for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
  +      String word = (String)iter.next();
  +      table.put(word, word);
       }
       return table;
     }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org