You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by dn...@apache.org on 2006/06/10 00:15:48 UTC

svn commit: r413180 - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/ src/java/org/apache/lucene/analysis/

Author: dnaber
Date: Fri Jun  9 15:15:47 2006
New Revision: 413180

URL: http://svn.apache.org/viewvc?rev=413180&view=rev
Log:
deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Jun  9 15:15:47 2006
@@ -14,6 +14,10 @@
  1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
     changing of termText via setTermText().  (Yonik Seeley)
 
+ 2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
+    and is supposed to be replaced with the WordlistLoader class in
+    package org.apache.lucene.analysis (Daniel Naber)
+    
 Bug fixes
 
  1. Fixed the web application demo (built with "ant war-demo") which

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Fri Jun  9 15:15:47 2006
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.File;
+import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -68,18 +69,20 @@
    */
   private Set excltable = new HashSet();
 
-  private Map _stemdict = new HashMap();
+  private Map stemdict = new HashMap();
 
 
   /**
-   * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
+   * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}) 
+   * and a few default entries for the stem exclusion table.
+   * 
    */
   public DutchAnalyzer() {
     stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
-    _stemdict.put("fiets", "fiets"); //otherwise fiet
-    _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
-    _stemdict.put("ei", "eier");
-    _stemdict.put("kind", "kinder");
+    stemdict.put("fiets", "fiets"); //otherwise fiet
+    stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
+    stemdict.put("ei", "eier");
+    stemdict.put("kind", "kinder");
   }
 
   /**
@@ -106,7 +109,12 @@
    * @param stopwords
    */
   public DutchAnalyzer(File stopwords) {
-    stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
+    try {
+      stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
   }
 
   /**
@@ -129,17 +137,26 @@
    * Builds an exclusionlist from the words contained in the given file.
    */
   public void setStemExclusionTable(File exclusionlist) {
-    excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
+    try {
+      excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
   }
 
   /**
    * Reads a stemdictionary file , that overrules the stemming algorithm
    * This is a textfile that contains per line
-   * word\tstem
-   * i.e: tabseperated
+   * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
    */
-  public void setStemDictionary(File stemdict) {
-    _stemdict = WordlistLoader.getStemDict(stemdict);
+  public void setStemDictionary(File stemdictFile) {
+    try {
+      stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
   }
 
   /**
@@ -152,7 +169,7 @@
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new StopFilter(result, stoptable);
-    result = new DutchStemFilter(result, excltable, _stemdict);
+    result = new DutchStemFilter(result, excltable, stemdict);
     return result;
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java Fri Jun  9 15:15:47 2006
@@ -23,16 +23,19 @@
 import java.util.HashMap;
 
 /**
- * @author Gerhard Schwarz
  *         <p/>
  *         Loads a text file and adds every line as an entry to a Hashtable. Every line
  *         should contain only one word. If the file is not found or on any error, an
  *         empty table is returned.
+ *         
+ * @author Gerhard Schwarz
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
  */
 public class WordlistLoader {
   /**
    * @param path     Path to the wordlist
    * @param wordfile Name of the wordlist
+   * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
    */
   public static HashMap getWordtable(String path, String wordfile) {
     if (path == null || wordfile == null) {
@@ -43,6 +46,7 @@
 
   /**
    * @param wordfile Complete path to the wordlist
+   * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
    */
   public static HashMap getWordtable(String wordfile) {
     if (wordfile == null) {
@@ -57,6 +61,7 @@
    * i.e. tab seperated)
    *
    * @return Stem dictionary that overrules, the stemming algorithm
+   * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
    */
   public static HashMap getStemDict(File wordstemfile) {
     if (wordstemfile == null) {
@@ -78,6 +83,7 @@
 
   /**
    * @param wordfile File containing the wordlist
+   * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
    */
   public static HashMap getWordtable(File wordfile) {
     if (wordfile == null) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java Fri Jun  9 15:15:47 2006
@@ -21,6 +21,7 @@
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.util.HashMap;
 import java.util.HashSet;
 
 /**
@@ -78,6 +79,37 @@
       }
     }
     finally {
+      if (br != null)
+        br.close();
+    }
+    return result;
+  }
+
+  /**
+   * Reads a stem dictionary. Each line contains:
+   * <pre>word<b>\t</b>stem</pre>
+   * (i.e. two tab seperated words)
+   *
+   * @return stem dictionary that overrules the stemming algorithm
+   * @throws IOException 
+   */
+  public static HashMap getStemDict(File wordstemfile) throws IOException {
+    if (wordstemfile == null)
+      throw new NullPointerException("wordstemfile may not be null");
+    HashMap result = new HashMap();
+    BufferedReader br = null;
+    FileReader fr = null;
+    try {
+      fr = new FileReader(wordstemfile);
+      br = new BufferedReader(fr);
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] wordstem = line.split("\t", 2);
+        result.put(wordstem[0], wordstem[1]);
+      }
+    } finally {
+      if (fr != null)
+        fr.close();
       if (br != null)
         br.close();
     }