You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by dn...@apache.org on 2006/06/10 00:15:48 UTC
svn commit: r413180 - in /lucene/java/trunk: ./
contrib/analyzers/src/java/org/apache/lucene/analysis/nl/
src/java/org/apache/lucene/analysis/
Author: dnaber
Date: Fri Jun 9 15:15:47 2006
New Revision: 413180
URL: http://svn.apache.org/viewvc?rev=413180&view=rev
Log:
deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Jun 9 15:15:47 2006
@@ -14,6 +14,10 @@
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
changing of termText via setTermText(). (Yonik Seeley)
+ 2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
+ and is supposed to be replaced with the WordlistLoader class in
+ package org.apache.lucene.analysis (Daniel Naber)
+
Bug fixes
1. Fixed the web application demo (built with "ant war-demo") which
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Fri Jun 9 15:15:47 2006
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
+import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
@@ -68,18 +69,20 @@
*/
private Set excltable = new HashSet();
- private Map _stemdict = new HashMap();
+ private Map stemdict = new HashMap();
/**
- * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
+ * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
+ * and a few default entries for the stem exclusion table.
+ *
*/
public DutchAnalyzer() {
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
- _stemdict.put("fiets", "fiets"); //otherwise fiet
- _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
- _stemdict.put("ei", "eier");
- _stemdict.put("kind", "kinder");
+ stemdict.put("fiets", "fiets"); //otherwise fiet
+ stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
+ stemdict.put("ei", "eier");
+ stemdict.put("kind", "kinder");
}
/**
@@ -106,7 +109,12 @@
* @param stopwords
*/
public DutchAnalyzer(File stopwords) {
- stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
+ try {
+ stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
}
/**
@@ -129,17 +137,26 @@
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist) {
- excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
+ try {
+ excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
}
/**
* Reads a stemdictionary file , that overrules the stemming algorithm
* This is a textfile that contains per line
- * word\tstem
- * i.e: tabseperated
+ * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
*/
- public void setStemDictionary(File stemdict) {
- _stemdict = WordlistLoader.getStemDict(stemdict);
+ public void setStemDictionary(File stemdictFile) {
+ try {
+ stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
}
/**
@@ -152,7 +169,7 @@
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new StopFilter(result, stoptable);
- result = new DutchStemFilter(result, excltable, _stemdict);
+ result = new DutchStemFilter(result, excltable, stemdict);
return result;
}
}
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java Fri Jun 9 15:15:47 2006
@@ -23,16 +23,19 @@
import java.util.HashMap;
/**
- * @author Gerhard Schwarz
* <p/>
* Loads a text file and adds every line as an entry to a Hashtable. Every line
* should contain only one word. If the file is not found or on any error, an
* empty table is returned.
+ *
+ * @author Gerhard Schwarz
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
*/
public class WordlistLoader {
/**
* @param path Path to the wordlist
* @param wordfile Name of the wordlist
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(String path, String wordfile) {
if (path == null || wordfile == null) {
@@ -43,6 +46,7 @@
/**
* @param wordfile Complete path to the wordlist
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(String wordfile) {
if (wordfile == null) {
@@ -57,6 +61,7 @@
* i.e. tab seperated)
*
* @return Stem dictionary that overrules, the stemming algorithm
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
*/
public static HashMap getStemDict(File wordstemfile) {
if (wordstemfile == null) {
@@ -78,6 +83,7 @@
/**
* @param wordfile File containing the wordlist
+ * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(File wordfile) {
if (wordfile == null) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java?rev=413180&r1=413179&r2=413180&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java Fri Jun 9 15:15:47 2006
@@ -21,6 +21,7 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
+import java.util.HashMap;
import java.util.HashSet;
/**
@@ -78,6 +79,37 @@
}
}
finally {
+ if (br != null)
+ br.close();
+ }
+ return result;
+ }
+
+ /**
+ * Reads a stem dictionary. Each line contains:
+ * <pre>word<b>\t</b>stem</pre>
+ * (i.e. two tab seperated words)
+ *
+ * @return stem dictionary that overrules the stemming algorithm
+ * @throws IOException
+ */
+ public static HashMap getStemDict(File wordstemfile) throws IOException {
+ if (wordstemfile == null)
+ throw new NullPointerException("wordstemfile may not be null");
+ HashMap result = new HashMap();
+ BufferedReader br = null;
+ FileReader fr = null;
+ try {
+ fr = new FileReader(wordstemfile);
+ br = new BufferedReader(fr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] wordstem = line.split("\t", 2);
+ result.put(wordstem[0], wordstem[1]);
+ }
+ } finally {
+ if (fr != null)
+ fr.close();
if (br != null)
br.close();
}