You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/11/10 02:21:26 UTC
svn commit: r1200080 - in /lucene/dev/trunk: lucene/contrib/
lucene/src/java/org/apache/lucene/util/
modules/analysis/common/src/java/org/apache/lucene/analysis/br/
modules/analysis/common/src/java/org/apache/lucene/analysis/core/
modules/analysis/comm...
Author: simonw
Date: Thu Nov 10 01:21:25 2011
New Revision: 1200080
URL: http://svn.apache.org/viewvc?rev=1200080&view=rev
Log:
LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/IOUtils.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
lucene/dev/trunk/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
lucene/dev/trunk/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Nov 10 01:21:25 2011
@@ -99,6 +99,11 @@ Changes in backwards compatibility polic
* LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core
o.a.l.search. (Simon Willnauer)
+
+ * LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in
+ WordListLoader now return CharArraySet/Map and expect Reader instances for
+ efficiency. Utilities to open Readers from Files, InputStreams or Java
+ resources were added to IOUtils. (Simon Willnauer, Robert Muir)
New Features
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/IOUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/IOUtils.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/IOUtils.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/IOUtils.java Thu Nov 10 01:21:25 2011
@@ -17,15 +17,35 @@ package org.apache.lucene.util;
* limitations under the License.
*/
+import java.io.BufferedReader;
import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
import java.lang.reflect.Method;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
/** This class emulates the new Java 7 "Try-With-Resources" statement.
* Remove once Lucene is on Java 7.
* @lucene.internal */
public final class IOUtils {
-
+
+ /**
+ * UTF-8 charset string
+ * @see Charset#forName(String)
+ */
+ public static final String UTF_8 = "UTF-8";
+
+ /**
+ * UTF-8 {@link Charset} instance to prevent repeated
+ * {@link Charset#forName(String)} lookups
+ */
+ public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8");
private IOUtils() {} // no instance
/**
@@ -220,5 +240,84 @@ public final class IOUtils {
}
}
}
+
+ /**
+ * Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}.
+ * Unlike Java's defaults this reader will throw an exception if your it detects
+ * the read charset doesn't match the expected {@link Charset}.
+ * <p>
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files
+ * to detect character set problems. However, its not recommended to use as a common purpose
+ * reader.
+ *
+ * @param stream the stream to wrap in a reader
+ * @param charSet the expected charset
+ * @return a wrapping reader
+ */
+ public static Reader getDecodingReader(InputStream stream, Charset charSet) {
+ final CharsetDecoder charSetDecoder = charSet.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ return new BufferedReader(new InputStreamReader(stream, charSetDecoder));
+ }
+
+ /**
+ * Opens a Reader for the given {@link File} using a {@link CharsetDecoder}.
+ * Unlike Java's defaults this reader will throw an exception if your it detects
+ * the read charset doesn't match the expected {@link Charset}.
+ * <p>
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files
+ * to detect character set problems. However, its not recommended to use as a common purpose
+ * reader.
+ * @param file the file to open a reader on
+ * @param charSet the expected charset
+ * @return a reader to read the given file
+ */
+ public static Reader getDecodingReader(File file, Charset charSet) throws IOException {
+ FileInputStream stream = null;
+ boolean success = false;
+ try {
+ stream = new FileInputStream(file);
+ final Reader reader = getDecodingReader(stream, charSet);
+ success = true;
+ return reader;
+
+ } finally {
+ if (!success) {
+ IOUtils.close(stream);
+ }
+ }
+ }
+
+ /**
+ * Opens a Reader for the given resource using a {@link CharsetDecoder}.
+ * Unlike Java's defaults this reader will throw an exception if your it detects
+ * the read charset doesn't match the expected {@link Charset}.
+ * <p>
+ * Decoding readers are useful to load configuration files, stopword lists or synonym files
+ * to detect character set problems. However, its not recommended to use as a common purpose
+ * reader.
+ * @param clazz the class used to locate the resource
+ * @param resource the resource name to load
+ * @param charSet the expected charset
+ * @return a reader to read the given file
+ *
+ */
+ public static Reader getDecodingReader(Class<?> clazz, String resource, Charset charSet) throws IOException {
+ InputStream stream = null;
+ boolean success = false;
+ try {
+ stream = clazz
+ .getResourceAsStream(resource);
+ final Reader reader = getDecodingReader(stream, charSet);
+ success = true;
+ return reader;
+ } finally {
+ if (!success) {
+ IOUtils.close(stream);
+ }
+ }
+ }
+
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -64,9 +65,8 @@ public final class BrazilianAnalyzer ext
static {
try {
- DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
- Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class,
- DEFAULT_STOPWORD_FILE, "#"), false));
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -57,8 +57,7 @@ public final class StopAnalyzer extends
"they", "this", "to", "was", "will", "with"
);
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
- stopWords.size(), false);
- stopSet.addAll(stopWords);
+ stopWords, false);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
@@ -82,7 +81,7 @@ public final class StopAnalyzer extends
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
+ this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -90,7 +89,7 @@ public final class StopAnalyzer extends
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
@@ -70,9 +71,8 @@ public final class CzechAnalyzer extends
static {
try {
- DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
- Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
- DEFAULT_STOPWORD_FILE, "#"), false));
+ DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.DanishStemmer;
@@ -62,8 +63,8 @@ public final class DanishAnalyzer extend
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.German2Stemmer;
@@ -100,8 +101,8 @@ public final class GermanAnalyzer extend
private static final Set<?> DEFAULT_SET;
static {
try {
- DEFAULT_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SpanishStemmer;
@@ -62,8 +63,8 @@ public final class SpanishAnalyzer exten
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.FinnishStemmer;
@@ -62,8 +63,8 @@ public final class FinnishAnalyzer exten
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.IOException;
@@ -118,8 +119,8 @@ public final class FrenchAnalyzer extend
static final Set<?> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -60,12 +61,12 @@ public final class GalicianAnalyzer exte
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set");
+ throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.HungarianStemmer;
@@ -62,8 +63,8 @@ public final class HungarianAnalyzer ext
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.ItalianStemmer;
@@ -79,8 +80,8 @@ public final class ItalianAnalyzer exten
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.S
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -60,8 +62,8 @@ public final class LatvianAnalyzer exten
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -83,8 +84,8 @@ public final class DutchAnalyzer extends
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.NorwegianStemmer;
@@ -62,8 +63,8 @@ public final class NorwegianAnalyzer ext
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.PortugueseStemmer;
@@ -62,8 +63,8 @@ public final class PortugueseAnalyzer ex
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.S
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -84,12 +85,12 @@ public final class RussianAnalyzer exten
static {
try {
- DEFAULT_STOP_SET =
- WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set");
+ throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.S
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -85,7 +86,7 @@ public final class ClassicAnalyzer exten
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -94,7 +95,7 @@ public final class ClassicAnalyzer exten
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.S
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@@ -86,7 +87,7 @@ public final class StandardAnalyzer exte
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -95,7 +96,7 @@ public final class StandardAnalyzer exte
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- this(matchVersion, WordlistLoader.getWordSet(stopwords));
+ this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SwedishStemmer;
@@ -62,8 +63,8 @@ public final class SwedishAnalyzer exten
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java Thu Nov 10 01:21:25 2011
@@ -17,10 +17,13 @@
package org.apache.lucene.analysis.util;
+import java.io.File;
import java.io.IOException;
+import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBa
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
final Class<? extends Analyzer> aClass, final String resource,
final String comment) throws IOException {
- final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
- comment);
- final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
- set.addAll(wordSet);
- return set;
+ Reader reader = null;
+ try {
+ reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
+ return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
+ } finally {
+ IOUtils.close(reader);
+ }
+
+ }
+
+ /**
+ * Creates a CharArraySet from a file.
+ *
+ * @param stopwords
+ * the stopwords file to load
+ *
+ * @param matchVersion
+ * the Lucene version for cross version compatibility
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(File stopwords,
+ Version matchVersion) throws IOException {
+ Reader reader = null;
+ try {
+ reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
+ return WordlistLoader.getWordSet(reader, matchVersion);
+ } finally {
+ IOUtils.close(reader);
+ }
+ }
+
+ /**
+ * Creates a CharArraySet from a file.
+ *
+ * @param stopwords
+ * the stopwords reader to load
+ *
+ * @param matchVersion
+ * the Lucene version for cross version compatibility
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * reader
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(Reader stopwords,
+ Version matchVersion) throws IOException {
+ try {
+ return WordlistLoader.getWordSet(stopwords, matchVersion);
+ } finally {
+ IOUtils.close(stopwords);
+ }
}
-
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java Thu Nov 10 01:21:25 2011
@@ -18,165 +18,91 @@ package org.apache.lucene.analysis.util;
*/
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Set;
+
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
/**
* Loader for text files that represent a list of stopwords.
+ *
+ * @see IOUtils to obtain {@link Reader} instances
+ * @lucene.internal
*/
public class WordlistLoader {
-
- /**
- * Loads a text file associated with a given class (See
- * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
- * to a {@link Set} (omitting leading and trailing whitespace). Every line of
- * the file should contain only one word. The words need to be in lower-case if
- * you make use of an Analyzer which uses LowerCaseFilter (like
- * StandardAnalyzer).
- *
- * @param aClass
- * a class that is associated with the given stopwordResource
- * @param stopwordResource
- * name of the resource file associated with the given class
- * @return a {@link Set} with the file's words
- */
- public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
- throws IOException {
- final Reader reader = new BufferedReader(new InputStreamReader(aClass
- .getResourceAsStream(stopwordResource), "UTF-8"));
- try {
- return getWordSet(reader);
- } finally {
- reader.close();
- }
- }
- /**
- * Loads a text file associated with a given class (See
- * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
- * to a {@link Set} (omitting leading and trailing whitespace). Every line of
- * the file should contain only one word. The words need to be in lower-case if
- * you make use of an Analyzer which uses LowerCaseFilter (like
- * StandardAnalyzer).
- *
- * @param aClass
- * a class that is associated with the given stopwordResource
- * @param stopwordResource
- * name of the resource file associated with the given class
- * @param comment
- * the comment string to ignore
- * @return a {@link Set} with the file's words
- */
- public static Set<String> getWordSet(Class<?> aClass,
- String stopwordResource, String comment) throws IOException {
- final Reader reader = new BufferedReader(new InputStreamReader(aClass
- .getResourceAsStream(stopwordResource), "UTF-8"));
- try {
- return getWordSet(reader, comment);
- } finally {
- reader.close();
- }
- }
+ private static final int INITITAL_CAPACITY = 16;
/**
- * Loads a text file and adds every line as an entry to a HashSet (omitting
- * leading and trailing whitespace). Every line of the file should contain only
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
- * @param wordfile File containing the wordlist
- * @return A HashSet with the file's words
+ * @param reader Reader containing the wordlist
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
*/
- public static HashSet<String> getWordSet(File wordfile) throws IOException {
- FileReader reader = null;
+ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
+ BufferedReader br = null;
try {
- reader = new FileReader(wordfile);
- return getWordSet(reader);
+ br = getBufferedReader(reader);
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ result.add(word.trim());
+ }
}
finally {
- if (reader != null)
- reader.close();
+ IOUtils.close(br);
}
+ return result;
}
-
+
/**
- * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
- * leading and trailing whitespace). Every line of the file should contain only
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
- * @param wordfile File containing the wordlist
- * @param comment The comment string to ignore
- * @return A HashSet with the file's words
+ * @param reader Reader containing the wordlist
+ * @param matchVersion the Lucene {@link Version}
+ * @return A {@link CharArraySet} with the reader's words
*/
- public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
- FileReader reader = null;
- try {
- reader = new FileReader(wordfile);
- return getWordSet(reader, comment);
- }
- finally {
- if (reader != null)
- reader.close();
- }
+ public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
+ return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
}
-
/**
- * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
- * @return A HashSet with the reader's words
+ * @param comment The string representing a comment.
+ * @param matchVersion the Lucene {@link Version}
+ * @return A CharArraySet with the reader's words
*/
- public static HashSet<String> getWordSet(Reader reader) throws IOException {
- final HashSet<String> result = new HashSet<String>();
- BufferedReader br = null;
- try {
- if (reader instanceof BufferedReader) {
- br = (BufferedReader) reader;
- } else {
- br = new BufferedReader(reader);
- }
- String word = null;
- while ((word = br.readLine()) != null) {
- result.add(word.trim());
- }
- }
- finally {
- if (br != null)
- br.close();
- }
- return result;
+ public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
+ return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
}
/**
- * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
- * @return A HashSet with the reader's words
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
*/
- public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
- final HashSet<String> result = new HashSet<String>();
+ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
BufferedReader br = null;
try {
- if (reader instanceof BufferedReader) {
- br = (BufferedReader) reader;
- } else {
- br = new BufferedReader(reader);
- }
+ br = getBufferedReader(reader);
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false){
@@ -185,34 +111,11 @@ public class WordlistLoader {
}
}
finally {
- if (br != null)
- br.close();
+ IOUtils.close(br);
}
return result;
}
- /**
- * Loads a text file in Snowball format associated with a given class (See
- * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
- * a {@link Set}. The words need to be in lower-case if you make use of an
- * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- *
- * @param aClass a class that is associated with the given stopwordResource
- * @param stopwordResource name of the resource file associated with the given
- * class
- * @return a {@link Set} with the file's words
- * @see #getSnowballWordSet(Reader)
- */
- public static Set<String> getSnowballWordSet(Class<?> aClass,
- String stopwordResource) throws IOException {
- final Reader reader = new BufferedReader(new InputStreamReader(aClass
- .getResourceAsStream(stopwordResource), "UTF-8"));
- try {
- return getSnowballWordSet(reader);
- } finally {
- reader.close();
- }
- }
/**
* Reads stopwords from a stopword list in Snowball format.
@@ -226,18 +129,14 @@ public class WordlistLoader {
* </p>
*
* @param reader Reader containing a Snowball stopword list
- * @return A Set with the reader's words
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
*/
- public static Set<String> getSnowballWordSet(Reader reader)
+ public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
throws IOException {
- final Set<String> result = new HashSet<String>();
BufferedReader br = null;
try {
- if (reader instanceof BufferedReader) {
- br = (BufferedReader) reader;
- } else {
- br = new BufferedReader(reader);
- }
+ br = getBufferedReader(reader);
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
@@ -247,10 +146,30 @@ public class WordlistLoader {
if (words[i].length() > 0) result.add(words[i]);
}
} finally {
- if (br != null) br.close();
+ IOUtils.close(br);
}
return result;
}
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ * </p>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @param matchVersion the Lucene {@link Version}
+ * @return A {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
+ return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+ }
/**
@@ -261,24 +180,24 @@ public class WordlistLoader {
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
- public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
- if (wordstemfile == null)
- throw new NullPointerException("wordstemfile may not be null");
- final HashMap<String, String> result = new HashMap<String,String>();
+ public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
BufferedReader br = null;
-
try {
- br = new BufferedReader(new FileReader(wordstemfile));
+ br = getBufferedReader(reader);
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
- if(br != null)
- br.close();
+ IOUtils.close(br);
}
return result;
}
-
+
+ private static BufferedReader getBufferedReader(Reader reader) {
+ return (reader instanceof BufferedReader) ? (BufferedReader) reader
+ : new BufferedReader(reader);
+ }
+
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java Thu Nov 10 01:21:25 2011
@@ -46,7 +46,7 @@ public class TestCharArraySet extends Lu
public void testNonZeroOffset() {
String[] words={"Hello","World","this","is","a","test"};
char[] findme="xthisy".toCharArray();
- CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
+ CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true);
set.addAll(Arrays.asList(words));
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java Thu Nov 10 01:21:25 2011
@@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
-import java.util.HashSet;
-import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
@@ -31,22 +29,22 @@ public class TestWordlistLoader extends
public void testWordlistLoading() throws IOException {
String s = "ONE\n two \nthree";
- HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
+ CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT);
checkSet(wordSet1);
- HashSet<String> wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
+ CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT);
checkSet(wordSet2);
}
public void testComments() throws Exception {
String s = "ONE\n two \nthree\n#comment";
- HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+ CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);
checkSet(wordSet1);
assertFalse(wordSet1.contains("#comment"));
assertFalse(wordSet1.contains("comment"));
}
- private void checkSet(HashSet<String> wordset) {
+ private void checkSet(CharArraySet wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
@@ -68,7 +66,7 @@ public class TestWordlistLoader extends
" two \n" + // stopword with leading/trailing space
" three four five \n" + // multiple stopwords
"six seven | comment\n"; //multiple stopwords + comment
- Set<String> wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
+ CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);
assertEquals(7, wordset.size());
assertTrue(wordset.contains("ONE"));
assertTrue(wordset.contains("two"));
Modified: lucene/dev/trunk/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -26,6 +26,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set<String> getDefaultStopSet(){
+ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set<String> DEFAULT_STOP_SET;
+ static final CharArraySet DEFAULT_STOP_SET;
static {
try {
@@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer
}
}
- static Set<String> loadDefaultStopWordSet() throws IOException {
+ static CharArraySet loadDefaultStopWordSet() throws IOException {
InputStream stream = SmartChineseAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
+ return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
+ STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
} finally {
stream.close();
}
Modified: lucene/dev/trunk/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java?rev=1200080&r1=1200079&r2=1200080&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java Thu Nov 10 01:21:25 2011
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempe
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.egothor.stemmer.Trie;
@@ -68,8 +69,8 @@ public final class PolishAnalyzer extend
static {
try {
- DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class,
- DEFAULT_STOPWORD_FILE);
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)