You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/01/03 09:48:18 UTC
svn commit: r895339 [1/2] - in /lucene/java/trunk: ./
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ con...
Author: rmuir
Date: Sun Jan 3 08:48:17 2010
New Revision: 895339
URL: http://svn.apache.org/viewvc?rev=895339&view=rev
Log:
LUCENE-2034: Refactor analyzer reuse and stopword handling
Added:
lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/index/wordliststopwords.txt (with props)
lucene/java/trunk/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/WordlistLoader.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Jan 3 08:48:17 2010
@@ -105,6 +105,12 @@
backwards compatibility. If Version < 3.1 is passed to the constructor,
LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)
+* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer
+ that makes it easier to reuse TokenStreams correctly. This issue also added
+ StopwordAnalyzerBase, which improves consistency of all Analyzers that use
+ stopwords, and implement many analyzers in contrib with it.
+ (Simon Willnauer via Robert Muir)
+
Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -19,17 +19,15 @@
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -52,7 +50,7 @@
* </ul>
*
*/
-public final class ArabicAnalyzer extends Analyzer {
+public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Arabic stopwords.
@@ -63,20 +61,17 @@
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set<?> stoptable;
- /**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
+ // TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set<String> getDefaultStopSet(){
+ public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -85,34 +80,19 @@
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set<String> DEFAULT_STOP_SET;
+ static final Set<?> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
-
- static Set<String> loadDefaultStopWordSet() throws IOException {
- InputStream stream = ArabicAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
@@ -129,8 +109,7 @@
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -159,54 +138,21 @@
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+ * @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer( reader );
- result = new LowerCaseFilter(matchVersion, result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ArabicLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
- result = new StopFilter( matchVersion, result, stoptable );
- result = new ArabicNormalizationFilter( result );
- result = new ArabicStemFilter( result );
-
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
- * and {@link ArabicStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- // the order here is important: the stopword list is not normalized!
- streams.result = new StopFilter( matchVersion, streams.result, stoptable);
- streams.result = new ArabicNormalizationFilter(streams.result);
- streams.result = new ArabicStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ result = new StopFilter( matchVersion, result, stopwords);
+ result = new ArabicNormalizationFilter(result);
+ return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,17 +17,16 @@
* limitations under the License.
*/
+import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -43,7 +42,7 @@
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
* <p>
*/
-public final class BulgarianAnalyzer extends Analyzer {
+public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Bulgarian stopwords.
@@ -55,13 +54,11 @@
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set<?> stoptable;
- /**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
+ * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
+ //TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
@@ -69,7 +66,7 @@
*
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set<String> getDefaultStopSet() {
+ public static Set<?> getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -78,35 +75,19 @@
* class accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set<String> DEFAULT_STOP_SET;
+ static final Set<?> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
- } catch (Exception ex) {
+ DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set", ex);
- }
- }
-
- static Set<String> loadDefaultStopWordSet() throws IOException {
- final InputStream stream = BulgarianAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- if(stream != null)
- stream.close();
+ throw new RuntimeException("Unable to load default stopword set");
}
}
}
-
- private final Version matchVersion;
-
+
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@@ -119,58 +100,24 @@
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
- super();
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
- stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter(matchVersion, result, stoptable);
+ result = new StopFilter(matchVersion, result, stopwords);
result = new BulgarianStemFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
- * text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, and {@link BulgarianStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new BulgarianStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ return new TokenStreamComponents(source, result);
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -21,19 +21,21 @@
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@@ -49,7 +51,7 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class BrazilianAnalyzer extends Analyzer {
+public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical Brazilian Portuguese stopwords.
@@ -91,19 +93,13 @@
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set<?> stoptable;
-
+
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this private in 3.1
private Set<?> excltable = Collections.emptySet();
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
*/
@@ -120,8 +116,7 @@
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -188,53 +183,22 @@
excltable = WordlistLoader.getWordSet( exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
- * {@link BrazilianStemFilter}.
- */
- @Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer( matchVersion, reader );
- result = new LowerCaseFilter( matchVersion, result );
- result = new StandardFilter( result );
- result = new StopFilter( matchVersion, result, stoptable );
- result = new BrazilianStemFilter( result, excltable );
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
- * {@link BrazilianStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new StandardFilter(streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new BrazilianStemFilter(streams.result, excltable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
+ * {@link BrazilianStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StandardFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new BrazilianStemFilter(result,
+ excltable));
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -19,12 +19,12 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
@@ -35,7 +35,7 @@
* filters with {@link StopFilter}
*
*/
-public final class CJKAnalyzer extends Analyzer {
+public final class CJKAnalyzer extends StopwordAnalyzerBase {
//~ Static fields/initializers ---------------------------------------------
/**
@@ -71,11 +71,6 @@
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
false));
}
- /**
- * stop word list
- */
- private final Set<?> stopTable;
- private final Version matchVersion;
//~ Constructors -----------------------------------------------------------
@@ -95,8 +90,7 @@
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
- stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -106,51 +100,15 @@
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
*/
public CJKAnalyzer(Version matchVersion, String... stopWords) {
- stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
- this.matchVersion = matchVersion;
+ super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
}
//~ Methods ----------------------------------------------------------------
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @param fieldName lucene field name
- * @param reader input {@link Reader}
- * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
- * {@link StopFilter}
- */
- @Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @param fieldName lucene field name
- * @param reader Input {@link Reader}
- * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
- * {@link StopFilter}
- */
@Override
- public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- /* tokenStream() is final, no back compat issue */
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new CJKTokenizer(reader);
- streams.result = new StopFilter(matchVersion, streams.source, stopTable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new CJKTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Sun Jan 3 08:48:17 2010
@@ -25,8 +25,6 @@
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
-
/**
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,10 +17,11 @@
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
@@ -29,49 +30,19 @@
*
*/
-public final class ChineseAnalyzer extends Analyzer {
-
- public ChineseAnalyzer() {
- }
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ChineseTokenizer}
- * filtered with {@link ChineseFilter}.
- */
- @Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ChineseTokenizer(reader);
- result = new ChineseFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
+public final class ChineseAnalyzer extends ReusableAnalyzerBase {
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
- * provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ChineseTokenizer}
- * filtered with {@link ChineseFilter}.
- */
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link ChineseTokenizer} filtered with {@link ChineseFilter}
+ */
@Override
- public final TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- /* tokenStream() is final, no back compat issue */
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ChineseTokenizer(reader);
- streams.result = new ChineseFilter(streams.source);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ChineseTokenizer(reader);
+ return new TokenStreamComponents(source, new ChineseFilter(source));
}
}
\ No newline at end of file
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -30,9 +32,9 @@
import java.io.*;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
-import java.util.Collections;
/**
* {@link Analyzer} for Czech language.
@@ -53,7 +55,7 @@
* <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*/
-public final class CzechAnalyzer extends Analyzer {
+public final class CzechAnalyzer extends ReusableAnalyzerBase {
/**
* List of typical stopwords.
@@ -95,10 +97,11 @@
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
}
+
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
- // TODO make this final in 3.1
+ // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
private Set<?> stoptable;
private final Version matchVersion;
@@ -168,6 +171,7 @@
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
* and {@link #CzechAnalyzer(Version, Set)} instead
*/
+ // TODO extend StopwordAnalyzerBase once this method is gone!
public void loadStopWords( InputStream wordfile, String encoding ) {
setPreviousTokenStream(null); // force a new stopfilter to be created
if ( wordfile == null ) {
@@ -191,58 +195,25 @@
stoptable = Collections.emptySet();
}
}
-
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
* >= LUCENE_31)
*/
@Override
- public final TokenStream tokenStream( String fieldName, Reader reader ) {
- TokenStream result = new StandardTokenizer( matchVersion, reader );
- result = new StandardFilter( result );
- result = new LowerCaseFilter( matchVersion, result );
- result = new StopFilter( matchVersion, result, stoptable );
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- result = new CzechStemFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
- * text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, and {@link CzechStemFilter} (only if version is
- * >= LUCENE_31)
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter( matchVersion, streams.result, stoptable);
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- streams.result = new CzechStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter( matchVersion, result, stoptable);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new CzechStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -29,13 +29,15 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@@ -51,7 +53,7 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class GermanAnalyzer extends Analyzer {
+public final class GermanAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical german stopwords.
@@ -89,17 +91,13 @@
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
- //TODO make this final in 3.1
- private Set<?> stopSet;
-
+
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this final in 3.1
private Set<?> exclusionSet;
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words:
* {@link #getDefaultStopSet()}.
@@ -131,9 +129,8 @@
* a stemming exclusion set
*/
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+ super(matchVersion, stopwords);
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
- this.matchVersion = matchVersion;
}
/**
@@ -187,51 +184,23 @@
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
- * {@link GermanStemFilter}
- */
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
- result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter( matchVersion, result, stopSet);
- result = new GermanStemFilter(result, exclusionSet);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
/**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
+ * {@link LowerCaseFilter}, {@link StopFilter}, and
* {@link GermanStemFilter}
*/
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter( matchVersion, streams.result, stopSet);
- streams.result = new GermanStemFilter(streams.result, exclusionSet);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter( matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -19,14 +19,15 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@@ -43,7 +44,7 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class GreekAnalyzer extends Analyzer
+public final class GreekAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Greek stopwords.
@@ -73,13 +74,6 @@
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set<?> stopSet;
-
- private final Version matchVersion;
-
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
@@ -93,8 +87,7 @@
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -115,47 +108,20 @@
{
this(matchVersion, stopwords.keySet());
}
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
- */
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader)
- {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new GreekLowerCaseFilter(result);
- result = new StopFilter(matchVersion, result, stopSet);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
- */
+
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ */
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new GreekLowerCaseFilter(streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stopSet);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ final TokenStream result = new GreekLowerCaseFilter(source);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -19,17 +19,15 @@
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -45,7 +43,7 @@
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
* </p>
*/
-public final class PersianAnalyzer extends Analyzer {
+public final class PersianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Persian stopwords.
@@ -58,11 +56,6 @@
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set<?> stoptable;
-
- /**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
*/
@@ -85,30 +78,15 @@
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
-
- static Set<String> loadDefaultStopWordSet() throws IOException {
- InputStream stream = PersianAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@@ -126,8 +104,7 @@
* a stopword set
*/
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -156,18 +133,19 @@
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
* filtered with {@link LowerCaseFilter},
* {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer(reader);
- result = new LowerCaseFilter(matchVersion, result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ArabicLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
@@ -175,44 +153,6 @@
* the order here is important: the stopword list is normalized with the
* above!
*/
- result = new StopFilter(matchVersion, result, stoptable);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- }
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
- * filtered with {@link LowerCaseFilter},
- * {@link ArabicNormalizationFilter},
- * {@link PersianNormalizationFilter} and Persian Stop words
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new ArabicNormalizationFilter(streams.result);
- /* additional persian-specific normalization */
- streams.result = new PersianNormalizationFilter(streams.result);
- /*
- * the order here is important: the stopword list is normalized with the
- * above!
- */
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -20,7 +20,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -59,7 +61,7 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class FrenchAnalyzer extends Analyzer {
+public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/**
* Extended list of typical French stopwords.
@@ -92,17 +94,11 @@
};
/**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set<?> stoptable;
- /**
* Contains words that should be indexed but not stemmed.
*/
//TODO make this final in 3.0
private Set<?> excltable = Collections.<Object>emptySet();
- private final Version matchVersion;
-
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
@@ -148,9 +144,7 @@
*/
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
Set<?> stemExclutionSet) {
- this.matchVersion = matchVersion;
- this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
- .copy(matchVersion, stopwords));
+ super(matchVersion, stopwords);
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclutionSet));
}
@@ -202,54 +196,22 @@
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link StopFilter},
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
*/
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
- result = new StopFilter(matchVersion, result, stoptable);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stopwords);
result = new FrenchStemFilter(result, excltable);
// Convert to lowercase after stemming!
- result = new LowerCaseFilter(matchVersion, result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
- * text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * {@link FrenchStemFilter} and {@link LowerCaseFilter}
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new FrenchStemFilter(streams.result, excltable);
- // Convert to lowercase after stemming!
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@@ -26,7 +25,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
@@ -39,7 +40,7 @@
* A default set of stopwords is used unless an alternative list is specified.
* </p>
*/
-public final class RussianAnalyzer extends Analyzer
+public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Russian stopwords.
@@ -63,13 +64,6 @@
Arrays.asList(RUSSIAN_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set<?> stopSet;
-
- private final Version matchVersion;
-
public RussianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
@@ -91,8 +85,7 @@
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -106,52 +99,21 @@
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
- * @return A {@link TokenStream} built from a
- * {@link RussianLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter},
- * and {@link RussianStemFilter}
- */
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader)
- {
- TokenStream result = new RussianLetterTokenizer(reader);
- result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter(matchVersion, result, stopSet);
- result = new RussianStemFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a
+ * @return {@link TokenStreamComponents} built from a
* {@link RussianLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* and {@link RussianStemFilter}
*/
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new RussianLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stopSet);
- streams.result = new RussianStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new RussianLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new RussianStemFilter(result));
+
}
- return streams.result;
- }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -16,16 +16,18 @@
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@@ -35,41 +37,28 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class ThaiAnalyzer extends Analyzer {
+public final class ThaiAnalyzer extends ReusableAnalyzerBase {
private final Version matchVersion;
public ThaiAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
}
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream ts = new StandardTokenizer(matchVersion, reader);
- ts = new StandardFilter(ts);
- ts = new ThaiWordFilter(ts);
- ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- return ts;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
+
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
+ * {@link ThaiWordFilter}, and {@link StopFilter}
+ */
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new ThaiWordFilter(streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- streams.result.reset(); // reset the ThaiWordFilter's state
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new ThaiWordFilter(result);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,10 +17,10 @@
* limitations under the License.
*/
-import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
@@ -78,7 +78,9 @@
* Test that custom stopwords work, and are not case-sensitive.
*/
public void testCustomStopwords() throws Exception {
- ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
+ Set<String> set = new HashSet<String>();
+ Collections.addAll(set, "the", "and", "a");
+ ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Sun Jan 3 08:48:17 2010
@@ -17,10 +17,12 @@
* limitations under the License.
*/
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -17,11 +17,8 @@
* limitations under the License.
*/
-import java.io.StringReader;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java?rev=895339&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java Sun Jan 3 08:48:17 2010
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * An convenience subclass of Analyzer that makes it easy to implement
+ * {@link TokenStream} reuse.
+ * <p>
+ * ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse
+ * for the most common use-cases. Analyzers such as
+ * {@link PerFieldAnalyzerWrapper} that behave differently depending upon the
+ * field name need to subclass Analyzer directly instead.
+ * </p>
+ * <p>
+ * To prevent consistency problems, this class does not allow subclasses to
+ * extend {@link #reusableTokenStream(String, Reader)} or
+ * {@link #tokenStream(String, Reader)} directly. Instead, subclasses must
+ * implement {@link #createComponents(String, Reader)}.
+ * </p>
+ */
+public abstract class ReusableAnalyzerBase extends Analyzer {
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance for this analyzer.
+ *
+ * @param fieldName
+ * the name of the fields content passed to the
+ * {@link TokenStreamComponents} sink as a reader
+ * @param aReader
+ * the reader passed to the {@link Tokenizer} constructor
+ * @return the {@link TokenStreamComponents} for this analyzer.
+ */
+ protected abstract TokenStreamComponents createComponents(String fieldName,
+ Reader aReader);
+
+ /**
+ * This method uses {@link #createComponents(String, Reader)} to obtain an
+ * instance of {@link TokenStreamComponents}. It returns the sink of the
+ * components and stores the components internally. Subsequent calls to this
+ * method will reuse the previously stored components if and only if the
+ * {@link TokenStreamComponents#reset(Reader)} method returned
+ * <code>true</code>. Otherwise a new instance of
+ * {@link TokenStreamComponents} is created.
+ *
+ * @param fieldName the name of the field the created TokenStream is used for
+ * @param reader the reader the streams source reads from
+ */
+ @Override
+ public final TokenStream reusableTokenStream(final String fieldName,
+ final Reader reader) throws IOException {
+ TokenStreamComponents streamChain = (TokenStreamComponents)
+ getPreviousTokenStream();
+ if (streamChain == null || !streamChain.reset(reader)) {
+ streamChain = createComponents(fieldName, reader);
+ setPreviousTokenStream(streamChain);
+ }
+ return streamChain.getTokenStream();
+ }
+
+ /**
+ * This method uses {@link #createComponents(String, Reader)} to obtain an
+ * instance of {@link TokenStreamComponents} and returns the sink of the
+ * components. Each calls to this method will create a new instance of
+ * {@link TokenStreamComponents}. Created {@link TokenStream} instances are
+ * never reused.
+ *
+ * @param fieldName the name of the field the created TokenStream is used for
+ * @param reader the reader the streams source reads from
+ */
+ @Override
+ public final TokenStream tokenStream(final String fieldName,
+ final Reader reader) {
+ return createComponents(fieldName, reader).getTokenStream();
+ }
+
+ /**
+ * This class encapsulates the outer components of a token stream. It provides
+ * access to the source ({@link Tokenizer}) and the outer end (sink), an
+ * instance of {@link TokenFilter} which also serves as the
+ * {@link TokenStream} returned by
+ * {@link Analyzer#tokenStream(String, Reader)} and
+ * {@link Analyzer#reusableTokenStream(String, Reader)}.
+ */
+ public static class TokenStreamComponents {
+ final Tokenizer source;
+ final TokenStream sink;
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzer's tokenizer
+ * @param result
+ * the analyzer's resulting token stream
+ */
+ public TokenStreamComponents(final Tokenizer source,
+ final TokenStream result) {
+ this.source = source;
+ this.sink = result;
+ }
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzer's tokenizer
+ */
+ public TokenStreamComponents(final Tokenizer source) {
+ this.source = source;
+ this.sink = source;
+ }
+
+ /**
+ * Resets the encapsulated components with the given reader. This method by
+ * default returns <code>true</code> indicating that the components have
+ * been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use
+ * their own {@link TokenStreamComponents} returning <code>false</code> if
+ * the components cannot be reset.
+ *
+ * @param reader
+ * a reader to reset the source component
+ * @return <code>true</code> if the components were reset, otherwise
+ * <code>false</code>
+ * @throws IOException
+ * if the component's reset method throws an {@link IOException}
+ */
+ protected boolean reset(final Reader reader) throws IOException {
+ source.reset(reader);
+ if(sink != source)
+ sink.reset(); // only reset if the sink reference is different from source
+ return true;
+ }
+
+ /**
+ * Returns the sink {@link TokenStream}
+ *
+ * @return the sink {@link TokenStream}
+ */
+ protected TokenStream getTokenStream() {
+ return sink;
+ }
+
+ }
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -18,25 +18,15 @@
*/
import java.io.Reader;
-import java.io.IOException;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} */
-public final class SimpleAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new LowerCaseTokenizer(reader);
- }
+public final class SimpleAnalyzer extends ReusableAnalyzerBase {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new LowerCaseTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new LowerCaseTokenizer(reader));
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -24,6 +24,7 @@
import java.util.Set;
import java.util.List;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.util.Version;
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
@@ -38,9 +39,7 @@
* </ul>
*/
-public final class StopAnalyzer extends Analyzer {
- private final Set<?> stopWords;
- private final Version matchVersion;
+public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
@@ -65,16 +64,14 @@
* @param matchVersion See <a href="#version">above</a>
*/
public StopAnalyzer(Version matchVersion) {
- stopWords = ENGLISH_STOP_WORDS_SET;
- this.matchVersion = matchVersion;
+ this(matchVersion, ENGLISH_STOP_WORDS_SET);
}
/** Builds an analyzer with the stop words from the given set.
* @param matchVersion See <a href="#version">above</a>
* @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
- this.stopWords = stopWords;
- this.matchVersion = matchVersion;
+ super(matchVersion, stopWords);
}
/** Builds an analyzer with the stop words from the given file.
@@ -82,8 +79,7 @@
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
- stopWords = WordlistLoader.getWordSet(stopwordsFile);
- this.matchVersion = matchVersion;
+ this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -91,34 +87,21 @@
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- stopWords = WordlistLoader.getWordSet(stopwords);
- this.matchVersion = matchVersion;
+ this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
- /** Filters LowerCaseTokenizer with StopFilter. */
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(matchVersion,
- new LowerCaseTokenizer(reader), stopWords);
- }
-
- /** Filters LowerCaseTokenizer with StopFilter. */
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with
+ * {@link StopFilter}
+ */
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new LowerCaseTokenizer(reader);
- streams.result = new StopFilter(matchVersion,
- streams.source, stopWords);
- setPreviousTokenStream(streams);
- } else
- streams.source.reset(reader);
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new LowerCaseTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ source, stopwords));
}
}
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java?rev=895339&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java Sun Jan 3 08:48:17 2010
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets.
+ *
+ */
+public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
+
+ /**
+ * An immutable stopword set
+ */
+ protected final CharArraySet stopwords;
+
+ protected final Version matchVersion;
+
+ /**
+ * Returns the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ *
+ * @return the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ */
+ public Set<?> getStopwordSet() {
+ return stopwords;
+ }
+
+ /**
+ * Creates a new instance initialized with the given stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ * @param stopwords
+ * the analyzer's stopword set
+ */
+ protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
+ /*
+ * no need to call
+ * setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both
+ * tokenStream methods are final in this class.
+ */
+ matchVersion = version;
+ // analyzers should use char array set for stopwords!
+ this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+ .unmodifiableSet(CharArraySet.copy(version, stopwords));
+ }
+
+ /**
+ * Creates a new Analyzer with an empty stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ */
+ protected StopwordAnalyzerBase(final Version version) {
+ this(version, null);
+ }
+
+ /**
+ * Creates a CharArraySet from a file resource associated with a class. (See
+ * {@link Class#getResourceAsStream(String)}).
+ *
+ * @param ignoreCase
+ * <code>true</code> if the set should ignore the case of the
+ * stopwords, otherwise <code>false</code>
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param resource
+ * name of the resource file associated with the given class
+ * @param comment
+ * comment string to ignore in the stopword file
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+ final Class<? extends ReusableAnalyzerBase> aClass, final String resource,
+ final String comment) throws IOException {
+ final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
+ comment);
+ final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
+ set.addAll(wordSet);
+ return set;
+ }
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java?rev=895339&r1=895338&r2=895339&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java Sun Jan 3 08:48:17 2010
@@ -18,24 +18,14 @@
*/
import java.io.Reader;
-import java.io.IOException;
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
-public final class WhitespaceAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new WhitespaceTokenizer(reader);
- }
+public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new WhitespaceTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new WhitespaceTokenizer(reader));
}
}