You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/08/16 14:37:07 UTC
svn commit: r804680 [1/2] - in /lucene/java/trunk/contrib: ./
analyzers/common/src/java/org/apache/lucene/analysis/ar/
analyzers/common/src/java/org/apache/lucene/analysis/br/
analyzers/common/src/java/org/apache/lucene/analysis/cjk/
analyzers/common/s...
Author: rmuir
Date: Sun Aug 16 12:37:05 2009
New Revision: 804680
URL: http://svn.apache.org/viewvc?rev=804680&view=rev
Log:
LUCENE-1794: Implement TokenStream reuse for contrib Analyzers
Added:
lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/TestSynonymTokenFilter.java (with props)
lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/testSynonyms.txt (with props)
Modified:
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
lucene/java/trunk/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
lucene/java/trunk/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Aug 16 12:37:05 2009
@@ -150,6 +150,9 @@
better performance, in ICUCollationKeyFilter. (Robert Muir via
Mike McCandless)
+ 2. LUCENE-1794: Implement TokenStream reuse for contrib Analyzers,
+ and implement reset() for TokenStreams to support reuse. (Robert Muir)
+
Documentation
(None)
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -30,6 +30,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
/**
@@ -109,7 +110,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from an ArabicTokenizer filtered with
+ * @return A TokenStream built from an ArabicTokenizer filtered with
* StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
@@ -121,5 +122,35 @@
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from an ArabicTokenizer filtered with
+ * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and
+ * ArabicStemFilter.
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new ArabicLetterTokenizer(reader);
+ streams.result = new StopFilter(streams.source, stoptable);
+ streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new ArabicNormalizationFilter(streams.result);
+ streams.result = new ArabicStemFilter(streams.result);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -28,6 +28,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -125,8 +126,9 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * LowerCaseFilter, StandardFilter, StopFilter, and
+ * BrazilianStemFilter.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer( reader );
@@ -136,5 +138,35 @@
result = new BrazilianStemFilter( result, excltable );
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * LowerCaseFilter, StandardFilter, StopFilter, and
+ * BrazilianStemFilter.
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new StandardFilter(streams.result);
+ streams.result = new StopFilter(streams.result, stoptable);
+ streams.result = new BrazilianStemFilter(streams.result, excltable);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -20,7 +20,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import java.io.IOException;
import java.io.Reader;
import java.util.Set;
@@ -84,4 +86,30 @@
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new CJKTokenizer(reader), stopTable);
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * get (possibly reused) token stream from input
+ *
+ * @param fieldName lucene field name
+ * @param reader input reader
+ * @return TokenStream
+ */
+ public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ /* tokenStream() is final, no back compat issue */
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new CJKTokenizer(reader);
+ streams.result = new StopFilter(streams.source, stopTable);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Sun Aug 16 12:37:05 2009
@@ -278,5 +278,17 @@
// set final offset
final int finalOffset = offset;
this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ offset = bufferIndex = dataLen = 0;
+ preIsTokened = false;
+ tokenType = WORD_TYPE;
+ }
+
+ public void reset(Reader reader) throws IOException {
+ super.reset(reader);
+ reset();
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -17,9 +17,11 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
/**
* Title: ChineseAnalyzer
@@ -47,4 +49,31 @@
result = new ChineseFilter(result);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text in the
+ * provided Reader.
+ *
+ * @return A TokenStream build from a ChineseTokenizer filtered with
+ * ChineseFilter.
+ */
+ public final TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ /* tokenStream() is final, no back compat issue */
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new ChineseTokenizer(reader);
+ streams.result = new ChineseFilter(streams.source);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
\ No newline at end of file
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java Sun Aug 16 12:37:05 2009
@@ -146,5 +146,15 @@
// set final offset
final int finalOffset = offset;
this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ offset = bufferIndex = dataLen = 0;
+ }
+
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ reset();
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Sun Aug 16 12:37:05 2009
@@ -215,4 +215,9 @@
}
protected abstract void decomposeInternal(final Token token);
+
+ public void reset() throws IOException {
+ super.reset();
+ tokens.clear();
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -126,7 +127,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
+ * @return A TokenStream built from a StandardTokenizer filtered with
* StandardFilter, LowerCaseFilter, and StopFilter
*/
public final TokenStream tokenStream( String fieldName, Reader reader ) {
@@ -136,5 +137,33 @@
result = new StopFilter( result, stoptable );
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text in
+ * the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * StandardFilter, LowerCaseFilter, and StopFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new StopFilter(streams.result, stoptable);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -79,6 +80,7 @@
*/
public GermanAnalyzer() {
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
+ setOverridesTokenStreamMethod(GermanAnalyzer.class);
}
/**
@@ -86,6 +88,7 @@
*/
public GermanAnalyzer(String[] stopwords) {
stopSet = StopFilter.makeStopSet(stopwords);
+ setOverridesTokenStreamMethod(GermanAnalyzer.class);
}
/**
@@ -93,6 +96,7 @@
*/
public GermanAnalyzer(Map stopwords) {
stopSet = new HashSet(stopwords.keySet());
+ setOverridesTokenStreamMethod(GermanAnalyzer.class);
}
/**
@@ -100,6 +104,7 @@
*/
public GermanAnalyzer(File stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords);
+ setOverridesTokenStreamMethod(GermanAnalyzer.class);
}
/**
@@ -126,7 +131,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
+ * @return A TokenStream built from a StandardTokenizer filtered with
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -137,4 +142,39 @@
result = new GermanStemFilter(result, exclusionSet);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return tokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new StopFilter(streams.result, stopSet);
+ streams.result = new GermanStemFilter(streams.result, exclusionSet);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -20,8 +20,10 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
@@ -209,7 +211,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
+ * @return A TokenStream built from a StandardTokenizer filtered with
* GreekLowerCaseFilter and StopFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
@@ -219,4 +221,31 @@
result = new StopFilter(result, stopSet);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * GreekLowerCaseFilter and StopFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new GreekLowerCaseFilter(streams.source, charset);
+ streams.result = new StopFilter(streams.result, stopSet);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -128,7 +129,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
+ * @return A TokenStream built from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
@@ -144,5 +145,35 @@
result = new LowerCaseFilter(result);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new StopFilter(streams.result, stoptable);
+ streams.result = new FrenchStemFilter(streams.result, excltable);
+ // Convert to lowercase after stemming!
+ streams.result = new LowerCaseFilter(streams.result);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Sun Aug 16 12:37:05 2009
@@ -161,4 +161,9 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset() throws IOException {
+ super.reset();
+ curTermBuffer = null;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Sun Aug 16 12:37:05 2009
@@ -170,4 +170,14 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ reset();
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ started = false;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Sun Aug 16 12:37:05 2009
@@ -109,4 +109,9 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset() throws IOException {
+ super.reset();
+ curTermBuffer = null;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Sun Aug 16 12:37:05 2009
@@ -115,4 +115,15 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ reset();
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ started = false;
+ pos = 0;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -78,6 +79,7 @@
*
*/
public DutchAnalyzer() {
+ setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
stemdict.put("fiets", "fiets"); //otherwise fiet
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
@@ -91,6 +93,7 @@
* @param stopwords
*/
public DutchAnalyzer(String[] stopwords) {
+ setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = StopFilter.makeStopSet(stopwords);
}
@@ -100,6 +103,7 @@
* @param stopwords
*/
public DutchAnalyzer(HashSet stopwords) {
+ setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = stopwords;
}
@@ -109,6 +113,7 @@
* @param stopwords
*/
public DutchAnalyzer(File stopwords) {
+ setOverridesTokenStreamMethod(DutchAnalyzer.class);
try {
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
} catch (IOException e) {
@@ -162,7 +167,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter,
+ * @return A TokenStream built from a StandardTokenizer filtered with StandardFilter,
* StopFilter, DutchStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -172,4 +177,39 @@
result = new DutchStemFilter(result, excltable, stemdict);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a StandardTokenizer filtered with
+ * StandardFilter, StopFilter, DutchStemFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return tokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new StopFilter(streams.result, stoptable);
+ streams.result = new DutchStemFilter(streams.result, excltable);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -56,6 +56,7 @@
*/
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
this.delegate = delegate;
+ setOverridesTokenStreamMethod(QueryAutoStopWordAnalyzer.class);
}
/**
@@ -154,17 +155,97 @@
term = te.term();
}
stopWordsPerField.put(fieldName, stopWords);
+
+ /* if the stopwords for a field are changed,
+ * then saved streams for that field are erased.
+ */
+ Map streamMap = (Map) getPreviousTokenStream();
+ if (streamMap != null)
+ streamMap.remove(fieldName);
+
return stopWords.size();
}
public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = delegate.tokenStream(fieldName, reader);
+ TokenStream result;
+ try {
+ result = delegate.reusableTokenStream(fieldName, reader);
+ } catch (IOException e) {
+ result = delegate.tokenStream(fieldName, reader);
+ }
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
if (stopWords != null) {
result = new StopFilter(result, stopWords);
}
return result;
}
+
+ private class SavedStreams {
+ /* the underlying stream */
+ TokenStream wrapped;
+
+ /*
+ * when there are no stopwords for the field, refers to wrapped.
+ * if there stopwords, it is a StopFilter around wrapped.
+ */
+ TokenStream withStopFilter;
+ };
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return tokenStream(fieldName, reader);
+ }
+
+ /* map of SavedStreams for each field */
+ Map streamMap = (Map) getPreviousTokenStream();
+ if (streamMap == null) {
+ streamMap = new HashMap();
+ setPreviousTokenStream(streamMap);
+ }
+
+ SavedStreams streams = (SavedStreams) streamMap.get(fieldName);
+ if (streams == null) {
+ /* an entry for this field does not exist, create one */
+ streams = new SavedStreams();
+ streamMap.put(fieldName, streams);
+ streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
+
+ /* if there are any stopwords for the field, save the stopfilter */
+ HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
+ if (stopWords != null)
+ streams.withStopFilter = new StopFilter(streams.wrapped, stopWords);
+ else
+ streams.withStopFilter = streams.wrapped;
+
+ } else {
+ /*
+ * an entry for this field exists, verify the wrapped stream has not
+ * changed. if it has not, reuse it, otherwise wrap the new stream.
+ */
+ TokenStream result = delegate.reusableTokenStream(fieldName, reader);
+ if (result == streams.wrapped) {
+ /* the wrapped analyzer reused the stream */
+ streams.withStopFilter.reset();
+ } else {
+ /*
+ * the wrapped analyzer did not. if there are any stopwords for the
+ * field, create a new StopFilter around the new stream
+ */
+ streams.wrapped = result;
+ HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
+ if (stopWords != null)
+ streams.withStopFilter = new StopFilter(streams.wrapped, stopWords);
+ else
+ streams.withStopFilter = streams.wrapped;
+ }
+ }
+
+ return streams.withStopFilter;
+ }
/**
* Provides information on which stop words have been identified for a field
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
@@ -25,6 +26,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
/**
* Analyzer for Russian language. Supports an external list of stopwords (words that
@@ -246,7 +248,7 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a RussianLetterTokenizer filtered with
+ * @return A TokenStream built from a RussianLetterTokenizer filtered with
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
@@ -257,4 +259,32 @@
result = new RussianStemFilter(result, charset);
return result;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) TokenStream which tokenizes all the text
+ * in the provided Reader.
+ *
+ * @return A TokenStream built from a RussianLetterTokenizer filtered with
+ * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new RussianLetterTokenizer(reader, charset);
+ streams.result = new RussianLowerCaseFilter(streams.source, charset);
+ streams.result = new StopFilter(streams.result, stopSet);
+ streams.result = new RussianStemFilter(streams.result, charset);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sun Aug 16 12:37:05 2009
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
@@ -36,6 +37,7 @@
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
this.defaultAnalyzer = defaultAnalyzer;
+ setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
@@ -49,6 +51,7 @@
public ShingleAnalyzerWrapper() {
super();
this.defaultAnalyzer = new StandardAnalyzer();
+ setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
}
public ShingleAnalyzerWrapper(int nGramSize) {
@@ -90,10 +93,50 @@
}
public TokenStream tokenStream(String fieldName, Reader reader) {
- ShingleFilter filter = new ShingleFilter(defaultAnalyzer.tokenStream(
- fieldName, reader));
+ TokenStream wrapped;
+ try {
+ wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader);
+ } catch (IOException e) {
+ wrapped = defaultAnalyzer.tokenStream(fieldName, reader);
+ }
+ ShingleFilter filter = new ShingleFilter(wrapped);
filter.setMaxShingleSize(maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
return filter;
}
+
+ private class SavedStreams {
+ TokenStream wrapped;
+ ShingleFilter shingle;
+ };
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return tokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader);
+ streams.shingle = new ShingleFilter(streams.wrapped);
+ setPreviousTokenStream(streams);
+ } else {
+ TokenStream result = defaultAnalyzer.reusableTokenStream(fieldName, reader);
+ if (result == streams.wrapped) {
+ /* the wrapped analyzer reused the stream */
+ streams.shingle.reset();
+ } else {
+ /* the wrapped analyzer did not, create a new shingle around the new one */
+ streams.wrapped = result;
+ streams.shingle = new ShingleFilter(streams.wrapped);
+ }
+ }
+ streams.shingle.setMaxShingleSize(maxShingleSize);
+ streams.shingle.setOutputUnigrams(outputUnigrams);
+ return streams.shingle;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sun Aug 16 12:37:05 2009
@@ -336,4 +336,14 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset() throws IOException {
+ super.reset();
+ nextToken = null;
+ shingleBufferPosition = 0;
+ shingleBuf.clear();
+ numFillerTokensToInsert = 0;
+ currentToken = null;
+ hasCurrentToken = false;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -16,11 +16,13 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -29,6 +31,11 @@
* @version 0.2
*/
public class ThaiAnalyzer extends Analyzer {
+
+ public ThaiAnalyzer() {
+ setOverridesTokenStreamMethod(ThaiAnalyzer.class);
+ }
+
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = new StandardTokenizer(reader);
ts = new StandardFilter(ts);
@@ -36,4 +43,32 @@
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return ts;
}
+
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ if (overridesTokenStreamMethod) {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return tokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new ThaiWordFilter(streams.result);
+ streams.result = new StopFilter(streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ setPreviousTokenStream(streams);
+ } else {
+ streams.source.reset(reader);
+ streams.result.reset(); // reset the ThaiWordFilter's state
+ }
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Sun Aug 16 12:37:05 2009
@@ -93,4 +93,9 @@
public final Token next() throws java.io.IOException {
return super.next();
}
+
+ public void reset() throws IOException {
+ super.reset();
+ thaiState = null;
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -57,6 +57,15 @@
assertAnalyzesTo(a, "Ù
ا Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
"});
assertAnalyzesTo(a, "اÙØ°ÙÙ Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
" }); // stopwords
}
+
+ /**
+ * Simple tests to show things are getting reset correctly, etc.
+ */
+ public void testReusableTokenStream() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ assertAnalyzesToReuse(a, "ÙبÙر", new String[] { "ÙبÙر" });
+ assertAnalyzesToReuse(a, "ÙبÙرة", new String[] { "ÙبÙر" }); // feminine marker
+ }
/**
* Non-arabic text gets treated in a similar way as SimpleAnalyzer.
@@ -80,5 +89,18 @@
assertFalse(ts.incrementToken());
ts.close();
}
+
+ private void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
+ throws Exception {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts
+ .getAttribute(TermAttribute.class);
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
+ }
+
+ assertFalse(ts.incrementToken());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Sun Aug 16 12:37:05 2009
@@ -117,6 +117,14 @@
check("quinzena", "quinzen");
check("quiosque", "quiosqu");
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new BrazilianAnalyzer();
+ checkReuse(a, "boa", "boa");
+ checkReuse(a, "boainain", "boainain");
+ checkReuse(a, "boas", "boas");
+ checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
+ }
private void check(final String input, final String expected) throws IOException {
@@ -128,5 +136,13 @@
assertFalse(stream.incrementToken());
stream.close();
}
+
+ private void checkReuse(Analyzer analyzer, final String input, final String expected) throws IOException {
+ TokenStream stream = analyzer.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, text.term());
+ assertFalse(stream.incrementToken());
+ }
}
\ No newline at end of file
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Sun Aug 16 12:37:05 2009
@@ -22,6 +22,8 @@
import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@@ -60,6 +62,21 @@
assertFalse(tokenizer.incrementToken());
}
+ public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
+ for (int i = 0; i < out_tokens.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(termAtt.term(), out_tokens[i].termText);
+ assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
+ assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
+ assertEquals(typeAtt.type(), out_tokens[i].type);
+ }
+ assertFalse(ts.incrementToken());
+ }
+
public void testJa1() throws IOException {
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
@@ -151,4 +168,38 @@
};
checkCJKToken(str, out_tokens);
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer analyzer = new CJKAnalyzer();
+ String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
+
+ TestToken[] out_tokens = {
+ newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKTokenReusable(analyzer, str, out_tokens);
+
+ str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
+ TestToken[] out_tokens2 = {
+ newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKTokenReusable(analyzer, str, out_tokens2);
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java Sun Aug 16 12:37:05 2009
@@ -22,7 +22,10 @@
import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class TestChineseTokenizer extends TestCase
@@ -42,4 +45,32 @@
correctEndOffset++;
}
}
+
+ public void testReusableTokenStream() throws Exception
+ {
+ Analyzer a = new ChineseAnalyzer();
+ assertAnalyzesToReuse(a, "ä¸å人æ°å
񆆫",
+ new String[] { "ä¸", "å", "人", "æ°", "å
±", "å", "å½" },
+ new int[] { 0, 1, 2, 3, 4, 5, 6 },
+ new int[] { 1, 2, 3, 4, 5, 6, 7 });
+ assertAnalyzesToReuse(a, "å京å¸",
+ new String[] { "å", "京", "å¸" },
+ new int[] { 0, 1, 2 },
+ new int[] { 1, 2, 3 });
+ }
+
+ private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
+ int startOffsets[], int endOffsets[])
+ throws Exception {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts
+ .getAttribute(TermAttribute.class);
+
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
+ }
+
+ assertFalse(ts.incrementToken());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Sun Aug 16 12:37:05 2009
@@ -34,6 +34,7 @@
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -151,6 +152,38 @@
14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
0, 0 });
}
+
+ public void testReset() throws Exception {
+ String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+ "Aufgabe", "Ãberwachung" };
+
+ Reader reader = getHyphenationReader("de_DR.xml");
+ if (reader == null) {
+ // we gracefully die if we have no reader
+ return;
+ }
+
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(reader);
+
+ Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
+ "Rindfleischüberwachungsgesetz"));
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ wsTokenizer, hyphenator, dict,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+ TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class);
+ assertTrue(tf.incrementToken());
+ assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
+ assertTrue(tf.incrementToken());
+ assertEquals("Rind", termAtt.term());
+ wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
+ tf.reset();
+ assertTrue(tf.incrementToken());
+ assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
+ }
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
int[] endOffset, int[] posIncr) throws Exception {
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -36,6 +36,12 @@
public void testStopWord() throws Exception {
assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer analyzer = new CzechAnalyzer();
+ assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
+ assertAnalyzesToReuse(analyzer, "Äeská Republika", new String[] { "Äeská", "republika" });
+ }
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
@@ -47,4 +53,14 @@
assertFalse(ts.incrementToken());
ts.close();
}
+
+ private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ for (int i=0; i<output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(text.term(), output[i]);
+ }
+ assertFalse(ts.incrementToken());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Sun Aug 16 12:37:05 2009
@@ -22,10 +22,14 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.io.Reader;
import java.io.StringReader;
import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -64,6 +68,26 @@
fail();
}
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new GermanAnalyzer();
+ checkReuse(a, "Tisch", "tisch");
+ checkReuse(a, "Tische", "tisch");
+ checkReuse(a, "Tischen", "tisch");
+ }
+
+ /**
+ * subclass that acts just like whitespace analyzer for testing
+ */
+ private class GermanSubclassAnalyzer extends GermanAnalyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new WhitespaceTokenizer(reader);
+ }
+ }
+
+ public void testLUCENE1678BWComp() throws Exception {
+ checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
+ }
private void check(final String input, final String expected) throws IOException {
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
@@ -73,5 +97,12 @@
assertEquals(expected, termAtt.term());
filter.close();
}
-
+
+ private void checkReuse(Analyzer a, String input, String expected) throws IOException {
+ TokenStream stream = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, text.term());
+ assertFalse(stream.incrementToken());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java Sun Aug 16 12:37:05 2009
@@ -49,6 +49,16 @@
assertFalse(ts.incrementToken());
ts.close();
}
+
+ private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ for (int i=0; i<output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(termAtt.term(), output[i]);
+ }
+ assertFalse(ts.incrementToken());
+ }
/**
* Test the analysis of various greek strings.
@@ -70,5 +80,20 @@
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
-
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new GreekAnalyzer();
+ // Verify the correct analysis of capitals and small accented letters
+ assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
+ new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
+ "\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
+ // Verify the correct analysis of small letters with diaeresis and the elimination
+ // of punctuation marks
+ assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
+ new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
+ // Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
+ // as well as the elimination of stop words
+ assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
+ new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -84,6 +84,19 @@
assertFalse(ts.incrementToken());
ts.close();
}
+
+ public void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
+ throws Exception {
+
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(termAtt.term(), output[i]);
+ }
+ assertFalse(ts.incrementToken());
+ }
public void testAnalyzer() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer();
@@ -186,5 +199,26 @@
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
}
+
+ public void testReusableTokenStream() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer();
+ // stopwords
+ assertAnalyzesToReuse(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesToReuse(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Sun Aug 16 12:37:05 2009
@@ -120,4 +120,18 @@
assertEquals("(fgh,0,3)", termAtt.toString());
assertFalse(tokenizer.incrementToken());
}
+
+ public void testReset() throws Exception {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+ assertTrue(filter.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ assertTrue(filter.incrementToken());
+ assertEquals("(ab,0,2)", termAtt.toString());
+ tokenizer.reset(new StringReader("abcde"));
+ filter.reset();
+ assertTrue(filter.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Sun Aug 16 12:37:05 2009
@@ -108,4 +108,16 @@
assertEquals("(cde,2,5)", termAtt.toString());
assertFalse(tokenizer.incrementToken());
}
+
+ public void testReset() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(ab,0,2)", termAtt.toString());
+ tokenizer.reset(new StringReader("abcde"));
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Sun Aug 16 12:37:05 2009
@@ -108,4 +108,18 @@
checkStream(filter, exp);
}
+
+ public void testReset() throws Exception {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 3);
+ TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
+ assertTrue(filter.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ assertTrue(filter.incrementToken());
+ assertEquals("(b,1,2)", termAtt.toString());
+ tokenizer.reset(new StringReader("abcde"));
+ filter.reset();
+ assertTrue(filter.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Sun Aug 16 12:37:05 2009
@@ -99,4 +99,16 @@
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
assertFalse(tokenizer.incrementToken());
}
+
+ public void testReset() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+ TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(b,1,2)", termAtt.toString());
+ tokenizer.reset(new StringReader("abcde"));
+ assertTrue(tokenizer.incrementToken());
+ assertEquals("(a,0,1)", termAtt.toString());
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Sun Aug 16 12:37:05 2009
@@ -18,12 +18,14 @@
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -116,6 +118,31 @@
check("ophoping", "ophop");
check("ophouden", "ophoud");
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new DutchAnalyzer();
+ checkReuse(a, "lichaamsziek", "lichaamsziek");
+ checkReuse(a, "lichamelijk", "licham");
+ checkReuse(a, "lichamelijke", "licham");
+ checkReuse(a, "lichamelijkheden", "licham");
+ }
+
+ /**
+ * subclass that acts just like whitespace analyzer for testing
+ */
+ private class DutchSubclassAnalyzer extends DutchAnalyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new WhitespaceTokenizer(reader);
+ }
+ }
+
+ public void testLUCENE1678BWComp() throws Exception {
+ Analyzer a = new DutchSubclassAnalyzer();
+ checkReuse(a, "lichaamsziek", "lichaamsziek");
+ checkReuse(a, "lichamelijk", "lichamelijk");
+ checkReuse(a, "lichamelijke", "lichamelijke");
+ checkReuse(a, "lichamelijkheden", "lichamelijkheden");
+ }
private void check(final String input, final String expected) throws IOException {
@@ -127,5 +154,16 @@
assertFalse(stream.incrementToken());
stream.close();
}
+
+ private void checkReuse(Analyzer a, final String input, final String expected)
+ throws IOException {
+ TokenStream stream = a
+ .reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute text = (TermAttribute) stream
+ .getAttribute(TermAttribute.class);
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, text.term());
+ assertFalse(stream.incrementToken());
+ }
}
\ No newline at end of file
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java Sun Aug 16 12:37:05 2009
@@ -18,7 +18,9 @@
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -32,6 +34,7 @@
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
+import java.io.Reader;
public class QueryAutoStopWordAnalyzerTest extends TestCase {
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
@@ -139,6 +142,24 @@
assertTrue("Filter should not prevent stopwords in one field being used in another ", h.length() > 0);
}
-
-
+
+ /**
+ * subclass that acts just like whitespace analyzer for testing
+ */
+ private class QueryAutoStopWordSubclassAnalyzer extends QueryAutoStopWordAnalyzer {
+ public QueryAutoStopWordSubclassAnalyzer() {
+ super(new WhitespaceAnalyzer());
+ }
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new WhitespaceTokenizer(reader);
+ }
+ }
+
+ public void testLUCENE1678BWComp() throws Exception {
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer();
+ a.addStopWords(reader, "repetitiveField", 10);
+ Hits h = search(a, "repetitiveField:boring");
+ assertFalse(h.length() == 0);
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=804680&r1=804679&r2=804680&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Sun Aug 16 12:37:05 2009
@@ -26,6 +26,7 @@
import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -187,5 +188,22 @@
fail("unexpected IOException");
}
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new RussianAnalyzer();
+ assertAnalyzesToReuse(a, "ÐмеÑÑе Ñ Ñем о Ñиле ÑлекÑÑомагниÑной ÑнеÑгии имели пÑедÑÑавление еÑе",
+ new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавлен" });
+ assertAnalyzesToReuse(a, "Ðо знание ÑÑо Ñ
ÑанилоÑÑ Ð² Ñайне",
+ new String[] { "знан", "Ñ
Ñан", "Ñайн" });
+ }
+ private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
+ TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ for (int i=0; i<output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(termAtt.term(), output[i]);
+ }
+ assertFalse(ts.incrementToken());
+ }
}