You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2010/03/14 21:58:34 UTC
svn commit: r922957 [1/3] - in /lucene/solr/branches/solr: ./ lib/
src/common/org/apache/solr/common/util/ src/java/org/apache/solr/analysis/
src/java/org/apache/solr/handler/ src/java/org/apache/solr/handler/admin/
src/java/org/apache/solr/handler/com...
Author: markrmiller
Date: Sun Mar 14 20:58:32 2010
New Revision: 922957
URL: http://svn.apache.org/viewvc?rev=922957&view=rev
Log:
a hackey commit of stuff needed to get on lucene 3.0.1
Added:
lucene/solr/branches/solr/lib/lucene-analyzers-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-collation-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-core-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-fast-vector-highlighter-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-highlighter-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-memory-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-misc-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-queries-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-snowball-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-spatial-3.0.1.jar (with props)
lucene/solr/branches/solr/lib/lucene-spellchecker-3.0.1.jar (with props)
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
Removed:
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/LengthFilter.java
Modified:
lucene/solr/branches/solr/common-build.xml
lucene/solr/branches/solr/src/common/org/apache/solr/common/util/ConcurrentLRUCache.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianCommon.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java
lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java
lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java
lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java
lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java
lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java
lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/QueryResultKey.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/SolrIndexReader.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/SolrQueryParser.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/WildcardFilter.java
lucene/solr/branches/solr/src/java/org/apache/solr/search/function/FunctionQuery.java
lucene/solr/branches/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
lucene/solr/branches/solr/src/java/org/apache/solr/spelling/FileBasedSpellChecker.java
lucene/solr/branches/solr/src/java/org/apache/solr/spelling/IndexBasedSpellChecker.java
lucene/solr/branches/solr/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
lucene/solr/branches/solr/src/java/org/apache/solr/tst/OldRequestHandler.java
lucene/solr/branches/solr/src/java/org/apache/solr/tst/TestRequestHandler.java
lucene/solr/branches/solr/src/java/org/apache/solr/update/SolrIndexWriter.java
lucene/solr/branches/solr/src/java/org/apache/solr/update/UpdateHandler.java
lucene/solr/branches/solr/src/test/org/apache/solr/BasicFunctionalityTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
lucene/solr/branches/solr/src/test/org/apache/solr/core/AlternateDirectoryTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/core/TestArbitraryIndexDir.java
lucene/solr/branches/solr/src/test/org/apache/solr/highlight/HighlighterTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/spelling/SimpleQueryConverter.java
lucene/solr/branches/solr/src/test/org/apache/solr/update/DirectUpdateHandlerOptimizeTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/update/DirectUpdateHandlerTest.java
lucene/solr/branches/solr/src/test/org/apache/solr/util/TestCharArrayMap.java
lucene/solr/branches/solr/src/test/org/apache/solr/util/TestOpenBitSet.java
Modified: lucene/solr/branches/solr/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/common-build.xml?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/common-build.xml (original)
+++ lucene/solr/branches/solr/common-build.xml Sun Mar 14 20:58:32 2010
@@ -114,7 +114,7 @@
The version suffix of the Lucene artifacts checked into "lib"
IF YOU CHANGE THIS, SANITY CHECK "javadoc.link.lucene"
-->
- <property name="lucene_version" value="2.9.2"/>
+ <property name="lucene_version" value="3.0.1"/>
<!-- The version number to assign to the Maven artifacts. -->
<property name="maven_version" value="1.5-SNAPSHOT"/>
Added: lucene/solr/branches/solr/lib/lucene-analyzers-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-analyzers-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-analyzers-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-collation-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-collation-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-collation-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-core-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-core-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-core-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-fast-vector-highlighter-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-fast-vector-highlighter-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-fast-vector-highlighter-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-highlighter-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-highlighter-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-highlighter-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-memory-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-memory-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-memory-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-misc-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-misc-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-misc-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-queries-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-queries-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-queries-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-snowball-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-snowball-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-snowball-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-spatial-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-spatial-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-spatial-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/solr/branches/solr/lib/lucene-spellchecker-3.0.1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/lib/lucene-spellchecker-3.0.1.jar?rev=922957&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/branches/solr/lib/lucene-spellchecker-3.0.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/solr/branches/solr/src/common/org/apache/solr/common/util/ConcurrentLRUCache.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/common/org/apache/solr/common/util/ConcurrentLRUCache.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/common/org/apache/solr/common/util/ConcurrentLRUCache.java (original)
+++ lucene/solr/branches/solr/src/common/org/apache/solr/common/util/ConcurrentLRUCache.java Sun Mar 14 20:58:32 2010
@@ -366,12 +366,12 @@ public class ConcurrentLRUCache<K,V> {
// necessary because maxSize is private in base class
public Object myInsertWithOverflow(Object element) {
if (size() < myMaxSize) {
- put(element);
+ add(element);
return null;
} else if (size() > 0 && !lessThan(element, heap[1])) {
Object ret = heap[1];
heap[1] = element;
- adjustTop();
+ updateTop();
return ret;
} else {
return element;
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,13 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource; // javadoc @link
import java.io.IOException;
import java.util.LinkedList;
@@ -56,13 +63,23 @@ import java.util.LinkedList;
* responsibility of the implementing subclass. In the "A" "B" => "A" "A" "B"
* example above, the subclass must clone the additional "A" it creates.
*
- * @version $Id$
+ * @deprecated This class does not support custom attributes. Extend TokenFilter instead,
+ * using {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState()}
+ * which support all attributes.
*/
+@Deprecated
public abstract class BufferedTokenStream extends TokenFilter {
// in the future, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
+ private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+ private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
public BufferedTokenStream(TokenStream input) {
super(input);
}
@@ -77,13 +94,13 @@ public abstract class BufferedTokenStrea
*/
protected abstract Token process(Token t) throws IOException;
- public final Token next() throws IOException {
+ public final boolean incrementToken() throws IOException {
while (true) {
- if (!outQueue.isEmpty()) return outQueue.removeFirst();
+ if (!outQueue.isEmpty()) return writeToken(outQueue.removeFirst());
Token t = read();
- if (null == t) return null;
+ if (null == t) return false;
Token out = process(t);
- if (null != out) return out;
+ if (null != out) return writeToken(out);
// loop back to top in case process() put something on the output queue
}
}
@@ -94,7 +111,7 @@ public abstract class BufferedTokenStrea
*/
protected Token read() throws IOException {
if (inQueue.isEmpty()) {
- Token t = input.next();
+ Token t = readToken();
return t;
}
return inQueue.removeFirst();
@@ -120,13 +137,41 @@ public abstract class BufferedTokenStrea
protected Token peek(int n) throws IOException {
int fillCount = n-inQueue.size();
for (int i=0; i < fillCount; i++) {
- Token t = input.next();
+ Token t = readToken();
if (null==t) return null;
inQueue.addLast(t);
}
return inQueue.get(n-1);
}
+ /** old api emulation for back compat */
+ private Token readToken() throws IOException {
+ if (!input.incrementToken()) {
+ return null;
+ } else {
+ Token token = new Token();
+ token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+ token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+ token.setType(typeAtt.type());
+ token.setFlags(flagsAtt.getFlags());
+ token.setPositionIncrement(posIncAtt.getPositionIncrement());
+ token.setPayload(payloadAtt.getPayload());
+ return token;
+ }
+ }
+
+ /** old api emulation for back compat */
+ private boolean writeToken(Token token) throws IOException {
+ clearAttributes();
+ termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ typeAtt.setType(token.type());
+ flagsAtt.setFlags(token.getFlags());
+ posIncAtt.setPositionIncrement(token.getPositionIncrement());
+ payloadAtt.setPayload(token.getPayload());
+ return true;
+ }
+
/**
* Write a token to the buffered output stream
*/
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java Sun Mar 14 20:58:32 2010
@@ -14,20 +14,22 @@ import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/*
- * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
- * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
- * associated constructors
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
*/
/**
* Construct bigrams for frequently occurring terms while indexing. Single terms
* are still indexed too, with bigrams overlaid. This is achieved through the
- * use of {@link Token#setPositionIncrement(int)}. Bigrams have a type
- * of "gram" Example
+ * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
+ * of {@link #GRAM_TYPE} Example:
* <ul>
* <li>input:"the quick brown fox"</li>
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
@@ -40,14 +42,23 @@ import org.apache.lucene.analysis.TokenS
/*
* Constructors and makeCommonSet based on similar code in StopFilter
*/
+public final class CommonGramsFilter extends TokenFilter {
-public class CommonGramsFilter extends BufferedTokenStream {
-
+ static final String GRAM_TYPE = "gram";
private static final char SEPARATOR = '_';
private final CharArraySet commonWords;
- private StringBuilder buffer = new StringBuilder();
+ private final StringBuilder buffer = new StringBuilder();
+
+ private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
+ private int lastStartOffset;
+ private boolean lastWasCommon;
+ private State savedState;
/**
* Construct a token stream filtering the given input using a Set of common
@@ -57,7 +68,6 @@ public class CommonGramsFilter extends B
*
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
- *
*/
public CommonGramsFilter(TokenStream input, Set commonWords) {
this(input, commonWords, false);
@@ -80,8 +90,7 @@ public class CommonGramsFilter extends B
* @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
- public CommonGramsFilter(TokenStream input, Set commonWords,
- boolean ignoreCase) {
+ public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
super(input);
if (commonWords instanceof CharArraySet) {
this.commonWords = (CharArraySet) commonWords;
@@ -89,7 +98,6 @@ public class CommonGramsFilter extends B
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords);
}
- init();
}
/**
@@ -101,7 +109,6 @@ public class CommonGramsFilter extends B
*/
public CommonGramsFilter(TokenStream input, String[] commonWords) {
this(input, commonWords, false);
- init();
}
/**
@@ -112,33 +119,21 @@ public class CommonGramsFilter extends B
* @param commonWords words to be used in constructing bigrams
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
- public CommonGramsFilter(TokenStream input, String[] commonWords,
- boolean ignoreCase) {
+ public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
super(input);
- this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
- init();
- }
-
- // Here for future moving to 2.9 api See StopFilter code
-
- public void init() {
- /**
- * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
- * =(PositionIncrementAttribute)
- * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
- * addAttribute(TypeAttribute.class);
- */
+ this.commonWords = makeCommonSet(commonWords, ignoreCase);
}
/**
* Build a CharArraySet from an array of common words, appropriate for passing
* into the CommonGramsFilter constructor. This permits this commonWords
* construction to be cached once when an Analyzer is constructed.
- *
- * @see #makeCommonSet(java.lang.String[], boolean) passing false to
- * ignoreCase
+ *
+ * @param commonWords Array of common words which will be converted into the CharArraySet
+ * @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
+ * @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
*/
- public static final CharArraySet makeCommonSet(String[] commonWords) {
+ public static CharArraySet makeCommonSet(String[] commonWords) {
return makeCommonSet(commonWords, false);
}
@@ -147,12 +142,11 @@ public class CommonGramsFilter extends B
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
* false.
*
- * @param commonWords
+ * @param commonWords Array of common words which will be converted into the CharArraySet
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
*/
- public static final CharArraySet makeCommonSet(String[] commonWords,
- boolean ignoreCase) {
+ public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
commonSet.addAll(Arrays.asList(commonWords));
return commonSet;
@@ -163,61 +157,95 @@ public class CommonGramsFilter extends B
* output the token. If the token and/or the following token are in the list
* of common words also output a bigram with position increment 0 and
* type="gram"
- */
- /*
- * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
- * Token.next() TODO:Consider adding an option to not emit unigram stopwords
+ *
+ * TODO:Consider adding an option to not emit unigram stopwords
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
- * changed to work with this. TODO: Consider optimizing for the case of three
+ * changed to work with this.
+ *
+ * TODO: Consider optimizing for the case of three
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
* "of-the", "the-year" but with proper management of positions we could
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
* position lookups.
*/
- public Token process(Token token) throws IOException {
- Token next = peek(1);
- // if this is the last token just spit it out. Any commongram would have
- // been output in the previous call
- if (next == null) {
- return token;
+ public boolean incrementToken() throws IOException {
+ // get the next piece of input
+ if (savedState != null) {
+ restoreState(savedState);
+ savedState = null;
+ saveTermBuffer();
+ return true;
+ } else if (!input.incrementToken()) {
+ return false;
}
-
- /**
- * if this token or next are common then construct a bigram with type="gram"
- * position increment = 0, and put it in the output queue. It will be
- * returned when super.next() is called, before this method gets called with
- * a new token from the input stream See implementation of next() in
- * BufferedTokenStream
+
+ /* We build n-grams before and after stopwords.
+ * When valid, the buffer always contains at least the separator.
+ * If its empty, there is nothing before this stopword.
*/
-
- if (isCommon(token) || isCommon(next)) {
- Token gram = gramToken(token, next);
- write(gram);
+ if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
+ savedState = captureState();
+ gramToken();
+ return true;
}
- // we always return the unigram token
- return token;
+
+ saveTermBuffer();
+ return true;
}
- /** True if token is for a common term. */
- private boolean isCommon(Token token) {
- return commonWords != null
- && commonWords.contains(token.termBuffer(), 0, token.termLength());
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ lastWasCommon = false;
+ savedState = null;
+ buffer.setLength(0);
}
- /** Construct a compound token. */
- private Token gramToken(Token first, Token second) {
+ // ================================================= Helper Methods ================================================
+
+ /**
+ * Determines if the current token is a common term
+ *
+ * @return {@code true} if the current token is a common term, {@code false} otherwise
+ */
+ private boolean isCommon() {
+ return commonWords != null && commonWords.contains(termAttribute.termBuffer(), 0, termAttribute.termLength());
+ }
+
+ /**
+ * Saves this information to form the left part of a gram
+ */
+ private void saveTermBuffer() {
buffer.setLength(0);
- buffer.append(first.termText());
+ buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
buffer.append(SEPARATOR);
- buffer.append(second.termText());
- Token result = new Token(buffer.toString(), first.startOffset(), second
- .endOffset(), "gram");
- result.setPositionIncrement(0);
- return result;
+ lastStartOffset = offsetAttribute.startOffset();
+ lastWasCommon = isCommon();
}
-
- public void reset() throws IOException {
- super.reset();
+
+ /**
+ * Constructs a compound token.
+ */
+ private void gramToken() {
+ buffer.append(termAttribute.termBuffer(), 0, termAttribute.termLength());
+ int endOffset = offsetAttribute.endOffset();
+
+ clearAttributes();
+
+ int length = buffer.length();
+ char termText[] = termAttribute.termBuffer();
+ if (length > termText.length) {
+ termText = termAttribute.resizeTermBuffer(length);
+ }
+
+ buffer.getChars(0, length, termText, 0);
+ termAttribute.setTermLength(length);
+ posIncAttribute.setPositionIncrement(0);
+ offsetAttribute.setOffset(lastStartOffset, endOffset);
+ typeAttribute.setType(GRAM_TYPE);
buffer.setLength(0);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -57,7 +57,7 @@ public class CommonGramsFilterFactory ex
throw new RuntimeException(e);
}
} else {
- commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+ commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java Sun Mar 14 20:58:32 2010
@@ -18,8 +18,11 @@ package org.apache.solr.analysis;
import java.io.IOException;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
@@ -36,33 +39,36 @@ import org.apache.lucene.analysis.Token;
*/
/*
- * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
- * 2.9 lucene TokenStream api, make necessary changes here.
* See:http://hudson.zones
* .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
* /lucene/analysis/TokenStream.html and
* http://svn.apache.org/viewvc/lucene/java
* /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
*/
-public class CommonGramsQueryFilter extends BufferedTokenStream {
- //private CharArraySet commonWords;
- private Token prev;
+public final class CommonGramsQueryFilter extends TokenFilter {
+
+ private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
+ private State previous;
+ private String previousType;
/**
- * Constructor
- *
- * @param input must be a CommonGramsFilter!
+ * Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
*
+ * @param input CommonGramsFilter the QueryFilter will use
*/
-
public CommonGramsQueryFilter(CommonGramsFilter input) {
super(input);
- prev = new Token();
}
-
+
+ /**
+ * {@inheritDoc}
+ */
public void reset() throws IOException {
super.reset();
- prev = new Token();
+ previous = null;
+ previousType = null;
}
/**
@@ -71,68 +77,47 @@ public class CommonGramsQueryFilter exte
* <ul>
* <li>input: "the rain in spain falls mainly"
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+ * </ul>
*/
-
- public Token process(Token token) throws IOException {
- Token next = peek(1);
- /*
- * Deal with last token (next=null when current token is the last word) Last
- * token will be a unigram. If previous token was a bigram, then we already
- * output the last token as part of the unigram and should not additionally
- * output the unigram. <p> Example: If the end of the input to the
- * CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
- * <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
- * <li> Since the word "plain" was already output as part of the bigram we
- * don't output it.</li> </ul> Example: If the end of the input to the
- * CommonGramsFilter is "falls mainly" <ul> <li>current token =
- * "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
- * (unigram)</li> <li>Since we haven't yet output the current token, we
- * output it</li> </ul>
- */
-
- // Deal with special case of last token
- if (next == null) {
- if (prev == null) {
- // This is the first and only token i.e. one word query
- return token;
- }
- if (prev != null && prev.type() != "gram") {
- // If previous token was a unigram, output the current token
- return token;
- } else {
- // If previous token was a bigram, we already output it and this token
- // was output as part of the bigram so we are done.
- return null;
+ public boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ State current = captureState();
+
+ if (previous != null && !isGramType()) {
+ restoreState(previous);
+ previous = current;
+ previousType = typeAttribute.type();
+
+ if (isGramType()) {
+ posIncAttribute.setPositionIncrement(1);
+ }
+ return true;
}
+
+ previous = current;
}
- /*
- * Possible cases are: |token |next 1|word |gram 2|word |word The
- * CommonGramsFilter we are wrapping always outputs the unigram word prior
- * to outputting an optional bigram: "the sound of" gets output as |"the",
- * "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
- * input stream and output it rather than the current token This means that
- * the call to super.next() which reads a token from input and passes it on
- * to this process method will always get a token of type word
- */
- if (next != null && next.type() == "gram") {
- // consume "next" token from list and output it
- token = read();
- // use this to clone the token because clone requires all these args but
- // won't take the token.type
- // see
- // http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
- prev.reinit(token.termBuffer(), 0, token.termLength(), token
- .startOffset(), token.endOffset(), token.type());
- token.setPositionIncrement(1);
- return token;
+ if (previous == null || GRAM_TYPE.equals(previousType)) {
+ return false;
}
+
+ restoreState(previous);
+ previous = null;
+
+ if (isGramType()) {
+ posIncAttribute.setPositionIncrement(1);
+ }
+ return true;
+ }
- // if the next token is not a bigram, then output the token
- // see note above regarding this method of copying token to prev
- prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
- token.endOffset(), token.type());
- assert token.type() == "word";
- return token;
+ // ================================================= Helper Methods ================================================
+
+ /**
+ * Convenience method to check if the current type is a gram type
+ *
+ * @return {@code true} if the current type is a gram type, {@code false} otherwise
+ */
+ public boolean isGramType() {
+ return GRAM_TYPE.equals(typeAttribute.type());
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -59,8 +59,7 @@ public class CommonGramsQueryFilterFacto
throw new RuntimeException(e);
}
} else {
- commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
- StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+ commonWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -23,7 +23,6 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.el.GreekCharsets;
import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@@ -32,40 +31,16 @@ import org.slf4j.LoggerFactory;
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
{
- @Deprecated
- private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
- static {
- CHARSETS.put("UnicodeGreek",GreekCharsets.UnicodeGreek);
- CHARSETS.put("ISO",GreekCharsets.ISO);
- CHARSETS.put("CP1253",GreekCharsets.CP1253);
- }
-
- private char[] charset = GreekCharsets.UnicodeGreek;
private static Logger logger = LoggerFactory.getLogger(GreekLowerCaseFilterFactory.class);
@Override
public void init(Map<String, String> args) {
super.init(args);
- String charsetName = args.get("charset");
- if (null != charsetName) {
- charset = CHARSETS.get(charsetName);
- if (charset.equals(GreekCharsets.UnicodeGreek))
- logger.warn("Specifying UnicodeGreek is no longer required (default). "
- + "Use of the charset parameter will cause an error in Solr 1.5");
- else
- logger.warn("Support for this custom encoding is deprecated. "
- + "Use of the charset parameter will cause an error in Solr 1.5");
- } else {
- charset = GreekCharsets.UnicodeGreek; /* default to unicode */
- }
- if (null == charset) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "Don't understand charset: " + charsetName);
- }
+
}
public GreekLowerCaseFilter create(TokenStream in) {
- return new GreekLowerCaseFilter(in,charset);
+ return new GreekLowerCaseFilter(in);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
import java.io.Reader;
import java.io.IOException;
@@ -31,11 +32,6 @@ import java.io.IOException;
@Deprecated
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
public Tokenizer create(Reader input) {
- return new StandardTokenizer(new HTMLStripReader(input)) {
- @Override
- public void reset(Reader reader) throws IOException {
- super.reset(new HTMLStripReader(reader));
- }
- };
+ return new StandardTokenizer(Version.LUCENE_24, new HTMLStripReader(input));
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,8 @@ package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* When the plain text is extracted from documents, we will often have many words hyphenated and broken into
@@ -52,46 +54,89 @@ import org.apache.lucene.analysis.*;
*/
public final class HyphenatedWordsFilter extends TokenFilter {
- public HyphenatedWordsFilter(TokenStream in) {
- super(in);
- }
+ private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
+ private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+ private final StringBuilder hyphenated = new StringBuilder();
+ private State savedState;
+ /**
+ * Creates a new HyphenatedWordsFilter
+ *
+ * @param in TokenStream that will be filtered
+ */
+ public HyphenatedWordsFilter(TokenStream in) {
+ super(in);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ char[] term = termAttribute.termBuffer();
+ int termLength = termAttribute.termLength();
+
+ if (termLength > 0 && term[termLength - 1] == '-') {
+ // a hyphenated word
+ // capture the state of the first token only
+ if (savedState == null) {
+ savedState = captureState();
+ }
+ hyphenated.append(term, 0, termLength - 1);
+ } else if (savedState == null) {
+ // not part of a hyphenated word.
+ return true;
+ } else {
+ // the final portion of a hyphenated word
+ hyphenated.append(term, 0, termLength);
+ unhyphenate();
+ return true;
+ }
+ }
+
+ if (savedState != null) {
+ // the final term ends with a hyphen
+ // add back the hyphen, for backwards compatibility.
+ hyphenated.append('-');
+ unhyphenate();
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ hyphenated.setLength(0);
+ savedState = null;
+ }
+ // ================================================= Helper Methods ================================================
/**
- * @inheritDoc
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
- public final Token next(Token in) throws IOException {
- StringBuilder termText = new StringBuilder(25);
- int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
- Token lastToken = null;
- for (Token token = input.next(in); token != null; token = input.next()) {
- termText.append(token.termBuffer(), 0, token.termLength());
- //current token ends with hyphen -> grab the next token and glue them together
- if (termText.charAt(termText.length() - 1) == '-') {
- wordsMerged++;
- //remove the hyphen
- termText.setLength(termText.length()-1);
- if (startOffset == -1) {
- startOffset = token.startOffset();
- firstPositionIncrement = token.getPositionIncrement();
- }
- lastToken = token;
- } else {
- //shortcut returns token
- if (wordsMerged == 0)
- return token;
- Token mergedToken = new Token(termText.toString(), startOffset, token.endOffset(), token.type());
- mergedToken.setPositionIncrement(firstPositionIncrement);
- return mergedToken;
- }
- }
- //last token ending with hyphen? - we know that we have only one token in
- //this situation, so we can safely return firstToken
- if (startOffset != -1)
- return lastToken;
- else
- return null; //end of token stream
- }
+ * Writes the joined unhyphenated term
+ */
+ private void unhyphenate() {
+ int endOffset = offsetAttribute.endOffset();
+
+ restoreState(savedState);
+ savedState = null;
+
+ char term[] = termAttribute.termBuffer();
+ int length = hyphenated.length();
+ if (length > termAttribute.termLength()) {
+ term = termAttribute.resizeTermBuffer(length);
+ }
+
+ hyphenated.getChars(0, length, term, 0);
+ termAttribute.setTermLength(length);
+ offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
+ hyphenated.setLength(0);
+ }
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -75,7 +75,7 @@ public class KeepWordFilterFactory exten
}
public KeepWordFilter create(TokenStream input) {
- return new KeepWordFilter(input, words, ignoreCase);
+ return new KeepWordFilter(input, (Set)words, ignoreCase);
}
public CharArraySet getWords() {
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java Sun Mar 14 20:58:32 2010
@@ -17,41 +17,69 @@
package org.apache.solr.analysis;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.solr.util.ArraysUtils;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.solr.util.CharArrayMap;
import java.io.IOException;
/**
- * A TokenFilter which filters out Tokens at the same position and Term
- * text as the previous token in the stream.
+ * A TokenFilter which filters out Tokens at the same position and Term text as the previous token in the stream.
*/
-public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
- public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
- protected Token process(Token t) throws IOException {
- Token tok = read();
- while (tok != null && tok.getPositionIncrement()==0) {
- if (null != t) {
- write(t);
- t = null;
- }
- boolean dup=false;
- for (Token outTok : output()) {
- int tokLen = tok.termLength();
- if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
- dup=true;
- //continue;;
- }
+public final class RemoveDuplicatesTokenFilter extends TokenFilter {
+
+ private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
+ private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
+ // keep a seen 'set' after each term with posInc > 0
+ // for now use CharArrayMap vs CharArraySet, as it has clear()
+ private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
+
+ /**
+ * Creates a new RemoveDuplicatesTokenFilter
+ *
+ * @param in TokenStream that will be filtered
+ */
+ public RemoveDuplicatesTokenFilter(TokenStream in) {
+ super(in);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ final char term[] = termAttribute.termBuffer();
+ final int length = termAttribute.termLength();
+ final int posIncrement = posIncAttribute.getPositionIncrement();
+
+ if (posIncrement > 0) {
+ previous.clear();
}
- if (!dup){
- write(tok);
+
+ boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
+
+ // clone the term, and add to the set of seen terms.
+ char saved[] = new char[length];
+ System.arraycopy(term, 0, saved, 0, length);
+ previous.put(saved, Boolean.TRUE);
+
+ if (!duplicate) {
+ return true;
}
- tok = read();
- }
- if (tok != null) {
- pushBack(tok);
}
- return t;
+ return false;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ previous.clear();
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianCommon.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianCommon.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianCommon.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianCommon.java Sun Mar 14 20:58:32 2010
@@ -16,46 +16,46 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
-import org.apache.lucene.analysis.ru.*;
-import java.util.Map;
-import java.util.HashMap;
-import org.apache.solr.core.SolrConfig;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrException.ErrorCode;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@Deprecated
-public class RussianCommon {
-
- private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
-
- private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
- static {
- CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
- CHARSETS.put("KOI8",RussianCharsets.KOI8);
- CHARSETS.put("CP1251",RussianCharsets.CP1251);
- }
-
- public static char[] getCharset(String name) {
- if (null == name)
- return RussianCharsets.UnicodeRussian;
-
- char[] charset = CHARSETS.get(name);
-
- if (charset.equals(RussianCharsets.UnicodeRussian))
- logger.warn("Specifying UnicodeRussian is no longer required (default). "
- + "Use of the charset parameter will cause an error in Solr 1.5");
- else
- logger.warn("Support for this custom encoding is deprecated. "
- + "Use of the charset parameter will cause an error in Solr 1.5");
-
- if (null == charset) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "Don't understand charset: " + name);
- }
- return charset;
- }
-}
+//package org.apache.solr.analysis;
+//import org.apache.lucene.analysis.ru.*;
+//import java.util.Map;
+//import java.util.HashMap;
+//import org.apache.solr.core.SolrConfig;
+//import org.apache.solr.common.SolrException;
+//import org.apache.solr.common.SolrException.ErrorCode;
+//import org.slf4j.Logger;
+//import org.slf4j.LoggerFactory;
+//
+//@Deprecated
+//public class RussianCommon {
+//
+// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
+//
+// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
+// static {
+// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
+// CHARSETS.put("KOI8",RussianCharsets.KOI8);
+// CHARSETS.put("CP1251",RussianCharsets.CP1251);
+// }
+//
+// public static char[] getCharset(String name) {
+// if (null == name)
+// return RussianCharsets.UnicodeRussian;
+//
+// char[] charset = CHARSETS.get(name);
+//
+// if (charset.equals(RussianCharsets.UnicodeRussian))
+// logger.warn("Specifying UnicodeRussian is no longer required (default). "
+// + "Use of the charset parameter will cause an error in Solr 1.5");
+// else
+// logger.warn("Support for this custom encoding is deprecated. "
+// + "Use of the charset parameter will cause an error in Solr 1.5");
+//
+// if (null == charset) {
+// throw new SolrException(ErrorCode.SERVER_ERROR,
+// "Don't understand charset: " + name);
+// }
+// return charset;
+// }
+//}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java Sun Mar 14 20:58:32 2010
@@ -23,17 +23,10 @@ import java.util.Map;
import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
- @Deprecated
- private char[] charset;
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- charset = RussianCommon.getCharset(args.get("charset"));
- }
+
public RussianLetterTokenizer create(Reader in) {
- return new RussianLetterTokenizer(in,charset);
+ return new RussianLetterTokenizer(in);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -23,17 +23,9 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
- @Deprecated
- private char[] charset;
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- charset = RussianCommon.getCharset(args.get("charset"));
- }
public RussianLowerCaseFilter create(TokenStream in) {
- return new RussianLowerCaseFilter(in,charset);
+ return new RussianLowerCaseFilter(in);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -25,16 +25,10 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.ru.RussianStemFilter;
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
- @Deprecated
- private char[] charset;
-
- public void init(Map<String, String> args) {
- super.init(args);
- charset = RussianCommon.getCharset(args.get("charset"));
- }
+
public RussianStemFilter create(TokenStream in) {
- return new RussianStemFilter(in,charset);
+ return new RussianStemFilter(in);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java Sun Mar 14 20:58:32 2010
@@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
import java.io.Reader;
@@ -28,6 +29,6 @@ import java.io.Reader;
public class StandardTokenizerFactory extends BaseTokenizerFactory {
public StandardTokenizer create(Reader input) {
- return new StandardTokenizer(input);
+ return new StandardTokenizer(Version.LUCENE_24, input);
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -58,7 +58,7 @@ public class StopFilterFactory extends B
throw new RuntimeException(e);
}
} else {
- stopWords = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
+ stopWords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
@@ -79,8 +79,7 @@ public class StopFilterFactory extends B
}
public StopFilter create(TokenStream input) {
- StopFilter stopFilter = new StopFilter(input,stopWords,ignoreCase);
- stopFilter.setEnablePositionIncrements(enablePositionIncrements);
+ StopFilter stopFilter = new StopFilter(enablePositionIncrements, input,stopWords,ignoreCase);
return stopFilter;
}
}
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilter.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,12 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.util.ArrayList;
@@ -39,11 +45,16 @@ import java.util.LinkedList;
public class SynonymFilter extends TokenFilter {
private final SynonymMap map; // Map<String, SynonymMap>
- private Iterator<Token> replacement; // iterator over generated tokens
+ private Iterator<AttributeSource> replacement; // iterator over generated tokens
public SynonymFilter(TokenStream in, SynonymMap map) {
super(in);
this.map = map;
+ // just ensuring these exist attributes exist...
+ addAttribute(TermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ addAttribute(OffsetAttribute.class);
+ addAttribute(TypeAttribute.class);
}
@@ -65,74 +76,100 @@ public class SynonymFilter extends Token
* - preserve original positionIncrement of first matched token
*/
@Override
- public Token next(Token target) throws IOException {
+ public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement!=null && replacement.hasNext()) {
- return replacement.next();
+ copy(this, replacement.next());
+ return true;
}
// common case fast-path of first token not matching anything
- Token firstTok = nextTok(target);
- if (firstTok == null) return null;
- SynonymMap result = map.submap!=null ? map.submap.get(firstTok.termBuffer(), 0, firstTok.termLength()) : null;
- if (result == null) return firstTok;
+ AttributeSource firstTok = nextTok();
+ if (firstTok == null) return false;
+ TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
+ SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
+ if (result == null) {
+ copy(this, firstTok);
+ return true;
+ }
+ // fast-path failed, clone ourselves if needed
+ if (firstTok == this)
+ firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
- matched = new LinkedList<Token>();
+ matched = new LinkedList<AttributeSource>();
result = match(result);
if (result==null) {
// no match, simply return the first token read.
- return firstTok;
+ copy(this, firstTok);
+ return true;
}
// reuse, or create new one each time?
- ArrayList<Token> generated = new ArrayList<Token>(result.synonyms.length + matched.size() + 1);
+ ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
- Token lastTok = matched.isEmpty() ? firstTok : matched.getLast();
+ AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
- Token origTok = includeOrig ? firstTok : null;
- int origPos = firstTok.getPositionIncrement(); // position of origTok in the original stream
+ AttributeSource origTok = includeOrig ? firstTok : null;
+ PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
+ int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
- Token newTok = new Token(firstTok.startOffset(), lastTok.endOffset(), firstTok.type());
- newTok.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
+ AttributeSource newTok = firstTok.cloneAttributes();
+ TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
+ OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
+ TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
+ PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
+
+ OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
+
+ newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
+ newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
repPos += repTok.getPositionIncrement();
if (i==0) repPos=origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
- origTok.setPositionIncrement(origPos-pos);
+ PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
+ origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
- pos += origTok.getPositionIncrement();
+ pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
- if (origTok != null) origPos += origTok.getPositionIncrement();
+ if (origTok != null) {
+ origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
+ origPos += origPosInc.getPositionIncrement();
+ }
}
- newTok.setPositionIncrement(repPos - pos);
+ newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
- pos += newTok.getPositionIncrement();
+ pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok!=null) {
- origTok.setPositionIncrement(origPos-pos);
+ PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
+ origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
- pos += origTok.getPositionIncrement();
+ pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
- if (origTok != null) origPos += origTok.getPositionIncrement();
+ if (origTok != null) {
+ origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
+ origPos += origPosInc.getPositionIncrement();
+ }
}
// what if we replaced a longer sequence with a shorter one?
@@ -151,27 +188,22 @@ public class SynonymFilter extends Token
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
- private LinkedList<Token> buffer;
- private LinkedList<Token> matched;
-
- private Token nextTok() throws IOException {
- if (buffer!=null && !buffer.isEmpty()) {
- return buffer.removeFirst();
- } else {
- return input.next();
- }
- }
+ private LinkedList<AttributeSource> buffer;
+ private LinkedList<AttributeSource> matched;
- private Token nextTok(Token target) throws IOException {
+ private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
- return input.next(target);
+ if (input.incrementToken()) {
+ return this;
+ } else
+ return null;
}
}
- private void pushTok(Token t) {
- if (buffer==null) buffer=new LinkedList<Token>();
+ private void pushTok(AttributeSource t) {
+ if (buffer==null) buffer=new LinkedList<AttributeSource>();
buffer.addFirst(t);
}
@@ -179,15 +211,20 @@ public class SynonymFilter extends Token
SynonymMap result = null;
if (map.submap != null) {
- Token tok = nextTok();
+ AttributeSource tok = nextTok();
if (tok != null) {
+ // clone ourselves.
+ if (tok == this)
+ tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
- SynonymMap subMap = map.submap.get(tok.termBuffer(), 0, tok.termLength());
+ TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
+ SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
if (subMap != null) {
// recurse
result = match(subMap);
}
+;
if (result != null) {
matched.addFirst(tok);
} else {
@@ -205,6 +242,15 @@ public class SynonymFilter extends Token
return result;
}
+ private void copy(AttributeSource target, AttributeSource source) {
+ if (target == source)
+ return;
+ for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
+ sourceIt.hasNext();) {
+ sourceIt.next().copyTo(targetIt.next());
+ }
+ }
+
@Override
public void reset() throws IOException {
input.reset();
Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java Sun Mar 14 20:58:32 2010
@@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
@@ -135,8 +136,9 @@ public class SynonymFilterFactory extend
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
- for( Token token = ts.next(); token != null; token = ts.next() ){
- String text = new String(token.termBuffer(), 0, token.termLength());
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ while (ts.incrementToken()){
+ String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
if( text.length() > 0 )
tokList.add( text );
}