You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ry...@apache.org on 2009/08/24 20:58:23 UTC
svn commit: r807338 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/
src/test/test-files/solr/conf/
Author: ryan
Date: Mon Aug 24 18:58:22 2009
New Revision: 807338
URL: http://svn.apache.org/viewvc?rev=807338&view=rev
Log:
SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer
rather then a TokenStream (that may be or may not be a Tokenizer). This change
is required to take advantage of the Token reuse improvements in lucene 2.9.
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java
lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Aug 24 18:58:22 2009
@@ -42,6 +42,12 @@
for your request handlers in solrconfig.xml, see the example solrconfig.xml for
sample syntax.)
+The TokenizerFactory API has changed to explicitly return a Tokenizer rather then
+a TokenStream (that may be or may not be a Tokenizer). This change is required
+to take advantage of the Token reuse improvements in lucene 2.9. For more
+information, see SOLR-1377.
+
+
Versions of Major Components
----------------------------
Apache Lucene 2.9-dev r804692
@@ -615,6 +621,10 @@
45. SOLR1276: Added StatsComponentTest (RafaÅ KuÄ, gsingers)
+46. SOLR-1377: The TokenizerFactory API has changed to explicitly return a Tokenizer
+ rather then a TokenStream (that may be or may not be a Tokenizer). This change
+ is required to take advantage of the Token reuse improvements in lucene 2.9. (ryan)
+
Build
----------------------
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -18,9 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
+import java.io.IOException;
/**
* @version $Id$
@@ -28,7 +30,12 @@
*/
@Deprecated
public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
- public TokenStream create(Reader input) {
- return new StandardTokenizer(new HTMLStripReader(input));
+ public Tokenizer create(Reader input) {
+ return new StandardTokenizer(new HTMLStripReader(input)) {
+ @Override
+ public void reset(Reader reader) throws IOException {
+ super.reset(new HTMLStripReader(reader));
+ }
+ };
}
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -18,9 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.Reader;
+import java.io.IOException;
/**
* @version $Id$
@@ -28,7 +30,12 @@
*/
@Deprecated
public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
- public TokenStream create(Reader input) {
- return new WhitespaceTokenizer(new HTMLStripReader(input));
+ public Tokenizer create(Reader input) {
+ return new WhitespaceTokenizer(new HTMLStripReader(input)) {
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset(new HTMLStripReader(input));
+ }
+ };
}
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -17,16 +17,6 @@
package org.apache.solr.analysis;
-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.CharStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.core.SolrConfig;
-
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
@@ -36,6 +26,11 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.solr.common.SolrException;
+
/**
* This tokenizer uses regex pattern matching to construct distinct tokens
@@ -103,41 +98,44 @@
/**
* Split the input using configured pattern
*/
- public TokenStream create(Reader input) {
+ public Tokenizer create(final Reader in) {
try {
- // Read the input into a single string
- String str = IOUtils.toString( input );
-
- Matcher matcher = pattern.matcher( str );
- List<Token> tokens = (group < 0 )
- ? split( matcher, str )
- : group( matcher, str, group );
-
- final Iterator<Token> iter = tokens.iterator();
- return new TokenStream() {
- @Override
- public boolean incrementToken() throws IOException {
- return super.incrementToken();
- }
+ return new Tokenizer(in) {
+ {init();}
+
+ List<Token> tokens;
+ Iterator<Token> iter;
+
+ void init() throws IOException {
+ // Read the input into a single string
+ String str = IOUtils.toString( input );
+
+ Matcher matcher = pattern.matcher( str );
+ tokens = (group < 0 )
+ ? split( matcher, str )
+ : group( matcher, str, group );
+ iter = tokens.iterator();
+ }
+
+// @Override
+// public boolean incrementToken() throws IOException {
+// return super.incrementToken();
+// }
@Override
public void end() throws IOException {
super.end();
}
- @Override
- public Token next(Token reusableToken) throws IOException {
- return super.next(reusableToken);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- }
+// @Override
+// public Token next(Token reusableToken) throws IOException {
+// return super.next(reusableToken);
+// }
@Override
- public void close() throws IOException {
- super.close();
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ init();
}
@Override
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -65,6 +65,6 @@
public Map<String,String> getArgs();
/** Creates a TokenStream of the specified input */
- public TokenStream create(Reader input);
+ public Tokenizer create(Reader input);
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -47,7 +47,7 @@
this.precisionStep = precisionStep;
}
- public TokenStream create(Reader input) {
+ public Tokenizer create(Reader input) {
return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
}
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java Mon Aug 24 18:58:22 2009
@@ -466,6 +466,25 @@
}
+
+ public void testTokenizer() {
+
+ assertU(adoc("id", "4055",
+ "patterntok", "Hello,There"));
+ assertU(adoc("id", "4056",
+ "patterntok", "Goodbye,Now"));
+ assertU(commit());
+
+ assertQ("make sure it split ok",
+ req("patterntok:Hello")
+ ,"*[count(//doc)=1]"
+ );
+ assertQ("make sure it split ok",
+ req("patterntok:Goodbye")
+ ,"*[count(//doc)=1]"
+ );
+ }
+
public void testConfigDefaults() {
assertU(adoc("id", "42",
"name", "Zapp Brannigan"));
Modified: lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml Mon Aug 24 18:58:22 2009
@@ -205,6 +205,11 @@
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
</fieldtype>
+ <fieldtype name="patterntok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.PatternTokenizerFactory" pattern=","/>
+ </analyzer>
+ </fieldtype>
<fieldtype name="porterfilt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -422,6 +427,7 @@
<field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
<field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
<field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+ <field name="patterntok" type="patterntok" indexed="true" stored="true"/>
<field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
<field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
<field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>