You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ry...@apache.org on 2009/08/24 20:58:23 UTC

svn commit: r807338 - in /lucene/solr/trunk: ./ src/java/org/apache/solr/analysis/ src/test/org/apache/solr/ src/test/test-files/solr/conf/

Author: ryan
Date: Mon Aug 24 18:58:22 2009
New Revision: 807338

URL: http://svn.apache.org/viewvc?rev=807338&view=rev
Log:
SOLR-1377:  The TokenizerFactory API has changed to explicitly return a Tokenizer 
rather then a TokenStream (that may be or may not be a Tokenizer).  This change 
is required to take advantage of the Token reuse improvements in lucene 2.9.

Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java
    lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
    lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java
    lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Aug 24 18:58:22 2009
@@ -42,6 +42,12 @@
 for your request handlers in solrconfig.xml, see the example solrconfig.xml for
 sample syntax.) 
 
+The TokenizerFactory API has changed to explicitly return a Tokenizer rather then
+a TokenStream (that may be or may not be a Tokenizer).  This change is required
+to take advantage of the Token reuse improvements in lucene 2.9.  For more 
+information, see SOLR-1377. 
+
+
 Versions of Major Components
 ----------------------------
 Apache Lucene 2.9-dev r804692
@@ -615,6 +621,10 @@
 
 45. SOLR1276: Added StatsComponentTest (Rafał Kuć, gsingers)
 
+46. SOLR-1377:  The TokenizerFactory API has changed to explicitly return a Tokenizer 
+    rather then a TokenStream (that may be or may not be a Tokenizer).  This change 
+    is required to take advantage of the Token reuse improvements in lucene 2.9. (ryan)  
+    
 
 Build
 ----------------------

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripStandardTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -18,9 +18,11 @@
 package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.Reader;
+import java.io.IOException;
 
 /**
  * @version $Id$
@@ -28,7 +30,12 @@
  */
 @Deprecated
 public class HTMLStripStandardTokenizerFactory extends BaseTokenizerFactory {
-  public TokenStream create(Reader input) {
-    return new StandardTokenizer(new HTMLStripReader(input));
+  public Tokenizer create(Reader input) {
+    return new StandardTokenizer(new HTMLStripReader(input)) {
+      @Override
+      public void reset(Reader reader) throws IOException {
+        super.reset(new HTMLStripReader(reader));
+      }
+    };
   }
 }

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -18,9 +18,11 @@
 package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 
 import java.io.Reader;
+import java.io.IOException;
 
 /**
  * @version $Id$
@@ -28,7 +30,12 @@
  */
 @Deprecated
 public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
-  public TokenStream create(Reader input) {
-    return new WhitespaceTokenizer(new HTMLStripReader(input));
+  public Tokenizer create(Reader input) {
+    return new WhitespaceTokenizer(new HTMLStripReader(input)) {
+      @Override
+      public void reset(Reader input) throws IOException {
+        super.reset(new HTMLStripReader(input));
+      }
+    };
   }
 }

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -17,16 +17,6 @@
 
 package org.apache.solr.analysis;
 
-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.CharStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.core.SolrConfig;
-
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
@@ -36,6 +26,11 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.solr.common.SolrException;
+
 
 /**
  * This tokenizer uses regex pattern matching to construct distinct tokens
@@ -103,41 +98,44 @@
   /**
    * Split the input using configured pattern
    */
-  public TokenStream create(Reader input) {
+  public Tokenizer create(final Reader in) {
     try {
-      // Read the input into a single string
-      String str = IOUtils.toString( input );
-      
-      Matcher matcher = pattern.matcher( str );
-      List<Token> tokens = (group < 0 ) 
-        ? split( matcher, str )
-        : group( matcher, str, group );
-        
-      final Iterator<Token> iter = tokens.iterator();
-      return new TokenStream() {
-        @Override
-        public boolean incrementToken() throws IOException {
-          return super.incrementToken();
-        }
+      return new Tokenizer(in) {
+        {init();}
+
+        List<Token> tokens;
+        Iterator<Token> iter;
+
+        void init() throws IOException {
+          // Read the input into a single string
+          String str = IOUtils.toString( input );
+
+          Matcher matcher = pattern.matcher( str );
+          tokens = (group < 0 )
+                  ? split( matcher, str )
+                  : group( matcher, str, group );
+          iter = tokens.iterator();
+        }
+
+//        @Override
+//        public boolean incrementToken() throws IOException {
+//          return super.incrementToken();
+//        }
 
         @Override
         public void end() throws IOException {
           super.end();
         }
 
-        @Override
-        public Token next(Token reusableToken) throws IOException {
-          return super.next(reusableToken);
-        }
-
-        @Override
-        public void reset() throws IOException {
-          super.reset();
-        }
+//        @Override
+//        public Token next(Token reusableToken) throws IOException {
+//          return super.next(reusableToken);
+//        }
 
         @Override
-        public void close() throws IOException {
-          super.close();
+        public void reset(Reader input) throws IOException {
+          super.reset(input);
+          init();
         }
 
         @Override

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -65,6 +65,6 @@
   public Map<String,String> getArgs();
   
   /** Creates a TokenStream of the specified input */
-  public TokenStream create(Reader input);
+  public Tokenizer create(Reader input);
 }
 

Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java Mon Aug 24 18:58:22 2009
@@ -47,7 +47,7 @@
     this.precisionStep = precisionStep;
   }
 
-  public TokenStream create(Reader input) {
+  public Tokenizer create(Reader input) {
     return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
   }
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/BasicFunctionalityTest.java Mon Aug 24 18:58:22 2009
@@ -466,6 +466,25 @@
     
   }
 
+
+  public void testTokenizer() {
+
+    assertU(adoc("id",  "4055",
+            "patterntok", "Hello,There"));
+    assertU(adoc("id",  "4056",
+            "patterntok", "Goodbye,Now"));
+    assertU(commit());
+
+    assertQ("make sure it split ok",
+            req("patterntok:Hello")
+            ,"*[count(//doc)=1]"
+    );
+    assertQ("make sure it split ok",
+            req("patterntok:Goodbye")
+            ,"*[count(//doc)=1]"
+    );
+  }
+
   public void testConfigDefaults() {
     assertU(adoc("id", "42",
                  "name", "Zapp Brannigan"));

Modified: lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=807338&r1=807337&r2=807338&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml Mon Aug 24 18:58:22 2009
@@ -205,6 +205,11 @@
         <tokenizer class="solr.KeywordTokenizerFactory"/>
       </analyzer>
     </fieldtype>
+    <fieldtype name="patterntok" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.PatternTokenizerFactory" pattern=","/>
+      </analyzer>
+    </fieldtype>
     <fieldtype name="porterfilt" class="solr.TextField">
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -422,6 +427,7 @@
    <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
    <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
    <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+   <field name="patterntok" type="patterntok" indexed="true" stored="true"/>
    <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
    <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
    <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>