You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/15 21:24:26 UTC

svn commit: r1049693 - in /lucene/dev/trunk/solr: ./ src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/

Author: sarowe
Date: Wed Dec 15 20:24:26 2010
New Revision: 1049693

URL: http://svn.apache.org/viewvc?rev=1049693&view=rev
Log:
SOLR-2188: provide maxTokenLength arg for Classic, Standard, and UAX29URLEmail tokenizer factories

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Dec 15 20:24:26 2010
@@ -314,6 +314,8 @@ New Features
   Adding a parameter NOW=<time_in_ms> to the request will override the
   current time.  (Peter Sturge, yonik)
 
+* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer,
+  StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe)
 
 Optimizations
 ----------------------

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,8 @@ package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.Reader;
 import java.util.Map;
@@ -28,13 +30,20 @@ import java.util.Map;
  */
 
 public class ClassicTokenizerFactory extends BaseTokenizerFactory {
+
+  private int maxTokenLength;
+
   @Override
   public void init(Map<String,String> args) {
     super.init(args);
     assureMatchVersion();
+    maxTokenLength = getInt("maxTokenLength", 
+                            StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
   }
 
   public Tokenizer create(Reader input) {
-    return new ClassicTokenizer(luceneMatchVersion, input);
+    ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input); 
+    tokenizer.setMaxTokenLength(maxTokenLength);
+    return tokenizer;
   }
 }

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -17,6 +17,7 @@
 
 package org.apache.solr.analysis;
 
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.Reader;
@@ -27,13 +28,21 @@ import java.util.Map;
  */
 
 public class StandardTokenizerFactory extends BaseTokenizerFactory {
+  
+  private int maxTokenLength;
+  
   @Override
   public void init(Map<String,String> args) {
     super.init(args);
     assureMatchVersion();
+    maxTokenLength = getInt("maxTokenLength", 
+                            StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
   }
 
   public StandardTokenizer create(Reader input) {
-    return new StandardTokenizer(luceneMatchVersion, input);
+    StandardTokenizer tokenizer
+      = new StandardTokenizer(luceneMatchVersion, input); 
+    tokenizer.setMaxTokenLength(maxTokenLength);
+    return tokenizer;
   }
 }

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
 
 
 
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 
 import java.io.Reader;
@@ -31,13 +32,20 @@ import java.util.Map;
  */
 
 public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
+
+  private int maxTokenLength;
+
   @Override
   public void init(Map<String,String> args) {
     super.init(args);
     assureMatchVersion();
+    maxTokenLength = getInt("maxTokenLength",
+                            StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
   }
 
   public UAX29URLEmailTokenizer create(Reader input) {
-    return new UAX29URLEmailTokenizer(input);
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input); 
+    tokenizer.setMaxTokenLength(maxTokenLength);
+    return tokenizer;
   }
 }

Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,8 @@ package org.apache.solr.analysis;
 
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -40,6 +42,24 @@ public class TestStandardFactories exten
         new String[] {"Wha\u0301t's", "this", "thing", "do" });
   }
   
+  public void testStandardTokenizerMaxTokenLength() throws Exception {
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0 ; i < 100 ; ++i) {
+      builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+    }
+    String longWord = builder.toString();
+    String content = "one two three " + longWord + " four five six";
+    Reader reader = new StringReader(content);
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+    args.put("maxTokenLength", "1000");
+    StandardTokenizerFactory factory = new StandardTokenizerFactory();
+    factory.init(args);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+  }
+  
   /**
    * Test ClassicTokenizerFactory
    */
@@ -52,6 +72,24 @@ public class TestStandardFactories exten
         new String[] {"What's", "this", "thing", "do" });
   }
   
+  public void testClassicTokenizerMaxTokenLength() throws Exception {
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0 ; i < 100 ; ++i) {
+      builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+    }
+    String longWord = builder.toString();
+    String content = "one two three " + longWord + " four five six";
+    Reader reader = new StringReader(content);
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+    args.put("maxTokenLength", "1000");
+    ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
+    factory.init(args);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+  }
+  
   /**
    * Test ClassicFilterFactory
    */

Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,9 @@ package org.apache.solr.analysis;
 
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
@@ -152,4 +155,22 @@ public class TestUAX29URLEmailTokenizerF
         }
     );
   }
+
+  public void testMaxTokenLength() throws Exception {
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0 ; i < 100 ; ++i) {
+      builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+    }
+    String longWord = builder.toString();
+    String content = "one two three " + longWord + " four five six";
+    Reader reader = new StringReader(content);
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+    args.put("maxTokenLength", "1000");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(args);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+  }
 }