You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/12/15 21:24:26 UTC
svn commit: r1049693 - in /lucene/dev/trunk/solr: ./
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
Author: sarowe
Date: Wed Dec 15 20:24:26 2010
New Revision: 1049693
URL: http://svn.apache.org/viewvc?rev=1049693&view=rev
Log:
SOLR-2188: provide maxTokenLength arg for Classic, Standard, and UAX29URLEmail tokenizer factories
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Dec 15 20:24:26 2010
@@ -314,6 +314,8 @@ New Features
Adding a parameter NOW=<time_in_ms> to the request will override the
current time. (Peter Sturge, yonik)
+* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer,
+ StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe)
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.util.Map;
@@ -28,13 +30,20 @@ import java.util.Map;
*/
public class ClassicTokenizerFactory extends BaseTokenizerFactory {
+
+ private int maxTokenLength;
+
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
+ maxTokenLength = getInt("maxTokenLength",
+ StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public Tokenizer create(Reader input) {
- return new ClassicTokenizer(luceneMatchVersion, input);
+ ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input);
+ tokenizer.setMaxTokenLength(maxTokenLength);
+ return tokenizer;
}
}
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -17,6 +17,7 @@
package org.apache.solr.analysis;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
@@ -27,13 +28,21 @@ import java.util.Map;
*/
public class StandardTokenizerFactory extends BaseTokenizerFactory {
+
+ private int maxTokenLength;
+
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
+ maxTokenLength = getInt("maxTokenLength",
+ StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public StandardTokenizer create(Reader input) {
- return new StandardTokenizer(luceneMatchVersion, input);
+ StandardTokenizer tokenizer
+ = new StandardTokenizer(luceneMatchVersion, input);
+ tokenizer.setMaxTokenLength(maxTokenLength);
+ return tokenizer;
}
}
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import java.io.Reader;
@@ -31,13 +32,20 @@ import java.util.Map;
*/
public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
+
+ private int maxTokenLength;
+
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
+ maxTokenLength = getInt("maxTokenLength",
+ StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
public UAX29URLEmailTokenizer create(Reader input) {
- return new UAX29URLEmailTokenizer(input);
+ UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input);
+ tokenizer.setMaxTokenLength(maxTokenLength);
+ return tokenizer;
}
}
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -40,6 +42,24 @@ public class TestStandardFactories exten
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
+ public void testStandardTokenizerMaxTokenLength() throws Exception {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0 ; i < 100 ; ++i) {
+ builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+ }
+ String longWord = builder.toString();
+ String content = "one two three " + longWord + " four five six";
+ Reader reader = new StringReader(content);
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+ args.put("maxTokenLength", "1000");
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
+ factory.init(args);
+ Tokenizer stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+ }
+
/**
* Test ClassicTokenizerFactory
*/
@@ -52,6 +72,24 @@ public class TestStandardFactories exten
new String[] {"What's", "this", "thing", "do" });
}
+ public void testClassicTokenizerMaxTokenLength() throws Exception {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0 ; i < 100 ; ++i) {
+ builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+ }
+ String longWord = builder.toString();
+ String content = "one two three " + longWord + " four five six";
+ Reader reader = new StringReader(content);
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+ args.put("maxTokenLength", "1000");
+ ClassicTokenizerFactory factory = new ClassicTokenizerFactory();
+ factory.init(args);
+ Tokenizer stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+ }
+
/**
* Test ClassicFilterFactory
*/
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java?rev=1049693&r1=1049692&r2=1049693&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java Wed Dec 15 20:24:26 2010
@@ -19,6 +19,9 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
import org.apache.lucene.analysis.Tokenizer;
/**
@@ -152,4 +155,22 @@ public class TestUAX29URLEmailTokenizerF
}
);
}
+
+ public void testMaxTokenLength() throws Exception {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0 ; i < 100 ; ++i) {
+ builder.append("abcdefg"); // 7 * 100 = 700 char "word"
+ }
+ String longWord = builder.toString();
+ String content = "one two three " + longWord + " four five six";
+ Reader reader = new StringReader(content);
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("luceneMatchVersion", DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
+ args.put("maxTokenLength", "1000");
+ UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+ factory.init(args);
+ Tokenizer stream = factory.create(reader);
+ assertTokenStreamContents(stream,
+ new String[] {"one", "two", "three", longWord, "four", "five", "six" });
+ }
}