You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/04/11 19:41:11 UTC
lucene-solr:branch_6x: LUCENE-7760: improve setMaxTokenLength
javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 5987a2a73 -> 740d96767
LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/740d9676
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/740d9676
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/740d9676
Branch: refs/heads/branch_6x
Commit: 740d96767b37aac31f1e99ed1bab301f5e915f3a
Parents: 5987a2a
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Apr 11 15:37:42 2017 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Apr 11 15:39:43 2017 -0400
----------------------------------------------------------------------
.../standard/UAX29URLEmailAnalyzer.java | 11 ++++++----
.../standard/UAX29URLEmailTokenizer.java | 23 ++++++++++++++++----
.../standard/TestUAX29URLEmailAnalyzer.java | 23 ++++++++++++++++++++
.../standard/TestUAX29URLEmailTokenizer.java | 2 +-
.../analysis/standard/StandardAnalyzer.java | 11 ++++++----
.../analysis/standard/StandardTokenizer.java | 6 ++++-
.../analysis/standard/TestStandardAnalyzer.java | 23 ++++++++++++++++++++
7 files changed, 85 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
index fe71b7e..282c2e7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Set maximum allowed token length. If a token is seen
- * that exceeds this length then it is discarded. This
- * setting only takes effect the next time tokenStream or
- * tokenStream is called.
+ * Set the max allowed token length. Tokens larger than this will be chopped
+ * up at this token length and emitted as multiple tokens. If you need to
+ * skip such large tokens, you could increase this max length, and then
+ * use {@code LengthFilter} to remove long tokens. The default is
+ * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
@@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) {
+ // So that if maxTokenLength was changed, the change takes
+ // effect next time tokenStream is called:
src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
super.setReader(reader);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
index d2b02e4..842ae51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
"<URL>",
"<EMAIL>",
};
+
+ /** Absolute maximum sized token */
+ public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
- /** Set the max allowed token length. Any token longer
- * than this is skipped. */
+ /**
+ * Set the max allowed token length. Tokens larger than this will be chopped
+ * up at this token length and emitted as multiple tokens. If you need to
+ * skip such large tokens, you could increase this max length, and then
+ * use {@code LengthFilter} to remove long tokens. The default is
+ * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+ *
+ * @throws IllegalArgumentException if the given length is outside of the
+ * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+ */
public void setMaxTokenLength(int length) {
if (length < 1) {
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+ } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+ throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+ }
+ if (length != maxTokenLength) {
+ this.maxTokenLength = length;
+ scanner.setBufferSize(length);
}
- this.maxTokenLength = length;
- scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
}
/** @see #setMaxTokenLength */
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
index 14a5165..b932178 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
}
+
+ public void testMaxTokenLengthDefault() throws Exception {
+
+ StringBuilder bToken = new StringBuilder();
+ // exact max length:
+ for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+ bToken.append('b');
+ }
+
+ String bString = bToken.toString();
+ // first bString is exact max default length; next one is 1 too long
+ String input = "x " + bString + " " + bString + "b";
+ assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+ a.close();
+ }
+
+ public void testMaxTokenLengthNonDefault() throws Exception {
+ UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
+ a.setMaxTokenLength(5);
+ assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+ a.close();
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
index eaa5a44..cfe31c9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
- tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
+ tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index fb57573..8afffd8 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
}
/**
- * Set maximum allowed token length. If a token is seen
- * that exceeds this length then it is discarded. This
- * setting only takes effect the next time tokenStream or
- * tokenStream is called.
+ * Set the max allowed token length. Tokens larger than this will be chopped
+ * up at this token length and emitted as multiple tokens. If you need to
+ * skip such large tokens, you could increase this max length, and then
+ * use {@code LengthFilter} to remove long tokens. The default is
+ * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
@@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) {
+ // So that if maxTokenLength was changed, the change takes
+ // effect next time tokenStream is called:
src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
super.setReader(reader);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 5b8fc75..ed52f03 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/**
- * Set the max allowed token length. No tokens longer than this are emitted.
+ * Set the max allowed token length. Tokens larger than this will be chopped
+ * up at this token length and emitted as multiple tokens. If you need to
+ * skip such large tokens, you could increase this max length, and then
+ * use {@code LengthFilter} to remove long tokens. The default is
+ * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
*
* @throws IllegalArgumentException if the given length is outside of the
* range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 2cc9274..6abbc2b 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
Analyzer a = new StandardAnalyzer();
assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
}
+
+ public void testMaxTokenLengthDefault() throws Exception {
+ StandardAnalyzer a = new StandardAnalyzer();
+
+ StringBuilder bToken = new StringBuilder();
+ // exact max length:
+ for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+ bToken.append('b');
+ }
+
+ String bString = bToken.toString();
+ // first bString is exact max default length; next one is 1 too long
+ String input = "x " + bString + " " + bString + "b";
+ assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+ a.close();
+ }
+
+ public void testMaxTokenLengthNonDefault() throws Exception {
+ StandardAnalyzer a = new StandardAnalyzer();
+ a.setMaxTokenLength(5);
+ assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+ a.close();
+ }
}