You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/04/11 19:41:11 UTC
lucene-solr:branch_6x: LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 5987a2a73 -> 740d96767


LUCENE-7760: improve setMaxTokenLength javadocs for StandardAnalyzer/Tokenizer and UAX29URLEmailAnalyzer/Tokenizer


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/740d9676
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/740d9676
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/740d9676

Branch: refs/heads/branch_6x
Commit: 740d96767b37aac31f1e99ed1bab301f5e915f3a
Parents: 5987a2a
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Apr 11 15:37:42 2017 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Apr 11 15:39:43 2017 -0400

----------------------------------------------------------------------
 .../standard/UAX29URLEmailAnalyzer.java         | 11 ++++++----
 .../standard/UAX29URLEmailTokenizer.java        | 23 ++++++++++++++++----
 .../standard/TestUAX29URLEmailAnalyzer.java     | 23 ++++++++++++++++++++
 .../standard/TestUAX29URLEmailTokenizer.java    |  2 +-
 .../analysis/standard/StandardAnalyzer.java     | 11 ++++++----
 .../analysis/standard/StandardTokenizer.java    |  6 ++++-
 .../analysis/standard/TestStandardAnalyzer.java | 23 ++++++++++++++++++++
 7 files changed, 85 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
index fe71b7e..282c2e7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@@ -66,10 +66,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
   }
 
   /**
-   * Set maximum allowed token length.  If a token is seen
-   * that exceeds this length then it is discarded.  This
-   * setting only takes effect the next time tokenStream or
-   * tokenStream is called.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
    */
   public void setMaxTokenLength(int length) {
     maxTokenLength = length;
@@ -92,6 +93,8 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
     return new TokenStreamComponents(src, tok) {
       @Override
       protected void setReader(final Reader reader) {
+        // So that if maxTokenLength was changed, the change takes
+        // effect next time tokenStream is called:
         src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength);
         super.setReader(reader);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
index d2b02e4..842ae51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@@ -72,19 +72,34 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
     "<URL>",
     "<EMAIL>",
   };
+
+  /** Absolute maximum sized token */
+  public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
   
   private int skippedPositions;
 
   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
 
-  /** Set the max allowed token length.  Any token longer
-   *  than this is skipped. */
+  /**
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link UAX29URLEmailAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+   * 
+   * @throws IllegalArgumentException if the given length is outside of the
+   *  range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+   */ 
   public void setMaxTokenLength(int length) {
     if (length < 1) {
       throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+    } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+      throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+    }
+    if (length != maxTokenLength) {
+      this.maxTokenLength = length;
+      scanner.setBufferSize(length);
     }
-    this.maxTokenLength = length;
-    scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
   }
 
   /** @see #setMaxTokenLength */

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
index 14a5165..b932178 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@@ -357,4 +357,27 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
   }
+
+  public void testMaxTokenLengthDefault() throws Exception {
+
+    StringBuilder bToken = new StringBuilder();
+    // exact max length:
+    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+      bToken.append('b');
+    }
+
+    String bString = bToken.toString();
+    // first bString is exact max default length; next one is 1 too long
+    String input = "x " + bString + " " + bString + "b";
+    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    a.close();
+  }
+
+  public void testMaxTokenLengthNonDefault() throws Exception {
+    UAX29URLEmailAnalyzer a = new UAX29URLEmailAnalyzer();
+    a.setMaxTokenLength(5);
+    assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+    a.close();
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
index eaa5a44..cfe31c9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@@ -105,7 +105,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
-        tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
+        tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT);  // Tokenize arbitrary length URLs
         TokenFilter filter = new URLFilter(tokenizer);
         return new TokenStreamComponents(tokenizer, filter);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index fb57573..8afffd8 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -81,10 +81,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
   }
 
   /**
-   * Set maximum allowed token length.  If a token is seen
-   * that exceeds this length then it is discarded.  This
-   * setting only takes effect the next time tokenStream or
-   * tokenStream is called.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
    */
   public void setMaxTokenLength(int length) {
     maxTokenLength = length;
@@ -107,6 +108,8 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
     return new TokenStreamComponents(src, tok) {
       @Override
       protected void setReader(final Reader reader) {
+        // So that if maxTokenLength was changed, the change takes
+        // effect next time tokenStream is called:
         src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
         super.setReader(reader);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 5b8fc75..ed52f03 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -105,7 +105,11 @@ public final class StandardTokenizer extends Tokenizer {
   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
 
   /**
-   * Set the max allowed token length.  No tokens longer than this are emitted.
+   * Set the max allowed token length.  Tokens larger than this will be chopped
+   * up at this token length and emitted as multiple tokens.  If you need to
+   * skip such large tokens, you could increase this max length, and then
+   * use {@code LengthFilter} to remove long tokens.  The default is
+   * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
    * 
    * @throws IllegalArgumentException if the given length is outside of the
    *  range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/740d9676/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 2cc9274..6abbc2b 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -393,4 +393,27 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
     Analyzer a = new StandardAnalyzer();
     assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
   }
+
+  public void testMaxTokenLengthDefault() throws Exception {
+    StandardAnalyzer a = new StandardAnalyzer();
+
+    StringBuilder bToken = new StringBuilder();
+    // exact max length:
+    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+      bToken.append('b');
+    }
+
+    String bString = bToken.toString();
+    // first bString is exact max default length; next one is 1 too long
+    String input = "x " + bString + " " + bString + "b";
+    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    a.close();
+  }
+
+  public void testMaxTokenLengthNonDefault() throws Exception {
+    StandardAnalyzer a = new StandardAnalyzer();
+    a.setMaxTokenLength(5);
+    assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
+    a.close();
+  }
 }