You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/09/17 05:58:23 UTC

svn commit: r1625458 - in /lucene/dev/branches/lucene_solr_4_9: ./ dev-tools/ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ lucene/analysis/common/src/java/org/apache/lucene/a...

Author: sarowe
Date: Wed Sep 17 03:58:21 2014
New Revision: 1625458

URL: http://svn.apache.org/r1625458
Log:
LUCENE-5897, LUCENE-5400: JFlex-based tokenizers StandardTokenizer and UAX29URLEmailTokenizer tokenize extremely slowly over long sequences of text partially matching certain grammar rules.  The scanner default buffer size was reduced, and scanner buffer growth was disabled, resulting in much, much faster tokenization for these text sequences. (merged branch_4x r1619773)

Modified:
    lucene/dev/branches/lucene_solr_4_9/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/dev-tools/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/README.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/build.xml
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/ASCIITLD.jflex-macro   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/SUPPLEMENTARY.jflex-macro   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/package.html   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/backwards/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/benchmark/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/classification/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/classification/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/classification/ivy.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/classification/src/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/codecs/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/common-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions2.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/search/TestSort.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/demo/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/expressions/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/facet/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/grouping/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/highlighter/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/ivy-ignore-conflicts.properties   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/ivy-versions.properties   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/join/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/licenses/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/memory/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/misc/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/module-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/queries/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionQuerySort.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/queryparser/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/replicator/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/sandbox/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/site/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/suggest/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/test-framework/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/lucene/tools/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/README.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/cloud-dev/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/common-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/contrib/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/core/src/test/org/apache/solr/core/TestConfig.java   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/example/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/scripts/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/site/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/solrj/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/test-framework/   (props changed)
    lucene/dev/branches/lucene_solr_4_9/solr/webapp/   (props changed)

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/CHANGES.txt?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/CHANGES.txt Wed Sep 17 03:58:21 2014
@@ -34,6 +34,13 @@ Bug fixes
 * LUCENE-5827: Make all Directory implementations correctly fail with
   IllegalArgumentException if slices are out of bounds.  (Uwe SChindler)
 
+* LUCENE-5897, LUCENE-5400: JFlex-based tokenizers StandardTokenizer and
+  UAX29URLEmailTokenizer tokenize extremely slowly over long sequences of
+  text partially matching certain grammar rules.  The scanner default
+  buffer size was reduced, and scanner buffer growth was disabled, resulting
+  in much, much faster tokenization for these text sequences.  
+  (Chris Geeringh, Robert Muir, Steve Rowe)
+
 ======================= Lucene 4.9.0 =======================
 
 Changes in Runtime Behavior

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/build.xml?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/build.xml (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/build.xml Wed Sep 17 03:58:21 2014
@@ -66,7 +66,8 @@
   </target>
 
   <target name="-jflex-StandardAnalyzer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard/std31" name="StandardTokenizerImpl31"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard/std34" name="StandardTokenizerImpl34"/>
@@ -74,7 +75,8 @@
   </target>
 
   <target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard/std31" name="UAX29URLEmailTokenizerImpl31"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard/std34" name="UAX29URLEmailTokenizerImpl34"/>
     <run-jflex dir="src/java/org/apache/lucene/analysis/standard/std36" name="UAX29URLEmailTokenizerImpl36"/>
@@ -89,6 +91,25 @@
     </sequential>
   </macrodef>
 
+  <macrodef name="run-jflex-and-disable-buffer-expansion">
+    <attribute name="dir"/>
+    <attribute name="name"/>
+    <sequential>
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+      <!-- LUCENE-5897: Disallow scanner buffer expansion -->
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
+                     replace="" flags="s" />
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="private static final int ZZ_BUFFERSIZE ="
+                     replace="private int ZZ_BUFFERSIZE ="/>
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="(// unlikely but not impossible:.*?if \(numRead == 0)"
+                     replace="\1 &amp;&amp; zzBuffer.length - zzEndRead > 0"
+                     flags="s" />
+    </sequential>
+  </macrodef>
+
   <target name="clean-jflex">
     <delete>
       <fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Wed Sep 17 03:58:21 2014
@@ -366,6 +366,9 @@ public final void getText(CharTermAttrib
   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 
 
   /**

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex Wed Sep 17 03:58:21 2014
@@ -67,6 +67,9 @@ public final void getText(CharTermAttrib
   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 THAI       = [\u0E00-\u0E59]

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Wed Sep 17 03:58:21 2014
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-// Generated using ICU4J 52.1.0.0
+// Generated using ICU4J 53.1.0.0
 // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
 
 

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Wed Sep 17 03:58:21 2014
@@ -117,6 +117,9 @@ public final class StandardTokenizer ext
       throw new IllegalArgumentException("maxTokenLength must be greater than zero");
     }
     this.maxTokenLength = length;
+    if (scanner instanceof StandardTokenizerImpl) {
+      scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+    }
   }
 
   /** @see #setMaxTokenLength */

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Wed Sep 17 03:58:21 2014
@@ -45,7 +45,7 @@ public final class StandardTokenizerImpl
   public static final int YYEOF = -1;
 
   /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 4096;
+  private int ZZ_BUFFERSIZE = 255;
 
   /** lexical states */
   public static final int YYINITIAL = 0;
@@ -958,6 +958,16 @@ public final class StandardTokenizerImpl
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 
 
   /**
@@ -1011,13 +1021,6 @@ public final class StandardTokenizerImpl
       zzStartRead = 0;
     }
 
-    /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length) {
-      /* if not: blow it up */
-      char newBuffer[] = new char[zzCurrentPos*2];
-      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
-      zzBuffer = newBuffer;
-    }
 
     /* finally: fill the buffer with new input */
     int numRead = zzReader.read(zzBuffer, zzEndRead,
@@ -1028,7 +1031,7 @@ public final class StandardTokenizerImpl
       return false;
     }
     // unlikely but not impossible: read 0 characters, but not at end of stream    
-    if (numRead == 0) {
+    if (numRead == 0 && zzBuffer.length - zzEndRead > 0) {
       int c = zzReader.read();
       if (c == -1) {
         return true;

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Wed Sep 17 03:58:21 2014
@@ -46,7 +46,7 @@ import org.apache.lucene.analysis.tokena
 %implements StandardTokenizerInterface
 %function getNextToken
 %char
-%buffer 4096
+%buffer 255
 
 %include SUPPLEMENTARY.jflex-macro
 ALetter           = (\p{WB:ALetter}                                     | {ALetterSupp})
@@ -120,6 +120,16 @@ RegionalIndicatorEx = {RegionalIndicator
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java Wed Sep 17 03:58:21 2014
@@ -67,4 +67,8 @@ public interface StandardTokenizerInterf
    */
   public int getNextToken() throws IOException;
 
+  /**
+   * Sets the scanner buffer size in chars
+   */
+  public void setBufferSize(int numChars);
 }

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java Wed Sep 17 03:58:21 2014
@@ -98,6 +98,9 @@ public final class UAX29URLEmailTokenize
       throw new IllegalArgumentException("maxTokenLength must be greater than zero");
     }
     this.maxTokenLength = length;
+    if (scanner instanceof UAX29URLEmailTokenizerImpl) {
+      scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+    }
   }
 
   /** @see #setMaxTokenLength */

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java Wed Sep 17 03:58:21 2014
@@ -48,7 +48,7 @@ public final class UAX29URLEmailTokenize
   public static final int YYEOF = -1;
 
   /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 4096;
+  private int ZZ_BUFFERSIZE = 255;
 
   /** lexical states */
   public static final int YYINITIAL = 0;
@@ -9135,6 +9135,16 @@ public final class UAX29URLEmailTokenize
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 
 
   /**
@@ -9188,13 +9198,6 @@ public final class UAX29URLEmailTokenize
       zzStartRead = 0;
     }
 
-    /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length) {
-      /* if not: blow it up */
-      char newBuffer[] = new char[zzCurrentPos*2];
-      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
-      zzBuffer = newBuffer;
-    }
 
     /* finally: fill the buffer with new input */
     int numRead = zzReader.read(zzBuffer, zzEndRead,
@@ -9205,7 +9208,7 @@ public final class UAX29URLEmailTokenize
       return false;
     }
     // unlikely but not impossible: read 0 characters, but not at end of stream    
-    if (numRead == 0) {
+    if (numRead == 0 && zzBuffer.length - zzEndRead > 0) {
       int c = zzReader.read();
       if (c == -1) {
         return true;

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Wed Sep 17 03:58:21 2014
@@ -50,7 +50,7 @@ import org.apache.lucene.analysis.tokena
 %function getNextToken
 %char
 %xstate AVOID_BAD_URL
-%buffer 4096
+%buffer 255
 
 %include SUPPLEMENTARY.jflex-macro
 ALetter           = (\p{WB:ALetter}                                     | {ALetterSupp})
@@ -207,6 +207,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java Wed Sep 17 03:58:21 2014
@@ -723,6 +723,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex Wed Sep 17 03:58:21 2014
@@ -103,6 +103,10 @@ ExtendNumLetEx = {ExtendNumLet}         
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java Wed Sep 17 03:58:21 2014
@@ -3298,6 +3298,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex Wed Sep 17 03:58:21 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java Wed Sep 17 03:58:21 2014
@@ -740,6 +740,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex Wed Sep 17 03:58:21 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java Wed Sep 17 03:58:21 2014
@@ -3386,6 +3386,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex Wed Sep 17 03:58:21 2014
@@ -188,6 +188,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java Wed Sep 17 03:58:21 2014
@@ -3822,6 +3822,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex Wed Sep 17 03:58:21 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java Wed Sep 17 03:58:21 2014
@@ -847,6 +847,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex Wed Sep 17 03:58:21 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java Wed Sep 17 03:58:21 2014
@@ -4034,6 +4034,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex Wed Sep 17 03:58:21 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Wed Sep 17 03:58:21 2014
@@ -30,10 +30,76 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.Version;
 
 public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
-  
+
+  // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
+  public void testLargePartiallyMatchingToken() throws Exception {
+    // TODO: get these lists of chars matching a property from ICU4J
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    int[] WordBreak_Format_chars // only the first char in ranges 
+        = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
+            0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    int[] WordBreak_Extend_chars // only the first char in ranges
+        = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
+             0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
+             0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
+             0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
+             0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
+             0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
+             0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
+             0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
+             0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
+             0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
+             0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
+             0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
+             0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
+             0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
+             0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
+             0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
+             0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
+             0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
+             0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
+             0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
+    StringBuilder builder = new StringBuilder();
+    int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
+    for (int i = 0 ; i < numChars ; ) {
+      builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
+      ++i;
+      if (random().nextBoolean()) {
+        int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
+        for (int j = 0; j < numFormatExtendChars; ++j) {
+          int codepoint;
+          if (random().nextBoolean()) {
+            codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
+          } else {
+            codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
+          }
+          char[] chars = Character.toChars(codepoint);
+          builder.append(chars);
+          i += chars.length;
+        }
+      }
+    }
+    StandardTokenizer ts = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
+    ts.reset();
+    while (ts.incrementToken()) { }
+    ts.end();
+    ts.close();
+    int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+    ts.setMaxTokenLength(newBufferSize); // try a different buffer size
+    ts.setReader(new StringReader(builder.toString()));
+    ts.reset();
+    while (ts.incrementToken()) { }
+    ts.end();
+    ts.close();
+  }
+
   public void testHugeDoc() throws IOException {
     StringBuilder sb = new StringBuilder();
     char whitespace[] = new char[4094];

Modified: lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1625458&r1=1625457&r2=1625458&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_9/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Wed Sep 17 03:58:21 2014
@@ -8,6 +8,7 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.Version;
 
 import java.io.BufferedReader;
@@ -20,6 +21,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Random;
+import java.util.regex.Pattern;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -39,7 +41,41 @@ import java.util.Random;
  */
 
 public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
-  
+
+  // LUCENE-5440: extremely slow tokenization of text matching email <local-part> (before the '@')
+  public void testLongEMAILatomText() throws Exception {
+    // EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
+    char[] emailAtomChars
+        = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
+    StringBuilder builder = new StringBuilder();
+    int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
+    for (int i = 0 ; i < numChars ; ++i) {
+      builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
+    }
+    int tokenCount = 0;
+    String text = builder.toString();
+    UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(text));
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokenCount++;
+    }
+    ts.end();
+    ts.close();
+    assertTrue(tokenCount > 0);
+
+    tokenCount = 0;
+    int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+    ts.setMaxTokenLength(newBufferSize);
+    ts.setReader(new StringReader(text));
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokenCount++;
+    }
+    ts.end();
+    ts.close();
+    assertTrue(tokenCount > 0);
+  }
+
   public void testHugeDoc() throws IOException {
     StringBuilder sb = new StringBuilder();
     char whitespace[] = new char[4094];