You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/08/22 17:28:36 UTC

svn commit: r1619836 - in /lucene/dev/branches/lucene_solr_4_10: ./ dev-tools/ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/analysis/common/src/java/org/apache/lucene/ana...

Author: rjernst
Date: Fri Aug 22 15:28:33 2014
New Revision: 1619836

URL: http://svn.apache.org/r1619836
Log:
LUCENE-5897, LUCENE-5400: JFlex-based tokenizers StandardTokenizer and UAX29URLEmailTokenizer tokenize extremely slowly over long sequences of text partially matching certain grammar rules. The scanner default buffer size was reduced, and scanner buffer growth was disabled, resulting in much, much faster tokenization for these text sequences. (merged branch_4x r1619773)

Modified:
    lucene/dev/branches/lucene_solr_4_10/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/dev-tools/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/CHANGES.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/README.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/ASCIITLD.jflex-macro   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/SUPPLEMENTARY.jflex-macro   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/package.html   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/backwards/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/benchmark/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/classification/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/classification/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/classification/ivy.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/classification/src/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/codecs/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/common-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions2.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSort.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/demo/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/expressions/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/facet/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/grouping/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/highlighter/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/ivy-ignore-conflicts.properties   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/ivy-versions.properties   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/join/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/licenses/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/memory/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/misc/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/module-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/queries/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionQuerySort.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/queryparser/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/replicator/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/sandbox/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/site/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/java/org/apache/lucene/spatial/util/ShapeAreaValueSource.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test-files/data/simple-bbox.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test-files/simple-Queries-BBox.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test/org/apache/lucene/spatial/bbox/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/suggest/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/test-framework/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/lucene/tools/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/README.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/cloud-dev/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/common-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/contrib/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/core/src/test/org/apache/solr/core/TestConfig.java   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/example/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/scripts/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/site/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/solrj/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/test-framework/   (props changed)
    lucene/dev/branches/lucene_solr_4_10/solr/webapp/   (props changed)

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml Fri Aug 22 15:28:33 2014
@@ -59,11 +59,13 @@
   </target>
 
   <target name="-jflex-StandardAnalyzer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
   </target>
 
   <target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
   </target>
   
   <macrodef name="run-jflex">
@@ -74,6 +76,27 @@
     </sequential>
   </macrodef>
 
+  <macrodef name="run-jflex-and-disable-buffer-expansion">
+    <attribute name="dir"/>
+    <attribute name="name"/>
+    <sequential>
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+      <!-- LUCENE-5897: Disallow scanner buffer expansion -->
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
+                     replace="" flags="s" />
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="private static final int ZZ_BUFFERSIZE ="
+                     replace="private int ZZ_BUFFERSIZE ="/>
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="int requested = zzBuffer.length - zzEndRead;"
+                     replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
+      <replaceregexp file="@{dir}/@{name}.java"
+                     match="(zzFinalHighSurrogate = 1;)(\r?\n)"
+                     replace="\1\2          if (totalRead == 1) { return true; }\2"/>
+    </sequential>
+  </macrodef>
+
   <target name="clean-jflex">
     <delete>
       <fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java Fri Aug 22 15:28:33 2014
@@ -29,6 +29,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.OpenStringBuilder;
+import org.apache.lucene.util.Version;
 
 /**
  * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
@@ -29839,7 +29840,7 @@ public final class HTMLStripCharFilter e
     upperCaseVariantsAccepted.put("amp", "AMP");
   }
   private static final CharArrayMap<Character> entityValues
-      = new CharArrayMap<>(253, false);
+      = new CharArrayMap<>(Version.LUCENE_CURRENT, 253, false);
   static {
     String[] entities = {
       "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Fri Aug 22 15:28:33 2014
@@ -27,6 +27,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.OpenStringBuilder;
+import org.apache.lucene.util.Version;
 
 /**
  * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -366,6 +366,9 @@ public final void getText(CharTermAttrib
   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 
 
   /**

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -67,6 +67,9 @@ public final void getText(CharTermAttrib
   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 THAI       = [\u0E00-\u0E59]

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Aug 22 15:28:33 2014
@@ -116,6 +116,9 @@ public final class StandardTokenizer ext
       throw new IllegalArgumentException("maxTokenLength must be greater than zero");
     }
     this.maxTokenLength = length;
+    if (scanner instanceof StandardTokenizerImpl) {
+      scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+    }
   }
 
   /** @see #setMaxTokenLength */

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -45,7 +45,7 @@ public final class StandardTokenizerImpl
   public static final int YYEOF = -1;
 
   /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 4096;
+  private int ZZ_BUFFERSIZE = 255;
 
   /** lexical states */
   public static final int YYINITIAL = 0;
@@ -454,6 +454,16 @@ public final class StandardTokenizerImpl
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 
 
   /**
@@ -509,18 +519,9 @@ public final class StandardTokenizerImpl
       zzStartRead = 0;
     }
 
-    /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
-      /* if not: blow it up */
-      char newBuffer[] = new char[zzBuffer.length*2];
-      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
-      zzBuffer = newBuffer;
-      zzEndRead += zzFinalHighSurrogate;
-      zzFinalHighSurrogate = 0;
-    }
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;           
     int totalRead = 0;
     while (totalRead < requested) {
       int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
@@ -536,6 +537,7 @@ public final class StandardTokenizerImpl
         if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
           --zzEndRead;
           zzFinalHighSurrogate = 1;
+          if (totalRead == 1) { return true; }
         }
       }
       return false;

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -46,7 +46,7 @@ import org.apache.lucene.analysis.tokena
 %implements StandardTokenizerInterface
 %function getNextToken
 %char
-%buffer 4096
+%buffer 255
 
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
@@ -101,6 +101,16 @@ ComplexContextEx    = \p{LB:Complex_Cont
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java Fri Aug 22 15:28:33 2014
@@ -67,4 +67,8 @@ public interface StandardTokenizerInterf
    */
   public int getNextToken() throws IOException;
 
+  /**
+   * Sets the scanner buffer size in chars
+   */
+  public void setBufferSize(int numChars);
 }

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java Fri Aug 22 15:28:33 2014
@@ -97,6 +97,9 @@ public final class UAX29URLEmailTokenize
       throw new IllegalArgumentException("maxTokenLength must be greater than zero");
     }
     this.maxTokenLength = length;
+    if (scanner instanceof UAX29URLEmailTokenizerImpl) {
+      scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+    }
   }
 
   /** @see #setMaxTokenLength */

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -48,7 +48,7 @@ public final class UAX29URLEmailTokenize
   public static final int YYEOF = -1;
 
   /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 4096;
+  private int ZZ_BUFFERSIZE = 255;
 
   /** lexical states */
   public static final int YYINITIAL = 0;
@@ -6820,6 +6820,16 @@ public final class UAX29URLEmailTokenize
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 
 
   /**
@@ -6875,18 +6885,9 @@ public final class UAX29URLEmailTokenize
       zzStartRead = 0;
     }
 
-    /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
-      /* if not: blow it up */
-      char newBuffer[] = new char[zzBuffer.length*2];
-      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
-      zzBuffer = newBuffer;
-      zzEndRead += zzFinalHighSurrogate;
-      zzFinalHighSurrogate = 0;
-    }
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;           
     int totalRead = 0;
     while (totalRead < requested) {
       int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
@@ -6902,6 +6903,7 @@ public final class UAX29URLEmailTokenize
         if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
           --zzEndRead;
           zzFinalHighSurrogate = 1;
+          if (totalRead == 1) { return true; }
         }
       }
       return false;

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -50,7 +50,7 @@ import org.apache.lucene.analysis.tokena
 %function getNextToken
 %char
 %xstate AVOID_BAD_URL
-%buffer 4096
+%buffer 255
 
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
@@ -189,6 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java Fri Aug 22 15:28:33 2014
@@ -723,6 +723,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ ExtendNumLetEx = {ExtendNumLet}         
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java Fri Aug 22 15:28:33 2014
@@ -3298,6 +3298,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java Fri Aug 22 15:28:33 2014
@@ -740,6 +740,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java Fri Aug 22 15:28:33 2014
@@ -3386,6 +3386,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex Fri Aug 22 15:28:33 2014
@@ -188,6 +188,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java Fri Aug 22 15:28:33 2014
@@ -3822,6 +3822,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java Fri Aug 22 15:28:33 2014
@@ -847,6 +847,10 @@ public final class StandardTokenizerImpl
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java Fri Aug 22 15:28:33 2014
@@ -4034,6 +4034,10 @@ public final class UAX29URLEmailTokenize
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
 
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
+
 
   /**
    * Creates a new scanner

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   public final void getText(CharTermAttribute t) {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
   }
+
+   public final void setBufferSize(int numChars) {
+     throw new UnsupportedOperationException();
+   }
 %}
 
 %%

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Fri Aug 22 15:28:33 2014
@@ -30,10 +30,76 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.Version;
 
 public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
-  
+
+  // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
+  public void testLargePartiallyMatchingToken() throws Exception {
+    // TODO: get these lists of chars matching a property from ICU4J
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    int[] WordBreak_Format_chars // only the first char in ranges 
+        = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
+            0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
+    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+    int[] WordBreak_Extend_chars // only the first char in ranges
+        = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
+             0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
+             0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
+             0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
+             0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
+             0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
+             0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
+             0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
+             0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
+             0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
+             0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
+             0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
+             0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
+             0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
+             0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
+             0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
+             0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
+             0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
+             0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
+             0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
+    StringBuilder builder = new StringBuilder();
+    int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
+    for (int i = 0 ; i < numChars ; ) {
+      builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
+      ++i;
+      if (random().nextBoolean()) {
+        int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
+        for (int j = 0; j < numFormatExtendChars; ++j) {
+          int codepoint;
+          if (random().nextBoolean()) {
+            codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
+          } else {
+            codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
+          }
+          char[] chars = Character.toChars(codepoint);
+          builder.append(chars);
+          i += chars.length;
+        }
+      }
+    }
+    StandardTokenizer ts = new StandardTokenizer(new StringReader(builder.toString()));
+    ts.reset();
+    while (ts.incrementToken()) { }
+    ts.end();
+    ts.close();
+    int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+    ts.setMaxTokenLength(newBufferSize); // try a different buffer size
+    ts.setReader(new StringReader(builder.toString()));
+    ts.reset();
+    while (ts.incrementToken()) { }
+    ts.end();
+    ts.close();
+  }
+
   public void testHugeDoc() throws IOException {
     StringBuilder sb = new StringBuilder();
     char whitespace[] = new char[4094];

Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Fri Aug 22 15:28:33 2014
@@ -7,6 +7,7 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.Version;
 
 import java.io.BufferedReader;
@@ -19,6 +20,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Random;
+import java.util.regex.Pattern;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -38,7 +40,41 @@ import java.util.Random;
  */
 
 public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
-  
+
+  // LUCENE-5440: extremely slow tokenization of text matching email <local-part> (before the '@')
+  public void testLongEMAILatomText() throws Exception {
+    // EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
+    char[] emailAtomChars
+        = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
+    StringBuilder builder = new StringBuilder();
+    int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
+    for (int i = 0 ; i < numChars ; ++i) {
+      builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
+    }
+    int tokenCount = 0;
+    String text = builder.toString();
+    UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer(new StringReader(text));
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokenCount++;
+    }
+    ts.end();
+    ts.close();
+    assertTrue(tokenCount > 0);
+
+    tokenCount = 0;
+    int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+    ts.setMaxTokenLength(newBufferSize);
+    ts.setReader(new StringReader(text));
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokenCount++;
+    }
+    ts.end();
+    ts.close();
+    assertTrue(tokenCount > 0);
+  }
+
   public void testHugeDoc() throws IOException {
     StringBuilder sb = new StringBuilder();
     char whitespace[] = new char[4094];