You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/08/22 17:28:36 UTC
svn commit: r1619836 - in /lucene/dev/branches/lucene_solr_4_10: ./
dev-tools/ lucene/ lucene/analysis/ lucene/analysis/common/
lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/analysis/common/src/java/org/apache/lucene/ana...
Author: rjernst
Date: Fri Aug 22 15:28:33 2014
New Revision: 1619836
URL: http://svn.apache.org/r1619836
Log:
LUCENE-5897, LUCENE-5400: JFlex-based tokenizers StandardTokenizer and UAX29URLEmailTokenizer tokenize extremely slowly over long sequences of text partially matching certain grammar rules. The scanner default buffer size was reduced, and scanner buffer growth was disabled, resulting in much, much faster tokenization for these text sequences. (merged branch_4x r1619773)
Modified:
lucene/dev/branches/lucene_solr_4_10/ (props changed)
lucene/dev/branches/lucene_solr_4_10/dev-tools/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/BUILD.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/CHANGES.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/JRE_VERSION_MIGRATION.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/LICENSE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/MIGRATE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/NOTICE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/README.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/ASCIITLD.jflex-macro (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/SUPPLEMENTARY.jflex-macro (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java (contents, props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex (contents, props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java (contents, props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex (contents, props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/package.html (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/backwards/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/benchmark/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/classification/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/classification/build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/classification/ivy.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/classification/src/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/codecs/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/common-build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions2.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSort.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/demo/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/expressions/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/facet/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/grouping/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/highlighter/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/ivy-ignore-conflicts.properties (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/ivy-settings.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/ivy-versions.properties (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/join/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/licenses/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/memory/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/misc/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/module-build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/queries/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionQuerySort.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/queryparser/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/replicator/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/sandbox/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/site/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/java/org/apache/lucene/spatial/util/ShapeAreaValueSource.java (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test-files/data/simple-bbox.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test-files/simple-Queries-BBox.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/spatial/src/test/org/apache/lucene/spatial/bbox/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/suggest/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/test-framework/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/ (props changed)
lucene/dev/branches/lucene_solr_4_10/lucene/tools/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/CHANGES.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/LICENSE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/NOTICE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/README.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/cloud-dev/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/common-build.xml (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/contrib/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/core/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/core/src/test/org/apache/solr/core/TestConfig.java (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/example/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpclient-LICENSE-ASL.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpclient-NOTICE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpcore-LICENSE-ASL.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpcore-NOTICE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpmime-LICENSE-ASL.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/licenses/httpmime-NOTICE.txt (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/scripts/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/site/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/solrj/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/test-framework/ (props changed)
lucene/dev/branches/lucene_solr_4_10/solr/webapp/ (props changed)
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/build.xml Fri Aug 22 15:28:33 2014
@@ -59,11 +59,13 @@
</target>
<target name="-jflex-StandardAnalyzer" depends="init,-install-jflex">
- <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
+ <run-jflex-and-disable-buffer-expansion
+ dir="src/java/org/apache/lucene/analysis/standard" name="StandardTokenizerImpl"/>
</target>
<target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
- <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+ <run-jflex-and-disable-buffer-expansion
+ dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
<macrodef name="run-jflex">
@@ -74,6 +76,27 @@
</sequential>
</macrodef>
+ <macrodef name="run-jflex-and-disable-buffer-expansion">
+ <attribute name="dir"/>
+ <attribute name="name"/>
+ <sequential>
+ <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+ <!-- LUCENE-5897: Disallow scanner buffer expansion -->
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
+ replace="" flags="s" />
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="private static final int ZZ_BUFFERSIZE ="
+ replace="private int ZZ_BUFFERSIZE ="/>
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="int requested = zzBuffer.length - zzEndRead;"
+ replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
+ <replaceregexp file="@{dir}/@{name}.java"
+ match="(zzFinalHighSurrogate = 1;)(\r?\n)"
+ replace="\1\2 if (totalRead == 1) { return true; }\2"/>
+ </sequential>
+ </macrodef>
+
<target name="clean-jflex">
<delete>
<fileset dir="src/java/org/apache/lucene/analysis/charfilter" includes="*.java">
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java Fri Aug 22 15:28:33 2014
@@ -29,6 +29,7 @@ import java.util.Set;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
+import org.apache.lucene.util.Version;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
@@ -29839,7 +29840,7 @@ public final class HTMLStripCharFilter e
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
- = new CharArrayMap<>(253, false);
+ = new CharArrayMap<>(Version.LUCENE_CURRENT, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Fri Aug 22 15:28:33 2014
@@ -27,6 +27,7 @@ import java.util.Set;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
+import org.apache.lucene.util.Version;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -366,6 +366,9 @@ public final void getText(CharTermAttrib
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
/**
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -67,6 +67,9 @@ public final void getText(CharTermAttrib
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
THAI = [\u0E00-\u0E59]
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Aug 22 15:28:33 2014
@@ -116,6 +116,9 @@ public final class StandardTokenizer ext
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
}
this.maxTokenLength = length;
+ if (scanner instanceof StandardTokenizerImpl) {
+ scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+ }
}
/** @see #setMaxTokenLength */
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -45,7 +45,7 @@ public final class StandardTokenizerImpl
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
- private static final int ZZ_BUFFERSIZE = 4096;
+ private int ZZ_BUFFERSIZE = 255;
/** lexical states */
public static final int YYINITIAL = 0;
@@ -454,6 +454,16 @@ public final class StandardTokenizerImpl
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
/**
@@ -509,18 +519,9 @@ public final class StandardTokenizerImpl
zzStartRead = 0;
}
- /* is the buffer big enough? */
- if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
- /* if not: blow it up */
- char newBuffer[] = new char[zzBuffer.length*2];
- System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
- zzBuffer = newBuffer;
- zzEndRead += zzFinalHighSurrogate;
- zzFinalHighSurrogate = 0;
- }
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead;
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
@@ -536,6 +537,7 @@ public final class StandardTokenizerImpl
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
+ if (totalRead == 1) { return true; }
}
}
return false;
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -46,7 +46,7 @@ import org.apache.lucene.analysis.tokena
%implements StandardTokenizerInterface
%function getNextToken
%char
-%buffer 4096
+%buffer 255
// UAX#29 WB4. X (Extend | Format)* --> X
//
@@ -101,6 +101,16 @@ ComplexContextEx = \p{LB:Complex_Cont
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java Fri Aug 22 15:28:33 2014
@@ -67,4 +67,8 @@ public interface StandardTokenizerInterf
*/
public int getNextToken() throws IOException;
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public void setBufferSize(int numChars);
}
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java Fri Aug 22 15:28:33 2014
@@ -97,6 +97,9 @@ public final class UAX29URLEmailTokenize
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
}
this.maxTokenLength = length;
+ if (scanner instanceof UAX29URLEmailTokenizerImpl) {
+ scanner.setBufferSize(Math.min(length, 1024 * 1024)); // limit buffer size to 1M chars
+ }
}
/** @see #setMaxTokenLength */
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java Fri Aug 22 15:28:33 2014
@@ -48,7 +48,7 @@ public final class UAX29URLEmailTokenize
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
- private static final int ZZ_BUFFERSIZE = 4096;
+ private int ZZ_BUFFERSIZE = 255;
/** lexical states */
public static final int YYINITIAL = 0;
@@ -6820,6 +6820,16 @@ public final class UAX29URLEmailTokenize
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
/**
@@ -6875,18 +6885,9 @@ public final class UAX29URLEmailTokenize
zzStartRead = 0;
}
- /* is the buffer big enough? */
- if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
- /* if not: blow it up */
- char newBuffer[] = new char[zzBuffer.length*2];
- System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
- zzBuffer = newBuffer;
- zzEndRead += zzFinalHighSurrogate;
- zzFinalHighSurrogate = 0;
- }
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead;
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
@@ -6902,6 +6903,7 @@ public final class UAX29URLEmailTokenize
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
+ if (totalRead == 1) { return true; }
}
}
return false;
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Fri Aug 22 15:28:33 2014
@@ -50,7 +50,7 @@ import org.apache.lucene.analysis.tokena
%function getNextToken
%char
%xstate AVOID_BAD_URL
-%buffer 4096
+%buffer 255
// UAX#29 WB4. X (Extend | Format)* --> X
//
@@ -189,6 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java Fri Aug 22 15:28:33 2014
@@ -723,6 +723,10 @@ public final class StandardTokenizerImpl
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ ExtendNumLetEx = {ExtendNumLet}
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java Fri Aug 22 15:28:33 2014
@@ -3298,6 +3298,10 @@ public final class UAX29URLEmailTokenize
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.java Fri Aug 22 15:28:33 2014
@@ -740,6 +740,10 @@ public final class StandardTokenizerImpl
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/StandardTokenizerImpl34.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.java Fri Aug 22 15:28:33 2014
@@ -3386,6 +3386,10 @@ public final class UAX29URLEmailTokenize
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex Fri Aug 22 15:28:33 2014
@@ -188,6 +188,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.java Fri Aug 22 15:28:33 2014
@@ -3822,6 +3822,10 @@ public final class UAX29URLEmailTokenize
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std36/UAX29URLEmailTokenizerImpl36.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.java Fri Aug 22 15:28:33 2014
@@ -847,6 +847,10 @@ public final class StandardTokenizerImpl
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/StandardTokenizerImpl40.jflex Fri Aug 22 15:28:33 2014
@@ -103,6 +103,10 @@ HiraganaEx = {Hiragana} ({Format} | {Ext
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.java Fri Aug 22 15:28:33 2014
@@ -4034,6 +4034,10 @@ public final class UAX29URLEmailTokenize
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
+
/**
* Creates a new scanner
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizerImpl40.jflex Fri Aug 22 15:28:33 2014
@@ -185,6 +185,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+ public final void setBufferSize(int numChars) {
+ throw new UnsupportedOperationException();
+ }
%}
%%
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Fri Aug 22 15:28:33 2014
@@ -30,10 +30,76 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
-
+
+ // LUCENE-5897: slow tokenization of strings of the form (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
+ public void testLargePartiallyMatchingToken() throws Exception {
+ // TODO: get these lists of chars matching a property from ICU4J
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ char[] WordBreak_ExtendNumLet_chars = "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ int[] WordBreak_Format_chars // only the first char in ranges
+ = { 0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF,
+ 0xFFF9, 0x110BD, 0x1D173, 0xE0001, 0xE0020 };
+ // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
+ int[] WordBreak_Extend_chars // only the first char in ranges
+ = { 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df,
+ 0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4,
+ 0x900, 0x93a, 0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2,
+ 0xa01, 0xa3c, 0xa3e, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7,
+ 0xacb, 0xae2, 0xb01, 0xb3c, 0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6,
+ 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6,
+ 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46, 0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf,
+ 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1, 0xeb4, 0xebb, 0xec8, 0xf18, 0xf35,
+ 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102b, 0x1056, 0x105e, 0x1062,
+ 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4,
+ 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8, 0x1a17, 0x1a55, 0x1a60, 0x1a7f,
+ 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24, 0x1cd0, 0x1cd4, 0x1ced, 0x1cf2,
+ 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674,
+ 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880, 0xa8b4, 0xa8e0, 0xa926, 0xa947,
+ 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1,
+ 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20, 0xff9e, 0x101fd, 0x10a01,
+ 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038, 0x11080, 0x11082,
+ 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180, 0x11182,
+ 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
+ 0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100 };
+ StringBuilder builder = new StringBuilder();
+ int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
+ for (int i = 0 ; i < numChars ; ) {
+ builder.append(WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
+ ++i;
+ if (random().nextBoolean()) {
+ int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
+ for (int j = 0; j < numFormatExtendChars; ++j) {
+ int codepoint;
+ if (random().nextBoolean()) {
+ codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
+ } else {
+ codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
+ }
+ char[] chars = Character.toChars(codepoint);
+ builder.append(chars);
+ i += chars.length;
+ }
+ }
+ }
+ StandardTokenizer ts = new StandardTokenizer(new StringReader(builder.toString()));
+ ts.reset();
+ while (ts.incrementToken()) { }
+ ts.end();
+ ts.close();
+ int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+ ts.setMaxTokenLength(newBufferSize); // try a different buffer size
+ ts.setReader(new StringReader(builder.toString()));
+ ts.reset();
+ while (ts.incrementToken()) { }
+ ts.end();
+ ts.close();
+ }
+
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Modified: lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1619836&r1=1619835&r2=1619836&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene_solr_4_10/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Fri Aug 22 15:28:33 2014
@@ -7,6 +7,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
@@ -19,6 +20,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
+import java.util.regex.Pattern;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -38,7 +40,41 @@ import java.util.Random;
*/
public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
-
+
+ // LUCENE-5440: extremely slow tokenization of text matching email <local-part> (before the '@')
+ public void testLongEMAILatomText() throws Exception {
+ // EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
+ char[] emailAtomChars
+ = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
+ StringBuilder builder = new StringBuilder();
+ int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
+ for (int i = 0 ; i < numChars ; ++i) {
+ builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
+ }
+ int tokenCount = 0;
+ String text = builder.toString();
+ UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer(new StringReader(text));
+ ts.reset();
+ while (ts.incrementToken()) {
+ tokenCount++;
+ }
+ ts.end();
+ ts.close();
+ assertTrue(tokenCount > 0);
+
+ tokenCount = 0;
+ int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
+ ts.setMaxTokenLength(newBufferSize);
+ ts.setReader(new StringReader(text));
+ ts.reset();
+ while (ts.incrementToken()) {
+ tokenCount++;
+ }
+ ts.end();
+ ts.close();
+ assertTrue(tokenCount > 0);
+ }
+
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];