You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:38 UTC
[24/24] lucene-solr:branch_7x: LUCENE-8527: Upgrade JFlex to 1.7.0.
StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
and provide UTS#51 v11.0 Emoji tokenization with the '' token type.
LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '<EMOJI>' token type.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e8c65da6
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e8c65da6
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e8c65da6
Branch: refs/heads/branch_7x
Commit: e8c65da6bb8be626242cfba18989e497180e82aa
Parents: 612a1d0
Author: Steve Rowe <sa...@apache.org>
Authored: Tue Jan 8 13:33:49 2019 -0500
Committer: Steve Rowe <sa...@apache.org>
Committed: Tue Jan 8 13:34:37 2019 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 8 +
lucene/analysis/common/build.xml | 32 +-
.../charfilter/HTMLStripCharFilter.java | 834 +-
.../charfilter/HTMLStripCharFilter.jflex | 22 +-
.../analysis/standard/ClassicTokenizerImpl.java | 141 +-
.../standard/UAX29URLEmailTokenizer.java | 44 +-
.../standard/UAX29URLEmailTokenizerImpl.java | 74906 +++++++++--------
.../standard/UAX29URLEmailTokenizerImpl.jflex | 216 +-
.../wikipedia/WikipediaTokenizerImpl.java | 465 +-
.../charfilter/HTMLStripCharFilterTest.java | 2 +-
.../standard/TestUAX29URLEmailAnalyzer.java | 4 +-
.../standard/TestUAX29URLEmailTokenizer.java | 76 +-
lucene/common-build.xml | 21 +-
.../src/data/jflex/UnicodeEmojiProperties.jflex | 25 +
.../src/data/jflex/getUnicodeEmojiProperties.pl | 168 +
lucene/core/src/data/jflex/skeleton.default | 342 +
.../jflex/skeleton.disable.buffer.expansion.txt | 348 +
.../standard/StandardTokenizerImpl.java | 637 +-
.../standard/StandardTokenizerImpl.jflex | 206 +-
.../analysis/standard/TestStandardAnalyzer.java | 131 +-
.../EmojiTokenizationTestUnicode_11_0.java | 10756 +++
.../standard/WordBreakTestUnicode_6_3_0.java | 5537 --
.../standard/WordBreakTestUnicode_9_0_0.java | 8276 ++
.../standard/generateEmojiTokenizationTest.pl | 150 +
.../generateJavaUnicodeWordBreakTest.pl | 41 +-
25 files changed, 62395 insertions(+), 40993 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 52c3939..f18e76b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -5,6 +5,11 @@ http://s.apache.org/luceneversions
======================= Lucene 7.7.0 =======================
+Changes in Runtime Behavior
+
+* LUCENE-8527: StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
+ and provide Unicode UTS#51 v11.0 Emoji tokenization with the "<EMOJI>" token type.
+
Build
* LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core
@@ -57,6 +62,9 @@ Improvements
* LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
(Ignacio Vera, Nick Knize, Adrien Grand)
+
+* LUCENE-8527: Upgrade JFlex dependency to 1.7.0; in StandardTokenizer and UAX29URLEmailTokenizer,
+ increase supported Unicode version from 6.3 to 9.0, and support Unicode UTS#51 v11.0 Emoji tokenization.
Optimizations
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/analysis/common/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/build.xml b/lucene/analysis/common/build.xml
index b8eb37a..f752ecc 100644
--- a/lucene/analysis/common/build.xml
+++ b/lucene/analysis/common/build.xml
@@ -33,18 +33,14 @@
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
- <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
- -jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
-
- <target name="-jflex-HTMLStripCharFilter"
- depends="init,generate-jflex-html-char-entities">
- <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
- outdir="src/java/org/apache/lucene/analysis/charfilter"
- nobak="on" inputstreamctor="false"/>
- <!-- Remove the inappropriate JFlex-generated constructor -->
- <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
- match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
- replace="" flags="s"/>
+ <!-- Because of a bug in JFlex's ant task, HTMLStripCharFilter has to be generated last. -->
+ <!-- Otherwise the "%apiprivate" option used in its specification will leak into following -->
+ <!-- ant task invocations. -->
+ <target name="jflex" depends="init,clean-jflex,-jflex-wiki-tokenizer,-jflex-ClassicAnalyzer,
+ -jflex-UAX29URLEmailTokenizer,-jflex-HTMLStripCharFilter"/>
+
+ <target name="-jflex-HTMLStripCharFilter" depends="-install-jflex,generate-jflex-html-char-entities">
+ <run-jflex dir="src/java/org/apache/lucene/analysis/charfilter" name="HTMLStripCharFilter"/>
</target>
<target name="generate-jflex-html-char-entities">
@@ -58,17 +54,17 @@
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
</target>
- <target name="-jflex-wiki-tokenizer" depends="init,-install-jflex">
+ <target name="-jflex-wiki-tokenizer" depends="-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
</target>
- <target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
- <run-jflex-and-disable-buffer-expansion
- dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+ <target name="-jflex-ClassicAnalyzer" depends="-install-jflex">
+ <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
</target>
- <target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
- <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
+ <target name="-jflex-UAX29URLEmailTokenizer" depends="-install-jflex">
+ <run-jflex-and-disable-buffer-expansion
+ dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
<target name="clean-jflex">
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
index a236497..ae67bde 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -95,127 +95,152 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
"\32\0\1\41\11\0\1\1\12\0\1\1\1\0\1\2\2\0\1\1"+
"\5\0\27\1\1\0\37\1\1\0\u01ca\1\4\0\14\1\16\0\5\1"+
"\7\0\1\1\1\0\1\1\21\0\160\2\5\1\1\0\2\1\2\0"+
- "\4\1\10\0\1\1\1\2\3\1\1\0\1\1\1\0\24\1\1\0"+
- "\123\1\1\0\213\1\1\0\5\2\2\0\236\1\11\0\46\1\2\0"+
- "\1\1\7\0\47\1\11\0\55\2\1\0\1\2\1\0\2\2\1\0"+
- "\2\2\1\0\1\2\10\0\33\1\5\0\3\1\35\0\13\2\5\0"+
- "\53\1\25\2\12\111\4\0\2\1\1\2\143\1\1\0\1\1\7\2"+
- "\2\0\6\2\2\1\2\2\1\0\4\2\2\1\12\111\3\1\2\0"+
- "\1\1\20\0\1\1\1\2\36\1\33\2\2\0\131\1\13\2\1\1"+
- "\16\0\12\111\41\1\11\2\2\1\4\0\1\1\5\0\26\1\4\2"+
- "\1\1\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\104\0"+
- "\1\1\1\0\13\1\67\0\33\2\1\0\4\2\66\1\3\2\1\1"+
- "\22\2\1\1\7\2\12\1\2\2\2\0\12\111\1\0\7\1\1\0"+
- "\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
- "\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1\7\2\2\0"+
- "\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1\1\0\3\1"+
- "\2\2\2\0\12\111\2\1\17\0\3\2\1\0\6\1\4\0\2\1"+
- "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1\1\0\2\1"+
- "\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2\3\0\1\2"+
- "\7\0\4\1\1\0\1\1\7\0\12\111\2\2\3\1\1\2\13\0"+
- "\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0\7\1\1\0"+
- "\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0\3\2\1\0"+
- "\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\111\21\0\3\2"+
- "\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1"+
- "\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2\2\0\3\2"+
- "\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0\12\111\1\0"+
- "\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1\1\0\4\1"+
- "\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1\3\0\3\1"+
- "\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2\2\0\1\1"+
- "\6\0\1\2\16\0\12\111\21\0\3\2\1\0\10\1\1\0\3\1"+
- "\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1\7\2\1\0"+
- "\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0\2\1\2\2"+
- "\2\0\12\111\22\0\2\2\1\0\10\1\1\0\3\1\1\0\27\1"+
+ "\4\1\1\0\1\1\6\0\1\1\1\2\3\1\1\0\1\1\1\0"+
+ "\24\1\1\0\123\1\1\0\213\1\1\0\5\2\2\0\246\1\1\0"+
+ "\46\1\2\0\1\1\7\0\47\1\11\0\55\2\1\0\1\2\1\0"+
+ "\2\2\1\0\2\2\1\0\1\2\10\0\33\1\5\0\3\1\35\0"+
+ "\13\2\5\0\53\1\25\2\12\111\4\0\2\1\1\2\143\1\1\0"+
+ "\1\1\7\2\2\0\6\2\2\1\2\2\1\0\4\2\2\1\12\111"+
+ "\3\1\2\0\1\1\20\0\1\1\1\2\36\1\33\2\2\0\131\1"+
+ "\13\2\1\1\16\0\12\111\41\1\11\2\2\1\4\0\1\1\5\0"+
+ "\26\1\4\2\1\1\11\2\1\1\3\2\1\1\5\2\22\0\31\1"+
+ "\3\2\104\0\25\1\1\0\10\1\26\0\16\2\1\0\41\2\66\1"+
+ "\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\111\1\0"+
+ "\20\1\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
+ "\1\0\1\1\3\0\4\1\2\0\1\2\1\1\7\2\2\0\2\2"+
+ "\2\0\3\2\1\1\10\0\1\2\4\0\2\1\1\0\3\1\2\2"+
+ "\2\0\12\111\2\1\17\0\3\2\1\0\6\1\4\0\2\1\2\0"+
+ "\26\1\1\0\7\1\1\0\2\1\1\0\2\1\1\0\2\1\2\0"+
+ "\1\2\1\0\5\2\4\0\2\2\2\0\3\2\3\0\1\2\7\0"+
+ "\4\1\1\0\1\1\7\0\12\111\2\2\3\1\1\2\13\0\3\2"+
+ "\1\0\11\1\1\0\3\1\1\0\26\1\1\0\7\1\1\0\2\1"+
+ "\1\0\5\1\2\0\1\2\1\1\10\2\1\0\3\2\1\0\3\2"+
+ "\2\0\1\1\17\0\2\1\2\2\2\0\12\111\11\0\1\1\7\0"+
+ "\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0"+
+ "\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2\2\0"+
+ "\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0\12\111"+
+ "\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1\1\0"+
+ "\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1\3\0"+
+ "\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2\2\0"+
+ "\1\1\6\0\1\2\16\0\12\111\20\0\4\2\1\0\10\1\1\0"+
+ "\3\1\1\0\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2"+
+ "\1\0\4\2\7\0\2\2\1\0\3\1\5\0\2\1\2\2\2\0"+
+ "\12\111\20\0\1\1\3\2\1\0\10\1\1\0\3\1\1\0\27\1"+
"\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2\1\0\3\2"+
"\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1\2\2\2\0"+
- "\12\111\1\0\2\1\17\0\2\2\1\0\10\1\1\0\3\1\1\0"+
- "\51\1\2\0\1\1\7\2\1\0\3\2\1\0\4\2\1\1\10\0"+
- "\1\2\10\0\2\1\2\2\2\0\12\111\12\0\6\1\2\0\2\2"+
- "\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0\7\1"+
- "\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0\2\2"+
- "\15\0\60\1\1\2\2\1\7\2\5\0\7\1\10\2\1\0\12\111"+
- "\47\0\2\1\1\0\1\1\2\0\2\1\1\0\1\1\2\0\1\1"+
- "\6\0\4\1\1\0\7\1\1\0\3\1\1\0\1\1\1\0\1\1"+
- "\2\0\2\1\1\0\4\1\1\2\2\1\6\2\1\0\2\2\1\1"+
- "\2\0\5\1\1\0\1\1\1\0\6\2\2\0\12\111\2\0\4\1"+
- "\40\0\1\1\27\0\2\2\6\0\12\111\13\0\1\2\1\0\1\2"+
- "\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2\1\0"+
- "\2\2\5\1\13\2\1\0\44\2\11\0\1\2\71\0\53\1\24\2"+
- "\1\1\12\111\6\0\6\1\4\2\4\1\3\2\1\1\3\2\2\1"+
- "\7\2\3\1\4\2\15\1\14\2\1\1\1\2\12\111\4\2\2\0"+
- "\46\1\1\0\1\1\5\0\1\1\2\0\53\1\1\0\u014d\1\1\0"+
- "\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0"+
- "\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0"+
- "\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0"+
- "\3\2\11\0\11\2\16\0\20\1\20\0\125\1\14\0\u026c\1\2\0"+
- "\21\1\1\41\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
- "\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
- "\1\0\3\1\1\0\2\2\14\0\64\1\40\2\3\0\1\1\4\0"+
- "\1\1\1\2\2\0\12\111\41\0\3\2\2\0\12\111\6\0\130\1"+
- "\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
- "\4\0\14\2\12\0\12\111\36\1\2\0\5\1\13\0\54\1\4\0"+
- "\21\2\7\1\2\2\6\0\12\111\1\2\45\0\27\1\5\2\4\0"+
- "\65\1\12\2\1\0\35\2\2\0\1\2\12\111\6\0\12\111\15\0"+
- "\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\111\21\0\11\2"+
- "\14\0\3\2\36\1\15\2\2\1\12\111\54\1\16\2\14\0\44\1"+
- "\24\2\10\0\12\111\3\0\3\1\12\111\44\1\122\0\3\2\1\0"+
- "\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2\25\0"+
- "\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1"+
- "\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1"+
- "\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1"+
- "\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41"+
- "\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41"+
- "\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2"+
- "\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1"+
- "\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1"+
- "\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1"+
- "\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1"+
- "\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2"+
- "\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0"+
- "\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1"+
- "\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0"+
- "\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0"+
- "\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\111"+
- "\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0\1\2"+
- "\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
- "\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2\27\1"+
- "\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\111\6\0"+
- "\22\2\6\1\3\0\1\1\4\0\12\111\34\1\10\2\2\0\27\1"+
- "\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\111"+
- "\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\111"+
- "\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1\3\2"+
- "\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1\2\0"+
- "\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1\2\0"+
- "\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
- "\2\0\12\111\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u016e\1"+
- "\2\0\152\1\46\0\7\1\14\0\5\1\5\0\1\1\1\2\12\1"+
- "\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1\1\0\2\1"+
- "\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1"+
- "\4\0\20\2\20\0\7\2\14\0\2\2\30\0\3\2\40\0\5\1"+
- "\1\0\207\1\23\0\12\111\7\0\32\1\4\0\1\2\1\0\32\1"+
- "\13\0\131\1\3\0\6\1\2\0\6\1\2\0\6\1\2\0\3\1"+
- "\43\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1\1\0\17\1"+
- "\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\2\202\0\35\1"+
- "\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1\2\0\44\1"+
- "\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\111\u0356\0\6\1"+
- "\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1\2\0\27\1"+
- "\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1\100\0\1\1"+
- "\3\2\1\0\2\2\5\0\4\2\4\1\1\0\3\1\1\0\33\1"+
- "\4\0\3\2\4\0\1\2\40\0\35\1\203\0\66\1\12\0\26\1"+
- "\12\0\23\1\215\0\111\1\u03b7\0\3\2\65\1\17\2\37\0\12\111"+
- "\20\0\3\2\55\1\13\2\25\0\31\1\7\0\12\111\6\0\3\2"+
- "\44\1\16\2\1\0\12\111\100\0\3\2\60\1\16\2\4\1\13\0"+
- "\12\111\u04a6\0\53\1\15\2\10\0\12\111\u0936\0\u036f\1\221\0\143\1"+
- "\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1\13\0\1\1\56\2\20\0"+
- "\4\2\15\1\u4060\0\2\1\u2163\0\5\2\3\0\6\2\10\0\10\2"+
- "\2\0\7\2\36\0\4\2\224\0\3\2\u01bb\0\125\1\1\0\107\1"+
- "\1\0\2\1\2\0\1\1\2\0\2\1\2\0\4\1\1\0\14\1"+
- "\1\0\1\1\1\0\7\1\1\0\101\1\1\0\4\1\2\0\10\1"+
- "\1\0\7\1\1\0\34\1\1\0\4\1\1\0\5\1\1\0\1\1"+
- "\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0\31\1\1\0\37\1"+
- "\1\0\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1"+
- "\1\0\37\1\1\0\31\1\1\0\10\1\2\0\62\111\u1600\0\4\1"+
+ "\12\111\1\0\2\1\16\0\3\2\1\0\10\1\1\0\3\1\1\0"+
+ "\51\1\2\0\1\1\7\2\1\0\3\2\1\0\4\2\1\1\5\0"+
+ "\3\1\1\2\7\0\3\1\2\2\2\0\12\111\12\0\6\1\2\0"+
+ "\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
+ "\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\6\0"+
+ "\12\111\2\0\2\2\15\0\60\1\1\2\2\1\7\2\5\0\7\1"+
+ "\10\2\1\0\12\111\47\0\2\1\1\0\1\1\2\0\2\1\1\0"+
+ "\1\1\2\0\1\1\6\0\4\1\1\0\7\1\1\0\3\1\1\0"+
+ "\1\1\1\0\1\1\2\0\2\1\1\0\4\1\1\2\2\1\6\2"+
+ "\1\0\2\2\1\1\2\0\5\1\1\0\1\1\1\0\6\2\2\0"+
+ "\12\111\2\0\4\1\40\0\1\1\27\0\2\2\6\0\12\111\13\0"+
+ "\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
+ "\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
+ "\71\0\53\1\24\2\1\1\12\111\6\0\6\1\4\2\4\1\3\2"+
+ "\1\1\3\2\2\1\7\2\3\1\4\2\15\1\14\2\1\1\1\2"+
+ "\12\111\4\2\2\0\46\1\1\0\1\1\5\0\1\1\2\0\53\1"+
+ "\1\0\u014d\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1"+
+ "\2\0\51\1\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1"+
+ "\1\0\1\1\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1"+
+ "\2\0\103\1\2\0\3\2\11\0\11\2\16\0\20\1\20\0\126\1"+
+ "\2\0\6\1\3\0\u026c\1\2\0\21\1\1\41\32\1\5\0\113\1"+
+ "\3\0\13\1\7\0\15\1\1\0\4\1\3\2\13\0\22\1\3\2"+
+ "\13\0\22\1\2\2\14\0\15\1\1\0\3\1\1\0\2\2\14\0"+
+ "\64\1\40\2\3\0\1\1\4\0\1\1\1\2\2\0\12\111\41\0"+
+ "\3\2\2\0\12\111\6\0\130\1\10\0\51\1\1\2\1\1\5\0"+
+ "\106\1\12\0\37\1\1\0\14\2\4\0\14\2\12\0\12\111\36\1"+
+ "\2\0\5\1\13\0\54\1\4\0\32\1\6\0\12\111\1\2\45\0"+
+ "\27\1\5\2\4\0\65\1\12\2\1\0\35\2\2\0\1\2\12\111"+
+ "\6\0\12\111\15\0\1\1\10\0\16\2\102\0\5\2\57\1\21\2"+
+ "\7\1\4\0\12\111\21\0\11\2\14\0\3\2\36\1\15\2\2\1"+
+ "\12\111\54\1\16\2\14\0\44\1\24\2\10\0\12\111\3\0\3\1"+
+ "\12\111\44\1\2\0\11\1\107\0\3\2\1\0\25\2\4\1\1\2"+
+ "\4\1\3\2\2\1\1\0\2\2\6\0\300\1\66\2\5\0\5\2"+
+ "\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0"+
+ "\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0"+
+ "\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0"+
+ "\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41\35\0"+
+ "\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41\21\0"+
+ "\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2\3\0"+
+ "\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1\2\0"+
+ "\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1\2\0"+
+ "\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1\1\0"+
+ "\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1\1\0"+
+ "\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2\27\1"+
+ "\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
+ "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0\1\41"+
+ "\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1\4\0"+
+ "\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0\51\1"+
+ "\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0\u51d6\1"+
+ "\52\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\111\2\1"+
+ "\24\0\57\1\1\2\4\0\12\2\1\0\37\1\2\2\120\1\2\2"+
+ "\45\0\11\1\2\0\147\1\2\0\44\1\1\0\10\1\77\0\13\1"+
+ "\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0\64\1\14\0"+
+ "\2\2\62\1\22\2\12\0\12\111\6\0\22\2\6\1\3\0\1\1"+
+ "\1\0\1\1\2\0\12\111\34\1\10\2\2\0\27\1\15\2\14\0"+
+ "\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\111\6\0\5\1"+
+ "\1\2\12\1\12\111\5\1\1\0\51\1\16\2\11\0\3\1\1\2"+
+ "\10\1\2\2\2\0\12\111\6\0\27\1\3\0\1\1\3\2\62\1"+
+ "\1\2\1\1\3\2\2\1\2\2\5\1\2\2\1\1\1\2\1\1"+
+ "\30\0\3\1\2\0\13\1\5\2\2\0\3\1\2\2\12\0\6\1"+
+ "\2\0\6\1\2\0\6\1\11\0\7\1\1\0\7\1\1\0\53\1"+
+ "\1\0\12\1\12\0\163\1\10\2\1\0\2\2\2\0\12\111\6\0"+
+ "\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u016e\1\2\0\152\1\46\0"+
+ "\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0"+
+ "\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0"+
+ "\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2\20\0"+
+ "\20\2\3\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1\23\0"+
+ "\12\111\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1\3\0"+
+ "\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0\14\1\1\0"+
+ "\32\1\1\0\23\1\1\0\2\1\1\0\17\1\2\0\16\1\42\0"+
+ "\173\1\105\0\65\1\210\0\1\2\202\0\35\1\3\0\61\1\17\0"+
+ "\1\2\37\0\40\1\20\0\33\1\5\0\46\1\5\2\5\0\36\1"+
+ "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\111"+
+ "\6\0\44\1\4\0\44\1\4\0\50\1\10\0\64\1\234\0\u0137\1"+
+ "\11\0\26\1\12\0\10\1\230\0\6\1\2\0\1\1\1\0\54\1"+
+ "\1\0\2\1\3\0\1\1\2\0\27\1\12\0\27\1\11\0\37\1"+
+ "\101\0\23\1\1\0\2\1\12\0\26\1\12\0\32\1\106\0\70\1"+
+ "\6\0\2\1\100\0\1\1\3\2\1\0\2\2\5\0\4\2\4\1"+
+ "\1\0\3\1\1\0\33\1\4\0\3\2\4\0\1\2\40\0\35\1"+
+ "\3\0\35\1\43\0\10\1\1\0\34\1\2\2\31\0\66\1\12\0"+
+ "\26\1\12\0\23\1\15\0\22\1\156\0\111\1\67\0\63\1\15\0"+
+ "\63\1\u030d\0\3\2\65\1\17\2\37\0\12\111\17\0\4\2\55\1"+
+ "\13\2\25\0\31\1\7\0\12\111\6\0\3\2\44\1\16\2\1\0"+
+ "\12\111\20\0\43\1\1\2\2\0\1\1\11\0\3\2\60\1\16\2"+
+ "\4\1\5\0\3\2\3\0\12\111\1\1\1\0\1\1\43\0\22\1"+
+ "\1\0\31\1\14\2\6\0\1\2\101\0\7\1\1\0\1\1\1\0"+
+ "\4\1\1\0\17\1\1\0\12\1\7\0\57\1\14\2\5\0\12\111"+
+ "\6\0\4\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
+ "\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
+ "\2\0\3\2\2\0\1\1\6\0\1\2\5\0\5\1\2\2\2\0"+
+ "\7\2\3\0\5\2\213\0\65\1\22\2\4\1\5\0\12\111\46\0"+
+ "\60\1\24\2\2\1\1\0\1\1\10\0\12\111\246\0\57\1\7\2"+
+ "\2\0\11\2\27\0\4\1\2\2\42\0\60\1\21\2\3\0\1\1"+
+ "\13\0\12\111\46\0\53\1\15\2\10\0\12\111\66\0\32\1\3\0"+
+ "\17\2\4\0\12\111\u0166\0\100\1\12\111\25\0\1\1\u01c0\0\71\1"+
+ "\u0107\0\11\1\1\0\45\1\10\2\1\0\10\2\1\1\17\0\12\111"+
+ "\30\0\36\1\2\0\26\2\1\0\16\2\u0349\0\u039a\1\146\0\157\1"+
+ "\21\0\304\1\u0abc\0\u042f\1\u0fd1\0\u0247\1\u21b9\0\u0239\1\7\0\37\1"+
+ "\1\0\12\111\146\0\36\1\2\0\5\2\13\0\60\1\7\2\11\0"+
+ "\4\1\14\0\12\111\11\0\25\1\5\0\23\1\u0370\0\105\1\13\0"+
+ "\1\1\56\2\20\0\4\2\15\1\100\0\1\1\37\0\u17ed\1\23\0"+
+ "\u02f3\1\u250d\0\2\1\u0bfe\0\153\1\5\0\15\1\3\0\11\1\7\0"+
+ "\12\1\3\0\2\2\u14c6\0\5\2\3\0\6\2\10\0\10\2\2\0"+
+ "\7\2\36\0\4\2\224\0\3\2\u01bb\0\125\1\1\0\107\1\1\0"+
+ "\2\1\2\0\1\1\2\0\2\1\2\0\4\1\1\0\14\1\1\0"+
+ "\1\1\1\0\7\1\1\0\101\1\1\0\4\1\2\0\10\1\1\0"+
+ "\7\1\1\0\34\1\1\0\4\1\1\0\5\1\1\0\1\1\3\0"+
+ "\7\1\1\0\u0154\1\2\0\31\1\1\0\31\1\1\0\37\1\1\0"+
+ "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
+ "\37\1\1\0\31\1\1\0\10\1\2\0\62\111\u0200\0\67\2\4\0"+
+ "\62\2\10\0\1\2\16\0\1\2\26\0\5\2\1\0\17\2\u0550\0"+
+ "\7\2\1\0\21\2\2\0\7\2\1\0\2\2\1\0\5\2\u07d5\0"+
+ "\305\1\13\0\7\2\51\0\104\1\7\2\5\0\12\111\u04a6\0\4\1"+
"\1\0\33\1\1\0\2\1\1\0\1\1\2\0\1\1\1\0\12\1"+
"\1\0\4\1\1\0\1\1\1\0\1\1\6\0\1\1\4\0\1\1"+
"\1\0\1\1\1\0\1\1\1\0\3\1\1\0\2\1\1\0\1\1"+
@@ -223,7 +248,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
"\1\0\2\1\1\0\1\1\2\0\4\1\1\0\7\1\1\0\4\1"+
"\1\0\4\1\1\0\1\1\1\0\12\1\1\0\21\1\5\0\3\1"+
"\1\0\5\1\1\0\21\1\u1144\0\ua6d7\1\51\0\u1035\1\13\0\336\1"+
- "\u3fe2\0\u021e\1\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u06ed\0"+
+ "\2\0\u1682\1\u295e\0\u021e\1\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u06ed\0"+
"\360\2\uffff\0\uffff\0\ufe12\0";
/**
@@ -29654,7 +29679,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
- "Unkown internal scanner error",
+ "Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@@ -29809,11 +29834,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
private int yycolumn;
/**
- * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
- /** zzAtEOF == true <=> the scanner is at the EOF */
+ /** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@@ -29950,24 +29975,14 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
private TextSegment entitySegment = new TextSegment(2);
/**
- * Creates a new HTMLStripCharFilter over the provided Reader.
- * @param source Reader to strip html tags from.
- */
- public HTMLStripCharFilter(Reader source) {
- super(source);
- this.zzReader = source;
- }
-
- /**
* Creates a new HTMLStripCharFilter over the provided Reader
* with the specified start and end tags.
- * @param source Reader to strip html tags from.
+ * @param in Reader to strip html tags from.
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
- public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
- super(source);
- this.zzReader = source;
+ public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
+ this(in);
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {
@@ -30059,7 +30074,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
-
+ /**
+ * Creates a new scanner
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public HTMLStripCharFilter(java.io.Reader in) {
+ super(in);
+ this.zzReader = in;
+ }
/**
@@ -30072,7 +30095,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
- while (i < 2836) {
+ while (i < 3340) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@@ -30116,28 +30139,29 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead;
- int totalRead = 0;
- while (totalRead < requested) {
- int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
- if (numRead == -1) {
- break;
- }
- totalRead += numRead;
- }
+ int requested = zzBuffer.length - zzEndRead;
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
- if (totalRead > 0) {
- zzEndRead += totalRead;
- if (totalRead == requested) { /* possibly more input available */
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ /* If numRead == requested, we might have requested to few chars to
+ encode a full Unicode character. We assume that a Reader would
+ otherwise never return half characters. */
+ if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
+ /* potentially more input available */
return false;
}
- // totalRead = 0: End of stream
+ /* numRead < 0 ==> end of stream */
return true;
}
@@ -30420,43 +30444,55 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
// store back cached position
zzMarkedPos = zzMarkedPosL;
- switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { if (yylength() == 1) {
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ zzDoEOF();
+ { return eofReturnValue;
+ }
+ }
+ else {
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { if (yylength() == 1) {
return zzBuffer[zzStartRead];
} else {
outputSegment.append(yytext()); return outputSegment.nextChar();
}
- }
- case 55: break;
- case 2:
- { inputStart = yychar;
+ }
+ // fall through
+ case 55: break;
+ case 2:
+ { inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
- }
- case 56: break;
- case 3:
- { inputStart = yychar;
+ }
+ // fall through
+ case 56: break;
+ case 3:
+ { inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
- }
- case 57: break;
- case 4:
- { yypushback(yylength());
+ }
+ // fall through
+ case 57: break;
+ case 4:
+ { yypushback(yylength());
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
- }
- case 58: break;
- case 5:
- { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
- }
- case 59: break;
- case 6:
- { int matchLength = yylength();
+ }
+ // fall through
+ case 58: break;
+ case 5:
+ { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
+ }
+ // fall through
+ case 59: break;
+ case 6:
+ { int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
@@ -30487,180 +30523,206 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
- }
- case 60: break;
- case 7:
- { // add (previously matched input length) + (this match length) - (substitution length)
+ }
+ // fall through
+ case 60: break;
+ case 7:
+ { // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
- }
- case 61: break;
- case 8:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 61: break;
+ case 8:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
- }
- case 62: break;
- case 9:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 62: break;
+ case 9:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
- }
- case 63: break;
- case 10:
- { inputSegment.append('!'); yybegin(BANG);
- }
- case 64: break;
- case 11:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 63: break;
+ case 10:
+ { inputSegment.append('!'); yybegin(BANG);
+ }
+ // fall through
+ case 64: break;
+ case 11:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
- }
- case 65: break;
- case 12:
- { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
- }
- case 66: break;
- case 13:
- { inputSegment.append(yytext());
- }
- case 67: break;
- case 14:
- { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+ }
+ // fall through
+ case 65: break;
+ case 12:
+ { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
+ }
+ // fall through
+ case 66: break;
+ case 13:
+ { inputSegment.append(yytext());
+ }
+ // fall through
+ case 67: break;
+ case 14:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
- }
- case 68: break;
- case 15:
- {
- }
- case 69: break;
- case 16:
- { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
- }
- case 70: break;
- case 17:
- { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
- }
- case 71: break;
- case 18:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 68: break;
+ case 15:
+ {
+ }
+ // fall through
+ case 69: break;
+ case 16:
+ { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
+ }
+ // fall through
+ case 70: break;
+ case 17:
+ { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
+ }
+ // fall through
+ case 71: break;
+ case 18:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
- }
- case 72: break;
- case 19:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 72: break;
+ case 19:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
- }
- case 73: break;
- case 20:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
- }
- case 74: break;
- case 21:
- { if (yylength() == 1) {
+ }
+ // fall through
+ case 73: break;
+ case 20:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 74: break;
+ case 21:
+ { if (yylength() == 1) {
return zzBuffer[zzStartRead];
} else {
outputSegment.append(yytext()); return outputSegment.nextChar();
}
- }
- case 75: break;
- case 22:
- { previousRestoreState = restoreState;
+ }
+ // fall through
+ case 75: break;
+ case 22:
+ { previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
- }
- case 76: break;
- case 23:
- { previousRestoreState = restoreState;
+ }
+ // fall through
+ case 76: break;
+ case 23:
+ { previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
- }
- case 77: break;
- case 24:
- { yybegin(restoreState); restoreState = previousRestoreState;
- }
- case 78: break;
- case 25:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ }
+ // fall through
+ case 77: break;
+ case 24:
+ { yybegin(restoreState); restoreState = previousRestoreState;
+ }
+ // fall through
+ case 78: break;
+ case 25:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
- }
- case 79: break;
- case 26:
- { // add (previously matched input length) + (this match length) - (substitution length)
+ }
+ // fall through
+ case 79: break;
+ case 26:
+ { // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
- }
- case 80: break;
- case 27:
- { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+ }
+ // fall through
+ case 80: break;
+ case 27:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
- }
- case 81: break;
- case 28:
- { // add (previously matched input length) + (this match length) - (substitution length)
+ }
+ // fall through
+ case 81: break;
+ case 28:
+ { // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
- }
- case 82: break;
- case 29:
- { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
- }
- case 83: break;
- case 30:
- { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
- }
- case 84: break;
- case 31:
- { int length = yylength();
+ }
+ // fall through
+ case 82: break;
+ case 29:
+ { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
+ }
+ // fall through
+ case 83: break;
+ case 30:
+ { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
+ }
+ // fall through
+ case 84: break;
+ case 31:
+ { int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
- }
- case 85: break;
- case 32:
- { int matchLength = yylength();
+ }
+ // fall through
+ case 85: break;
+ case 32:
+ { int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
String hexCharRef
@@ -30692,18 +30754,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
- }
- case 86: break;
- case 33:
- { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
+ }
+ // fall through
+ case 86: break;
+ case 33:
+ { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
inputSegment.append(yytext());
} else {
yybegin(COMMENT);
}
- }
- case 87: break;
- case 34:
- { yybegin(YYINITIAL);
+ }
+ // fall through
+ case 87: break;
+ case 34:
+ { yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
@@ -30716,23 +30780,26 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
- }
- case 88: break;
- case 35:
- { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+ }
+ // fall through
+ case 88: break;
+ case 35:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
- }
- case 89: break;
- case 36:
- { yybegin(SCRIPT);
- }
- case 90: break;
- case 37:
- { yybegin(YYINITIAL);
+ }
+ // fall through
+ case 89: break;
+ case 36:
+ { yybegin(SCRIPT);
+ }
+ // fall through
+ case 90: break;
+ case 37:
+ { yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
@@ -30745,66 +30812,77 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
- }
- case 91: break;
- case 38:
- { // add (this match length) [ - (substitution length) = 0 ]
+ }
+ // fall through
+ case 91: break;
+ case 38:
+ { // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
- }
- case 92: break;
- case 39:
- { yybegin(restoreState);
- }
- case 93: break;
- case 40:
- { yybegin(STYLE);
- }
- case 94: break;
- case 41:
- { yybegin(SCRIPT_COMMENT);
- }
- case 95: break;
- case 42:
- { yybegin(STYLE_COMMENT);
- }
- case 96: break;
- case 43:
- { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
- }
- case 97: break;
- case 44:
- { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
- }
- case 98: break;
- case 45:
- { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
- }
- case 99: break;
- case 46:
- { yybegin(STYLE);
+ }
+ // fall through
+ case 92: break;
+ case 39:
+ { yybegin(restoreState);
+ }
+ // fall through
+ case 93: break;
+ case 40:
+ { yybegin(STYLE);
+ }
+ // fall through
+ case 94: break;
+ case 41:
+ { yybegin(SCRIPT_COMMENT);
+ }
+ // fall through
+ case 95: break;
+ case 42:
+ { yybegin(STYLE_COMMENT);
+ }
+ // fall through
+ case 96: break;
+ case 43:
+ { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+ }
+ // fall through
+ case 97: break;
+ case 44:
+ { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+ }
+ // fall through
+ case 98: break;
+ case 45:
+ { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+ }
+ // fall through
+ case 99: break;
+ case 46:
+ { yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
- }
- case 100: break;
- case 47:
- { yybegin(SCRIPT);
+ }
+ // fall through
+ case 100: break;
+ case 47:
+ { yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
- }
- case 101: break;
- case 48:
- { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
+ }
+ // fall through
+ case 101: break;
+ case 48:
+ { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
inputSegment.append(yytext());
} else {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
@@ -30814,10 +30892,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.clear();
yybegin(CDATA);
}
- }
- case 102: break;
- case 49:
- { inputSegment.clear();
+ }
+ // fall through
+ case 102: break;
+ case 49:
+ { inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
@@ -30837,10 +30916,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
- }
- case 103: break;
- case 50:
- { inputSegment.clear();
+ }
+ // fall through
+ case 103: break;
+ case 50:
+ { inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
@@ -30860,10 +30940,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
- }
- case 104: break;
- case 51:
- { // Handle paired UTF-16 surrogates.
+ }
+ // fall through
+ case 104: break;
+ case 51:
+ { // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
@@ -30888,10 +30969,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
- }
- case 105: break;
- case 52:
- { // Handle paired UTF-16 surrogates.
+ }
+ // fall through
+ case 105: break;
+ case 52:
+ { // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
@@ -30922,10 +31004,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
- }
- case 106: break;
- case 53:
- { // Handle paired UTF-16 surrogates.
+ }
+ // fall through
+ case 106: break;
+ case 53:
+ { // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
@@ -30955,10 +31038,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
- }
- case 107: break;
- case 54:
- { // Handle paired UTF-16 surrogates.
+ }
+ // fall through
+ case 107: break;
+ case 54:
+ { // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
@@ -30991,18 +31075,12 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
- }
- case 108: break;
- default:
- if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
- zzAtEOF = true;
- zzDoEOF();
- { return eofReturnValue;
- }
- }
- else {
+ }
+ // fall through
+ case 108: break;
+ default:
zzScanError(ZZ_NO_MATCH);
- }
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
index d810d79..8b83de0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
@SuppressWarnings("fallthrough")
%%
-%unicode 6.3
+%unicode 9.0
%apiprivate
%type int
%final
@@ -50,6 +50,10 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
%xstate STYLE, STYLE_COMMENT
+%init{
+ super(in);
+%init}
+
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
@@ -166,24 +170,14 @@ InlineElment = ( [aAbBiIqQsSuU] |
private TextSegment entitySegment = new TextSegment(2);
/**
- * Creates a new HTMLStripCharFilter over the provided Reader.
- * @param source Reader to strip html tags from.
- */
- public HTMLStripCharFilter(Reader source) {
- super(source);
- this.zzReader = source;
- }
-
- /**
* Creates a new HTMLStripCharFilter over the provided Reader
* with the specified start and end tags.
- * @param source Reader to strip html tags from.
+ * @param in Reader to strip html tags from.
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
- public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
- super(source);
- this.zzReader = source;
+ public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
+ this(in);
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
index 7e5105d..31d3d96 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -251,7 +251,7 @@ class ClassicTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
- "Unkown internal scanner error",
+ "Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@@ -323,11 +323,11 @@ class ClassicTokenizerImpl {
private int yycolumn;
/**
- * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
- /** zzAtEOF == true <=> the scanner is at the EOF */
+ /** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@@ -436,28 +436,29 @@ public final void getText(CharTermAttribute t) {
}
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead;
- int totalRead = 0;
- while (totalRead < requested) {
- int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
- if (numRead == -1) {
- break;
- }
- totalRead += numRead;
- }
+ int requested = zzBuffer.length - zzEndRead;
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
- if (totalRead > 0) {
- zzEndRead += totalRead;
- if (totalRead == requested) { /* possibly more input available */
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ /* If numRead == requested, we might have requested to few chars to
+ encode a full Unicode character. We assume that a Reader would
+ otherwise never return half characters. */
+ if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
+ /* potentially more input available */
return false;
}
- // totalRead = 0: End of stream
+ /* numRead < 0 ==> end of stream */
return true;
}
@@ -681,55 +682,65 @@ public final void getText(CharTermAttribute t) {
// store back cached position
zzMarkedPos = zzMarkedPosL;
- switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore */
- }
- case 11: break;
- case 2:
- { return ALPHANUM;
- }
- case 12: break;
- case 3:
- { return CJ;
- }
- case 13: break;
- case 4:
- { return HOST;
- }
- case 14: break;
- case 5:
- { return NUM;
- }
- case 15: break;
- case 6:
- { return APOSTROPHE;
- }
- case 16: break;
- case 7:
- { return COMPANY;
- }
- case 17: break;
- case 8:
- { return ACRONYM_DEP;
- }
- case 18: break;
- case 9:
- { return ACRONYM;
- }
- case 19: break;
- case 10:
- { return EMAIL;
- }
- case 20: break;
- default:
- if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
- zzAtEOF = true;
- return YYEOF;
- }
- else {
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else {
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+ }
+ // fall through
+ case 11: break;
+ case 2:
+ { return ALPHANUM;
+ }
+ // fall through
+ case 12: break;
+ case 3:
+ { return CJ;
+ }
+ // fall through
+ case 13: break;
+ case 4:
+ { return HOST;
+ }
+ // fall through
+ case 14: break;
+ case 5:
+ { return NUM;
+ }
+ // fall through
+ case 15: break;
+ case 6:
+ { return APOSTROPHE;
+ }
+ // fall through
+ case 16: break;
+ case 7:
+ { return COMPANY;
+ }
+ // fall through
+ case 17: break;
+ case 8:
+ { return ACRONYM_DEP;
+ }
+ // fall through
+ case 18: break;
+ case 9:
+ { return ACRONYM;
+ }
+ // fall through
+ case 19: break;
+ case 10:
+ { return EMAIL;
+ }
+ // fall through
+ case 20: break;
+ default:
zzScanError(ZZ_NO_MATCH);
- }
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8c65da6/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
index 842ae51..65848f2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@@ -32,33 +32,32 @@ import org.apache.lucene.util.AttributeFactory;
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
- * <p>
- * Tokens produced are of the following types:
- * <ul>
- * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
- * <li><NUM>: A number</li>
- * <li><URL>: A URL</li>
- * <li><EMAIL>: An email address</li>
- * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
- * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
- * <li><HIRAGANA>: A single hiragana character</li>
- * </ul>
*/
public final class UAX29URLEmailTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private final UAX29URLEmailTokenizerImpl scanner;
-
- public static final int ALPHANUM = 0;
- public static final int NUM = 1;
- public static final int SOUTHEAST_ASIAN = 2;
- public static final int IDEOGRAPHIC = 3;
- public static final int HIRAGANA = 4;
- public static final int KATAKANA = 5;
- public static final int HANGUL = 6;
- public static final int URL = 7;
- public static final int EMAIL = 8;
+
+ /** Alpha/numeric token type */
+ public static final int ALPHANUM = 0;
+ /** Numeric token type */
+ public static final int NUM = 1;
+ /** Southeast Asian token type */
+ public static final int SOUTHEAST_ASIAN = 2;
+ /** Ideographic token type */
+ public static final int IDEOGRAPHIC = 3;
+ /** Hiragana token type */
+ public static final int HIRAGANA = 4;
+ /** Katakana token type */
+ public static final int KATAKANA = 5;
+ /** Hangul token type */
+ public static final int HANGUL = 6;
+ /** URL token type */
+ public static final int URL = 7;
+ /** Email token type */
+ public static final int EMAIL = 8;
+ /** Emoji token type. */
+ public static final int EMOJI = 9;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@@ -71,6 +70,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
"<URL>",
"<EMAIL>",
+ StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
};
/** Absolute maximum sized token */