You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:22 UTC

[08/24] lucene-solr:master: LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '' token type.

LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '<EMOJI>' token type.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/283b19a8
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/283b19a8
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/283b19a8

Branch: refs/heads/master
Commit: 283b19a8da6ab9e0b7e9a75b132d3067218d5502
Parents: 7db4121
Author: Steve Rowe <sa...@apache.org>
Authored: Tue Jan 8 13:33:49 2019 -0500
Committer: Steve Rowe <sa...@apache.org>
Committed: Tue Jan 8 13:33:49 2019 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |     8 +
 lucene/analysis/common/build.xml                |    32 +-
 .../charfilter/HTMLStripCharFilter.java         |   834 +-
 .../charfilter/HTMLStripCharFilter.jflex        |    22 +-
 .../analysis/standard/ClassicTokenizerImpl.java |   141 +-
 .../standard/UAX29URLEmailTokenizer.java        |    44 +-
 .../standard/UAX29URLEmailTokenizerImpl.java    | 74906 +++++++++--------
 .../standard/UAX29URLEmailTokenizerImpl.jflex   |   216 +-
 .../wikipedia/WikipediaTokenizerImpl.java       |   465 +-
 .../charfilter/HTMLStripCharFilterTest.java     |     2 +-
 .../standard/TestUAX29URLEmailAnalyzer.java     |     4 +-
 .../standard/TestUAX29URLEmailTokenizer.java    |    76 +-
 lucene/common-build.xml                         |    21 +-
 .../src/data/jflex/UnicodeEmojiProperties.jflex |    25 +
 .../src/data/jflex/getUnicodeEmojiProperties.pl |   168 +
 lucene/core/src/data/jflex/skeleton.default     |   342 +
 .../jflex/skeleton.disable.buffer.expansion.txt |   348 +
 .../standard/StandardTokenizerImpl.java         |   637 +-
 .../standard/StandardTokenizerImpl.jflex        |   206 +-
 .../analysis/standard/TestStandardAnalyzer.java |   131 +-
 .../EmojiTokenizationTestUnicode_11_0.java      | 10756 +++
 .../standard/WordBreakTestUnicode_6_3_0.java    |  5537 --
 .../standard/WordBreakTestUnicode_9_0_0.java    |  8276 ++
 .../standard/generateEmojiTokenizationTest.pl   |   150 +
 .../generateJavaUnicodeWordBreakTest.pl         |    41 +-
 25 files changed, 62395 insertions(+), 40993 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7169cf6..3b98955 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -241,6 +241,11 @@ Optimizations
 
 ======================= Lucene 7.7.0 =======================
 
+Changes in Runtime Behavior
+
+* LUCENE-8527: StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
+  and provide Unicode UTS#51 v11.0 Emoji tokenization with the "<EMOJI>" token type. 
+
 Build
 
 * LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core 
@@ -293,6 +298,9 @@ Improvements
 
 * LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
   (Ignacio Vera, Nick Knize, Adrien Grand)
+  
+* LUCENE-8527: Upgrade JFlex dependency to 1.7.0; in StandardTokenizer and UAX29URLEmailTokenizer,
+  increase supported Unicode version from 6.3 to 9.0, and support Unicode UTS#51 v11.0 Emoji tokenization.
 
 Optimizations
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/analysis/common/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/build.xml b/lucene/analysis/common/build.xml
index b8eb37a..f752ecc 100644
--- a/lucene/analysis/common/build.xml
+++ b/lucene/analysis/common/build.xml
@@ -33,18 +33,14 @@
 
   <property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
 
-  <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
-                                -jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
-
-  <target name="-jflex-HTMLStripCharFilter"
-          depends="init,generate-jflex-html-char-entities">
-    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
-           outdir="src/java/org/apache/lucene/analysis/charfilter"
-           nobak="on" inputstreamctor="false"/>
-    <!-- Remove the inappropriate JFlex-generated constructor -->
-    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
-                   match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
-                   replace="" flags="s"/>
+  <!-- Because of a bug in JFlex's ant task, HTMLStripCharFilter has to be generated last.   -->
+  <!-- Otherwise the "%apiprivate" option used in its specification will leak into following -->
+  <!-- ant task invocations.                                                                 -->
+  <target name="jflex" depends="init,clean-jflex,-jflex-wiki-tokenizer,-jflex-ClassicAnalyzer,
+                                -jflex-UAX29URLEmailTokenizer,-jflex-HTMLStripCharFilter"/>
+
+  <target name="-jflex-HTMLStripCharFilter" depends="-install-jflex,generate-jflex-html-char-entities">
+    <run-jflex dir="src/java/org/apache/lucene/analysis/charfilter" name="HTMLStripCharFilter"/>
   </target>
 
   <target name="generate-jflex-html-char-entities">
@@ -58,17 +54,17 @@
     <fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
   </target>
 
-  <target name="-jflex-wiki-tokenizer" depends="init,-install-jflex">
+  <target name="-jflex-wiki-tokenizer" depends="-install-jflex">
     <run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
   </target>
 
-  <target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
-    <run-jflex-and-disable-buffer-expansion
-        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+  <target name="-jflex-ClassicAnalyzer" depends="-install-jflex">
+    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
   </target>
 
-  <target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
+  <target name="-jflex-UAX29URLEmailTokenizer" depends="-install-jflex">
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
   </target>
 
   <target name="clean-jflex">

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
index a236497..ae67bde 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -95,127 +95,152 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     "\32\0\1\41\11\0\1\1\12\0\1\1\1\0\1\2\2\0\1\1"+
     "\5\0\27\1\1\0\37\1\1\0\u01ca\1\4\0\14\1\16\0\5\1"+
     "\7\0\1\1\1\0\1\1\21\0\160\2\5\1\1\0\2\1\2\0"+
-    "\4\1\10\0\1\1\1\2\3\1\1\0\1\1\1\0\24\1\1\0"+
-    "\123\1\1\0\213\1\1\0\5\2\2\0\236\1\11\0\46\1\2\0"+
-    "\1\1\7\0\47\1\11\0\55\2\1\0\1\2\1\0\2\2\1\0"+
-    "\2\2\1\0\1\2\10\0\33\1\5\0\3\1\35\0\13\2\5\0"+
-    "\53\1\25\2\12\111\4\0\2\1\1\2\143\1\1\0\1\1\7\2"+
-    "\2\0\6\2\2\1\2\2\1\0\4\2\2\1\12\111\3\1\2\0"+
-    "\1\1\20\0\1\1\1\2\36\1\33\2\2\0\131\1\13\2\1\1"+
-    "\16\0\12\111\41\1\11\2\2\1\4\0\1\1\5\0\26\1\4\2"+
-    "\1\1\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\104\0"+
-    "\1\1\1\0\13\1\67\0\33\2\1\0\4\2\66\1\3\2\1\1"+
-    "\22\2\1\1\7\2\12\1\2\2\2\0\12\111\1\0\7\1\1\0"+
-    "\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
-    "\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1\7\2\2\0"+
-    "\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1\1\0\3\1"+
-    "\2\2\2\0\12\111\2\1\17\0\3\2\1\0\6\1\4\0\2\1"+
-    "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1\1\0\2\1"+
-    "\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2\3\0\1\2"+
-    "\7\0\4\1\1\0\1\1\7\0\12\111\2\2\3\1\1\2\13\0"+
-    "\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0\7\1\1\0"+
-    "\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0\3\2\1\0"+
-    "\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\111\21\0\3\2"+
-    "\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1"+
-    "\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2\2\0\3\2"+
-    "\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0\12\111\1\0"+
-    "\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1\1\0\4\1"+
-    "\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1\3\0\3\1"+
-    "\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2\2\0\1\1"+
-    "\6\0\1\2\16\0\12\111\21\0\3\2\1\0\10\1\1\0\3\1"+
-    "\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1\7\2\1\0"+
-    "\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0\2\1\2\2"+
-    "\2\0\12\111\22\0\2\2\1\0\10\1\1\0\3\1\1\0\27\1"+
+    "\4\1\1\0\1\1\6\0\1\1\1\2\3\1\1\0\1\1\1\0"+
+    "\24\1\1\0\123\1\1\0\213\1\1\0\5\2\2\0\246\1\1\0"+
+    "\46\1\2\0\1\1\7\0\47\1\11\0\55\2\1\0\1\2\1\0"+
+    "\2\2\1\0\2\2\1\0\1\2\10\0\33\1\5\0\3\1\35\0"+
+    "\13\2\5\0\53\1\25\2\12\111\4\0\2\1\1\2\143\1\1\0"+
+    "\1\1\7\2\2\0\6\2\2\1\2\2\1\0\4\2\2\1\12\111"+
+    "\3\1\2\0\1\1\20\0\1\1\1\2\36\1\33\2\2\0\131\1"+
+    "\13\2\1\1\16\0\12\111\41\1\11\2\2\1\4\0\1\1\5\0"+
+    "\26\1\4\2\1\1\11\2\1\1\3\2\1\1\5\2\22\0\31\1"+
+    "\3\2\104\0\25\1\1\0\10\1\26\0\16\2\1\0\41\2\66\1"+
+    "\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\111\1\0"+
+    "\20\1\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
+    "\1\0\1\1\3\0\4\1\2\0\1\2\1\1\7\2\2\0\2\2"+
+    "\2\0\3\2\1\1\10\0\1\2\4\0\2\1\1\0\3\1\2\2"+
+    "\2\0\12\111\2\1\17\0\3\2\1\0\6\1\4\0\2\1\2\0"+
+    "\26\1\1\0\7\1\1\0\2\1\1\0\2\1\1\0\2\1\2\0"+
+    "\1\2\1\0\5\2\4\0\2\2\2\0\3\2\3\0\1\2\7\0"+
+    "\4\1\1\0\1\1\7\0\12\111\2\2\3\1\1\2\13\0\3\2"+
+    "\1\0\11\1\1\0\3\1\1\0\26\1\1\0\7\1\1\0\2\1"+
+    "\1\0\5\1\2\0\1\2\1\1\10\2\1\0\3\2\1\0\3\2"+
+    "\2\0\1\1\17\0\2\1\2\2\2\0\12\111\11\0\1\1\7\0"+
+    "\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0"+
+    "\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2\2\0"+
+    "\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0\12\111"+
+    "\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1\1\0"+
+    "\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1\3\0"+
+    "\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2\2\0"+
+    "\1\1\6\0\1\2\16\0\12\111\20\0\4\2\1\0\10\1\1\0"+
+    "\3\1\1\0\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2"+
+    "\1\0\4\2\7\0\2\2\1\0\3\1\5\0\2\1\2\2\2\0"+
+    "\12\111\20\0\1\1\3\2\1\0\10\1\1\0\3\1\1\0\27\1"+
     "\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2\1\0\3\2"+
     "\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1\2\2\2\0"+
-    "\12\111\1\0\2\1\17\0\2\2\1\0\10\1\1\0\3\1\1\0"+
-    "\51\1\2\0\1\1\7\2\1\0\3\2\1\0\4\2\1\1\10\0"+
-    "\1\2\10\0\2\1\2\2\2\0\12\111\12\0\6\1\2\0\2\2"+
-    "\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0\7\1"+
-    "\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0\2\2"+
-    "\15\0\60\1\1\2\2\1\7\2\5\0\7\1\10\2\1\0\12\111"+
-    "\47\0\2\1\1\0\1\1\2\0\2\1\1\0\1\1\2\0\1\1"+
-    "\6\0\4\1\1\0\7\1\1\0\3\1\1\0\1\1\1\0\1\1"+
-    "\2\0\2\1\1\0\4\1\1\2\2\1\6\2\1\0\2\2\1\1"+
-    "\2\0\5\1\1\0\1\1\1\0\6\2\2\0\12\111\2\0\4\1"+
-    "\40\0\1\1\27\0\2\2\6\0\12\111\13\0\1\2\1\0\1\2"+
-    "\1\0\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2\1\0"+
-    "\2\2\5\1\13\2\1\0\44\2\11\0\1\2\71\0\53\1\24\2"+
-    "\1\1\12\111\6\0\6\1\4\2\4\1\3\2\1\1\3\2\2\1"+
-    "\7\2\3\1\4\2\15\1\14\2\1\1\1\2\12\111\4\2\2\0"+
-    "\46\1\1\0\1\1\5\0\1\1\2\0\53\1\1\0\u014d\1\1\0"+
-    "\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0"+
-    "\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0"+
-    "\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0"+
-    "\3\2\11\0\11\2\16\0\20\1\20\0\125\1\14\0\u026c\1\2\0"+
-    "\21\1\1\41\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
-    "\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
-    "\1\0\3\1\1\0\2\2\14\0\64\1\40\2\3\0\1\1\4\0"+
-    "\1\1\1\2\2\0\12\111\41\0\3\2\2\0\12\111\6\0\130\1"+
-    "\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
-    "\4\0\14\2\12\0\12\111\36\1\2\0\5\1\13\0\54\1\4\0"+
-    "\21\2\7\1\2\2\6\0\12\111\1\2\45\0\27\1\5\2\4\0"+
-    "\65\1\12\2\1\0\35\2\2\0\1\2\12\111\6\0\12\111\15\0"+
-    "\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\111\21\0\11\2"+
-    "\14\0\3\2\36\1\15\2\2\1\12\111\54\1\16\2\14\0\44\1"+
-    "\24\2\10\0\12\111\3\0\3\1\12\111\44\1\122\0\3\2\1\0"+
-    "\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2\25\0"+
-    "\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1"+
-    "\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1"+
-    "\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1"+
-    "\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41"+
-    "\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41"+
-    "\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2"+
-    "\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1"+
-    "\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1"+
-    "\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1"+
-    "\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1"+
-    "\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2"+
-    "\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
-    "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0"+
-    "\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1"+
-    "\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0"+
-    "\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0"+
-    "\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\111"+
-    "\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0\1\2"+
-    "\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
-    "\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2\27\1"+
-    "\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\111\6\0"+
-    "\22\2\6\1\3\0\1\1\4\0\12\111\34\1\10\2\2\0\27\1"+
-    "\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\111"+
-    "\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\111"+
-    "\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1\3\2"+
-    "\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1\2\0"+
-    "\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1\2\0"+
-    "\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
-    "\2\0\12\111\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u016e\1"+
-    "\2\0\152\1\46\0\7\1\14\0\5\1\5\0\1\1\1\2\12\1"+
-    "\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1\1\0\2\1"+
-    "\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1"+
-    "\4\0\20\2\20\0\7\2\14\0\2\2\30\0\3\2\40\0\5\1"+
-    "\1\0\207\1\23\0\12\111\7\0\32\1\4\0\1\2\1\0\32\1"+
-    "\13\0\131\1\3\0\6\1\2\0\6\1\2\0\6\1\2\0\3\1"+
-    "\43\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1\1\0\17\1"+
-    "\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\2\202\0\35\1"+
-    "\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1\2\0\44\1"+
-    "\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\111\u0356\0\6\1"+
-    "\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1\2\0\27\1"+
-    "\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1\100\0\1\1"+
-    "\3\2\1\0\2\2\5\0\4\2\4\1\1\0\3\1\1\0\33\1"+
-    "\4\0\3\2\4\0\1\2\40\0\35\1\203\0\66\1\12\0\26\1"+
-    "\12\0\23\1\215\0\111\1\u03b7\0\3\2\65\1\17\2\37\0\12\111"+
-    "\20\0\3\2\55\1\13\2\25\0\31\1\7\0\12\111\6\0\3\2"+
-    "\44\1\16\2\1\0\12\111\100\0\3\2\60\1\16\2\4\1\13\0"+
-    "\12\111\u04a6\0\53\1\15\2\10\0\12\111\u0936\0\u036f\1\221\0\143\1"+
-    "\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1\13\0\1\1\56\2\20\0"+
-    "\4\2\15\1\u4060\0\2\1\u2163\0\5\2\3\0\6\2\10\0\10\2"+
-    "\2\0\7\2\36\0\4\2\224\0\3\2\u01bb\0\125\1\1\0\107\1"+
-    "\1\0\2\1\2\0\1\1\2\0\2\1\2\0\4\1\1\0\14\1"+
-    "\1\0\1\1\1\0\7\1\1\0\101\1\1\0\4\1\2\0\10\1"+
-    "\1\0\7\1\1\0\34\1\1\0\4\1\1\0\5\1\1\0\1\1"+
-    "\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0\31\1\1\0\37\1"+
-    "\1\0\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1"+
-    "\1\0\37\1\1\0\31\1\1\0\10\1\2\0\62\111\u1600\0\4\1"+
+    "\12\111\1\0\2\1\16\0\3\2\1\0\10\1\1\0\3\1\1\0"+
+    "\51\1\2\0\1\1\7\2\1\0\3\2\1\0\4\2\1\1\5\0"+
+    "\3\1\1\2\7\0\3\1\2\2\2\0\12\111\12\0\6\1\2\0"+
+    "\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
+    "\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\6\0"+
+    "\12\111\2\0\2\2\15\0\60\1\1\2\2\1\7\2\5\0\7\1"+
+    "\10\2\1\0\12\111\47\0\2\1\1\0\1\1\2\0\2\1\1\0"+
+    "\1\1\2\0\1\1\6\0\4\1\1\0\7\1\1\0\3\1\1\0"+
+    "\1\1\1\0\1\1\2\0\2\1\1\0\4\1\1\2\2\1\6\2"+
+    "\1\0\2\2\1\1\2\0\5\1\1\0\1\1\1\0\6\2\2\0"+
+    "\12\111\2\0\4\1\40\0\1\1\27\0\2\2\6\0\12\111\13\0"+
+    "\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
+    "\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
+    "\71\0\53\1\24\2\1\1\12\111\6\0\6\1\4\2\4\1\3\2"+
+    "\1\1\3\2\2\1\7\2\3\1\4\2\15\1\14\2\1\1\1\2"+
+    "\12\111\4\2\2\0\46\1\1\0\1\1\5\0\1\1\2\0\53\1"+
+    "\1\0\u014d\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1"+
+    "\2\0\51\1\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1"+
+    "\1\0\1\1\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1"+
+    "\2\0\103\1\2\0\3\2\11\0\11\2\16\0\20\1\20\0\126\1"+
+    "\2\0\6\1\3\0\u026c\1\2\0\21\1\1\41\32\1\5\0\113\1"+
+    "\3\0\13\1\7\0\15\1\1\0\4\1\3\2\13\0\22\1\3\2"+
+    "\13\0\22\1\2\2\14\0\15\1\1\0\3\1\1\0\2\2\14\0"+
+    "\64\1\40\2\3\0\1\1\4\0\1\1\1\2\2\0\12\111\41\0"+
+    "\3\2\2\0\12\111\6\0\130\1\10\0\51\1\1\2\1\1\5\0"+
+    "\106\1\12\0\37\1\1\0\14\2\4\0\14\2\12\0\12\111\36\1"+
+    "\2\0\5\1\13\0\54\1\4\0\32\1\6\0\12\111\1\2\45\0"+
+    "\27\1\5\2\4\0\65\1\12\2\1\0\35\2\2\0\1\2\12\111"+
+    "\6\0\12\111\15\0\1\1\10\0\16\2\102\0\5\2\57\1\21\2"+
+    "\7\1\4\0\12\111\21\0\11\2\14\0\3\2\36\1\15\2\2\1"+
+    "\12\111\54\1\16\2\14\0\44\1\24\2\10\0\12\111\3\0\3\1"+
+    "\12\111\44\1\2\0\11\1\107\0\3\2\1\0\25\2\4\1\1\2"+
+    "\4\1\3\2\2\1\1\0\2\2\6\0\300\1\66\2\5\0\5\2"+
+    "\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0"+
+    "\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0"+
+    "\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0"+
+    "\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41\35\0"+
+    "\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41\21\0"+
+    "\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2\3\0"+
+    "\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1\2\0"+
+    "\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1\2\0"+
+    "\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1\1\0"+
+    "\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1\1\0"+
+    "\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2\27\1"+
+    "\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
+    "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0\1\41"+
+    "\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1\4\0"+
+    "\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0\51\1"+
+    "\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0\u51d6\1"+
+    "\52\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\111\2\1"+
+    "\24\0\57\1\1\2\4\0\12\2\1\0\37\1\2\2\120\1\2\2"+
+    "\45\0\11\1\2\0\147\1\2\0\44\1\1\0\10\1\77\0\13\1"+
+    "\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0\64\1\14\0"+
+    "\2\2\62\1\22\2\12\0\12\111\6\0\22\2\6\1\3\0\1\1"+
+    "\1\0\1\1\2\0\12\111\34\1\10\2\2\0\27\1\15\2\14\0"+
+    "\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\111\6\0\5\1"+
+    "\1\2\12\1\12\111\5\1\1\0\51\1\16\2\11\0\3\1\1\2"+
+    "\10\1\2\2\2\0\12\111\6\0\27\1\3\0\1\1\3\2\62\1"+
+    "\1\2\1\1\3\2\2\1\2\2\5\1\2\2\1\1\1\2\1\1"+
+    "\30\0\3\1\2\0\13\1\5\2\2\0\3\1\2\2\12\0\6\1"+
+    "\2\0\6\1\2\0\6\1\11\0\7\1\1\0\7\1\1\0\53\1"+
+    "\1\0\12\1\12\0\163\1\10\2\1\0\2\2\2\0\12\111\6\0"+
+    "\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u016e\1\2\0\152\1\46\0"+
+    "\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0"+
+    "\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0"+
+    "\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2\20\0"+
+    "\20\2\3\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1\23\0"+
+    "\12\111\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1\3\0"+
+    "\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0\14\1\1\0"+
+    "\32\1\1\0\23\1\1\0\2\1\1\0\17\1\2\0\16\1\42\0"+
+    "\173\1\105\0\65\1\210\0\1\2\202\0\35\1\3\0\61\1\17\0"+
+    "\1\2\37\0\40\1\20\0\33\1\5\0\46\1\5\2\5\0\36\1"+
+    "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\111"+
+    "\6\0\44\1\4\0\44\1\4\0\50\1\10\0\64\1\234\0\u0137\1"+
+    "\11\0\26\1\12\0\10\1\230\0\6\1\2\0\1\1\1\0\54\1"+
+    "\1\0\2\1\3\0\1\1\2\0\27\1\12\0\27\1\11\0\37\1"+
+    "\101\0\23\1\1\0\2\1\12\0\26\1\12\0\32\1\106\0\70\1"+
+    "\6\0\2\1\100\0\1\1\3\2\1\0\2\2\5\0\4\2\4\1"+
+    "\1\0\3\1\1\0\33\1\4\0\3\2\4\0\1\2\40\0\35\1"+
+    "\3\0\35\1\43\0\10\1\1\0\34\1\2\2\31\0\66\1\12\0"+
+    "\26\1\12\0\23\1\15\0\22\1\156\0\111\1\67\0\63\1\15\0"+
+    "\63\1\u030d\0\3\2\65\1\17\2\37\0\12\111\17\0\4\2\55\1"+
+    "\13\2\25\0\31\1\7\0\12\111\6\0\3\2\44\1\16\2\1\0"+
+    "\12\111\20\0\43\1\1\2\2\0\1\1\11\0\3\2\60\1\16\2"+
+    "\4\1\5\0\3\2\3\0\12\111\1\1\1\0\1\1\43\0\22\1"+
+    "\1\0\31\1\14\2\6\0\1\2\101\0\7\1\1\0\1\1\1\0"+
+    "\4\1\1\0\17\1\1\0\12\1\7\0\57\1\14\2\5\0\12\111"+
+    "\6\0\4\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
+    "\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
+    "\2\0\3\2\2\0\1\1\6\0\1\2\5\0\5\1\2\2\2\0"+
+    "\7\2\3\0\5\2\213\0\65\1\22\2\4\1\5\0\12\111\46\0"+
+    "\60\1\24\2\2\1\1\0\1\1\10\0\12\111\246\0\57\1\7\2"+
+    "\2\0\11\2\27\0\4\1\2\2\42\0\60\1\21\2\3\0\1\1"+
+    "\13\0\12\111\46\0\53\1\15\2\10\0\12\111\66\0\32\1\3\0"+
+    "\17\2\4\0\12\111\u0166\0\100\1\12\111\25\0\1\1\u01c0\0\71\1"+
+    "\u0107\0\11\1\1\0\45\1\10\2\1\0\10\2\1\1\17\0\12\111"+
+    "\30\0\36\1\2\0\26\2\1\0\16\2\u0349\0\u039a\1\146\0\157\1"+
+    "\21\0\304\1\u0abc\0\u042f\1\u0fd1\0\u0247\1\u21b9\0\u0239\1\7\0\37\1"+
+    "\1\0\12\111\146\0\36\1\2\0\5\2\13\0\60\1\7\2\11\0"+
+    "\4\1\14\0\12\111\11\0\25\1\5\0\23\1\u0370\0\105\1\13\0"+
+    "\1\1\56\2\20\0\4\2\15\1\100\0\1\1\37\0\u17ed\1\23\0"+
+    "\u02f3\1\u250d\0\2\1\u0bfe\0\153\1\5\0\15\1\3\0\11\1\7\0"+
+    "\12\1\3\0\2\2\u14c6\0\5\2\3\0\6\2\10\0\10\2\2\0"+
+    "\7\2\36\0\4\2\224\0\3\2\u01bb\0\125\1\1\0\107\1\1\0"+
+    "\2\1\2\0\1\1\2\0\2\1\2\0\4\1\1\0\14\1\1\0"+
+    "\1\1\1\0\7\1\1\0\101\1\1\0\4\1\2\0\10\1\1\0"+
+    "\7\1\1\0\34\1\1\0\4\1\1\0\5\1\1\0\1\1\3\0"+
+    "\7\1\1\0\u0154\1\2\0\31\1\1\0\31\1\1\0\37\1\1\0"+
+    "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
+    "\37\1\1\0\31\1\1\0\10\1\2\0\62\111\u0200\0\67\2\4\0"+
+    "\62\2\10\0\1\2\16\0\1\2\26\0\5\2\1\0\17\2\u0550\0"+
+    "\7\2\1\0\21\2\2\0\7\2\1\0\2\2\1\0\5\2\u07d5\0"+
+    "\305\1\13\0\7\2\51\0\104\1\7\2\5\0\12\111\u04a6\0\4\1"+
     "\1\0\33\1\1\0\2\1\1\0\1\1\2\0\1\1\1\0\12\1"+
     "\1\0\4\1\1\0\1\1\1\0\1\1\6\0\1\1\4\0\1\1"+
     "\1\0\1\1\1\0\1\1\1\0\3\1\1\0\2\1\1\0\1\1"+
@@ -223,7 +248,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     "\1\0\2\1\1\0\1\1\2\0\4\1\1\0\7\1\1\0\4\1"+
     "\1\0\4\1\1\0\1\1\1\0\12\1\1\0\21\1\5\0\3\1"+
     "\1\0\5\1\1\0\21\1\u1144\0\ua6d7\1\51\0\u1035\1\13\0\336\1"+
-    "\u3fe2\0\u021e\1\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u06ed\0"+
+    "\2\0\u1682\1\u295e\0\u021e\1\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u06ed\0"+
     "\360\2\uffff\0\uffff\0\ufe12\0";
 
   /** 
@@ -29654,7 +29679,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
 
   /* error messages for the codes above */
   private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
     "Error: could not match input",
     "Error: pushback value was too large"
   };
@@ -29809,11 +29834,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
   private int yycolumn;
 
   /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
    */
   private boolean zzAtBOL = true;
 
-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
   private boolean zzAtEOF;
 
   /** denotes if the user-EOF-code has already been executed */
@@ -29950,24 +29975,14 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
   private TextSegment entitySegment = new TextSegment(2);
 
   /**
-   * Creates a new HTMLStripCharFilter over the provided Reader.
-   * @param source Reader to strip html tags from.
-   */
-  public HTMLStripCharFilter(Reader source) {
-    super(source);
-    this.zzReader = source;
-  }
-
-  /**
    * Creates a new HTMLStripCharFilter over the provided Reader
    * with the specified start and end tags.
-   * @param source Reader to strip html tags from.
+   * @param in Reader to strip html tags from.
    * @param escapedTags Tags in this set (both start and end tags)
    *  will not be filtered out.
    */
-  public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
-    super(source);
-    this.zzReader = source;
+  public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
+    this(in);
     if (null != escapedTags) {
       for (String tag : escapedTags) {
         if (tag.equalsIgnoreCase("BR")) {
@@ -30059,7 +30074,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
   }
 
 
-  
+  /**
+   * Creates a new scanner
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public HTMLStripCharFilter(java.io.Reader in) {
+    super(in);
+    this.zzReader = in;
+  }
 
 
   /** 
@@ -30072,7 +30095,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     char [] map = new char[0x110000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 2836) {
+    while (i < 3340) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -30116,28 +30139,29 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     }
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
-    }
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
 
-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
         if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
           --zzEndRead;
           zzFinalHighSurrogate = 1;
         }
       }
+      /* potentially more input available */
       return false;
     }
 
-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
     return true;
   }
 
@@ -30420,43 +30444,55 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       // store back cached position
       zzMarkedPos = zzMarkedPosL;
 
-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { if (yylength() == 1) {
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+            zzDoEOF();
+          {   return eofReturnValue;
+ }
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { if (yylength() == 1) {
     return zzBuffer[zzStartRead];
   } else {
     outputSegment.append(yytext()); return outputSegment.nextChar();
   }
-          }
-        case 55: break;
-        case 2: 
-          { inputStart = yychar;
+            } 
+            // fall through
+          case 55: break;
+          case 2: 
+            { inputStart = yychar;
   inputSegment.clear();
   inputSegment.append('<');
   yybegin(LEFT_ANGLE_BRACKET);
-          }
-        case 56: break;
-        case 3: 
-          { inputStart = yychar;
+            } 
+            // fall through
+          case 56: break;
+          case 3: 
+            { inputStart = yychar;
   inputSegment.clear();
   inputSegment.append('&');
   yybegin(AMPERSAND);
-          }
-        case 57: break;
-        case 4: 
-          { yypushback(yylength());
+            } 
+            // fall through
+          case 57: break;
+          case 4: 
+            { yypushback(yylength());
     outputSegment = inputSegment;
     outputSegment.restart();
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
-          }
-        case 58: break;
-        case 5: 
-          { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
-          }
-        case 59: break;
-        case 6: 
-          { int matchLength = yylength();
+            } 
+            // fall through
+          case 58: break;
+          case 5: 
+            { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
+            } 
+            // fall through
+          case 59: break;
+          case 6: 
+            { int matchLength = yylength();
     inputSegment.write(zzBuffer, zzStartRead, matchLength);
     if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
       String decimalCharRef = yytext();
@@ -30487,180 +30523,206 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       yybegin(YYINITIAL);
       return outputSegment.nextChar();
     }
-          }
-        case 60: break;
-        case 7: 
-          { // add (previously matched input length) + (this match length) - (substitution length)
+            } 
+            // fall through
+          case 60: break;
+          case 7: 
+            { // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
     // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
-          }
-        case 61: break;
-        case 8: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 61: break;
+          case 8: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
         && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
       yybegin(START_TAG_TAIL_INCLUDE);
     } else {
       yybegin(START_TAG_TAIL_SUBSTITUTE);
     }
-          }
-        case 62: break;
-        case 9: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 62: break;
+          case 9: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
         && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
       yybegin(START_TAG_TAIL_INCLUDE);
     } else {
       yybegin(START_TAG_TAIL_EXCLUDE);
     }
-          }
-        case 63: break;
-        case 10: 
-          { inputSegment.append('!'); yybegin(BANG);
-          }
-        case 64: break;
-        case 11: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 63: break;
+          case 10: 
+            { inputSegment.append('!'); yybegin(BANG);
+            } 
+            // fall through
+          case 64: break;
+          case 11: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
     yybegin(LEFT_ANGLE_BRACKET_SPACE);
-          }
-        case 65: break;
-        case 12: 
-          { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
-          }
-        case 66: break;
-        case 13: 
-          { inputSegment.append(yytext());
-          }
-        case 67: break;
-        case 14: 
-          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+            } 
+            // fall through
+          case 65: break;
+          case 12: 
+            { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
+            } 
+            // fall through
+          case 66: break;
+          case 13: 
+            { inputSegment.append(yytext());
+            } 
+            // fall through
+          case 67: break;
+          case 14: 
+            { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
     // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
-          }
-        case 68: break;
-        case 15: 
-          { 
-          }
-        case 69: break;
-        case 16: 
-          { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
-          }
-        case 70: break;
-        case 17: 
-          { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
-          }
-        case 71: break;
-        case 18: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 68: break;
+          case 15: 
+            { 
+            } 
+            // fall through
+          case 69: break;
+          case 16: 
+            { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
+            } 
+            // fall through
+          case 70: break;
+          case 17: 
+            { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
+            } 
+            // fall through
+          case 71: break;
+          case 18: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
         && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
       yybegin(END_TAG_TAIL_INCLUDE);
     } else {
       yybegin(END_TAG_TAIL_SUBSTITUTE);
     }
-          }
-        case 72: break;
-        case 19: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 72: break;
+          case 19: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
         && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
       yybegin(END_TAG_TAIL_INCLUDE);
     } else {
       yybegin(END_TAG_TAIL_EXCLUDE);
     }
-          }
-        case 73: break;
-        case 20: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
-          }
-        case 74: break;
-        case 21: 
-          { if (yylength() == 1) {
+            } 
+            // fall through
+          case 73: break;
+          case 20: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 74: break;
+          case 21: 
+            { if (yylength() == 1) {
       return zzBuffer[zzStartRead];
     } else {
       outputSegment.append(yytext()); return outputSegment.nextChar();
     }
-          }
-        case 75: break;
-        case 22: 
-          { previousRestoreState = restoreState;
+            } 
+            // fall through
+          case 75: break;
+          case 22: 
+            { previousRestoreState = restoreState;
     restoreState = SERVER_SIDE_INCLUDE;
     yybegin(SINGLE_QUOTED_STRING);
-          }
-        case 76: break;
-        case 23: 
-          { previousRestoreState = restoreState;
+            } 
+            // fall through
+          case 76: break;
+          case 23: 
+            { previousRestoreState = restoreState;
     restoreState = SERVER_SIDE_INCLUDE;
     yybegin(DOUBLE_QUOTED_STRING);
-          }
-        case 77: break;
-        case 24: 
-          { yybegin(restoreState); restoreState = previousRestoreState;
-          }
-        case 78: break;
-        case 25: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+            } 
+            // fall through
+          case 77: break;
+          case 24: 
+            { yybegin(restoreState); restoreState = previousRestoreState;
+            } 
+            // fall through
+          case 78: break;
+          case 25: 
+            { inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      yybegin(YYINITIAL);
      return outputSegment.nextChar();
-          }
-        case 79: break;
-        case 26: 
-          { // add (previously matched input length) + (this match length) - (substitution length)
+            } 
+            // fall through
+          case 79: break;
+          case 26: 
+            { // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - 1;
     // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
     return BLOCK_LEVEL_END_TAG_REPLACEMENT;
-          }
-        case 80: break;
-        case 27: 
-          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+            } 
+            // fall through
+          case 80: break;
+          case 27: 
+            { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
     // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     outputSegment = inputSegment;
     yybegin(YYINITIAL);
-          }
-        case 81: break;
-        case 28: 
-          { // add (previously matched input length) + (this match length) - (substitution length)
+            } 
+            // fall through
+          case 81: break;
+          case 28: 
+            { // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - 1;
     // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
     return BLOCK_LEVEL_START_TAG_REPLACEMENT;
-          }
-        case 82: break;
-        case 29: 
-          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
-          }
-        case 83: break;
-        case 30: 
-          { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
-          }
-        case 84: break;
-        case 31: 
-          { int length = yylength();
+            } 
+            // fall through
+          case 82: break;
+          case 29: 
+            { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
+            } 
+            // fall through
+          case 83: break;
+          case 30: 
+            { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
+            } 
+            // fall through
+          case 84: break;
+          case 31: 
+            { int length = yylength();
     inputSegment.write(zzBuffer, zzStartRead, length);
     entitySegment.clear();
     char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
     entitySegment.append(ch);
     outputSegment = entitySegment;
     yybegin(CHARACTER_REFERENCE_TAIL);
-          }
-        case 85: break;
-        case 32: 
-          { int matchLength = yylength();
+            } 
+            // fall through
+          case 85: break;
+          case 32: 
+            { int matchLength = yylength();
     inputSegment.write(zzBuffer, zzStartRead, matchLength);
     if (matchLength <= 6) { // 10FFFF: max 6 hex chars
       String hexCharRef
@@ -30692,18 +30754,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       yybegin(YYINITIAL);
       return outputSegment.nextChar();
     }
-          }
-        case 86: break;
-        case 33: 
-          { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
+            } 
+            // fall through
+          case 86: break;
+          case 33: 
+            { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a comment
       inputSegment.append(yytext());
     } else {
       yybegin(COMMENT);
     }
-          }
-        case 87: break;
-        case 34: 
-          { yybegin(YYINITIAL);
+            } 
+            // fall through
+          case 87: break;
+          case 34: 
+            { yybegin(YYINITIAL);
     if (escapeBR) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
@@ -30716,23 +30780,26 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       inputSegment.reset();
       return BR_START_TAG_REPLACEMENT;
     }
-          }
-        case 88: break;
-        case 35: 
-          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+            } 
+            // fall through
+          case 88: break;
+          case 35: 
+            { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
     cumulativeDiff += yychar - inputStart + yylength();
     // position the correction at (already output length) [ + (substitution length) = 0]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
-          }
-        case 89: break;
-        case 36: 
-          { yybegin(SCRIPT);
-          }
-        case 90: break;
-        case 37: 
-          { yybegin(YYINITIAL);
+            } 
+            // fall through
+          case 89: break;
+          case 36: 
+            { yybegin(SCRIPT);
+            } 
+            // fall through
+          case 90: break;
+          case 37: 
+            { yybegin(YYINITIAL);
     if (escapeBR) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
@@ -30745,66 +30812,77 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       inputSegment.reset();
       return BR_END_TAG_REPLACEMENT;
     }
-          }
-        case 91: break;
-        case 38: 
-          { // add (this match length) [ - (substitution length) = 0 ]
+            } 
+            // fall through
+          case 91: break;
+          case 38: 
+            { // add (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += yylength();
     // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     yybegin(YYINITIAL);
-          }
-        case 92: break;
-        case 39: 
-          { yybegin(restoreState);
-          }
-        case 93: break;
-        case 40: 
-          { yybegin(STYLE);
-          }
-        case 94: break;
-        case 41: 
-          { yybegin(SCRIPT_COMMENT);
-          }
-        case 95: break;
-        case 42: 
-          { yybegin(STYLE_COMMENT);
-          }
-        case 96: break;
-        case 43: 
-          { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
-          }
-        case 97: break;
-        case 44: 
-          { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
-          }
-        case 98: break;
-        case 45: 
-          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
-          }
-        case 99: break;
-        case 46: 
-          { yybegin(STYLE);
+            } 
+            // fall through
+          case 92: break;
+          case 39: 
+            { yybegin(restoreState);
+            } 
+            // fall through
+          case 93: break;
+          case 40: 
+            { yybegin(STYLE);
+            } 
+            // fall through
+          case 94: break;
+          case 41: 
+            { yybegin(SCRIPT_COMMENT);
+            } 
+            // fall through
+          case 95: break;
+          case 42: 
+            { yybegin(STYLE_COMMENT);
+            } 
+            // fall through
+          case 96: break;
+          case 43: 
+            { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+            } 
+            // fall through
+          case 97: break;
+          case 44: 
+            { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+            } 
+            // fall through
+          case 98: break;
+          case 45: 
+            { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+            } 
+            // fall through
+          case 99: break;
+          case 46: 
+            { yybegin(STYLE);
     if (escapeSTYLE) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       inputStart += 1 + yylength();
       return outputSegment.nextChar();
     }
-          }
-        case 100: break;
-        case 47: 
-          { yybegin(SCRIPT);
+            } 
+            // fall through
+          case 100: break;
+          case 47: 
+            { yybegin(SCRIPT);
     if (escapeSCRIPT) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       inputStart += 1 + yylength();
       return outputSegment.nextChar();
     }
-          }
-        case 101: break;
-        case 48: 
-          { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
+            } 
+            // fall through
+          case 101: break;
+          case 48: 
+            { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section
       inputSegment.append(yytext());
     } else {
       // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
@@ -30814,10 +30892,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
       inputSegment.clear();
       yybegin(CDATA);
     }
-          }
-        case 102: break;
-        case 49: 
-          { inputSegment.clear();
+            } 
+            // fall through
+          case 102: break;
+          case 49: 
+            { inputSegment.clear();
     yybegin(YYINITIAL);
     // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
@@ -30837,10 +30916,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     }
     addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
-          }
-        case 103: break;
-        case 50: 
-          { inputSegment.clear();
+            } 
+            // fall through
+          case 103: break;
+          case 50: 
+            { inputSegment.clear();
     yybegin(YYINITIAL);
     // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
@@ -30860,10 +30940,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     }
     addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
-          }
-        case 104: break;
-        case 51: 
-          { // Handle paired UTF-16 surrogates.
+            } 
+            // fall through
+          case 104: break;
+          case 51: 
+            { // Handle paired UTF-16 surrogates.
     outputSegment = entitySegment;
     outputSegment.clear();
     String surrogatePair = yytext();
@@ -30888,10 +30969,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     inputSegment.clear();
     yybegin(YYINITIAL);
     return highSurrogate;
-          }
-        case 105: break;
-        case 52: 
-          { // Handle paired UTF-16 surrogates.
+            } 
+            // fall through
+          case 105: break;
+          case 52: 
+            { // Handle paired UTF-16 surrogates.
     String surrogatePair = yytext();
     char highSurrogate = '\u0000';
     char lowSurrogate = '\u0000';
@@ -30922,10 +31004,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     yypushback(surrogatePair.length() - 1); // Consume only '#'
     inputSegment.append('#');
     yybegin(NUMERIC_CHARACTER);
-          }
-        case 106: break;
-        case 53: 
-          { // Handle paired UTF-16 surrogates.
+            } 
+            // fall through
+          case 106: break;
+          case 53: 
+            { // Handle paired UTF-16 surrogates.
     String surrogatePair = yytext();
     char highSurrogate = '\u0000';
     try { // High surrogates are in decimal range [55296, 56319]
@@ -30955,10 +31038,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     yypushback(surrogatePair.length() - 1); // Consume only '#'
     inputSegment.append('#');
     yybegin(NUMERIC_CHARACTER);
-          }
-        case 107: break;
-        case 54: 
-          { // Handle paired UTF-16 surrogates.
+            } 
+            // fall through
+          case 107: break;
+          case 54: 
+            { // Handle paired UTF-16 surrogates.
     String surrogatePair = yytext();
     char highSurrogate = '\u0000';
     try { // High surrogates are in decimal range [55296, 56319]
@@ -30991,18 +31075,12 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
     yypushback(surrogatePair.length() - 1); // Consume only '#'
     inputSegment.append('#');
     yybegin(NUMERIC_CHARACTER);
-          }
-        case 108: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-            zzDoEOF();
-              {   return eofReturnValue;
- }
-          } 
-          else {
+            } 
+            // fall through
+          case 108: break;
+          default:
             zzScanError(ZZ_NO_MATCH);
-          }
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
index d810d79..8b83de0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 @SuppressWarnings("fallthrough")
 %%
 
-%unicode 6.3
+%unicode 9.0
 %apiprivate
 %type int
 %final
@@ -50,6 +50,10 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 %xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
 %xstate STYLE, STYLE_COMMENT
 
+%init{
+  super(in);
+%init}
+
 // From XML 1.0 <http://www.w3.org/TR/xml/>:
 //
 //    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
@@ -166,24 +170,14 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
   private TextSegment entitySegment = new TextSegment(2);
 
   /**
-   * Creates a new HTMLStripCharFilter over the provided Reader.
-   * @param source Reader to strip html tags from.
-   */
-  public HTMLStripCharFilter(Reader source) {
-    super(source);
-    this.zzReader = source;
-  }
-
-  /**
    * Creates a new HTMLStripCharFilter over the provided Reader
    * with the specified start and end tags.
-   * @param source Reader to strip html tags from.
+   * @param in Reader to strip html tags from.
    * @param escapedTags Tags in this set (both start and end tags)
    *  will not be filtered out.
    */
-  public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
-    super(source);
-    this.zzReader = source;
+  public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
+    this(in);
     if (null != escapedTags) {
       for (String tag : escapedTags) {
         if (tag.equalsIgnoreCase("BR")) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
index 7e5105d..31d3d96 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -251,7 +251,7 @@ class ClassicTokenizerImpl {
 
   /* error messages for the codes above */
   private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
     "Error: could not match input",
     "Error: pushback value was too large"
   };
@@ -323,11 +323,11 @@ class ClassicTokenizerImpl {
   private int yycolumn;
 
   /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
    */
   private boolean zzAtBOL = true;
 
-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
   private boolean zzAtEOF;
 
   /** denotes if the user-EOF-code has already been executed */
@@ -436,28 +436,29 @@ public final void getText(CharTermAttribute t) {
     }
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
-    }
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
 
-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
         if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
           --zzEndRead;
           zzFinalHighSurrogate = 1;
         }
       }
+      /* potentially more input available */
       return false;
     }
 
-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
     return true;
   }
 
@@ -681,55 +682,65 @@ public final void getText(CharTermAttribute t) {
       // store back cached position
       zzMarkedPos = zzMarkedPosL;
 
-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore */
-          }
-        case 11: break;
-        case 2: 
-          { return ALPHANUM;
-          }
-        case 12: break;
-        case 3: 
-          { return CJ;
-          }
-        case 13: break;
-        case 4: 
-          { return HOST;
-          }
-        case 14: break;
-        case 5: 
-          { return NUM;
-          }
-        case 15: break;
-        case 6: 
-          { return APOSTROPHE;
-          }
-        case 16: break;
-        case 7: 
-          { return COMPANY;
-          }
-        case 17: break;
-        case 8: 
-          { return ACRONYM_DEP;
-          }
-        case 18: break;
-        case 9: 
-          { return ACRONYM;
-          }
-        case 19: break;
-        case 10: 
-          { return EMAIL;
-          }
-        case 20: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-            return YYEOF;
-          } 
-          else {
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+        return YYEOF;
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+            } 
+            // fall through
+          case 11: break;
+          case 2: 
+            { return ALPHANUM;
+            } 
+            // fall through
+          case 12: break;
+          case 3: 
+            { return CJ;
+            } 
+            // fall through
+          case 13: break;
+          case 4: 
+            { return HOST;
+            } 
+            // fall through
+          case 14: break;
+          case 5: 
+            { return NUM;
+            } 
+            // fall through
+          case 15: break;
+          case 6: 
+            { return APOSTROPHE;
+            } 
+            // fall through
+          case 16: break;
+          case 7: 
+            { return COMPANY;
+            } 
+            // fall through
+          case 17: break;
+          case 8: 
+            { return ACRONYM_DEP;
+            } 
+            // fall through
+          case 18: break;
+          case 9: 
+            { return ACRONYM;
+            } 
+            // fall through
+          case 19: break;
+          case 10: 
+            { return EMAIL;
+            } 
+            // fall through
+          case 20: break;
+          default:
             zzScanError(ZZ_NO_MATCH);
-          }
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
index 842ae51..65848f2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@@ -32,33 +32,32 @@ import org.apache.lucene.util.AttributeFactory;
  * algorithm, as specified in 
  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
  * URLs and email addresses are also tokenized according to the relevant RFCs.
- * <p>
- * Tokens produced are of the following types:
- * <ul>
- *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
- *   <li>&lt;NUM&gt;: A number</li>
- *   <li>&lt;URL&gt;: A URL</li>
- *   <li>&lt;EMAIL&gt;: An email address</li>
- *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
- *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
- *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
- * </ul>
  */
 
 public final class UAX29URLEmailTokenizer extends Tokenizer {
   /** A private instance of the JFlex-constructed scanner */
   private final UAX29URLEmailTokenizerImpl scanner;
-  
-  public static final int ALPHANUM          = 0;
-  public static final int NUM               = 1;
-  public static final int SOUTHEAST_ASIAN   = 2;
-  public static final int IDEOGRAPHIC       = 3;
-  public static final int HIRAGANA          = 4;
-  public static final int KATAKANA          = 5;
-  public static final int HANGUL            = 6;
-  public static final int URL               = 7;
-  public static final int EMAIL             = 8;
+
+  /** Alpha/numeric token type */
+  public static final int ALPHANUM = 0;
+  /** Numeric token type */
+  public static final int NUM = 1;
+  /** Southeast Asian token type */
+  public static final int SOUTHEAST_ASIAN = 2;
+  /** Ideographic token type */
+  public static final int IDEOGRAPHIC = 3;
+  /** Hiragana token type */
+  public static final int HIRAGANA = 4;
+  /** Katakana token type */
+  public static final int KATAKANA = 5;
+  /** Hangul token type */
+  public static final int HANGUL = 6;
+  /** URL token type */
+  public static final int URL = 7;
+  /** Email token type */
+  public static final int EMAIL = 8;
+  /** Emoji token type. */
+  public static final int EMOJI = 9;
 
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -71,6 +70,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
     "<URL>",
     "<EMAIL>",
+    StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
   };
 
   /** Absolute maximum sized token */