You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:19 UTC
[05/24] lucene-solr:master: LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '' token type.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
index 8b288c2..a2ad394 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
  *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
  * </ul>
  */
 @SuppressWarnings("fallthrough")
@@ -65,147 +66,212 @@ public final class StandardTokenizerImpl {
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
-    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
-    "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
-    "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
-    "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
-    "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
-    "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
-    "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
-    "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
-    "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
-    "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
-    "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
-    "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
-    "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
-    "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
-    "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
-    "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
-    "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
-    "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
-    "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
-    "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
-    "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
-    "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
-    "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
-    "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
-    "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
-    "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
-    "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
-    "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
-    "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
-    "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
-    "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
-    "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
-    "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
-    "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
-    "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
-    "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
-    "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
-    "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
-    "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
-    "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
-    "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
-    "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
-    "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
-    "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
-    "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
-    "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
-    "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
-    "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
-    "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
-    "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
-    "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
-    "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
-    "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
-    "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
-    "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
-    "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
-    "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
-    "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
-    "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
-    "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
-    "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
-    "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
-    "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
-    "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
-    "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
-    "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
-    "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
-    "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
-    "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
-    "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
-    "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
-    "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
-    "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
-    "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
-    "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
-    "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
-    "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
-    "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
-    "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
-    "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
-    "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
-    "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
-    "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
-    "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
-    "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
-    "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
-    "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
-    "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
-    "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
-    "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
-    "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
-    "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
-    "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
-    "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
-    "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
-    "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
-    "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
-    "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
-    "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
-    "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
-    "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
-    "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
-    "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
-    "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
-    "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
-    "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
-    "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
-    "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
-    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
-    "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
-    "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
-    "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
-    "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
-    "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
-    "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
-    "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
-    "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
-    "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
-    "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
-    "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
-    "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
-    "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
-    "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
-    "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
-    "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
-    "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
-    "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
-    "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
-    "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
-    "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
-    "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
-    "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
-    "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
-    "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
-    "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
-    "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
-    "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
-    "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
-    "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
-    "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+    "\42\0\1\32\1\7\3\0\1\31\2\0\1\7\1\0\1\24\1\0"+
+    "\1\25\1\0\12\21\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+    "\32\15\56\0\1\4\1\15\2\0\1\5\1\4\6\0\1\15\1\0"+
+    "\1\23\2\0\1\15\5\0\27\15\1\0\37\15\1\0\u01ca\15\4\0"+
+    "\14\15\5\0\1\23\10\0\5\15\7\0\1\15\1\0\1\15\21\0"+
+    "\160\5\5\15\1\0\2\15\2\0\4\15\1\24\1\15\6\0\1\15"+
+    "\1\23\3\15\1\0\1\15\1\0\24\15\1\0\123\15\1\0\213\15"+
+    "\1\0\7\5\246\15\1\0\46\15\2\0\1\15\7\0\47\15\1\0"+
+    "\1\24\7\0\55\5\1\0\1\5\1\0\2\5\1\0\2\5\1\0"+
+    "\1\5\10\0\33\33\5\0\3\33\1\15\1\23\13\0\6\5\6\0"+
+    "\2\24\2\0\13\5\1\0\1\5\3\0\53\15\25\5\12\20\1\0"+
+    "\1\20\1\24\1\0\2\15\1\5\143\15\1\0\1\15\10\5\1\0"+
+    "\6\5\2\15\2\5\1\0\4\5\2\15\12\20\3\15\2\0\1\15"+
+    "\17\0\1\5\1\15\1\5\36\15\33\5\2\0\131\15\13\5\1\15"+
+    "\16\0\12\20\41\15\11\5\2\15\2\0\1\24\1\0\1\15\5\0"+
+    "\26\15\4\5\1\15\11\5\1\15\3\5\1\15\5\5\22\0\31\15"+
+    "\3\5\104\0\25\15\1\0\10\15\26\0\60\5\66\15\3\5\1\15"+
+    "\22\5\1\15\7\5\12\15\2\5\2\0\12\20\1\0\20\15\3\5"+
+    "\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\1\15"+
+    "\3\0\4\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5"+
+    "\1\15\10\0\1\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20"+
+    "\2\15\17\0\3\5\1\0\6\15\4\0\2\15\2\0\26\15\1\0"+
+    "\7\15\1\0\2\15\1\0\2\15\1\0\2\15\2\0\1\5\1\0"+
+    "\5\5\4\0\2\5\2\0\3\5\3\0\1\5\7\0\4\15\1\0"+
+    "\1\15\7\0\12\20\2\5\3\15\1\5\13\0\3\5\1\0\11\15"+
+    "\1\0\3\15\1\0\26\15\1\0\7\15\1\0\2\15\1\0\5\15"+
+    "\2\0\1\5\1\15\10\5\1\0\3\5\1\0\3\5\2\0\1\15"+
+    "\17\0\2\15\2\5\2\0\12\20\11\0\1\15\7\0\3\5\1\0"+
+    "\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\2\15\1\0"+
+    "\5\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5\10\0"+
+    "\2\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20\1\0\1\15"+
+    "\20\0\1\5\1\15\1\0\6\15\3\0\3\15\1\0\4\15\3\0"+
+    "\2\15\1\0\1\15\1\0\2\15\3\0\2\15\3\0\3\15\3\0"+
+    "\14\15\4\0\5\5\3\0\3\5\1\0\4\5\2\0\1\15\6\0"+
+    "\1\5\16\0\12\20\20\0\4\5\1\0\10\15\1\0\3\15\1\0"+
+    "\27\15\1\0\20\15\3\0\1\15\7\5\1\0\3\5\1\0\4\5"+
+    "\7\0\2\5\1\0\3\15\5\0\2\15\2\5\2\0\12\20\20\0"+
+    "\1\15\3\5\1\0\10\15\1\0\3\15\1\0\27\15\1\0\12\15"+
+    "\1\0\5\15\2\0\1\5\1\15\7\5\1\0\3\5\1\0\4\5"+
+    "\7\0\2\5\7\0\1\15\1\0\2\15\2\5\2\0\12\20\1\0"+
+    "\2\15\16\0\3\5\1\0\10\15\1\0\3\15\1\0\51\15\2\0"+
+    "\1\15\7\5\1\0\3\5\1\0\4\5\1\15\5\0\3\15\1\5"+
+    "\7\0\3\15\2\5\2\0\12\20\12\0\6\15\2\0\2\5\1\0"+
+    "\22\15\3\0\30\15\1\0\11\15\1\0\1\15\2\0\7\15\3\0"+
+    "\1\5\4\0\6\5\1\0\1\5\1\0\10\5\6\0\12\20\2\0"+
+    "\2\5\15\0\60\34\1\35\2\34\7\35\5\0\7\34\10\35\1\0"+
+    "\12\20\47\0\2\34\1\0\1\34\2\0\2\34\1\0\1\34\2\0"+
+    "\1\34\6\0\4\34\1\0\7\34\1\0\3\34\1\0\1\34\1\0"+
+    "\1\34\2\0\2\34\1\0\4\34\1\35\2\34\6\35\1\0\2\35"+
+    "\1\34\2\0\5\34\1\0\1\34\1\0\6\35\2\0\12\20\2\0"+
+    "\4\34\40\0\1\15\27\0\2\5\6\0\12\20\13\0\1\5\1\0"+
+    "\1\5\1\0\1\5\4\0\2\5\10\15\1\0\44\15\4\0\24\5"+
+    "\1\0\2\5\5\15\13\5\1\0\44\5\11\0\1\5\71\0\53\34"+
+    "\24\35\1\34\12\20\6\0\6\34\4\35\4\34\3\35\1\34\3\35"+
+    "\2\34\7\35\3\34\4\35\15\34\14\35\1\34\1\35\12\20\4\35"+
+    "\2\34\46\15\1\0\1\15\5\0\1\15\2\0\53\15\1\0\4\15"+
+    "\u0100\17\111\15\1\0\4\15\2\0\7\15\1\0\1\15\1\0\4\15"+
+    "\2\0\51\15\1\0\4\15\2\0\41\15\1\0\4\15\2\0\7\15"+
+    "\1\0\1\15\1\0\4\15\2\0\17\15\1\0\71\15\1\0\4\15"+
+    "\2\0\103\15\2\0\3\5\40\0\20\15\20\0\126\15\2\0\6\15"+
+    "\3\0\u026c\15\2\0\21\15\1\0\32\15\5\0\113\15\3\0\13\15"+
+    "\7\0\15\15\1\0\4\15\3\5\13\0\22\15\3\5\13\0\22\15"+
+    "\2\5\14\0\15\15\1\0\3\15\1\0\2\5\14\0\64\34\40\35"+
+    "\3\0\1\34\4\0\1\34\1\35\2\0\12\20\41\0\4\5\1\0"+
+    "\12\20\6\0\130\15\10\0\5\15\2\5\42\15\1\5\1\15\5\0"+
+    "\106\15\12\0\37\15\1\0\14\5\4\0\14\5\12\0\12\20\36\34"+
+    "\2\0\5\34\13\0\54\34\4\0\32\34\6\0\12\20\1\34\3\0"+
+    "\2\34\40\0\27\15\5\5\4\0\65\34\12\35\1\0\35\35\2\0"+
+    "\1\5\12\20\6\0\12\20\6\0\16\34\2\0\17\5\101\0\5\5"+
+    "\57\15\21\5\7\15\4\0\12\20\21\0\11\5\14\0\3\5\36\15"+
+    "\15\5\2\15\12\20\54\15\16\5\14\0\44\15\24\5\10\0\12\20"+
+    "\3\0\3\15\12\20\44\15\2\0\11\15\107\0\3\5\1\0\25\5"+
+    "\4\15\1\5\4\15\3\5\2\15\1\0\2\5\6\0\300\15\66\5"+
+    "\5\0\5\5\u0116\15\2\0\6\15\2\0\46\15\2\0\6\15\2\0"+
+    "\10\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\37\15\2\0"+
+    "\65\15\1\0\7\15\1\0\1\15\3\0\3\15\1\0\7\15\3\0"+
+    "\4\15\2\0\6\15\4\0\15\15\5\0\3\15\1\0\7\15\17\0"+
+    "\1\5\1\12\2\5\10\0\2\25\12\0\1\25\2\0\1\23\2\0"+
+    "\5\5\1\26\14\0\1\4\2\0\2\26\3\0\1\24\4\0\1\4"+
+    "\12\0\1\26\13\0\5\5\1\0\12\5\1\0\1\15\15\0\1\15"+
+    "\20\0\15\15\63\0\23\5\1\10\15\5\21\0\1\15\4\0\1\15"+
+    "\2\0\12\15\1\0\1\15\3\0\5\15\4\0\1\4\1\0\1\15"+
+    "\1\0\1\15\1\0\1\15\1\0\4\15\1\0\12\15\1\16\2\0"+
+    "\4\15\5\0\5\15\4\0\1\15\21\0\51\15\13\0\6\4\17\0"+
+    "\2\4\u016f\0\2\4\14\0\1\4\137\0\1\4\106\0\1\4\31\0"+
+    "\13\4\4\0\3\4\273\0\14\15\1\16\47\15\300\0\2\4\12\0"+
+    "\1\4\11\0\1\4\72\0\4\4\1\0\5\4\1\4\1\0\7\4"+
+    "\1\4\2\4\1\4\1\4\1\0\2\4\2\4\1\4\4\4\1\3"+
+    "\2\4\1\4\1\4\2\4\2\4\1\4\3\4\1\4\3\4\2\4"+
+    "\10\4\3\4\5\4\1\4\1\4\1\4\5\4\14\4\13\4\2\4"+
+    "\2\4\1\4\1\4\2\4\1\4\1\4\22\4\1\4\2\4\2\4"+
+    "\6\4\12\0\2\4\6\4\1\4\1\4\1\4\2\4\3\4\2\4"+
+    "\10\4\2\4\4\4\2\4\13\4\2\4\5\4\2\4\2\4\1\4"+
+    "\5\4\2\4\1\4\1\4\1\4\2\4\24\4\2\4\5\4\6\4"+
+    "\1\4\2\4\1\3\1\4\2\4\1\4\4\4\1\4\2\4\1\4"+
+    "\2\0\2\4\4\3\1\4\1\4\2\4\1\4\1\0\1\4\1\0"+
+    "\1\4\6\0\1\4\3\0\1\4\6\0\1\4\12\0\2\4\17\0"+
+    "\1\4\2\0\1\4\4\0\1\4\1\0\1\4\4\0\3\4\1\0"+
+    "\1\4\13\0\2\4\3\4\55\0\3\4\11\0\1\4\16\0\1\4"+
+    "\16\0\1\4\u0174\0\2\4\u01cf\0\3\4\23\0\2\4\63\0\1\4"+
+    "\4\0\1\4\252\0\57\15\1\0\57\15\1\0\205\15\6\0\4\15"+
+    "\3\5\2\15\14\0\46\15\1\0\1\15\5\0\1\15\2\0\70\15"+
+    "\7\0\1\15\17\0\1\5\27\15\11\0\7\15\1\0\7\15\1\0"+
+    "\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0"+
+    "\7\15\1\0\40\5\57\0\1\15\120\0\32\27\1\0\131\27\14\0"+
+    "\326\27\57\0\1\15\1\0\1\27\31\0\11\27\6\5\1\4\5\22"+
+    "\2\0\3\27\1\15\1\15\1\4\3\0\126\30\2\0\2\5\2\22"+
+    "\3\30\133\22\1\0\4\22\5\0\51\15\3\0\136\17\21\0\33\15"+
+    "\65\0\20\22\227\0\1\4\1\0\1\4\66\0\57\22\1\0\130\22"+
+    "\250\0\u19b6\27\112\0\u51d6\27\52\0\u048d\15\103\0\56\15\2\0\u010d\15"+
+    "\3\0\20\15\12\20\2\15\24\0\57\15\4\5\1\0\12\5\1\0"+
+    "\37\15\2\5\120\15\2\5\45\0\11\15\2\0\147\15\2\0\44\15"+
+    "\1\0\10\15\77\0\13\15\1\5\3\15\1\5\4\15\1\5\27\15"+
+    "\5\5\30\0\64\15\14\0\2\5\62\15\22\5\12\0\12\20\6\0"+
+    "\22\5\6\15\3\0\1\15\1\0\1\15\2\0\12\20\34\15\10\5"+
+    "\2\0\27\15\15\5\14\0\35\17\3\0\4\5\57\15\16\5\16\0"+
+    "\1\15\12\20\6\0\5\34\1\35\12\34\12\20\5\34\1\0\51\15"+
+    "\16\5\11\0\3\15\1\5\10\15\2\5\2\0\12\20\6\0\33\34"+
+    "\3\35\62\34\1\35\1\34\3\35\2\34\2\35\5\34\2\35\1\34"+
+    "\1\35\1\34\30\0\5\34\13\15\5\5\2\0\3\15\2\5\12\0"+
+    "\6\15\2\0\6\15\2\0\6\15\11\0\7\15\1\0\7\15\1\0"+
+    "\53\15\1\0\12\15\12\0\163\15\10\5\1\0\2\5\2\0\12\20"+
+    "\6\0\u2ba4\17\14\0\27\17\4\0\61\17\u2104\0\u016e\27\2\0\152\27"+
+    "\46\0\7\15\14\0\5\15\5\0\1\33\1\5\12\33\1\0\15\33"+
+    "\1\0\5\33\1\0\1\33\1\0\2\33\1\0\2\33\1\0\12\33"+
+    "\142\15\41\0\u016b\15\22\0\100\15\2\0\66\15\50\0\14\15\4\0"+
+    "\16\5\1\6\1\11\1\24\2\0\1\23\1\24\13\0\20\5\3\0"+
+    "\2\26\30\0\3\26\1\24\1\0\1\25\1\0\1\24\1\23\32\0"+
+    "\5\15\1\0\207\15\2\0\1\5\7\0\1\25\4\0\1\24\1\0"+
+    "\1\25\1\0\12\20\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+    "\32\15\13\0\70\22\2\5\37\17\3\0\6\17\2\0\6\17\2\0"+
+    "\6\17\2\0\3\17\34\0\3\5\4\0\14\15\1\0\32\15\1\0"+
+    "\23\15\1\0\2\15\1\0\17\15\2\0\16\15\42\0\173\15\105\0"+
+    "\65\15\210\0\1\5\202\0\35\15\3\0\61\15\17\0\1\5\37\0"+
+    "\40\15\20\0\33\15\5\0\46\15\5\5\5\0\36\15\2\0\44\15"+
+    "\4\0\10\15\1\0\5\15\52\0\236\15\2\0\12\20\6\0\44\15"+
+    "\4\0\44\15\4\0\50\15\10\0\64\15\234\0\u0137\15\11\0\26\15"+
+    "\12\0\10\15\230\0\6\15\2\0\1\15\1\0\54\15\1\0\2\15"+
+    "\3\0\1\15\2\0\27\15\12\0\27\15\11\0\37\15\101\0\23\15"+
+    "\1\0\2\15\12\0\26\15\12\0\32\15\106\0\70\15\6\0\2\15"+
+    "\100\0\1\15\3\5\1\0\2\5\5\0\4\5\4\15\1\0\3\15"+
+    "\1\0\33\15\4\0\3\5\4\0\1\5\40\0\35\15\3\0\35\15"+
+    "\43\0\10\15\1\0\34\15\2\5\31\0\66\15\12\0\26\15\12\0"+
+    "\23\15\15\0\22\15\156\0\111\15\67\0\63\15\15\0\63\15\u030d\0"+
+    "\3\5\65\15\17\5\37\0\12\20\17\0\4\5\55\15\13\5\2\0"+
+    "\1\5\22\0\31\15\7\0\12\20\6\0\3\5\44\15\16\5\1\0"+
+    "\12\20\20\0\43\15\1\5\2\0\1\15\11\0\3\5\60\15\16\5"+
+    "\4\15\5\0\3\5\3\0\12\20\1\15\1\0\1\15\43\0\22\15"+
+    "\1\0\31\15\14\5\6\0\1\5\101\0\7\15\1\0\1\15\1\0"+
+    "\4\15\1\0\17\15\1\0\12\15\7\0\57\15\14\5\5\0\12\20"+
+    "\6\0\4\5\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15"+
+    "\1\0\2\15\1\0\5\15\2\0\1\5\1\15\7\5\2\0\2\5"+
+    "\2\0\3\5\2\0\1\15\6\0\1\5\5\0\5\15\2\5\2\0"+
+    "\7\5\3\0\5\5\213\0\65\15\22\5\4\15\5\0\12\20\46\0"+
+    "\60\15\24\5\2\15\1\0\1\15\10\0\12\20\246\0\57\15\7\5"+
+    "\2\0\11\5\27\0\4\15\2\5\42\0\60\15\21\5\3\0\1\15"+
+    "\13\0\12\20\46\0\53\15\15\5\10\0\12\20\66\0\32\34\3\0"+
+    "\17\35\4\0\12\20\2\34\3\0\1\34\u0160\0\100\15\12\20\25\0"+
+    "\1\15\u01c0\0\71\15\u0107\0\11\15\1\0\45\15\10\5\1\0\10\5"+
+    "\1\15\17\0\12\20\30\0\36\15\2\0\26\5\1\0\16\5\u0349\0"+
+    "\u039a\15\146\0\157\15\21\0\304\15\u0abc\0\u042f\15\u0fd1\0\u0247\15\u21b9\0"+
+    "\u0239\15\7\0\37\15\1\0\12\20\146\0\36\15\2\0\5\5\13\0"+
+    "\60\15\7\5\11\0\4\15\14\0\12\20\11\0\25\15\5\0\23\15"+
+    "\u0370\0\105\15\13\0\1\15\56\5\20\0\4\5\15\15\100\0\1\15"+
+    "\u401f\0\1\22\1\30\u0bfe\0\153\15\5\0\15\15\3\0\11\15\7\0"+
+    "\12\15\3\0\2\5\1\0\4\5\u14c1\0\5\5\3\0\26\5\2\0"+
+    "\7\5\36\0\4\5\224\0\3\5\u01bb\0\125\15\1\0\107\15\1\0"+
+    "\2\15\2\0\1\15\2\0\2\15\2\0\4\15\1\0\14\15\1\0"+
+    "\1\15\1\0\7\15\1\0\101\15\1\0\4\15\2\0\10\15\1\0"+
+    "\7\15\1\0\34\15\1\0\4\15\1\0\5\15\1\0\1\15\3\0"+
+    "\7\15\1\0\u0154\15\2\0\31\15\1\0\31\15\1\0\37\15\1\0"+
+    "\31\15\1\0\37\15\1\0\31\15\1\0\37\15\1\0\31\15\1\0"+
+    "\37\15\1\0\31\15\1\0\10\15\2\0\62\20\u0200\0\67\5\4\0"+
+    "\62\5\10\0\1\5\16\0\1\5\26\0\5\5\1\0\17\5\u0550\0"+
+    "\7\5\1\0\21\5\2\0\7\5\1\0\2\5\1\0\5\5\u07d5\0"+
+    "\305\15\13\0\7\5\51\0\104\15\7\5\5\0\12\20\u04a6\0\4\15"+
+    "\1\0\33\15\1\0\2\15\1\0\1\15\2\0\1\15\1\0\12\15"+
+    "\1\0\4\15\1\0\1\15\1\0\1\15\6\0\1\15\4\0\1\15"+
+    "\1\0\1\15\1\0\1\15\1\0\3\15\1\0\2\15\1\0\1\15"+
+    "\2\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15"+
+    "\1\0\2\15\1\0\1\15\2\0\4\15\1\0\7\15\1\0\4\15"+
+    "\1\0\4\15\1\0\1\15\1\0\12\15\1\0\21\15\5\0\3\15"+
+    "\1\0\5\15\1\0\21\15\u0144\0\4\4\1\4\312\4\1\4\60\4"+
+    "\15\0\3\4\37\0\1\4\32\15\6\0\32\15\2\0\4\4\2\16"+
+    "\14\15\2\16\12\15\4\0\1\4\2\0\12\4\22\0\71\4\32\1"+
+    "\1\30\2\4\15\4\12\0\1\4\24\0\1\4\2\0\11\4\1\0"+
+    "\4\4\11\0\7\4\2\4\256\4\42\4\2\4\141\4\1\3\16\4"+
+    "\2\4\2\4\1\4\3\4\2\4\44\4\3\3\2\4\1\3\2\4"+
+    "\3\3\44\4\2\4\3\4\1\4\4\4\5\2\102\4\2\3\2\4"+
+    "\13\3\25\4\4\3\4\4\1\3\1\4\11\3\3\4\1\3\4\4"+
+    "\3\3\1\4\3\3\42\4\1\3\123\4\1\4\77\4\10\0\3\4"+
+    "\6\4\1\4\30\4\7\4\2\4\2\4\1\4\2\3\4\4\1\3"+
+    "\14\4\1\4\2\4\4\4\2\4\1\3\4\4\2\3\15\4\2\4"+
+    "\2\4\1\4\10\4\2\4\11\4\1\4\5\4\3\4\14\4\3\4"+
+    "\10\4\3\4\2\4\1\4\1\4\1\4\4\4\1\4\6\4\1\4"+
+    "\3\4\1\4\6\4\113\4\3\3\3\4\5\3\60\0\43\4\1\3"+
+    "\20\4\3\3\11\4\1\3\5\4\5\4\1\4\1\3\6\4\15\4"+
+    "\6\4\3\4\1\4\1\4\2\4\3\4\1\4\2\4\7\4\6\4"+
+    "\164\0\14\4\125\0\53\4\14\0\4\4\70\0\10\4\12\0\6\4"+
+    "\50\0\10\4\36\0\122\4\14\0\4\4\10\4\5\3\1\4\2\3"+
+    "\6\4\1\3\11\4\12\3\1\4\1\0\1\4\2\3\1\4\6\4"+
+    "\1\0\52\4\2\4\4\4\3\4\1\4\1\4\47\4\15\4\5\4"+
+    "\2\3\1\4\2\3\6\4\3\4\15\4\1\4\15\3\42\4\u05fe\4"+
+    "\2\0\ua6d7\27\51\0\u1035\27\13\0\336\27\2\0\u1682\27\u295e\0\u021e\27"+
+    "\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+    "\1\5\36\0\137\13\1\14\200\0\360\5\uffff\0\uffff\0\ufe12\0";
 
   /** 
    * Translates characters to character classes
@@ -218,12 +284,15 @@ public final class StandardTokenizerImpl {
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
-    "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
-    "\1\4\1\0\2\2\2\0\1\1\1\0";
+    "\1\0\2\1\3\2\2\1\1\3\1\2\1\4\2\5"+
+    "\1\6\1\1\1\7\1\10\1\3\1\11\1\2\1\0"+
+    "\4\2\1\0\1\2\2\0\1\3\1\0\1\3\2\2"+
+    "\1\0\1\5\1\2\1\5\1\0\2\3\1\0\2\2"+
+    "\2\0\1\2\1\0\2\3\5\2\1\0\1\2\1\3"+
+    "\3\2";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[24];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -248,12 +317,17 @@ public final class StandardTokenizerImpl {
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
-    "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
-    "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+    "\0\0\0\36\0\74\0\132\0\170\0\226\0\264\0\322"+
+    "\0\360\0\u010e\0\u012c\0\u014a\0\u0168\0\u0186\0\u01a4\0\u01c2"+
+    "\0\u01e0\0\u01fe\0\u021c\0\u023a\0\74\0\u0258\0\u0276\0\u0294"+
+    "\0\u02b2\0\264\0\u02d0\0\u02ee\0\322\0\u030c\0\u032a\0\u0348"+
+    "\0\u0366\0\u0384\0\u03a2\0\u03c0\0\u03de\0\u03fc\0\u01a4\0\u041a"+
+    "\0\u0438\0\u0456\0\u0474\0\u0492\0\u04b0\0\u04ce\0\u04ec\0\u050a"+
+    "\0\u0528\0\u0546\0\u0564\0\u0582\0\u05a0\0\u05be\0\u05dc\0\u05fa"+
+    "\0\36\0\u0618\0\360\0\u0636\0\u0654";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[24];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -276,33 +350,94 @@ public final class StandardTokenizerImpl {
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
-    "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
-    "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
-    "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
-    "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
-    "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
-    "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
-    "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
-    "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
-    "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
-    "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
-    "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
-    "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
-    "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
-    "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
-    "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
-    "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
-    "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
-    "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
-    "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
-    "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
-    "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
-    "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
-    "\1\30\1\15\14\0\1\30";
+    "\1\2\1\3\1\4\1\5\1\6\2\2\1\7\2\2"+
+    "\1\10\2\2\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\3\2\1\17\1\20\1\21\2\2\1\22\2\23\37\0"+
+    "\1\24\3\0\2\25\1\0\5\25\20\0\1\25\5\0"+
+    "\1\4\2\0\1\4\1\0\1\26\2\4\20\0\1\4"+
+    "\2\0\1\4\2\0\1\5\2\0\1\5\1\27\1\30"+
+    "\2\5\20\0\1\5\5\0\1\6\2\0\1\6\1\27"+
+    "\1\31\2\6\20\0\1\6\5\0\1\32\2\0\1\33"+
+    "\1\34\3\32\20\0\1\32\3\0\1\5\1\6\5\0"+
+    "\1\35\3\0\1\6\24\0\2\11\1\0\10\11\2\36"+
+    "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+    "\1\22\1\0\1\11\5\0\1\12\1\11\1\0\1\12"+
+    "\1\41\1\42\2\12\3\11\2\36\1\0\1\37\1\0"+
+    "\1\37\1\40\2\0\1\37\1\0\1\22\1\0\1\12"+
+    "\5\0\2\13\1\0\5\13\2\11\1\13\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\13\5\0\2\14\1\0\5\14\3\11\2\14"+
+    "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+    "\1\14\5\0\1\15\1\14\1\0\1\45\1\46\3\15"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\15\5\0\2\16\1\0\5\16\5\0"+
+    "\1\16\3\0\1\40\6\0\1\16\5\0\2\47\1\0"+
+    "\5\47\3\11\2\14\1\50\3\0\1\47\4\0\1\22"+
+    "\1\0\1\47\5\0\2\20\1\0\5\20\20\0\1\20"+
+    "\5\0\2\21\1\0\5\21\20\0\1\21\5\0\2\22"+
+    "\1\0\5\22\3\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\51\1\52\1\22\1\0\1\22\5\0"+
+    "\2\23\1\0\5\23\17\0\2\23\5\0\2\24\1\0"+
+    "\5\24\20\0\1\24\2\0\1\4\1\53\1\54\1\4"+
+    "\2\0\1\4\1\0\1\26\2\4\1\0\1\54\16\0"+
+    "\1\4\12\0\1\55\1\56\24\0\1\4\1\53\1\54"+
+    "\1\5\2\0\1\5\1\27\1\30\2\5\1\0\1\54"+
+    "\16\0\1\5\2\0\1\4\1\53\1\54\1\6\2\0"+
+    "\1\6\1\27\1\31\2\6\1\0\1\54\16\0\1\6"+
+    "\5\0\1\33\2\0\1\33\1\34\3\33\20\0\1\33"+
+    "\10\0\1\57\32\0\2\36\1\0\5\36\3\11\2\36"+
+    "\2\0\2\60\1\40\2\0\1\60\1\0\1\22\1\0"+
+    "\1\36\5\0\2\37\1\0\5\37\3\11\13\0\1\11"+
+    "\1\0\1\37\5\0\2\40\1\0\5\40\3\11\2\36"+
+    "\1\50\3\0\1\40\4\0\1\22\1\0\1\40\5\0"+
+    "\2\11\1\0\2\11\1\61\1\62\4\11\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\11\2\0\1\4\1\53\1\54\1\12\1\11"+
+    "\1\0\1\12\1\41\1\42\2\12\1\11\1\63\1\11"+
+    "\2\36\1\0\1\37\1\0\1\37\1\40\2\0\1\37"+
+    "\1\0\1\22\1\0\1\12\5\0\2\43\1\0\5\43"+
+    "\3\0\2\14\13\0\1\43\5\0\2\44\1\0\5\44"+
+    "\3\11\2\14\1\50\3\0\1\44\4\0\1\22\1\0"+
+    "\1\44\5\0\1\45\1\14\1\0\1\45\1\46\3\45"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\45\5\0\2\14\1\0\1\64\4\14"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\14\5\0\2\50\1\0\5\50\5\0"+
+    "\1\50\3\0\1\40\6\0\1\50\5\0\2\51\1\0"+
+    "\5\51\3\11\2\36\4\0\1\40\4\0\1\22\1\0"+
+    "\1\51\5\0\2\52\1\0\5\52\16\0\1\51\1\0"+
+    "\1\52\2\0\1\4\2\0\1\53\2\0\1\53\1\65"+
+    "\1\66\2\53\20\0\1\53\5\0\1\54\2\0\1\54"+
+    "\1\65\1\67\2\54\20\0\1\54\2\0\1\4\1\53"+
+    "\1\54\5\0\1\70\3\0\1\54\32\0\1\56\1\71"+
+    "\26\0\1\57\2\0\1\57\1\0\3\57\20\0\1\57"+
+    "\5\0\2\60\1\0\5\60\3\0\2\36\13\0\1\60"+
+    "\2\0\1\4\1\53\1\54\2\11\1\0\2\11\1\72"+
+    "\3\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\11\5\0"+
+    "\2\11\1\0\3\11\1\62\1\73\3\11\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\11\5\0\1\63\1\11\1\0\1\63\1\74"+
+    "\1\75\2\63\3\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\63\5\0"+
+    "\1\64\1\14\1\0\1\64\1\14\3\64\3\11\2\14"+
+    "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+    "\1\64\12\0\1\55\25\0\1\4\1\53\1\54\1\53"+
+    "\2\0\1\53\1\65\1\66\2\53\1\0\1\54\16\0"+
+    "\1\53\2\0\1\4\1\53\2\54\2\0\1\54\1\65"+
+    "\1\67\2\54\1\0\1\54\16\0\1\54\3\0\1\53"+
+    "\1\54\5\0\1\70\3\0\1\54\22\0\1\53\1\54"+
+    "\2\11\1\0\2\11\1\72\3\11\1\63\1\11\2\36"+
+    "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+    "\1\22\1\0\1\11\5\0\2\11\1\0\2\11\1\61"+
+    "\5\11\2\36\1\0\1\37\1\0\1\37\1\40\2\0"+
+    "\1\37\1\0\1\22\1\0\1\11\2\0\1\4\1\53"+
+    "\1\54\1\63\1\11\1\0\1\63\1\74\1\75\2\63"+
+    "\1\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\63";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[396];
+    int [] result = new int[1650];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -329,7 +464,7 @@ public final class StandardTokenizerImpl {
 
   /* error messages for the codes above */
   private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
     "Error: could not match input",
     "Error: pushback value was too large"
   };
@@ -340,11 +475,12 @@ public final class StandardTokenizerImpl {
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
-    "\2\1\2\0\1\1\1\0";
+    "\1\0\1\11\22\1\1\0\4\1\1\0\1\1\2\0"+
+    "\1\1\1\0\3\1\1\0\3\1\1\0\2\1\1\0"+
+    "\2\1\2\0\1\1\1\0\7\1\1\0\1\11\4\1";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[24];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -401,11 +537,11 @@ public final class StandardTokenizerImpl {
   private int yycolumn;
 
   /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
    */
   private boolean zzAtBOL = true;
 
-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
   private boolean zzAtEOF;
 
   /** denotes if the user-EOF-code has already been executed */
@@ -447,6 +583,9 @@ public final class StandardTokenizerImpl {
 
   /** Hangul token type */
   public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+  
+  /** Emoji token type */
+  public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
 
   /** Character count processed so far */
   public final int yychar()
@@ -492,7 +631,7 @@ public final class StandardTokenizerImpl {
     char [] map = new char[0x110000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 2836) {
+    while (i < 4122) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -500,6 +639,8 @@ public final class StandardTokenizerImpl {
     return map;
   }
 
+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
 
   /**
    * Refills the input buffer.
@@ -527,32 +668,45 @@ public final class StandardTokenizerImpl {
 
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+    if (requested == 0) {
+      return true;
     }
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
 
-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
-        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+        if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
           --zzEndRead;
           zzFinalHighSurrogate = 1;
-          if (totalRead == 1) { return true; }
+          if (numRead == 1) {
+            return true;
+          }
+        } else {                    // There is room in the buffer for at least one more char
+          int c = zzReader.read();  // Expecting to read a low surrogate char
+          if (c == -1) {
+            return true;
+          } else {
+            zzBuffer[zzEndRead++] = (char)c;
+            return false;
+          }
         }
       }
+      /* potentially more input available */
       return false;
     }
 
-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
     return true;
   }
 
+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
     
   /**
    * Closes the input stream.
@@ -773,49 +927,62 @@ public final class StandardTokenizerImpl {
       // store back cached position
       zzMarkedPos = zzMarkedPosL;
 
-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
-          }
-        case 9: break;
-        case 2: 
-          { return WORD_TYPE;
-          }
-        case 10: break;
-        case 3: 
-          { return HANGUL_TYPE;
-          }
-        case 11: break;
-        case 4: 
-          { return NUMERIC_TYPE;
-          }
-        case 12: break;
-        case 5: 
-          { return KATAKANA_TYPE;
-          }
-        case 13: break;
-        case 6: 
-          { return IDEOGRAPHIC_TYPE;
-          }
-        case 14: break;
-        case 7: 
-          { return HIRAGANA_TYPE;
-          }
-        case 15: break;
-        case 8: 
-          { return SOUTH_EAST_ASIAN_TYPE;
-          }
-        case 16: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
               {
                 return YYEOF;
               }
-          } 
-          else {
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */
+            } 
+            // fall through
+          case 10: break;
+          case 2: 
+            { return EMOJI_TYPE;
+            } 
+            // fall through
+          case 11: break;
+          case 3: 
+            { return WORD_TYPE;
+            } 
+            // fall through
+          case 12: break;
+          case 4: 
+            { return HANGUL_TYPE;
+            } 
+            // fall through
+          case 13: break;
+          case 5: 
+            { return NUMERIC_TYPE;
+            } 
+            // fall through
+          case 14: break;
+          case 6: 
+            { return KATAKANA_TYPE;
+            } 
+            // fall through
+          case 15: break;
+          case 7: 
+            { return IDEOGRAPHIC_TYPE;
+            } 
+            // fall through
+          case 16: break;
+          case 8: 
+            { return HIRAGANA_TYPE;
+            } 
+            // fall through
+          case 17: break;
+          case 9: 
+            { return SOUTH_EAST_ASIAN_TYPE;
+            } 
+            // fall through
+          case 18: break;
+          default:
             zzScanError(ZZ_NO_MATCH);
-          }
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
index a1e7b17..e95a9b4 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -34,12 +34,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
  *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
  * </ul>
  */
 @SuppressWarnings("fallthrough")
 %%
 
-%unicode 6.3
+%unicode 9.0
 %integer
 %final
 %public
@@ -48,22 +49,67 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %char
 %buffer 255
 
-// UAX#29 WB4. X (Extend | Format)* --> X
-//
-HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
-NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
-HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+//   \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+//   - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx         = {EmojiChar}           {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx     = {Emoji_Modifier}      {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx          = [\p{WB:ALetter}\p{WB:Hebrew_Letter}]                      {ExtFmtZwj}
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        {ExtFmtZwj}
+KatakanaEx          = \p{WB:Katakana}                                           {ExtFmtZwj} 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      {ExtFmtZwj} 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         {ExtFmtZwj}
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       {ExtFmtZwj}
+HanEx               = \p{Script:Han}                                            {ExtFmtZwj}
+HiraganaEx          = \p{Script:Hiragana}                                       {ExtFmtZwj}
+SingleQuoteEx       = \p{WB:Single_Quote}                                       {ExtFmtZwj}
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       {ExtFmtZwj}
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator}                                 {ExtFmtZwj}
+ComplexContextEx    = \p{LB:Complex_Context}                                    {ExtFmtZwj}
 
 %{
   /** Alphanumeric sequences */
@@ -93,6 +139,9 @@ ComplexContextEx    = \p{LB:Complex_Context}
 
   /** Hangul token type */
   public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+  
+  /** Emoji token type */
+  public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
 
   /** Character count processed so far */
   public final int yychar()
@@ -120,18 +169,64 @@ ComplexContextEx    = \p{LB:Complex_Context}
 
 %%
 
-// UAX#29 WB1.   sot   ÷
-//        WB2.     ÷   eot
+// UAX#29 WB1.    sot ÷ Any
+//        WB2.    Any ÷ eot
 //
 <<EOF>> { return YYEOF; }
 
-// UAX#29 WB8.   Numeric × Numeric
-//        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-//        WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
+// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+//                          WB14. (E_Base | EBG) × E_Modifier
+//                          WB15. ^ (RI RI)* RI × RI
+//                          WB16. [^RI] (RI RI)* RI × RI
+//
+// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+// 
+// emoji_sequence :=
+//    Top-level EBNF           Expanded #1                       Expanded #2                       Expanded #3
+//    ---------------------    ----------------------------      -----------------------------     ----------------------------------------------
+//      emoji_core_sequence      emoji_combining_sequence          emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_keycap_sequence           | [0-9#*] \u{FE0F 20E3}      [1]
+//                             | emoji_modifier_sequence                                           | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+//                             | emoji_flag_sequence                                               | \p{WB:Regional_Indicator}{2}               )
+//
+//    | emoji_zwj_sequence       emoji_zwj_element                 emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+//                             ( ZWJ emoji_zwj_element )+                                          ( \p{WB:ZWJ} ^^ )+
+// 
+//    | emoji_tag_sequence     tag_base                            emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+//                             tag_spec                                                            [\u{E0020}-\u{E007E}]+
+//                             tag_term                                                            \u{E007F}
+//
+// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences 
+//     WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+//     TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+//     choose whether to support them for segmentation.  This implementation will
+//     recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji. 
+//
+// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+//           https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+//
+//     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+//
+//         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+//
+  {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
+| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
+| {RegionalIndicatorEx}{2} 
+  { return EMOJI_TYPE; }
+
+// UAX#29 WB8.    Numeric × Numeric
+//        WB11.   Numeric (MidNum | MidNumLetQ) × Numeric
+//        WB12.   Numeric × (MidNum | MidNumLetQ) Numeric
+//        WB13a.  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b.  ExtendNumLet × (AHLetter | Numeric | Katakana)
 //
-{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}* 
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
   { return NUMERIC_TYPE; }
 
 // subset of the below for typing purposes only!
@@ -141,28 +236,28 @@ ComplexContextEx    = \p{LB:Complex_Context}
 {KatakanaEx}+
   { return KATAKANA_TYPE; }
 
-// UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-//        WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-//        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-//        WB7a.  Hebrew_Letter × Single_Quote
-//        WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
-//        WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
-//        WB9.   (ALetter | Hebrew_Letter) × Numeric
-//        WB10.  Numeric × (ALetter | Hebrew_Letter)
-//        WB13.  Katakana × Katakana
-//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
-//
-{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+// UAX#29 WB5.    AHLetter × AHLetter
+//        WB6.    AHLetter × (MidLetter | MidNumLetQ) AHLetter
+//        WB7.    AHLetter (MidLetter | MidNumLetQ) × AHLetter
+//        WB7a.   Hebrew_Letter × Single_Quote
+//        WB7b.   Hebrew_Letter × Double_Quote Hebrew_Letter
+//        WB7c.   Hebrew_Letter Double_Quote × Hebrew_Letter
+//        WB9.    AHLetter × Numeric
+//        WB10.   Numeric × AHLetter
+//        WB13.   Katakana × Katakana
+//        WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b.  ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
+//
+{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                     | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                      )+
                    )
-({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                     | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                      )+
                    )
 )*
@@ -172,13 +267,13 @@ ComplexContextEx    = \p{LB:Complex_Context}
 
 // From UAX #29:
 //
-//    [C]haracters with the Line_Break property values of Contingent_Break (CB), 
-//    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word 
+//    [C]haracters with the Line_Break property values of Contingent_Break (CB),
+//    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
 //    boundary property values based on criteria outside of the scope of this
 //    annex.  That means that satisfactory treatment of languages like Chinese
 //    or Thai requires special handling.
 // 
-// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 //
 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@@ -191,17 +286,14 @@ ComplexContextEx    = \p{LB:Complex_Context}
 //
 {ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
 
-// UAX#29 WB14.  Any ÷ Any
+// UAX#29 WB999.  Any ÷ Any
 //
 {HanEx} { return IDEOGRAPHIC_TYPE; }
 {HiraganaEx} { return HIRAGANA_TYPE; }
 
-
-// UAX#29 WB3.   CR × LF
-//        WB3a.  (Newline | CR | LF) ÷
-//        WB3b.  ÷ (Newline | CR | LF)
-//        WB13c. Regional_Indicator × Regional_Indicator
-//        WB14.  Any ÷ Any
+// UAX#29 WB3.    CR × LF
+//        WB3a.   (Newline | CR | LF) ÷
+//        WB3b.   ÷ (Newline | CR | LF)
+//        WB999.  Any ÷ Any
 //
-{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
-  { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 6abbc2b..615b565 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -18,8 +18,11 @@ package org.apache.lucene.analysis.standard;
 
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -27,6 +30,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TestUtil;
 
@@ -282,7 +286,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
   }
   
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+    WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
     wordBreakTest.test(a);
   }
   
@@ -358,8 +362,80 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
     BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
   }
 
-
-
+  /** simple emoji */
+  public void testEmoji() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+        new String[] { "💩", "💩", "💩" },
+        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence */
+  public void testEmojiSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
+        new String[] { "👩‍❤️‍👩" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence with fitzpatrick modifier */
+  public void testEmojiSequenceWithModifier() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
+        new String[] { "👨🏼‍⚕️" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** regional indicator */
+  public void testEmojiRegionalIndicator() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+        new String[] { "🇺🇸", "🇺🇸" },
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** variation sequence */
+  public void testEmojiVariationSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+        new String[] { "#️⃣" },
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+        new String[] { "3️⃣",},
+        new String[] { "<EMOJI>" });
+
+    // text presentation sequences
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+        new String[] { },
+        new String[] { });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
+        new String[] { "3\uFE0E",},
+        new String[] { "<NUM>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
+        new String[] { "\u2B55",},
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+        new String[] { "\u2B55", "\u200D\u2B55"},
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  public void testEmojiTagSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
+        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
+        new String[] { "<EMOJI>" });
+  }
+
+  public void testEmojiTokenization() throws Exception {
+    // simple emoji around latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+        new String[] { "poo", "💩", "poo" },
+        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+    // simple emoji around non-latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+        new String[] { "💩", "中", "國", "💩" },
+        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+  }
+  
+  public void testUnicodeEmojiTests() throws Exception {
+    EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+    emojiTest.test(a);
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer analyzer = new StandardAnalyzer();
@@ -416,4 +492,53 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
     assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
     a.close();
   }
+
+  public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
+    String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
+    
+    // Collect tokens with normal reader
+    StandardAnalyzer a = new StandardAnalyzer();
+    TokenStream ts = a.tokenStream("dummy", text);
+    List<String> tokens = new ArrayList<>();
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokens.add(termAtt.toString());
+    }
+    ts.end();
+    ts.close();
+
+    // Tokens from a spoon-feed reader should be the same as from a normal reader
+    // The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
+    Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
+    ts = a.tokenStream("dummy", reader);
+    termAtt = ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
+      assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
+    }
+    ts.end();
+    ts.close();
+  }
+}
+
+class SpoonFeedMaxCharsReaderWrapper extends Reader {
+  private final Reader in;
+  private final int maxChars; 
+
+  public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
+    this.in = in;
+    this.maxChars = maxChars;
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+
+  /** Returns the configured number of chars if available */
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return in.read(cbuf, off, Math.min(maxChars, len));
+  }
 }