You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:28 UTC
[14/24] lucene-solr:branch_8x: LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '' token type.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
index 292f2ad..e4b10af 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
@@ -37,12 +37,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
  *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
  * </ul>
  */
 @SuppressWarnings("fallthrough")
 %%
 
-%unicode 6.3
+%unicode 9.0
 %integer
 %final
 %public
@@ -52,22 +53,73 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %xstate AVOID_BAD_URL
 %buffer 255
 
-// UAX#29 WB4. X (Extend | Format)* --> X
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../../../../core/src/data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+//   \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+//   - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx         = {EmojiChar}           {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx     = {Emoji_Modifier}      {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
 //
-HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
-NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
-HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx          = [\p{WB:ALetter}\p{WB:Hebrew_Letter}]                      {ExtFmtZwj}
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        {ExtFmtZwj}
+KatakanaEx          = \p{WB:Katakana}                                           {ExtFmtZwj} 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      {ExtFmtZwj} 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         {ExtFmtZwj}
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       {ExtFmtZwj}
+HanEx               = \p{Script:Han}                                            {ExtFmtZwj}
+HiraganaEx          = \p{Script:Hiragana}                                       {ExtFmtZwj}
+SingleQuoteEx       = \p{WB:Single_Quote}                                       {ExtFmtZwj}
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       {ExtFmtZwj}
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator}                                 {ExtFmtZwj}
+ComplexContextEx    = \p{LB:Complex_Context}                                    {ExtFmtZwj}
+
 
 // URL and E-mail syntax specifications:
 //
@@ -174,18 +226,28 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
    */
   public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
   
+  /** Ideographic token type */
   public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
   
+  /** Hiragana token type */
   public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
   
+  /** Katakana token type */
   public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
   
+  /** Hangul token type */
   public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
   
+  /** Email token type */
   public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
   
+  /** URL token type */
   public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
 
+  /** Emoji token type */
+  public static final int EMOJI_TYPE = UAX29URLEmailTokenizer.EMOJI;
+
+  /** Character count processed so far */
   public final int yychar()
   {
     return yychar;
@@ -213,11 +275,11 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 
 <YYINITIAL, AVOID_BAD_URL> {
 
-// UAX#29 WB1.   sot   ÷
-//        WB2.     ÷   eot
+// UAX#29 WB1.    sot ÷ Any
+//        WB2.    Any ÷ eot
 //
   <<EOF>> { return YYEOF; }
-
+  
   {URL}   { yybegin(YYINITIAL); return URL_TYPE; }
 
   // LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
@@ -244,14 +306,61 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 
   {EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }
 
-  // UAX#29 WB8.   Numeric × Numeric
-  //        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-  //        WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-  //        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-  //        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+
+  // Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+  //                          WB14. (E_Base | EBG) × E_Modifier
+  //                          WB15. ^ (RI RI)* RI × RI
+  //                          WB16. [^RI] (RI RI)* RI × RI
+  //
+  // We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+  // and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+  // 
+  // emoji_sequence :=
+  //    Top-level EBNF           Expanded #1                       Expanded #2                       Expanded #3
+  //    ---------------------    ----------------------------      -----------------------------     ----------------------------------------------
+  //      emoji_core_sequence      emoji_combining_sequence          emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_keycap_sequence           | [0-9#*] \u{FE0F 20E3}      [1]
+  //                             | emoji_modifier_sequence                                           | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+  //                             | emoji_flag_sequence                                               | \p{WB:Regional_Indicator}{2}               )
+  //
+  //    | emoji_zwj_sequence       emoji_zwj_element                 emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+  //                             ( ZWJ emoji_zwj_element )+                                          ( \p{WB:ZWJ} ^^ )+
+  // 
+  //    | emoji_tag_sequence     tag_base                            emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+  //                             tag_spec                                                            [\u{E0020}-\u{E007E}]+
+  //                             tag_term                                                            \u{E007F}
+  //
+  // [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences 
+  //     WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+  //     TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+  //     choose whether to support them for segmentation.  This implementation will
+  //     recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji. 
+  //
+  // See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+  //           https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+  //
+  //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+  //
+  //         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+  //
+    {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
+  | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
+  | {RegionalIndicatorEx}{2} 
+    { yybegin(YYINITIAL); return EMOJI_TYPE; }
+
+  // UAX#29 WB8.    Numeric × Numeric
+  //        WB11.   Numeric (MidNum | MidNumLetQ) × Numeric
+  //        WB12.   Numeric × (MidNum | MidNumLetQ) Numeric
+  //        WB13a.  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+  //        WB13b.  ExtendNumLet × (AHLetter | Numeric | Katakana)
   //
   {ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
-    {  yybegin(YYINITIAL); return NUMERIC_TYPE; }
+    { yybegin(YYINITIAL); return NUMERIC_TYPE; }
 
   // subset of the below for typing purposes only!
   {HangulEx}+
@@ -260,32 +369,32 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
   {KatakanaEx}+
     { yybegin(YYINITIAL); return KATAKANA_TYPE; }
 
-  // UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-  //        WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-  //        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-  //        WB7a.  Hebrew_Letter × Single_Quote
-  //        WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
-  //        WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
-  //        WB9.   (ALetter | Hebrew_Letter) × Numeric
-  //        WB10.  Numeric × (ALetter | Hebrew_Letter)
-  //        WB13.  Katakana × Katakana
-  //        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-  //        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+  // UAX#29 WB5.    AHLetter × AHLetter
+  //        WB6.    AHLetter × (MidLetter | MidNumLetQ) AHLetter
+  //        WB7.    AHLetter (MidLetter | MidNumLetQ) × AHLetter
+  //        WB7a.   Hebrew_Letter × Single_Quote
+  //        WB7b.   Hebrew_Letter × Double_Quote Hebrew_Letter
+  //        WB7c.   Hebrew_Letter Double_Quote × Hebrew_Letter
+  //        WB9.    AHLetter × Numeric
+  //        WB10.   Numeric × AHLetter
+  //        WB13.   Katakana × Katakana
+  //        WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+  //        WB13b.  ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
   //
-  {ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                       | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+  {ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                       | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                        )+
                      )
-  ({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                       | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+  ({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                       | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                        )+
                      )
   )*
-  {ExtendNumLetEx}*
+  {ExtendNumLetEx}* 
     { yybegin(YYINITIAL); return WORD_TYPE; }
 
 
@@ -297,7 +406,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
   //    annex.  That means that satisfactory treatment of languages like Chinese
   //    or Thai requires special handling.
   //
-  // In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+  // In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
   // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
   //
   // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@@ -310,18 +419,15 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
   //
   {ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
 
-  // UAX#29 WB14.  Any ÷ Any
+  // UAX#29 WB999.  Any ÷ Any
   //
   {HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
   {HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }
 
-
-  // UAX#29 WB3.   CR × LF
-  //        WB3a.  (Newline | CR | LF) ÷
-  //        WB3b.  ÷ (Newline | CR | LF)
-  //        WB13c. Regional_Indicator × Regional_Indicator
-  //        WB14.  Any ÷ Any
+  // UAX#29 WB3.    CR × LF
+  //        WB3a.   (Newline | CR | LF) ÷
+  //        WB3b.   ÷ (Newline | CR | LF)
+  //        WB999.  Any ÷ Any
   //
-  {RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
-    { yybegin(YYINITIAL); /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+  [^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
index 7f9227f..9295e1c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -341,7 +341,7 @@ class WikipediaTokenizerImpl {
 
   /* error messages for the codes above */
   private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
     "Error: could not match input",
     "Error: pushback value was too large"
   };
@@ -419,11 +419,11 @@ class WikipediaTokenizerImpl {
   private int yycolumn;
 
   /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
    */
   private boolean zzAtBOL = true;
 
-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
   private boolean zzAtEOF;
 
   /** denotes if the user-EOF-code has already been executed */
@@ -575,28 +575,29 @@ final void reset() {
     }
 
     /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
-    }
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
 
-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
         if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
           --zzEndRead;
           zzFinalHighSurrogate = 1;
         }
       }
+      /* potentially more input available */
       return false;
     }
 
-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
     return true;
   }
 
@@ -820,199 +821,245 @@ final void reset() {
       // store back cached position
       zzMarkedPos = zzMarkedPosL;
 
-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 47: break;
-        case 2: 
-          { positionInc = 1; return ALPHANUM;
-          }
-        case 48: break;
-        case 3: 
-          { positionInc = 1; return CJ;
-          }
-        case 49: break;
-        case 4: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 50: break;
-        case 5: 
-          { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 51: break;
-        case 6: 
-          { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 52: break;
-        case 7: 
-          { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 53: break;
-        case 8: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore */
-          }
-        case 54: break;
-        case 9: 
-          { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
-          }
-        case 55: break;
-        case 10: 
-          { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 56: break;
-        case 11: 
-          { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 57: break;
-        case 12: 
-          { currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/
-          }
-        case 58: break;
-        case 13: 
-          { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 59: break;
-        case 14: 
-          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
-          }
-        case 60: break;
-        case 15: 
-          { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 61: break;
-        case 16: 
-          { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 62: break;
-        case 17: 
-          { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
-          }
-        case 63: break;
-        case 18: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
-          }
-        case 64: break;
-        case 19: 
-          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
-          }
-        case 65: break;
-        case 20: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 66: break;
-        case 21: 
-          { yybegin(STRING); return currentTokType;/*pipe*/
-          }
-        case 67: break;
-        case 22: 
-          { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 68: break;
-        case 23: 
-          { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 69: break;
-        case 24: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 70: break;
-        case 25: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 71: break;
-        case 26: 
-          { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 72: break;
-        case 27: 
-          { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 73: break;
-        case 28: 
-          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 74: break;
-        case 29: 
-          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 75: break;
-        case 30: 
-          { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 76: break;
-        case 31: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
-          }
-        case 77: break;
-        case 32: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 78: break;
-        case 33: 
-          { positionInc = 1; return APOSTROPHE;
-          }
-        case 79: break;
-        case 34: 
-          { positionInc = 1; return HOST;
-          }
-        case 80: break;
-        case 35: 
-          { positionInc = 1; return NUM;
-          }
-        case 81: break;
-        case 36: 
-          { positionInc = 1; return COMPANY;
-          }
-        case 82: break;
-        case 37: 
-          { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 83: break;
-        case 38: 
-          { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
-          }
-        case 84: break;
-        case 39: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
-          }
-        case 85: break;
-        case 40: 
-          { positionInc = 1; return ACRONYM;
-          }
-        case 86: break;
-        case 41: 
-          { positionInc = 1; return EMAIL;
-          }
-        case 87: break;
-        case 42: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
-          }
-        case 88: break;
-        case 43: 
-          { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
-          }
-        case 89: break;
-        case 44: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 90: break;
-        case 45: 
-          { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 91: break;
-        case 46: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 92: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-            return YYEOF;
-          } 
-          else {
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+        return YYEOF;
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 47: break;
+          case 2: 
+            { positionInc = 1; return ALPHANUM;
+            } 
+            // fall through
+          case 48: break;
+          case 3: 
+            { positionInc = 1; return CJ;
+            } 
+            // fall through
+          case 49: break;
+          case 4: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 50: break;
+          case 5: 
+            { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 51: break;
+          case 6: 
+            { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 52: break;
+          case 7: 
+            { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 53: break;
+          case 8: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+            } 
+            // fall through
+          case 54: break;
+          case 9: 
+            { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
+            } 
+            // fall through
+          case 55: break;
+          case 10: 
+            { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 56: break;
+          case 11: 
+            { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 57: break;
+          case 12: 
+            { currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/
+            } 
+            // fall through
+          case 58: break;
+          case 13: 
+            { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 59: break;
+          case 14: 
+            { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 60: break;
+          case 15: 
+            { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 61: break;
+          case 16: 
+            { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 62: break;
+          case 17: 
+            { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
+            } 
+            // fall through
+          case 63: break;
+          case 18: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
+            } 
+            // fall through
+          case 64: break;
+          case 19: 
+            { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
+            } 
+            // fall through
+          case 65: break;
+          case 20: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 66: break;
+          case 21: 
+            { yybegin(STRING); return currentTokType;/*pipe*/
+            } 
+            // fall through
+          case 67: break;
+          case 22: 
+            { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 68: break;
+          case 23: 
+            { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 69: break;
+          case 24: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 70: break;
+          case 25: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 71: break;
+          case 26: 
+            { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 72: break;
+          case 27: 
+            { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 73: break;
+          case 28: 
+            { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 74: break;
+          case 29: 
+            { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 75: break;
+          case 30: 
+            { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 76: break;
+          case 31: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
+            } 
+            // fall through
+          case 77: break;
+          case 32: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 78: break;
+          case 33: 
+            { positionInc = 1; return APOSTROPHE;
+            } 
+            // fall through
+          case 79: break;
+          case 34: 
+            { positionInc = 1; return HOST;
+            } 
+            // fall through
+          case 80: break;
+          case 35: 
+            { positionInc = 1; return NUM;
+            } 
+            // fall through
+          case 81: break;
+          case 36: 
+            { positionInc = 1; return COMPANY;
+            } 
+            // fall through
+          case 82: break;
+          case 37: 
+            { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 83: break;
+          case 38: 
+            { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
+            } 
+            // fall through
+          case 84: break;
+          case 39: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
+            } 
+            // fall through
+          case 85: break;
+          case 40: 
+            { positionInc = 1; return ACRONYM;
+            } 
+            // fall through
+          case 86: break;
+          case 41: 
+            { positionInc = 1; return EMAIL;
+            } 
+            // fall through
+          case 87: break;
+          case 42: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
+            } 
+            // fall through
+          case 88: break;
+          case 43: 
+            { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+            } 
+            // fall through
+          case 89: break;
+          case 44: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 90: break;
+          case 45: 
+            { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 91: break;
+          case 46: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 92: break;
+          default:
             zzScanError(ZZ_NO_MATCH);
-          }
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
index cf6c65a..758d5d2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@@ -499,7 +499,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
     String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
         = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
-    String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
+    String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString2 +"-[CDATA[";
 
     String[] testGold = {
         "one<![CDATA[<one><two>three<four></four></two></one>]]>two",

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
index b3b0ce1..507eb09 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@@ -361,14 +361,14 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
 
     StringBuilder bToken = new StringBuilder();
     // exact max length:
-    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+    for(int i=0;i<UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
       bToken.append('b');
     }
 
     String bString = bToken.toString();
     // first bString is exact max default length; next one is 1 too long
     String input = "x " + bString + " " + bString + "b";
-    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
     a.close();
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
index d9d8381..76c5d55 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@@ -467,7 +467,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
   }
 
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+    WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
     wordBreakTest.test(a);
   }
   
@@ -545,6 +545,80 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
   }
 
 
+  /** simple emoji */
+  public void testEmoji() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+        new String[] { "💩", "💩", "💩" },
+        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence */
+  public void testEmojiSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
+        new String[] { "👩‍❤️‍👩" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence with fitzpatrick modifier */
+  public void testEmojiSequenceWithModifier() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
+        new String[] { "👨🏼‍⚕️" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** regional indicator */
+  public void testEmojiRegionalIndicator() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+        new String[] { "🇺🇸", "🇺🇸" },
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** variation sequence */
+  public void testEmojiVariationSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+        new String[] { "#️⃣" },
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+        new String[] { "3️⃣",},
+        new String[] { "<EMOJI>" });
+
+    // text presentation sequences
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+        new String[] { },
+        new String[] { });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
+        new String[] { "3\uFE0E",},
+        new String[] { "<NUM>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
+        new String[] { "\u2B55",},
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+        new String[] { "\u2B55", "\u200D\u2B55"},
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  public void testEmojiTagSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
+        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
+        new String[] { "<EMOJI>" });
+  }
+
+  public void testEmojiTokenization() throws Exception {
+    // simple emoji around latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+        new String[] { "poo", "💩", "poo" },
+        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+    // simple emoji around non-latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+        new String[] { "💩", "中", "國", "💩" },
+        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+  }
+
+  public void testUnicodeEmojiTests() throws Exception {
+    EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+    emojiTest.test(a);
+  }
+
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/common-build.xml
----------------------------------------------------------------------
diff --git a/lucene/common-build.xml b/lucene/common-build.xml
index 789fc5f..0dc3884 100644
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@@ -2388,7 +2388,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
 
   <!-- JFlex task -->
   <target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
-    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
+    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.7.0"
                    inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
     <taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
     <property name="jflex.loaded" value="true"/>
@@ -2645,7 +2645,11 @@ The following arguments can be provided to ant to alter its behaviour and target
     <attribute name="dir"/>
     <attribute name="name"/>
     <sequential>
-      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+      <!-- The default skeleton is specified here to work around a JFlex ant task bug:    -->
+      <!-- invocations with a non-default skeleton will cause following invocations to    -->
+      <!-- use the same skeleton, though not specified, unless the default is configured. -->
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+             skeleton="${common.dir}/core/src/data/jflex/skeleton.default"/>
     </sequential>
   </macrodef>
 
@@ -2653,20 +2657,13 @@ The following arguments can be provided to ant to alter its behaviour and target
     <attribute name="dir"/>
     <attribute name="name"/>
     <sequential>
-      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
       <!-- LUCENE-5897: Disallow scanner buffer expansion -->
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
-                     replace="" flags="s" />
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+             skeleton="${common.dir}/core/src/data/jflex/skeleton.disable.buffer.expansion.txt"/>
+      <!-- Since the ZZ_BUFFERSIZE declaration is generated rather than in the skeleton, we have to transform it here. -->
       <replaceregexp file="@{dir}/@{name}.java"
                      match="private static final int ZZ_BUFFERSIZE ="
                      replace="private int ZZ_BUFFERSIZE ="/>
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="int requested = zzBuffer.length - zzEndRead;"
-                     replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="(zzFinalHighSurrogate = 1;)(\r?\n)"
-                     replace="\1\2          if (totalRead == 1) { return true; }\2"/>
     </sequential>
   </macrodef>
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
new file mode 100644
index 0000000..c631dee
--- /dev/null
+++ b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by getUnicodeEmojiProperties.pl
+// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt 
+
+Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{
 2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
+Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
+Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
+Extended_Pictographic = [\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{2388}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2605}\u{2607}-\u{2612}\u{2614}-\u{2685}\u{2690}-\u{2705}\u{2708}-\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2767}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F000}-\u{1F0FF}\u{1F10D}-\u{1F10F}\u{1F12F}\u{1F16C}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AD}-\u{1F1E5}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F3FA}\u{1F400}-\u{1F53D}\u{1F546}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{1F774}-\u{1F77F}\u{1F7D5}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE
 }-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1FFFD}]
+

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
new file mode 100644
index 0000000..e818b64
--- /dev/null
+++ b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
@@ -0,0 +1,168 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+    print STDERR "Usage: $script_name -v <version>\n";
+    print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
+        if ($version);
+    exit 1;
+}
+my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
+my $output_filename = "UnicodeEmojiProperties.jflex";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by ${script_name}
+// from: ${emoji_data_url} 
+
+__HEADER__
+
+my $property_ranges = {};
+my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
+
+parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+output_jflex_include_file($output_path, $property_ranges);
+
+
+# sub parse_emoji_data_file
+#
+# Downloads and parses the emoji_data.txt file, extracting code point ranges
+# assigned to property values with age not younger than the passed-in version,
+# except for the Extended_Pictographic property, for which all code point ranges
+# are extracted, regardless of age.
+#
+# Parameters:
+#
+#  - Emoji data file URL
+#  - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
+#  - Reference to hash of wanted property names
+#
+sub parse_emoji_data_file {
+    my $url = shift;
+    my $prop_ranges = shift;
+    my $wanted_props = shift;
+    my $content = get_URL_content($url);
+    print STDERR "Parsing '$url'...";
+    my @lines = split /\r?\n/, $content;
+    for (@lines) {
+        ## 231A..231B    ; Emoji_Presentation   #  1.1  [2] (⌚..⌛)    watch..hourglass done
+        ## 1F9C0         ; Emoji_Presentation   #  8.0  [1] (🧀)       cheese wedge
+        ## 1FA00..1FA5F  ; Extended_Pictographic#   NA [96] (🨀️..🩟️)    <reserved-1FA00>..<reserved-1FA5F>
+        if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
+            next unless defined($wanted_props->{$prop});  # Skip unless we want ranges for this property
+            
+            if (not defined($prop_ranges->{$prop})) {
+                $prop_ranges->{$prop} = [];
+            }
+            $end = $start unless defined($end);
+            my $start_dec = hex $start;
+            my $end_dec = hex $end;
+            my $ranges = $prop_ranges->{$prop};
+            if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
+                # print STDERR "Adding new range ($start, $end)\n";
+                push @$ranges, $start_dec, $end_dec;
+            } else {
+                # printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
+                $ranges->[-1] = $end_dec;
+            }
+        } else {
+            # print STDERR "Skipping line (no data): $_\n";
+        }
+    }
+    print STDERR "done.\n";
+}
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+# Parameter:
+#
+#  - URL to get content for
+#
+sub get_URL_content {
+    my $url = shift;
+    print STDERR "Retrieving '$url'...";
+    my $user_agent = LWP::UserAgent->new;
+    my $request = HTTP::Request->new(GET => $url);
+    my $response = $user_agent->request($request);
+    unless ($response->is_success) {
+        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+        exit 1;
+    }
+    print STDERR "done.\n";
+    return $response->content;
+}
+
+
+# sub output_jflex_include_file
+#
+# Parameters:
+#
+#  - Output path
+#  - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
+#     
+sub output_jflex_include_file {
+    my $path = shift;
+    my $prop_ranges = shift;
+    open OUT, ">$path"
+        || die "Error opening '$path' for writing: $!";
+
+    print STDERR "Writing '$path'...";
+
+    print OUT $header;
+
+    for my $prop (sort keys %$prop_ranges) {
+        my $ranges = $prop_ranges->{$prop};
+        print OUT "$prop = [";
+        for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
+            printf OUT "\\u{%X}", $ranges->[$index];
+            printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
+        }
+        print OUT "]\n";
+    }
+
+    print OUT "\n";
+    close OUT;
+    print STDERR "done.\n";
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/skeleton.default
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/skeleton.default b/lucene/core/src/data/jflex/skeleton.default
new file mode 100644
index 0000000..9e08fbb
--- /dev/null
+++ b/lucene/core/src/data/jflex/skeleton.default
@@ -0,0 +1,342 @@
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+  /** lexical states */
+---  lexical states, charmap
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unknown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+--- isFinal list
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true iff the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzBuffer.length*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+    }
+
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
+        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+        }
+      }
+      /* potentially more input available */
+      return false;
+    }
+
+    /* numRead < 0 ==> end of stream */
+    return true;
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * Internal scan buffer is resized down to its initial length, if it has grown.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+    if (zzBuffer.length > ZZ_BUFFERSIZE)
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+--- zzScanError declaration
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+--- throws clause
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+--- yypushback decl (contains zzScanError exception)
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+--- zzDoEOF
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+--- yylex declaration
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+--- start admin (lexstate etc)
+
+      zzForAction: {
+        while (true) {
+    
+--- next input, line, col, char count, next transition, isFinal action
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+--- line count update
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+--- eofvalue
+      }
+      else {
+--- actions
+          default:
+--- no match
+        }
+      }
+    }
+  }
+
+--- main
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt b/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
new file mode 100644
index 0000000..a9dabcf
--- /dev/null
+++ b/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
@@ -0,0 +1,348 @@
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+  /** lexical states */
+---  lexical states, charmap
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unknown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+--- isFinal list
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true iff the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+    if (requested == 0) {
+      return true;
+    }
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+        if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+          if (numRead == 1) {
+            return true;
+          }
+        } else {                    // There is room in the buffer for at least one more char
+          int c = zzReader.read();  // Expecting to read a low surrogate char
+          if (c == -1) {
+            return true;
+          } else {
+            zzBuffer[zzEndRead++] = (char)c;
+            return false;
+          }
+        }
+      }
+      /* potentially more input available */
+      return false;
+    }
+
+    /* numRead < 0 ==> end of stream */
+    return true;
+  }
+
+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * Internal scan buffer is resized down to its initial length, if it has grown.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+    if (zzBuffer.length > ZZ_BUFFERSIZE)
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+--- zzScanError declaration
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+--- throws clause
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+--- yypushback decl (contains zzScanError exception)
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+--- zzDoEOF
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+--- yylex declaration
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+--- start admin (lexstate etc)
+
+      zzForAction: {
+        while (true) {
+    
+--- next input, line, col, char count, next transition, isFinal action
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+--- line count update
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+--- eofvalue
+      }
+      else {
+--- actions
+          default:
+--- no match
+        }
+      }
+    }
+  }
+
+--- main
+
+}