You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:28 UTC
[14/24] lucene-solr:branch_8x: LUCENE-8527: Upgrade JFlex to 1.7.0.
StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
and provide UTS#51 v11.0 Emoji tokenization with the '' token type.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
index 292f2ad..e4b10af 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
@@ -37,12 +37,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li><HIRAGANA>: A single hiragana character</li>
* <li><KATAKANA>: A sequence of katakana characters</li>
* <li><HANGUL>: A sequence of Hangul characters</li>
+ * <li><EMOJI>: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
%%
-%unicode 6.3
+%unicode 9.0
%integer
%final
%public
@@ -52,22 +53,73 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%xstate AVOID_BAD_URL
%buffer 255
-// UAX#29 WB4. X (Extend | Format)* --> X
+
+// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../../../../core/src/data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
+//
+// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+// - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
-HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
-NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
-MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
-HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
+ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
+NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
+KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
+MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
+MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
+ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
+HanEx = \p{Script:Han} {ExtFmtZwj}
+HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
+SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
+DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
+HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
+ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
+
// URL and E-mail syntax specifications:
//
@@ -174,18 +226,28 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
+ /** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
+ /** Hiragana token type */
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
+ /** Katakana token type */
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
+ /** Hangul token type */
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
+ /** Email token type */
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
+ /** URL token type */
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
+ /** Emoji token type */
+ public static final int EMOJI_TYPE = UAX29URLEmailTokenizer.EMOJI;
+
+ /** Character count processed so far */
public final int yychar()
{
return yychar;
@@ -213,11 +275,11 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
<YYINITIAL, AVOID_BAD_URL> {
-// UAX#29 WB1. sot ÷
-// WB2. ÷ eot
+// UAX#29 WB1. sot ÷ Any
+// WB2. Any ÷ eot
//
<<EOF>> { return YYEOF; }
-
+
{URL} { yybegin(YYINITIAL); return URL_TYPE; }
// LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
@@ -244,14 +306,61 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }
- // UAX#29 WB8. Numeric × Numeric
- // WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
- // WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
- // WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
- // WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+
+ // Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+ // WB14. (E_Base | EBG) × E_Modifier
+ // WB15. ^ (RI RI)* RI × RI
+ // WB16. [^RI] (RI RI)* RI × RI
+ //
+ // We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+ // and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+ //
+ // emoji_sequence :=
+ // Top-level EBNF Expanded #1 Expanded #2 Expanded #3
+ // --------------------- ---------------------------- ----------------------------- ----------------------------------------------
+ // emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
+ // | emoji_presentation_sequence | \p{Emoji} \uFE0F
+ // | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
+ // | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+ // | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
+ //
+ // | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
+ // | emoji_presentation_sequence | \p{Emoji} \uFE0F
+ // | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+ // ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
+ //
+ // | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
+ // | emoji_presentation_sequence | \p{Emoji} \uFE0F
+ // | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+ // tag_spec [\u{E0020}-\u{E007E}]+
+ // tag_term \u{E007F}
+ //
+ // [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
+ // WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+ // TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+ // choose whether to support them for segmentation. This implementation will
+ // recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
+ //
+ // See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+ // https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+ //
+ // In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+ //
+ // WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+ //
+ {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
+ | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
+ | {RegionalIndicatorEx}{2}
+ { yybegin(YYINITIAL); return EMOJI_TYPE; }
+
+ // UAX#29 WB8. Numeric × Numeric
+ // WB11. Numeric (MidNum | MidNumLetQ) × Numeric
+ // WB12. Numeric × (MidNum | MidNumLetQ) Numeric
+ // WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+ // WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
- { yybegin(YYINITIAL); return NUMERIC_TYPE; }
+ { yybegin(YYINITIAL); return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
@@ -260,32 +369,32 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{KatakanaEx}+
{ yybegin(YYINITIAL); return KATAKANA_TYPE; }
- // UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
- // WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
- // WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
- // WB7a. Hebrew_Letter × Single_Quote
- // WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
- // WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
- // WB9. (ALetter | Hebrew_Letter) × Numeric
- // WB10. Numeric × (ALetter | Hebrew_Letter)
- // WB13. Katakana × Katakana
- // WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
- // WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+ // UAX#29 WB5. AHLetter × AHLetter
+ // WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
+ // WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
+ // WB7a. Hebrew_Letter × Single_Quote
+ // WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
+ // WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
+ // WB9. AHLetter × Numeric
+ // WB10. Numeric × AHLetter
+ // WB13. Katakana × Katakana
+ // WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+ // WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
- {ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ {ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
- ({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ ({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
)*
- {ExtendNumLetEx}*
+ {ExtendNumLetEx}*
{ yybegin(YYINITIAL); return WORD_TYPE; }
@@ -297,7 +406,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
- // In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+ // In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@@ -310,18 +419,15 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
- // UAX#29 WB14. Any ÷ Any
+ // UAX#29 WB999. Any ÷ Any
//
{HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }
-
- // UAX#29 WB3. CR × LF
- // WB3a. (Newline | CR | LF) ÷
- // WB3b. ÷ (Newline | CR | LF)
- // WB13c. Regional_Indicator × Regional_Indicator
- // WB14. Any ÷ Any
+ // UAX#29 WB3. CR × LF
+ // WB3a. (Newline | CR | LF) ÷
+ // WB3b. ÷ (Newline | CR | LF)
+ // WB999. Any ÷ Any
//
- {RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
- { yybegin(YYINITIAL); /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+ [^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
index 7f9227f..9295e1c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -341,7 +341,7 @@ class WikipediaTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
- "Unkown internal scanner error",
+ "Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@@ -419,11 +419,11 @@ class WikipediaTokenizerImpl {
private int yycolumn;
/**
- * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
- /** zzAtEOF == true <=> the scanner is at the EOF */
+ /** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@@ -575,28 +575,29 @@ final void reset() {
}
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead;
- int totalRead = 0;
- while (totalRead < requested) {
- int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
- if (numRead == -1) {
- break;
- }
- totalRead += numRead;
- }
+ int requested = zzBuffer.length - zzEndRead;
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
- if (totalRead > 0) {
- zzEndRead += totalRead;
- if (totalRead == requested) { /* possibly more input available */
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ /* If numRead == requested, we might have requested to few chars to
+ encode a full Unicode character. We assume that a Reader would
+ otherwise never return half characters. */
+ if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
+ /* potentially more input available */
return false;
}
- // totalRead = 0: End of stream
+ /* numRead < 0 ==> end of stream */
return true;
}
@@ -820,199 +821,245 @@ final void reset() {
// store back cached position
zzMarkedPos = zzMarkedPosL;
- switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
- }
- case 47: break;
- case 2:
- { positionInc = 1; return ALPHANUM;
- }
- case 48: break;
- case 3:
- { positionInc = 1; return CJ;
- }
- case 49: break;
- case 4:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
- }
- case 50: break;
- case 5:
- { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
- }
- case 51: break;
- case 6:
- { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
- }
- case 52: break;
- case 7:
- { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
- }
- case 53: break;
- case 8:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore */
- }
- case 54: break;
- case 9:
- { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
- }
- case 55: break;
- case 10:
- { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
- }
- case 56: break;
- case 11:
- { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 57: break;
- case 12:
- { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
- }
- case 58: break;
- case 13:
- { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 59: break;
- case 14:
- { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
- }
- case 60: break;
- case 15:
- { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
- }
- case 61: break;
- case 16:
- { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
- }
- case 62: break;
- case 17:
- { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
- }
- case 63: break;
- case 18:
- { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
- }
- case 64: break;
- case 19:
- { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
- }
- case 65: break;
- case 20:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 66: break;
- case 21:
- { yybegin(STRING); return currentTokType;/*pipe*/
- }
- case 67: break;
- case 22:
- { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
- }
- case 68: break;
- case 23:
- { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
- }
- case 69: break;
- case 24:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
- }
- case 70: break;
- case 25:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
- }
- case 71: break;
- case 26:
- { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
- }
- case 72: break;
- case 27:
- { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
- }
- case 73: break;
- case 28:
- { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 74: break;
- case 29:
- { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 75: break;
- case 30:
- { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
- }
- case 76: break;
- case 31:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
- }
- case 77: break;
- case 32:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 78: break;
- case 33:
- { positionInc = 1; return APOSTROPHE;
- }
- case 79: break;
- case 34:
- { positionInc = 1; return HOST;
- }
- case 80: break;
- case 35:
- { positionInc = 1; return NUM;
- }
- case 81: break;
- case 36:
- { positionInc = 1; return COMPANY;
- }
- case 82: break;
- case 37:
- { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 83: break;
- case 38:
- { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
- }
- case 84: break;
- case 39:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
- }
- case 85: break;
- case 40:
- { positionInc = 1; return ACRONYM;
- }
- case 86: break;
- case 41:
- { positionInc = 1; return EMAIL;
- }
- case 87: break;
- case 42:
- { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
- }
- case 88: break;
- case 43:
- { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
- }
- case 89: break;
- case 44:
- { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
- }
- case 90: break;
- case 45:
- { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 91: break;
- case 46:
- { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
- }
- case 92: break;
- default:
- if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
- zzAtEOF = true;
- return YYEOF;
- }
- else {
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else {
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 47: break;
+ case 2:
+ { positionInc = 1; return ALPHANUM;
+ }
+ // fall through
+ case 48: break;
+ case 3:
+ { positionInc = 1; return CJ;
+ }
+ // fall through
+ case 49: break;
+ case 4:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 50: break;
+ case 5:
+ { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 51: break;
+ case 6:
+ { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
+ }
+ // fall through
+ case 52: break;
+ case 7:
+ { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
+ }
+ // fall through
+ case 53: break;
+ case 8:
+ { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+ }
+ // fall through
+ case 54: break;
+ case 9:
+ { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
+ }
+ // fall through
+ case 55: break;
+ case 10:
+ { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 56: break;
+ case 11:
+ { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 57: break;
+ case 12:
+ { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
+ }
+ // fall through
+ case 58: break;
+ case 13:
+ { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 59: break;
+ case 14:
+ { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
+ }
+ // fall through
+ case 60: break;
+ case 15:
+ { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 61: break;
+ case 16:
+ { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
+ }
+ // fall through
+ case 62: break;
+ case 17:
+ { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
+ }
+ // fall through
+ case 63: break;
+ case 18:
+ { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
+ }
+ // fall through
+ case 64: break;
+ case 19:
+ { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
+ }
+ // fall through
+ case 65: break;
+ case 20:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 66: break;
+ case 21:
+ { yybegin(STRING); return currentTokType;/*pipe*/
+ }
+ // fall through
+ case 67: break;
+ case 22:
+ { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 68: break;
+ case 23:
+ { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 69: break;
+ case 24:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 70: break;
+ case 25:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 71: break;
+ case 26:
+ { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 72: break;
+ case 27:
+ { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 73: break;
+ case 28:
+ { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 74: break;
+ case 29:
+ { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 75: break;
+ case 30:
+ { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 76: break;
+ case 31:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
+ }
+ // fall through
+ case 77: break;
+ case 32:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 78: break;
+ case 33:
+ { positionInc = 1; return APOSTROPHE;
+ }
+ // fall through
+ case 79: break;
+ case 34:
+ { positionInc = 1; return HOST;
+ }
+ // fall through
+ case 80: break;
+ case 35:
+ { positionInc = 1; return NUM;
+ }
+ // fall through
+ case 81: break;
+ case 36:
+ { positionInc = 1; return COMPANY;
+ }
+ // fall through
+ case 82: break;
+ case 37:
+ { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 83: break;
+ case 38:
+ { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
+ }
+ // fall through
+ case 84: break;
+ case 39:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
+ }
+ // fall through
+ case 85: break;
+ case 40:
+ { positionInc = 1; return ACRONYM;
+ }
+ // fall through
+ case 86: break;
+ case 41:
+ { positionInc = 1; return EMAIL;
+ }
+ // fall through
+ case 87: break;
+ case 42:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
+ }
+ // fall through
+ case 88: break;
+ case 43:
+ { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+ }
+ // fall through
+ case 89: break;
+ case 44:
+ { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 90: break;
+ case 45:
+ { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 91: break;
+ case 46:
+ { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+ }
+ // fall through
+ case 92: break;
+ default:
zzScanError(ZZ_NO_MATCH);
- }
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
index cf6c65a..758d5d2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@@ -499,7 +499,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
- String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
+ String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString2 +"-[CDATA[";
String[] testGold = {
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
index b3b0ce1..507eb09 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@@ -361,14 +361,14 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
StringBuilder bToken = new StringBuilder();
// exact max length:
- for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+ for(int i=0;i<UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
bToken.append('b');
}
String bString = bToken.toString();
// first bString is exact max default length; next one is 1 too long
String input = "x " + bString + " " + bString + "b";
- assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+ assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
a.close();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
index d9d8381..76c5d55 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@@ -467,7 +467,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
- WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+ WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
@@ -545,6 +545,80 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
+ /** simple emoji */
+ public void testEmoji() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+ new String[] { "💩", "💩", "💩" },
+ new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+ }
+
+ /** emoji zwj sequence */
+ public void testEmojiSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
+ new String[] { "👩❤️👩" },
+ new String[] { "<EMOJI>" });
+ }
+
+ /** emoji zwj sequence with fitzpatrick modifier */
+ public void testEmojiSequenceWithModifier() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
+ new String[] { "👨🏼⚕️" },
+ new String[] { "<EMOJI>" });
+ }
+
+ /** regional indicator */
+ public void testEmojiRegionalIndicator() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+ new String[] { "🇺🇸", "🇺🇸" },
+ new String[] { "<EMOJI>", "<EMOJI>" });
+ }
+
+ /** variation sequence */
+ public void testEmojiVariationSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+ new String[] { "#️⃣" },
+ new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+ new String[] { "3️⃣",},
+ new String[] { "<EMOJI>" });
+
+ // text presentation sequences
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+ new String[] { },
+ new String[] { });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
+ new String[] { "3\uFE0E",},
+ new String[] { "<NUM>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
+ new String[] { "\u2B55",},
+ new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+ new String[] { "\u2B55", "\u200D\u2B55"},
+ new String[] { "<EMOJI>", "<EMOJI>" });
+ }
+
+ public void testEmojiTagSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
+ new String[] { "🏴" },
+ new String[] { "<EMOJI>" });
+ }
+
+ public void testEmojiTokenization() throws Exception {
+ // simple emoji around latin
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+ new String[] { "poo", "💩", "poo" },
+ new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+ // simple emoji around non-latin
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+ new String[] { "💩", "中", "國", "💩" },
+ new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+ }
+
+ public void testUnicodeEmojiTests() throws Exception {
+ EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+ emojiTest.test(a);
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/common-build.xml
----------------------------------------------------------------------
diff --git a/lucene/common-build.xml b/lucene/common-build.xml
index 789fc5f..0dc3884 100644
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@@ -2388,7 +2388,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<!-- JFlex task -->
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
- <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
+ <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.7.0"
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
<property name="jflex.loaded" value="true"/>
@@ -2645,7 +2645,11 @@ The following arguments can be provided to ant to alter its behaviour and target
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
- <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+ <!-- The default skeleton is specified here to work around a JFlex ant task bug: -->
+ <!-- invocations with a non-default skeleton will cause following invocations to -->
+ <!-- use the same skeleton, though not specified, unless the default is configured. -->
+ <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+ skeleton="${common.dir}/core/src/data/jflex/skeleton.default"/>
</sequential>
</macrodef>
@@ -2653,20 +2657,13 @@ The following arguments can be provided to ant to alter its behaviour and target
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
- <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
<!-- LUCENE-5897: Disallow scanner buffer expansion -->
- <replaceregexp file="@{dir}/@{name}.java"
- match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
- replace="" flags="s" />
+ <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+ skeleton="${common.dir}/core/src/data/jflex/skeleton.disable.buffer.expansion.txt"/>
+ <!-- Since the ZZ_BUFFERSIZE declaration is generated rather than in the skeleton, we have to transform it here. -->
<replaceregexp file="@{dir}/@{name}.java"
match="private static final int ZZ_BUFFERSIZE ="
replace="private int ZZ_BUFFERSIZE ="/>
- <replaceregexp file="@{dir}/@{name}.java"
- match="int requested = zzBuffer.length - zzEndRead;"
- replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
- <replaceregexp file="@{dir}/@{name}.java"
- match="(zzFinalHighSurrogate = 1;)(\r?\n)"
- replace="\1\2 if (totalRead == 1) { return true; }\2"/>
</sequential>
</macrodef>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
new file mode 100644
index 0000000..c631dee
--- /dev/null
+++ b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by getUnicodeEmojiProperties.pl
+// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt
+
+Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{
2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
+Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
+Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
+Extended_Pictographic = [\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{2388}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2605}\u{2607}-\u{2612}\u{2614}-\u{2685}\u{2690}-\u{2705}\u{2708}-\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2767}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F000}-\u{1F0FF}\u{1F10D}-\u{1F10F}\u{1F12F}\u{1F16C}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AD}-\u{1F1E5}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F3FA}\u{1F400}-\u{1F53D}\u{1F546}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{1F774}-\u{1F77F}\u{1F7D5}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE
}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1FFFD}]
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
new file mode 100644
index 0000000..e818b64
--- /dev/null
+++ b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
@@ -0,0 +1,168 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+ print STDERR "Usage: $script_name -v <version>\n";
+ print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
+ if ($version);
+ exit 1;
+}
+my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
+my $output_filename = "UnicodeEmojiProperties.jflex";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by ${script_name}
+// from: ${emoji_data_url}
+
+__HEADER__
+
+my $property_ranges = {};
+my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
+
+parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+output_jflex_include_file($output_path, $property_ranges);
+
+
+# sub parse_emoji_data_file
+#
+# Downloads and parses the emoji_data.txt file, extracting code point ranges
+# assigned to property values with age not younger than the passed-in version,
+# except for the Extended_Pictographic property, for which all code point ranges
+# are extracted, regardless of age.
+#
+# Parameters:
+#
+# - Emoji data file URL
+# - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
+# - Reference to hash of wanted property names
+#
+sub parse_emoji_data_file {
+ my $url = shift;
+ my $prop_ranges = shift;
+ my $wanted_props = shift;
+ my $content = get_URL_content($url);
+ print STDERR "Parsing '$url'...";
+ my @lines = split /\r?\n/, $content;
+ for (@lines) {
+ ## 231A..231B ; Emoji_Presentation # 1.1 [2] (⌚..⌛) watch..hourglass done
+ ## 1F9C0 ; Emoji_Presentation # 8.0 [1] (🧀) cheese wedge
+ ## 1FA00..1FA5F ; Extended_Pictographic# NA [96] (🨀️..️) <reserved-1FA00>..<reserved-1FA5F>
+ if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
+ next unless defined($wanted_props->{$prop}); # Skip unless we want ranges for this property
+
+ if (not defined($prop_ranges->{$prop})) {
+ $prop_ranges->{$prop} = [];
+ }
+ $end = $start unless defined($end);
+ my $start_dec = hex $start;
+ my $end_dec = hex $end;
+ my $ranges = $prop_ranges->{$prop};
+ if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
+ # print STDERR "Adding new range ($start, $end)\n";
+ push @$ranges, $start_dec, $end_dec;
+ } else {
+ # printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
+ $ranges->[-1] = $end_dec;
+ }
+ } else {
+ # print STDERR "Skipping line (no data): $_\n";
+ }
+ }
+ print STDERR "done.\n";
+}
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+# Parameter:
+#
+# - URL to get content for
+#
+sub get_URL_content {
+ my $url = shift;
+ print STDERR "Retrieving '$url'...";
+ my $user_agent = LWP::UserAgent->new;
+ my $request = HTTP::Request->new(GET => $url);
+ my $response = $user_agent->request($request);
+ unless ($response->is_success) {
+ print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+ exit 1;
+ }
+ print STDERR "done.\n";
+ return $response->content;
+}
+
+
+# sub output_jflex_include_file
+#
+# Parameters:
+#
+# - Output path
+# - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
+#
+sub output_jflex_include_file {
+ my $path = shift;
+ my $prop_ranges = shift;
+ open OUT, ">$path"
+ || die "Error opening '$path' for writing: $!";
+
+ print STDERR "Writing '$path'...";
+
+ print OUT $header;
+
+ for my $prop (sort keys %$prop_ranges) {
+ my $ranges = $prop_ranges->{$prop};
+ print OUT "$prop = [";
+ for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
+ printf OUT "\\u{%X}", $ranges->[$index];
+ printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
+ }
+ print OUT "]\n";
+ }
+
+ print OUT "\n";
+ close OUT;
+ print STDERR "done.\n";
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/skeleton.default
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/skeleton.default b/lucene/core/src/data/jflex/skeleton.default
new file mode 100644
index 0000000..9e08fbb
--- /dev/null
+++ b/lucene/core/src/data/jflex/skeleton.default
@@ -0,0 +1,342 @@
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+ /** lexical states */
+--- lexical states, charmap
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unknown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+--- isFinal list
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true iff the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead.
+ * When a lead/high surrogate has been read from the input stream
+ * into the final zzBuffer position, this will have a value of 1;
+ * otherwise, it will have a value of 0.
+ */
+ private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzBuffer.length*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
+ }
+
+ /* fill the buffer with new input */
+ int requested = zzBuffer.length - zzEndRead;
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ /* If numRead == requested, we might have requested to few chars to
+ encode a full Unicode character. We assume that a Reader would
+ otherwise never return half characters. */
+ if (numRead == requested) {
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ --zzEndRead;
+ zzFinalHighSurrogate = 1;
+ }
+ }
+ /* potentially more input available */
+ return false;
+ }
+
+ /* numRead < 0 ==> end of stream */
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * Internal scan buffer is resized down to its initial length, if it has grown.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ zzFinalHighSurrogate = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ if (zzBuffer.length > ZZ_BUFFERSIZE)
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+--- zzScanError declaration
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+--- throws clause
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+--- yypushback decl (contains zzScanError exception)
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+--- zzDoEOF
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+--- yylex declaration
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+--- start admin (lexstate etc)
+
+ zzForAction: {
+ while (true) {
+
+--- next input, line, col, char count, next transition, isFinal action
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+--- line count update
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+--- eofvalue
+ }
+ else {
+--- actions
+ default:
+--- no match
+ }
+ }
+ }
+ }
+
+--- main
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0e903cab/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
----------------------------------------------------------------------
diff --git a/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt b/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
new file mode 100644
index 0000000..a9dabcf
--- /dev/null
+++ b/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
@@ -0,0 +1,348 @@
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+ /** lexical states */
+--- lexical states, charmap
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unknown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+--- isFinal list
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true iff the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead.
+ * When a lead/high surrogate has been read from the input stream
+ * into the final zzBuffer position, this will have a value of 1;
+ * otherwise, it will have a value of 0.
+ */
+ private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+
+ /* fill the buffer with new input */
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+ if (requested == 0) {
+ return true;
+ }
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
+ --zzEndRead;
+ zzFinalHighSurrogate = 1;
+ if (numRead == 1) {
+ return true;
+ }
+ } else { // There is room in the buffer for at least one more char
+ int c = zzReader.read(); // Expecting to read a low surrogate char
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char)c;
+ return false;
+ }
+ }
+ }
+ /* potentially more input available */
+ return false;
+ }
+
+ /* numRead < 0 ==> end of stream */
+ return true;
+ }
+
+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * Internal scan buffer is resized down to its initial length, if it has grown.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ zzFinalHighSurrogate = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ if (zzBuffer.length > ZZ_BUFFERSIZE)
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+--- zzScanError declaration
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+--- throws clause
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+--- yypushback decl (contains zzScanError exception)
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+--- zzDoEOF
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+--- yylex declaration
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+--- start admin (lexstate etc)
+
+ zzForAction: {
+ while (true) {
+
+--- next input, line, col, char count, next transition, isFinal action
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+--- line count update
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+--- eofvalue
+ }
+ else {
+--- actions
+ default:
+--- no match
+ }
+ }
+ }
+ }
+
+--- main
+
+}