You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2019/01/08 18:35:19 UTC
[05/24] lucene-solr:master: LUCENE-8527: Upgrade JFlex to 1.7.0.
StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
and provide UTS#51 v11.0 Emoji tokenization with the '' token type.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
index 8b288c2..a2ad394 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li><HIRAGANA>: A single hiragana character</li>
* <li><KATAKANA>: A sequence of katakana characters</li>
* <li><HANGUL>: A sequence of Hangul characters</li>
+ * <li><EMOJI>: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
@@ -65,147 +66,212 @@ public final class StandardTokenizerImpl {
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
- "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
- "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
- "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
- "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
- "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
- "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
- "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
- "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
- "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
- "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
- "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
- "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
- "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
- "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
- "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
- "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
- "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
- "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
- "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
- "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
- "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
- "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
- "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
- "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
- "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
- "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
- "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
- "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
- "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
- "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
- "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
- "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
- "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
- "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
- "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
- "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
- "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
- "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
- "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
- "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
- "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
- "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
- "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
- "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
- "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
- "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
- "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
- "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
- "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
- "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
- "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
- "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
- "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
- "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
- "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
- "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
- "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
- "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
- "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
- "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
- "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
- "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
- "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
- "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
- "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
- "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
- "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
- "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
- "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
- "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
- "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
- "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
- "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
- "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
- "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
- "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
- "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
- "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
- "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
- "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
- "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
- "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
- "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
- "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
- "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
- "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
- "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
- "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
- "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
- "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
- "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
- "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
- "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
- "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
- "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
- "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
- "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
- "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
- "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
- "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
- "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
- "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
- "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
- "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
- "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
- "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
- "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
- "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
- "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
- "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
- "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
- "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
- "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
- "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
- "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
- "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
- "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
- "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
- "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
- "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
- "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
- "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
- "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
- "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
- "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
- "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
- "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
- "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
- "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
- "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
- "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
- "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
- "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
- "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
- "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
- "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
- "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
- "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
- "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
- "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+ "\42\0\1\32\1\7\3\0\1\31\2\0\1\7\1\0\1\24\1\0"+
+ "\1\25\1\0\12\21\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+ "\32\15\56\0\1\4\1\15\2\0\1\5\1\4\6\0\1\15\1\0"+
+ "\1\23\2\0\1\15\5\0\27\15\1\0\37\15\1\0\u01ca\15\4\0"+
+ "\14\15\5\0\1\23\10\0\5\15\7\0\1\15\1\0\1\15\21\0"+
+ "\160\5\5\15\1\0\2\15\2\0\4\15\1\24\1\15\6\0\1\15"+
+ "\1\23\3\15\1\0\1\15\1\0\24\15\1\0\123\15\1\0\213\15"+
+ "\1\0\7\5\246\15\1\0\46\15\2\0\1\15\7\0\47\15\1\0"+
+ "\1\24\7\0\55\5\1\0\1\5\1\0\2\5\1\0\2\5\1\0"+
+ "\1\5\10\0\33\33\5\0\3\33\1\15\1\23\13\0\6\5\6\0"+
+ "\2\24\2\0\13\5\1\0\1\5\3\0\53\15\25\5\12\20\1\0"+
+ "\1\20\1\24\1\0\2\15\1\5\143\15\1\0\1\15\10\5\1\0"+
+ "\6\5\2\15\2\5\1\0\4\5\2\15\12\20\3\15\2\0\1\15"+
+ "\17\0\1\5\1\15\1\5\36\15\33\5\2\0\131\15\13\5\1\15"+
+ "\16\0\12\20\41\15\11\5\2\15\2\0\1\24\1\0\1\15\5\0"+
+ "\26\15\4\5\1\15\11\5\1\15\3\5\1\15\5\5\22\0\31\15"+
+ "\3\5\104\0\25\15\1\0\10\15\26\0\60\5\66\15\3\5\1\15"+
+ "\22\5\1\15\7\5\12\15\2\5\2\0\12\20\1\0\20\15\3\5"+
+ "\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\1\15"+
+ "\3\0\4\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5"+
+ "\1\15\10\0\1\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20"+
+ "\2\15\17\0\3\5\1\0\6\15\4\0\2\15\2\0\26\15\1\0"+
+ "\7\15\1\0\2\15\1\0\2\15\1\0\2\15\2\0\1\5\1\0"+
+ "\5\5\4\0\2\5\2\0\3\5\3\0\1\5\7\0\4\15\1\0"+
+ "\1\15\7\0\12\20\2\5\3\15\1\5\13\0\3\5\1\0\11\15"+
+ "\1\0\3\15\1\0\26\15\1\0\7\15\1\0\2\15\1\0\5\15"+
+ "\2\0\1\5\1\15\10\5\1\0\3\5\1\0\3\5\2\0\1\15"+
+ "\17\0\2\15\2\5\2\0\12\20\11\0\1\15\7\0\3\5\1\0"+
+ "\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\2\15\1\0"+
+ "\5\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5\10\0"+
+ "\2\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20\1\0\1\15"+
+ "\20\0\1\5\1\15\1\0\6\15\3\0\3\15\1\0\4\15\3\0"+
+ "\2\15\1\0\1\15\1\0\2\15\3\0\2\15\3\0\3\15\3\0"+
+ "\14\15\4\0\5\5\3\0\3\5\1\0\4\5\2\0\1\15\6\0"+
+ "\1\5\16\0\12\20\20\0\4\5\1\0\10\15\1\0\3\15\1\0"+
+ "\27\15\1\0\20\15\3\0\1\15\7\5\1\0\3\5\1\0\4\5"+
+ "\7\0\2\5\1\0\3\15\5\0\2\15\2\5\2\0\12\20\20\0"+
+ "\1\15\3\5\1\0\10\15\1\0\3\15\1\0\27\15\1\0\12\15"+
+ "\1\0\5\15\2\0\1\5\1\15\7\5\1\0\3\5\1\0\4\5"+
+ "\7\0\2\5\7\0\1\15\1\0\2\15\2\5\2\0\12\20\1\0"+
+ "\2\15\16\0\3\5\1\0\10\15\1\0\3\15\1\0\51\15\2\0"+
+ "\1\15\7\5\1\0\3\5\1\0\4\5\1\15\5\0\3\15\1\5"+
+ "\7\0\3\15\2\5\2\0\12\20\12\0\6\15\2\0\2\5\1\0"+
+ "\22\15\3\0\30\15\1\0\11\15\1\0\1\15\2\0\7\15\3\0"+
+ "\1\5\4\0\6\5\1\0\1\5\1\0\10\5\6\0\12\20\2\0"+
+ "\2\5\15\0\60\34\1\35\2\34\7\35\5\0\7\34\10\35\1\0"+
+ "\12\20\47\0\2\34\1\0\1\34\2\0\2\34\1\0\1\34\2\0"+
+ "\1\34\6\0\4\34\1\0\7\34\1\0\3\34\1\0\1\34\1\0"+
+ "\1\34\2\0\2\34\1\0\4\34\1\35\2\34\6\35\1\0\2\35"+
+ "\1\34\2\0\5\34\1\0\1\34\1\0\6\35\2\0\12\20\2\0"+
+ "\4\34\40\0\1\15\27\0\2\5\6\0\12\20\13\0\1\5\1\0"+
+ "\1\5\1\0\1\5\4\0\2\5\10\15\1\0\44\15\4\0\24\5"+
+ "\1\0\2\5\5\15\13\5\1\0\44\5\11\0\1\5\71\0\53\34"+
+ "\24\35\1\34\12\20\6\0\6\34\4\35\4\34\3\35\1\34\3\35"+
+ "\2\34\7\35\3\34\4\35\15\34\14\35\1\34\1\35\12\20\4\35"+
+ "\2\34\46\15\1\0\1\15\5\0\1\15\2\0\53\15\1\0\4\15"+
+ "\u0100\17\111\15\1\0\4\15\2\0\7\15\1\0\1\15\1\0\4\15"+
+ "\2\0\51\15\1\0\4\15\2\0\41\15\1\0\4\15\2\0\7\15"+
+ "\1\0\1\15\1\0\4\15\2\0\17\15\1\0\71\15\1\0\4\15"+
+ "\2\0\103\15\2\0\3\5\40\0\20\15\20\0\126\15\2\0\6\15"+
+ "\3\0\u026c\15\2\0\21\15\1\0\32\15\5\0\113\15\3\0\13\15"+
+ "\7\0\15\15\1\0\4\15\3\5\13\0\22\15\3\5\13\0\22\15"+
+ "\2\5\14\0\15\15\1\0\3\15\1\0\2\5\14\0\64\34\40\35"+
+ "\3\0\1\34\4\0\1\34\1\35\2\0\12\20\41\0\4\5\1\0"+
+ "\12\20\6\0\130\15\10\0\5\15\2\5\42\15\1\5\1\15\5\0"+
+ "\106\15\12\0\37\15\1\0\14\5\4\0\14\5\12\0\12\20\36\34"+
+ "\2\0\5\34\13\0\54\34\4\0\32\34\6\0\12\20\1\34\3\0"+
+ "\2\34\40\0\27\15\5\5\4\0\65\34\12\35\1\0\35\35\2\0"+
+ "\1\5\12\20\6\0\12\20\6\0\16\34\2\0\17\5\101\0\5\5"+
+ "\57\15\21\5\7\15\4\0\12\20\21\0\11\5\14\0\3\5\36\15"+
+ "\15\5\2\15\12\20\54\15\16\5\14\0\44\15\24\5\10\0\12\20"+
+ "\3\0\3\15\12\20\44\15\2\0\11\15\107\0\3\5\1\0\25\5"+
+ "\4\15\1\5\4\15\3\5\2\15\1\0\2\5\6\0\300\15\66\5"+
+ "\5\0\5\5\u0116\15\2\0\6\15\2\0\46\15\2\0\6\15\2\0"+
+ "\10\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\37\15\2\0"+
+ "\65\15\1\0\7\15\1\0\1\15\3\0\3\15\1\0\7\15\3\0"+
+ "\4\15\2\0\6\15\4\0\15\15\5\0\3\15\1\0\7\15\17\0"+
+ "\1\5\1\12\2\5\10\0\2\25\12\0\1\25\2\0\1\23\2\0"+
+ "\5\5\1\26\14\0\1\4\2\0\2\26\3\0\1\24\4\0\1\4"+
+ "\12\0\1\26\13\0\5\5\1\0\12\5\1\0\1\15\15\0\1\15"+
+ "\20\0\15\15\63\0\23\5\1\10\15\5\21\0\1\15\4\0\1\15"+
+ "\2\0\12\15\1\0\1\15\3\0\5\15\4\0\1\4\1\0\1\15"+
+ "\1\0\1\15\1\0\1\15\1\0\4\15\1\0\12\15\1\16\2\0"+
+ "\4\15\5\0\5\15\4\0\1\15\21\0\51\15\13\0\6\4\17\0"+
+ "\2\4\u016f\0\2\4\14\0\1\4\137\0\1\4\106\0\1\4\31\0"+
+ "\13\4\4\0\3\4\273\0\14\15\1\16\47\15\300\0\2\4\12\0"+
+ "\1\4\11\0\1\4\72\0\4\4\1\0\5\4\1\4\1\0\7\4"+
+ "\1\4\2\4\1\4\1\4\1\0\2\4\2\4\1\4\4\4\1\3"+
+ "\2\4\1\4\1\4\2\4\2\4\1\4\3\4\1\4\3\4\2\4"+
+ "\10\4\3\4\5\4\1\4\1\4\1\4\5\4\14\4\13\4\2\4"+
+ "\2\4\1\4\1\4\2\4\1\4\1\4\22\4\1\4\2\4\2\4"+
+ "\6\4\12\0\2\4\6\4\1\4\1\4\1\4\2\4\3\4\2\4"+
+ "\10\4\2\4\4\4\2\4\13\4\2\4\5\4\2\4\2\4\1\4"+
+ "\5\4\2\4\1\4\1\4\1\4\2\4\24\4\2\4\5\4\6\4"+
+ "\1\4\2\4\1\3\1\4\2\4\1\4\4\4\1\4\2\4\1\4"+
+ "\2\0\2\4\4\3\1\4\1\4\2\4\1\4\1\0\1\4\1\0"+
+ "\1\4\6\0\1\4\3\0\1\4\6\0\1\4\12\0\2\4\17\0"+
+ "\1\4\2\0\1\4\4\0\1\4\1\0\1\4\4\0\3\4\1\0"+
+ "\1\4\13\0\2\4\3\4\55\0\3\4\11\0\1\4\16\0\1\4"+
+ "\16\0\1\4\u0174\0\2\4\u01cf\0\3\4\23\0\2\4\63\0\1\4"+
+ "\4\0\1\4\252\0\57\15\1\0\57\15\1\0\205\15\6\0\4\15"+
+ "\3\5\2\15\14\0\46\15\1\0\1\15\5\0\1\15\2\0\70\15"+
+ "\7\0\1\15\17\0\1\5\27\15\11\0\7\15\1\0\7\15\1\0"+
+ "\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0"+
+ "\7\15\1\0\40\5\57\0\1\15\120\0\32\27\1\0\131\27\14\0"+
+ "\326\27\57\0\1\15\1\0\1\27\31\0\11\27\6\5\1\4\5\22"+
+ "\2\0\3\27\1\15\1\15\1\4\3\0\126\30\2\0\2\5\2\22"+
+ "\3\30\133\22\1\0\4\22\5\0\51\15\3\0\136\17\21\0\33\15"+
+ "\65\0\20\22\227\0\1\4\1\0\1\4\66\0\57\22\1\0\130\22"+
+ "\250\0\u19b6\27\112\0\u51d6\27\52\0\u048d\15\103\0\56\15\2\0\u010d\15"+
+ "\3\0\20\15\12\20\2\15\24\0\57\15\4\5\1\0\12\5\1\0"+
+ "\37\15\2\5\120\15\2\5\45\0\11\15\2\0\147\15\2\0\44\15"+
+ "\1\0\10\15\77\0\13\15\1\5\3\15\1\5\4\15\1\5\27\15"+
+ "\5\5\30\0\64\15\14\0\2\5\62\15\22\5\12\0\12\20\6\0"+
+ "\22\5\6\15\3\0\1\15\1\0\1\15\2\0\12\20\34\15\10\5"+
+ "\2\0\27\15\15\5\14\0\35\17\3\0\4\5\57\15\16\5\16\0"+
+ "\1\15\12\20\6\0\5\34\1\35\12\34\12\20\5\34\1\0\51\15"+
+ "\16\5\11\0\3\15\1\5\10\15\2\5\2\0\12\20\6\0\33\34"+
+ "\3\35\62\34\1\35\1\34\3\35\2\34\2\35\5\34\2\35\1\34"+
+ "\1\35\1\34\30\0\5\34\13\15\5\5\2\0\3\15\2\5\12\0"+
+ "\6\15\2\0\6\15\2\0\6\15\11\0\7\15\1\0\7\15\1\0"+
+ "\53\15\1\0\12\15\12\0\163\15\10\5\1\0\2\5\2\0\12\20"+
+ "\6\0\u2ba4\17\14\0\27\17\4\0\61\17\u2104\0\u016e\27\2\0\152\27"+
+ "\46\0\7\15\14\0\5\15\5\0\1\33\1\5\12\33\1\0\15\33"+
+ "\1\0\5\33\1\0\1\33\1\0\2\33\1\0\2\33\1\0\12\33"+
+ "\142\15\41\0\u016b\15\22\0\100\15\2\0\66\15\50\0\14\15\4\0"+
+ "\16\5\1\6\1\11\1\24\2\0\1\23\1\24\13\0\20\5\3\0"+
+ "\2\26\30\0\3\26\1\24\1\0\1\25\1\0\1\24\1\23\32\0"+
+ "\5\15\1\0\207\15\2\0\1\5\7\0\1\25\4\0\1\24\1\0"+
+ "\1\25\1\0\12\20\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+ "\32\15\13\0\70\22\2\5\37\17\3\0\6\17\2\0\6\17\2\0"+
+ "\6\17\2\0\3\17\34\0\3\5\4\0\14\15\1\0\32\15\1\0"+
+ "\23\15\1\0\2\15\1\0\17\15\2\0\16\15\42\0\173\15\105\0"+
+ "\65\15\210\0\1\5\202\0\35\15\3\0\61\15\17\0\1\5\37\0"+
+ "\40\15\20\0\33\15\5\0\46\15\5\5\5\0\36\15\2\0\44\15"+
+ "\4\0\10\15\1\0\5\15\52\0\236\15\2\0\12\20\6\0\44\15"+
+ "\4\0\44\15\4\0\50\15\10\0\64\15\234\0\u0137\15\11\0\26\15"+
+ "\12\0\10\15\230\0\6\15\2\0\1\15\1\0\54\15\1\0\2\15"+
+ "\3\0\1\15\2\0\27\15\12\0\27\15\11\0\37\15\101\0\23\15"+
+ "\1\0\2\15\12\0\26\15\12\0\32\15\106\0\70\15\6\0\2\15"+
+ "\100\0\1\15\3\5\1\0\2\5\5\0\4\5\4\15\1\0\3\15"+
+ "\1\0\33\15\4\0\3\5\4\0\1\5\40\0\35\15\3\0\35\15"+
+ "\43\0\10\15\1\0\34\15\2\5\31\0\66\15\12\0\26\15\12\0"+
+ "\23\15\15\0\22\15\156\0\111\15\67\0\63\15\15\0\63\15\u030d\0"+
+ "\3\5\65\15\17\5\37\0\12\20\17\0\4\5\55\15\13\5\2\0"+
+ "\1\5\22\0\31\15\7\0\12\20\6\0\3\5\44\15\16\5\1\0"+
+ "\12\20\20\0\43\15\1\5\2\0\1\15\11\0\3\5\60\15\16\5"+
+ "\4\15\5\0\3\5\3\0\12\20\1\15\1\0\1\15\43\0\22\15"+
+ "\1\0\31\15\14\5\6\0\1\5\101\0\7\15\1\0\1\15\1\0"+
+ "\4\15\1\0\17\15\1\0\12\15\7\0\57\15\14\5\5\0\12\20"+
+ "\6\0\4\5\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15"+
+ "\1\0\2\15\1\0\5\15\2\0\1\5\1\15\7\5\2\0\2\5"+
+ "\2\0\3\5\2\0\1\15\6\0\1\5\5\0\5\15\2\5\2\0"+
+ "\7\5\3\0\5\5\213\0\65\15\22\5\4\15\5\0\12\20\46\0"+
+ "\60\15\24\5\2\15\1\0\1\15\10\0\12\20\246\0\57\15\7\5"+
+ "\2\0\11\5\27\0\4\15\2\5\42\0\60\15\21\5\3\0\1\15"+
+ "\13\0\12\20\46\0\53\15\15\5\10\0\12\20\66\0\32\34\3\0"+
+ "\17\35\4\0\12\20\2\34\3\0\1\34\u0160\0\100\15\12\20\25\0"+
+ "\1\15\u01c0\0\71\15\u0107\0\11\15\1\0\45\15\10\5\1\0\10\5"+
+ "\1\15\17\0\12\20\30\0\36\15\2\0\26\5\1\0\16\5\u0349\0"+
+ "\u039a\15\146\0\157\15\21\0\304\15\u0abc\0\u042f\15\u0fd1\0\u0247\15\u21b9\0"+
+ "\u0239\15\7\0\37\15\1\0\12\20\146\0\36\15\2\0\5\5\13\0"+
+ "\60\15\7\5\11\0\4\15\14\0\12\20\11\0\25\15\5\0\23\15"+
+ "\u0370\0\105\15\13\0\1\15\56\5\20\0\4\5\15\15\100\0\1\15"+
+ "\u401f\0\1\22\1\30\u0bfe\0\153\15\5\0\15\15\3\0\11\15\7\0"+
+ "\12\15\3\0\2\5\1\0\4\5\u14c1\0\5\5\3\0\26\5\2\0"+
+ "\7\5\36\0\4\5\224\0\3\5\u01bb\0\125\15\1\0\107\15\1\0"+
+ "\2\15\2\0\1\15\2\0\2\15\2\0\4\15\1\0\14\15\1\0"+
+ "\1\15\1\0\7\15\1\0\101\15\1\0\4\15\2\0\10\15\1\0"+
+ "\7\15\1\0\34\15\1\0\4\15\1\0\5\15\1\0\1\15\3\0"+
+ "\7\15\1\0\u0154\15\2\0\31\15\1\0\31\15\1\0\37\15\1\0"+
+ "\31\15\1\0\37\15\1\0\31\15\1\0\37\15\1\0\31\15\1\0"+
+ "\37\15\1\0\31\15\1\0\10\15\2\0\62\20\u0200\0\67\5\4\0"+
+ "\62\5\10\0\1\5\16\0\1\5\26\0\5\5\1\0\17\5\u0550\0"+
+ "\7\5\1\0\21\5\2\0\7\5\1\0\2\5\1\0\5\5\u07d5\0"+
+ "\305\15\13\0\7\5\51\0\104\15\7\5\5\0\12\20\u04a6\0\4\15"+
+ "\1\0\33\15\1\0\2\15\1\0\1\15\2\0\1\15\1\0\12\15"+
+ "\1\0\4\15\1\0\1\15\1\0\1\15\6\0\1\15\4\0\1\15"+
+ "\1\0\1\15\1\0\1\15\1\0\3\15\1\0\2\15\1\0\1\15"+
+ "\2\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15"+
+ "\1\0\2\15\1\0\1\15\2\0\4\15\1\0\7\15\1\0\4\15"+
+ "\1\0\4\15\1\0\1\15\1\0\12\15\1\0\21\15\5\0\3\15"+
+ "\1\0\5\15\1\0\21\15\u0144\0\4\4\1\4\312\4\1\4\60\4"+
+ "\15\0\3\4\37\0\1\4\32\15\6\0\32\15\2\0\4\4\2\16"+
+ "\14\15\2\16\12\15\4\0\1\4\2\0\12\4\22\0\71\4\32\1"+
+ "\1\30\2\4\15\4\12\0\1\4\24\0\1\4\2\0\11\4\1\0"+
+ "\4\4\11\0\7\4\2\4\256\4\42\4\2\4\141\4\1\3\16\4"+
+ "\2\4\2\4\1\4\3\4\2\4\44\4\3\3\2\4\1\3\2\4"+
+ "\3\3\44\4\2\4\3\4\1\4\4\4\5\2\102\4\2\3\2\4"+
+ "\13\3\25\4\4\3\4\4\1\3\1\4\11\3\3\4\1\3\4\4"+
+ "\3\3\1\4\3\3\42\4\1\3\123\4\1\4\77\4\10\0\3\4"+
+ "\6\4\1\4\30\4\7\4\2\4\2\4\1\4\2\3\4\4\1\3"+
+ "\14\4\1\4\2\4\4\4\2\4\1\3\4\4\2\3\15\4\2\4"+
+ "\2\4\1\4\10\4\2\4\11\4\1\4\5\4\3\4\14\4\3\4"+
+ "\10\4\3\4\2\4\1\4\1\4\1\4\4\4\1\4\6\4\1\4"+
+ "\3\4\1\4\6\4\113\4\3\3\3\4\5\3\60\0\43\4\1\3"+
+ "\20\4\3\3\11\4\1\3\5\4\5\4\1\4\1\3\6\4\15\4"+
+ "\6\4\3\4\1\4\1\4\2\4\3\4\1\4\2\4\7\4\6\4"+
+ "\164\0\14\4\125\0\53\4\14\0\4\4\70\0\10\4\12\0\6\4"+
+ "\50\0\10\4\36\0\122\4\14\0\4\4\10\4\5\3\1\4\2\3"+
+ "\6\4\1\3\11\4\12\3\1\4\1\0\1\4\2\3\1\4\6\4"+
+ "\1\0\52\4\2\4\4\4\3\4\1\4\1\4\47\4\15\4\5\4"+
+ "\2\3\1\4\2\3\6\4\3\4\15\4\1\4\15\3\42\4\u05fe\4"+
+ "\2\0\ua6d7\27\51\0\u1035\27\13\0\336\27\2\0\u1682\27\u295e\0\u021e\27"+
+ "\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+ "\1\5\36\0\137\13\1\14\200\0\360\5\uffff\0\uffff\0\ufe12\0";
/**
* Translates characters to character classes
@@ -218,12 +284,15 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
- "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
- "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
- "\1\4\1\0\2\2\2\0\1\1\1\0";
+ "\1\0\2\1\3\2\2\1\1\3\1\2\1\4\2\5"+
+ "\1\6\1\1\1\7\1\10\1\3\1\11\1\2\1\0"+
+ "\4\2\1\0\1\2\2\0\1\3\1\0\1\3\2\2"+
+ "\1\0\1\5\1\2\1\5\1\0\2\3\1\0\2\2"+
+ "\2\0\1\2\1\0\2\3\5\2\1\0\1\2\1\3"+
+ "\3\2";
private static int [] zzUnpackAction() {
- int [] result = new int[24];
+ int [] result = new int[61];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@@ -248,12 +317,17 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
- "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
- "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
- "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+ "\0\0\0\36\0\74\0\132\0\170\0\226\0\264\0\322"+
+ "\0\360\0\u010e\0\u012c\0\u014a\0\u0168\0\u0186\0\u01a4\0\u01c2"+
+ "\0\u01e0\0\u01fe\0\u021c\0\u023a\0\74\0\u0258\0\u0276\0\u0294"+
+ "\0\u02b2\0\264\0\u02d0\0\u02ee\0\322\0\u030c\0\u032a\0\u0348"+
+ "\0\u0366\0\u0384\0\u03a2\0\u03c0\0\u03de\0\u03fc\0\u01a4\0\u041a"+
+ "\0\u0438\0\u0456\0\u0474\0\u0492\0\u04b0\0\u04ce\0\u04ec\0\u050a"+
+ "\0\u0528\0\u0546\0\u0564\0\u0582\0\u05a0\0\u05be\0\u05dc\0\u05fa"+
+ "\0\36\0\u0618\0\360\0\u0636\0\u0654";
private static int [] zzUnpackRowMap() {
- int [] result = new int[24];
+ int [] result = new int[61];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@@ -276,33 +350,94 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
- "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
- "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
- "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
- "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
- "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
- "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
- "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
- "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
- "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
- "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
- "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
- "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
- "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
- "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
- "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
- "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
- "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
- "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
- "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
- "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
- "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
- "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
- "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
- "\1\30\1\15\14\0\1\30";
+ "\1\2\1\3\1\4\1\5\1\6\2\2\1\7\2\2"+
+ "\1\10\2\2\1\11\1\12\1\13\1\14\1\15\1\16"+
+ "\3\2\1\17\1\20\1\21\2\2\1\22\2\23\37\0"+
+ "\1\24\3\0\2\25\1\0\5\25\20\0\1\25\5\0"+
+ "\1\4\2\0\1\4\1\0\1\26\2\4\20\0\1\4"+
+ "\2\0\1\4\2\0\1\5\2\0\1\5\1\27\1\30"+
+ "\2\5\20\0\1\5\5\0\1\6\2\0\1\6\1\27"+
+ "\1\31\2\6\20\0\1\6\5\0\1\32\2\0\1\33"+
+ "\1\34\3\32\20\0\1\32\3\0\1\5\1\6\5\0"+
+ "\1\35\3\0\1\6\24\0\2\11\1\0\10\11\2\36"+
+ "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+ "\1\22\1\0\1\11\5\0\1\12\1\11\1\0\1\12"+
+ "\1\41\1\42\2\12\3\11\2\36\1\0\1\37\1\0"+
+ "\1\37\1\40\2\0\1\37\1\0\1\22\1\0\1\12"+
+ "\5\0\2\13\1\0\5\13\2\11\1\13\2\36\1\0"+
+ "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+ "\1\0\1\13\5\0\2\14\1\0\5\14\3\11\2\14"+
+ "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+ "\1\14\5\0\1\15\1\14\1\0\1\45\1\46\3\15"+
+ "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+ "\1\22\1\0\1\15\5\0\2\16\1\0\5\16\5\0"+
+ "\1\16\3\0\1\40\6\0\1\16\5\0\2\47\1\0"+
+ "\5\47\3\11\2\14\1\50\3\0\1\47\4\0\1\22"+
+ "\1\0\1\47\5\0\2\20\1\0\5\20\20\0\1\20"+
+ "\5\0\2\21\1\0\5\21\20\0\1\21\5\0\2\22"+
+ "\1\0\5\22\3\11\2\36\1\0\1\37\1\0\1\37"+
+ "\1\40\2\0\1\51\1\52\1\22\1\0\1\22\5\0"+
+ "\2\23\1\0\5\23\17\0\2\23\5\0\2\24\1\0"+
+ "\5\24\20\0\1\24\2\0\1\4\1\53\1\54\1\4"+
+ "\2\0\1\4\1\0\1\26\2\4\1\0\1\54\16\0"+
+ "\1\4\12\0\1\55\1\56\24\0\1\4\1\53\1\54"+
+ "\1\5\2\0\1\5\1\27\1\30\2\5\1\0\1\54"+
+ "\16\0\1\5\2\0\1\4\1\53\1\54\1\6\2\0"+
+ "\1\6\1\27\1\31\2\6\1\0\1\54\16\0\1\6"+
+ "\5\0\1\33\2\0\1\33\1\34\3\33\20\0\1\33"+
+ "\10\0\1\57\32\0\2\36\1\0\5\36\3\11\2\36"+
+ "\2\0\2\60\1\40\2\0\1\60\1\0\1\22\1\0"+
+ "\1\36\5\0\2\37\1\0\5\37\3\11\13\0\1\11"+
+ "\1\0\1\37\5\0\2\40\1\0\5\40\3\11\2\36"+
+ "\1\50\3\0\1\40\4\0\1\22\1\0\1\40\5\0"+
+ "\2\11\1\0\2\11\1\61\1\62\4\11\2\36\1\0"+
+ "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+ "\1\0\1\11\2\0\1\4\1\53\1\54\1\12\1\11"+
+ "\1\0\1\12\1\41\1\42\2\12\1\11\1\63\1\11"+
+ "\2\36\1\0\1\37\1\0\1\37\1\40\2\0\1\37"+
+ "\1\0\1\22\1\0\1\12\5\0\2\43\1\0\5\43"+
+ "\3\0\2\14\13\0\1\43\5\0\2\44\1\0\5\44"+
+ "\3\11\2\14\1\50\3\0\1\44\4\0\1\22\1\0"+
+ "\1\44\5\0\1\45\1\14\1\0\1\45\1\46\3\45"+
+ "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+ "\1\22\1\0\1\45\5\0\2\14\1\0\1\64\4\14"+
+ "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+ "\1\22\1\0\1\14\5\0\2\50\1\0\5\50\5\0"+
+ "\1\50\3\0\1\40\6\0\1\50\5\0\2\51\1\0"+
+ "\5\51\3\11\2\36\4\0\1\40\4\0\1\22\1\0"+
+ "\1\51\5\0\2\52\1\0\5\52\16\0\1\51\1\0"+
+ "\1\52\2\0\1\4\2\0\1\53\2\0\1\53\1\65"+
+ "\1\66\2\53\20\0\1\53\5\0\1\54\2\0\1\54"+
+ "\1\65\1\67\2\54\20\0\1\54\2\0\1\4\1\53"+
+ "\1\54\5\0\1\70\3\0\1\54\32\0\1\56\1\71"+
+ "\26\0\1\57\2\0\1\57\1\0\3\57\20\0\1\57"+
+ "\5\0\2\60\1\0\5\60\3\0\2\36\13\0\1\60"+
+ "\2\0\1\4\1\53\1\54\2\11\1\0\2\11\1\72"+
+ "\3\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+ "\1\40\2\0\1\37\1\0\1\22\1\0\1\11\5\0"+
+ "\2\11\1\0\3\11\1\62\1\73\3\11\2\36\1\0"+
+ "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+ "\1\0\1\11\5\0\1\63\1\11\1\0\1\63\1\74"+
+ "\1\75\2\63\3\11\2\36\1\0\1\37\1\0\1\37"+
+ "\1\40\2\0\1\37\1\0\1\22\1\0\1\63\5\0"+
+ "\1\64\1\14\1\0\1\64\1\14\3\64\3\11\2\14"+
+ "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+ "\1\64\12\0\1\55\25\0\1\4\1\53\1\54\1\53"+
+ "\2\0\1\53\1\65\1\66\2\53\1\0\1\54\16\0"+
+ "\1\53\2\0\1\4\1\53\2\54\2\0\1\54\1\65"+
+ "\1\67\2\54\1\0\1\54\16\0\1\54\3\0\1\53"+
+ "\1\54\5\0\1\70\3\0\1\54\22\0\1\53\1\54"+
+ "\2\11\1\0\2\11\1\72\3\11\1\63\1\11\2\36"+
+ "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+ "\1\22\1\0\1\11\5\0\2\11\1\0\2\11\1\61"+
+ "\5\11\2\36\1\0\1\37\1\0\1\37\1\40\2\0"+
+ "\1\37\1\0\1\22\1\0\1\11\2\0\1\4\1\53"+
+ "\1\54\1\63\1\11\1\0\1\63\1\74\1\75\2\63"+
+ "\1\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+ "\1\40\2\0\1\37\1\0\1\22\1\0\1\63";
private static int [] zzUnpackTrans() {
- int [] result = new int[396];
+ int [] result = new int[1650];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@@ -329,7 +464,7 @@ public final class StandardTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
- "Unkown internal scanner error",
+ "Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@@ -340,11 +475,12 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
- "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
- "\2\1\2\0\1\1\1\0";
+ "\1\0\1\11\22\1\1\0\4\1\1\0\1\1\2\0"+
+ "\1\1\1\0\3\1\1\0\3\1\1\0\2\1\1\0"+
+ "\2\1\2\0\1\1\1\0\7\1\1\0\1\11\4\1";
private static int [] zzUnpackAttribute() {
- int [] result = new int[24];
+ int [] result = new int[61];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@@ -401,11 +537,11 @@ public final class StandardTokenizerImpl {
private int yycolumn;
/**
- * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ * zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
- /** zzAtEOF == true <=> the scanner is at the EOF */
+ /** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@@ -447,6 +583,9 @@ public final class StandardTokenizerImpl {
/** Hangul token type */
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Emoji token type */
+ public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
/** Character count processed so far */
public final int yychar()
@@ -492,7 +631,7 @@ public final class StandardTokenizerImpl {
char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
- while (i < 2836) {
+ while (i < 4122) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@@ -500,6 +639,8 @@ public final class StandardTokenizerImpl {
return map;
}
+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
/**
* Refills the input buffer.
@@ -527,32 +668,45 @@ public final class StandardTokenizerImpl {
/* fill the buffer with new input */
- int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
- int totalRead = 0;
- while (totalRead < requested) {
- int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
- if (numRead == -1) {
- break;
- }
- totalRead += numRead;
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+ if (requested == 0) {
+ return true;
}
+ int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
- if (totalRead > 0) {
- zzEndRead += totalRead;
- if (totalRead == requested) { /* possibly more input available */
- if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ /* not supposed to occur according to specification of java.io.Reader */
+ if (numRead == 0) {
+ throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+ }
+ if (numRead > 0) {
+ zzEndRead += numRead;
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
--zzEndRead;
zzFinalHighSurrogate = 1;
- if (totalRead == 1) { return true; }
+ if (numRead == 1) {
+ return true;
+ }
+ } else { // There is room in the buffer for at least one more char
+ int c = zzReader.read(); // Expecting to read a low surrogate char
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char)c;
+ return false;
+ }
}
}
+ /* potentially more input available */
return false;
}
- // totalRead = 0: End of stream
+ /* numRead < 0 ==> end of stream */
return true;
}
+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
/**
* Closes the input stream.
@@ -773,49 +927,62 @@ public final class StandardTokenizerImpl {
// store back cached position
zzMarkedPos = zzMarkedPosL;
- switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 1:
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
- }
- case 9: break;
- case 2:
- { return WORD_TYPE;
- }
- case 10: break;
- case 3:
- { return HANGUL_TYPE;
- }
- case 11: break;
- case 4:
- { return NUMERIC_TYPE;
- }
- case 12: break;
- case 5:
- { return KATAKANA_TYPE;
- }
- case 13: break;
- case 6:
- { return IDEOGRAPHIC_TYPE;
- }
- case 14: break;
- case 7:
- { return HIRAGANA_TYPE;
- }
- case 15: break;
- case 8:
- { return SOUTH_EAST_ASIAN_TYPE;
- }
- case 16: break;
- default:
- if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
- zzAtEOF = true;
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
{
return YYEOF;
}
- }
- else {
+ }
+ else {
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */
+ }
+ // fall through
+ case 10: break;
+ case 2:
+ { return EMOJI_TYPE;
+ }
+ // fall through
+ case 11: break;
+ case 3:
+ { return WORD_TYPE;
+ }
+ // fall through
+ case 12: break;
+ case 4:
+ { return HANGUL_TYPE;
+ }
+ // fall through
+ case 13: break;
+ case 5:
+ { return NUMERIC_TYPE;
+ }
+ // fall through
+ case 14: break;
+ case 6:
+ { return KATAKANA_TYPE;
+ }
+ // fall through
+ case 15: break;
+ case 7:
+ { return IDEOGRAPHIC_TYPE;
+ }
+ // fall through
+ case 16: break;
+ case 8:
+ { return HIRAGANA_TYPE;
+ }
+ // fall through
+ case 17: break;
+ case 9:
+ { return SOUTH_EAST_ASIAN_TYPE;
+ }
+ // fall through
+ case 18: break;
+ default:
zzScanError(ZZ_NO_MATCH);
- }
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
index a1e7b17..e95a9b4 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -34,12 +34,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li><HIRAGANA>: A single hiragana character</li>
* <li><KATAKANA>: A sequence of katakana characters</li>
* <li><HANGUL>: A sequence of Hangul characters</li>
+ * <li><EMOJI>: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
%%
-%unicode 6.3
+%unicode 9.0
%integer
%final
%public
@@ -48,22 +49,67 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%char
%buffer 255
-// UAX#29 WB4. X (Extend | Format)* --> X
-//
-HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
-NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
-MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
-HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
+//
+// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+// - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
+NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
+KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
+MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
+MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
+ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
+HanEx = \p{Script:Han} {ExtFmtZwj}
+HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
+SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
+DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
+HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
+ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
%{
/** Alphanumeric sequences */
@@ -93,6 +139,9 @@ ComplexContextEx = \p{LB:Complex_Context}
/** Hangul token type */
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Emoji token type */
+ public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
/** Character count processed so far */
public final int yychar()
@@ -120,18 +169,64 @@ ComplexContextEx = \p{LB:Complex_Context}
%%
-// UAX#29 WB1. sot ÷
-// WB2. ÷ eot
+// UAX#29 WB1. sot ÷ Any
+// WB2. Any ÷ eot
//
<<EOF>> { return YYEOF; }
-// UAX#29 WB8. Numeric × Numeric
-// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+// WB14. (E_Base | EBG) × E_Modifier
+// WB15. ^ (RI RI)* RI × RI
+// WB16. [^RI] (RI RI)* RI × RI
+//
+// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+//
+// emoji_sequence :=
+// Top-level EBNF Expanded #1 Expanded #2 Expanded #3
+// --------------------- ---------------------------- ----------------------------- ----------------------------------------------
+// emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
+// | emoji_presentation_sequence | \p{Emoji} \uFE0F
+// | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
+// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+// | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
+//
+// | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
+// | emoji_presentation_sequence | \p{Emoji} \uFE0F
+// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+// ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
+//
+// | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
+// | emoji_presentation_sequence | \p{Emoji} \uFE0F
+// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+// tag_spec [\u{E0020}-\u{E007E}]+
+// tag_term \u{E007F}
+//
+// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
+// WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+// TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+// choose whether to support them for segmentation. This implementation will
+// recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
+//
+// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+// https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+//
+// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+//
+// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+//
+ {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
+| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
+| {RegionalIndicatorEx}{2}
+ { return EMOJI_TYPE; }
+
+// UAX#29 WB8. Numeric × Numeric
+// WB11. Numeric (MidNum | MidNumLetQ) × Numeric
+// WB12. Numeric × (MidNum | MidNumLetQ) Numeric
+// WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+// WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
//
-{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
@@ -141,28 +236,28 @@ ComplexContextEx = \p{LB:Complex_Context}
{KatakanaEx}+
{ return KATAKANA_TYPE; }
-// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-// WB7a. Hebrew_Letter × Single_Quote
-// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
-// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
-// WB9. (ALetter | Hebrew_Letter) × Numeric
-// WB10. Numeric × (ALetter | Hebrew_Letter)
-// WB13. Katakana × Katakana
-// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+// UAX#29 WB5. AHLetter × AHLetter
+// WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
+// WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
+// WB7a. Hebrew_Letter × Single_Quote
+// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
+// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
+// WB9. AHLetter × Numeric
+// WB10. Numeric × AHLetter
+// WB13. Katakana × Katakana
+// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
-({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
- | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
- | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
- | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
)*
@@ -172,13 +267,13 @@ ComplexContextEx = \p{LB:Complex_Context}
// From UAX #29:
//
-// [C]haracters with the Line_Break property values of Contingent_Break (CB),
-// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
+// [C]haracters with the Line_Break property values of Contingent_Break (CB),
+// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
-// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@@ -191,17 +286,14 @@ ComplexContextEx = \p{LB:Complex_Context}
//
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
-// UAX#29 WB14. Any ÷ Any
+// UAX#29 WB999. Any ÷ Any
//
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
-
-// UAX#29 WB3. CR × LF
-// WB3a. (Newline | CR | LF) ÷
-// WB3b. ÷ (Newline | CR | LF)
-// WB13c. Regional_Indicator × Regional_Indicator
-// WB14. Any ÷ Any
+// UAX#29 WB3. CR × LF
+// WB3a. (Newline | CR | LF) ÷
+// WB3b. ÷ (Newline | CR | LF)
+// WB999. Any ÷ Any
//
-{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
- { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/283b19a8/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 6abbc2b..615b565 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -18,8 +18,11 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@@ -27,6 +30,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
@@ -282,7 +286,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
- WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+ WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
@@ -358,8 +362,80 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
}
-
-
+ /** simple emoji */
+ public void testEmoji() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+ new String[] { "💩", "💩", "💩" },
+ new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+ }
+
+ /** emoji zwj sequence */
+ public void testEmojiSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
+ new String[] { "👩❤️👩" },
+ new String[] { "<EMOJI>" });
+ }
+
+ /** emoji zwj sequence with fitzpatrick modifier */
+ public void testEmojiSequenceWithModifier() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
+ new String[] { "👨🏼⚕️" },
+ new String[] { "<EMOJI>" });
+ }
+
+ /** regional indicator */
+ public void testEmojiRegionalIndicator() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+ new String[] { "🇺🇸", "🇺🇸" },
+ new String[] { "<EMOJI>", "<EMOJI>" });
+ }
+
+ /** variation sequence */
+ public void testEmojiVariationSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+ new String[] { "#️⃣" },
+ new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+ new String[] { "3️⃣",},
+ new String[] { "<EMOJI>" });
+
+ // text presentation sequences
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+ new String[] { },
+ new String[] { });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
+ new String[] { "3\uFE0E",},
+ new String[] { "<NUM>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
+ new String[] { "\u2B55",},
+ new String[] { "<EMOJI>" });
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+ new String[] { "\u2B55", "\u200D\u2B55"},
+ new String[] { "<EMOJI>", "<EMOJI>" });
+ }
+
+ public void testEmojiTagSequence() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
+ new String[] { "🏴" },
+ new String[] { "<EMOJI>" });
+ }
+
+ public void testEmojiTokenization() throws Exception {
+ // simple emoji around latin
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+ new String[] { "poo", "💩", "poo" },
+ new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+ // simple emoji around non-latin
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+ new String[] { "💩", "中", "國", "💩" },
+ new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+ }
+
+ public void testUnicodeEmojiTests() throws Exception {
+ EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+ emojiTest.test(a);
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
@@ -416,4 +492,53 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
a.close();
}
+
+ public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
+ String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
+
+ // Collect tokens with normal reader
+ StandardAnalyzer a = new StandardAnalyzer();
+ TokenStream ts = a.tokenStream("dummy", text);
+ List<String> tokens = new ArrayList<>();
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ tokens.add(termAtt.toString());
+ }
+ ts.end();
+ ts.close();
+
+ // Tokens from a spoon-feed reader should be the same as from a normal reader
+ // The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
+ Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
+ ts = a.tokenStream("dummy", reader);
+ termAtt = ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
+ for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
+ assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
+ }
+ ts.end();
+ ts.close();
+ }
+}
+
+class SpoonFeedMaxCharsReaderWrapper extends Reader {
+ private final Reader in;
+ private final int maxChars;
+
+ public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
+ this.in = in;
+ this.maxChars = maxChars;
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ /** Returns the configured number of chars if available */
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return in.read(cbuf, off, Math.min(maxChars, len));
+ }
}