You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/14 22:51:16 UTC
[04/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
new file mode 100644
index 0000000..5d7b240
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -0,0 +1,823 @@
+/* The following code was generated by JFlex 1.6.0 */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation
+ * algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
+ * <li><NUM>: A number</li>
+ * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
+ * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
+ * <li><HIRAGANA>: A single hiragana character</li>
+ * <li><KATAKANA>: A sequence of katakana characters</li>
+ * <li><HANGUL>: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+
+public final class StandardTokenizerImpl {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private int ZZ_BUFFERSIZE = 255;
+
+ /** lexical states */
+ public static final int YYINITIAL = 0;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
+ "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
+ "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
+ "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
+ "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
+ "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
+ "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
+ "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
+ "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
+ "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
+ "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
+ "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
+ "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
+ "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
+ "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
+ "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
+ "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
+ "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
+ "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
+ "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
+ "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
+ "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
+ "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
+ "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
+ "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
+ "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
+ "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
+ "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
+ "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
+ "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
+ "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
+ "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
+ "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
+ "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
+ "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
+ "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
+ "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
+ "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
+ "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
+ "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
+ "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
+ "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
+ "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
+ "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
+ "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
+ "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
+ "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
+ "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
+ "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
+ "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
+ "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
+ "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
+ "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
+ "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
+ "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
+ "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
+ "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
+ "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
+ "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
+ "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
+ "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
+ "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
+ "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
+ "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
+ "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
+ "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
+ "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
+ "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
+ "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
+ "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
+ "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
+ "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
+ "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
+ "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
+ "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
+ "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
+ "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
+ "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
+ "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
+ "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
+ "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
+ "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
+ "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
+ "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
+ "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
+ "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
+ "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
+ "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
+ "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
+ "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
+ "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
+ "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
+ "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
+ "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
+ "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
+ "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
+ "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
+ "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
+ "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
+ "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
+ "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
+ "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
+ "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
+ "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
+ "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
+ "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
+ "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
+ "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
+ "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
+ "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
+ "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
+ "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
+ "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
+ "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
+ "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
+ "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
+ "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
+ "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
+ "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
+ "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
+ "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
+ "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
+ "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
+ "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
+ "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
+ "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
+ "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
+ "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
+ "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
+ "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
+ "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
+ "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
+ "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
+ "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
+ "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
+ "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
+ "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
+ "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
+ "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+ "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
+ "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
+ "\1\4\1\0\2\2\2\0\1\1\1\0";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
+ "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
+ "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
+ "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
+ "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
+ "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
+ "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
+ "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
+ "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
+ "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
+ "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
+ "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
+ "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
+ "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
+ "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
+ "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
+ "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
+ "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
+ "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
+ "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
+ "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
+ "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
+ "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
+ "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
+ "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
+ "\1\30\1\15\14\0\1\30";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[396];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
+ "\2\1\2\0\1\1\1\0";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[24];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead.
+ * When a lead/high surrogate has been read from the input stream
+ * into the final zzBuffer position, this will have a value of 1;
+ * otherwise, it will have a value of 0.
+ */
+ private int zzFinalHighSurrogate = 0;
+
+ /* user code: */
+ /** Alphanumeric sequences */
+ public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+
+ /** Numbers */
+ public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+
+ /**
+ * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+ * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
+ * together as as a single token rather than broken up, because the logic
+ * required to break them at word boundaries is too complex for UAX#29.
+ * <p>
+ * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+ */
+ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+
+ /** Hiragana token type */
+ public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+
+ /** Katakana token type */
+ public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+ /** Hangul token type */
+ public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Character count processed so far */
+ public final int yychar()
+ {
+ return yychar;
+ }
+
+ /**
+ * Fills CharTermAttribute with the current token text.
+ */
+ public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+ }
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
+
+
+ /**
+ * Creates a new scanner
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public StandardTokenizerImpl(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x110000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 2836) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+
+ /* fill the buffer with new input */
+ int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+ int totalRead = 0;
+ while (totalRead < requested) {
+ int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
+ if (numRead == -1) {
+ break;
+ }
+ totalRead += numRead;
+ }
+
+ if (totalRead > 0) {
+ zzEndRead += totalRead;
+ if (totalRead == requested) { /* possibly more input available */
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ --zzEndRead;
+ zzFinalHighSurrogate = 1;
+ if (totalRead == 1) { return true; }
+ }
+ }
+ return false;
+ }
+
+ // totalRead = 0: End of stream
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * Internal scan buffer is resized down to its initial length, if it has grown.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ zzFinalHighSurrogate = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ if (zzBuffer.length > ZZ_BUFFERSIZE)
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public int getNextToken() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar+= zzMarkedPosL-zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL) {
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
+ }
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 1:
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+ }
+ case 9: break;
+ case 2:
+ { return WORD_TYPE;
+ }
+ case 10: break;
+ case 3:
+ { return HANGUL_TYPE;
+ }
+ case 11: break;
+ case 4:
+ { return NUMERIC_TYPE;
+ }
+ case 12: break;
+ case 5:
+ { return KATAKANA_TYPE;
+ }
+ case 13: break;
+ case 6:
+ { return IDEOGRAPHIC_TYPE;
+ }
+ case 14: break;
+ case 7:
+ { return HIRAGANA_TYPE;
+ }
+ case 15: break;
+ case 8:
+ { return SOUTH_EAST_ASIAN_TYPE;
+ }
+ case 16: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ {
+ return YYEOF;
+ }
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
new file mode 100644
index 0000000..24c401d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation
+ * algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
+ * <li><NUM>: A number</li>
+ * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
+ * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
+ * <li><HIRAGANA>: A single hiragana character</li>
+ * <li><KATAKANA>: A sequence of katakana characters</li>
+ * <li><HANGUL>: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.3
+%integer
+%final
+%public
+%class StandardTokenizerImpl
+%function getNextToken
+%char
+%buffer 255
+
+// UAX#29 WB4. X (Extend | Format)* --> X
+//
+HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
+HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
+NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
+MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
+HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
+HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
+SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
+DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
+HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
+RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
+ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
+
+%{
+ /** Alphanumeric sequences */
+ public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+
+ /** Numbers */
+ public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+
+ /**
+ * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+ * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
+ * together as as a single token rather than broken up, because the logic
+ * required to break them at word boundaries is too complex for UAX#29.
+ * <p>
+ * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+ */
+ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+
+ /** Hiragana token type */
+ public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+
+ /** Katakana token type */
+ public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+ /** Hangul token type */
+ public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+ /** Character count processed so far */
+ public final int yychar()
+ {
+ return yychar;
+ }
+
+ /**
+ * Fills CharTermAttribute with the current token text.
+ */
+ public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+ }
+
+ /**
+ * Sets the scanner buffer size in chars
+ */
+ public final void setBufferSize(int numChars) {
+ ZZ_BUFFERSIZE = numChars;
+ char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+ System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+ zzBuffer = newZzBuffer;
+ }
+%}
+
+%%
+
+// UAX#29 WB1. sot �
+// WB2. � eot
+//
+<<EOF>> { return YYEOF; }
+
+// UAX#29 WB8. Numeric � Numeric
+// WB11. Numeric (MidNum | MidNumLet | Single_Quote) � Numeric
+// WB12. Numeric � (MidNum | MidNumLet | Single_Quote) Numeric
+// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
+ { return NUMERIC_TYPE; }
+
+// subset of the below for typing purposes only!
+{HangulEx}+
+ { return HANGUL_TYPE; }
+
+{KatakanaEx}+
+ { return KATAKANA_TYPE; }
+
+// UAX#29 WB5. (ALetter | Hebrew_Letter) � (ALetter | Hebrew_Letter)
+// WB6. (ALetter | Hebrew_Letter) � (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
+// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) � (ALetter | Hebrew_Letter)
+// WB7a. Hebrew_Letter � Single_Quote
+// WB7b. Hebrew_Letter � Double_Quote Hebrew_Letter
+// WB7c. Hebrew_Letter Double_Quote � Hebrew_Letter
+// WB9. (ALetter | Hebrew_Letter) � Numeric
+// WB10. Numeric � (ALetter | Hebrew_Letter)
+// WB13. Katakana � Katakana
+// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+// WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ )+
+ )
+({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
+ | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
+ | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
+ | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
+ )+
+ )
+)*
+{ExtendNumLetEx}*
+ { return WORD_TYPE; }
+
+
+// From UAX #29:
+//
+// [C]haracters with the Line_Break property values of Contingent_Break (CB),
+// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
+// boundary property values based on criteria outside of the scope of this
+// annex. That means that satisfactory treatment of languages like Chinese
+// or Thai requires special handling.
+//
+// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// property: U+FFFC ( \ufffc ) OBJECT REPLACEMENT CHARACTER.
+//
+// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
+// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
+// Lao, etc.) are kept together. This grammar does the same below.
+//
+// See also the Unicode Line Breaking Algorithm:
+//
+// http://www.unicode.org/reports/tr14/#SA
+//
+{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
+
+// UAX#29 WB14. Any � Any
+//
+{HanEx} { return IDEOGRAPHIC_TYPE; }
+{HiraganaEx} { return HIRAGANA_TYPE; }
+
+
+// UAX#29 WB3. CR � LF
+// WB3a. (Newline | CR | LF) �
+// WB3b. � (Newline | CR | LF)
+// WB13c. Regional_Indicator � Regional_Indicator
+// WB14. Any � Any
+//
+{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
+ { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
new file mode 100644
index 0000000..39ce8f9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Fast, general-purpose grammar-based tokenizer {@link org.apache.lucene.analysis.standard.StandardTokenizer}
+ * implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * Unlike <code>UAX29URLEmailTokenizer</code> from the analysis module, URLs and email addresses are
+ * <b>not</b> tokenized as single tokens, but are instead split up into
+ * tokens according to the UAX#29 word break rules.
+ * <br>
+ * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes
+ * {@link org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer},
+ * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter},
+ * {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
+ * and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ */
+
+package org.apache.lucene.analysis.standard;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 50d2482..368259a 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -21,6 +21,7 @@ import java.io.PrintStream;
import java.util.EnumSet;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.Sort;
@@ -121,7 +122,21 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
}
/**
- * Creates a new config that with the default {@link
+ * Creates a new config, using {@link StandardAnalyzer} as the
+ * analyzer. By default, {@link TieredMergePolicy} is used
+ * for merging;
+ * Note that {@link TieredMergePolicy} is free to select
+ * non-contiguous merges, which means docIDs may not
+ * remain monotonic over time. If this is a problem you
+ * should switch to {@link LogByteSizeMergePolicy} or
+ * {@link LogDocMergePolicy}.
+ */
+ public IndexWriterConfig() {
+ this(new StandardAnalyzer());
+ }
+
+ /**
+ * Creates a new config that with the provided {@link
* Analyzer}. By default, {@link TieredMergePolicy} is used
* for merging;
* Note that {@link TieredMergePolicy} is free to select
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
index 3fda7c3..82281a9e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
@@ -62,7 +62,7 @@ final class Direct16 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
index aec9eaf..502aa3f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
@@ -62,7 +62,7 @@ final class Direct32 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
index b8e06b6..106f641 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
@@ -57,7 +57,7 @@ final class Direct64 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
index 81fc5a9..27986c0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
@@ -60,7 +60,7 @@ final class Direct8 extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
+ RamUsageEstimator.sizeOf(values);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
index 02f4e41..8e8e94d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
@@ -112,7 +112,7 @@ final class Packed16ThreeBlocks extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
index 85e7ea8..a7262b3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
@@ -61,7 +61,7 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
index 3ec6df0..5a85735 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
@@ -110,7 +110,7 @@ final class Packed8ThreeBlocks extends PackedInts.MutableImpl {
public long ramBytesUsed() {
return RamUsageEstimator.alignObjectSize(
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
- + 2 * Integer.BYTES // valueCount,bitsPerValue
+ + 2 * RamUsageEstimator.NUM_BYTES_INT // valueCount,bitsPerValue
+ RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
+ RamUsageEstimator.sizeOf(blocks);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/overview.html b/lucene/core/src/java/overview.html
index 9086cf9..b7112ac 100644
--- a/lucene/core/src/java/overview.html
+++ b/lucene/core/src/java/overview.html
@@ -78,7 +78,7 @@ to the output of a {@link org.apache.lucene.analysis.Tokenizer Tokenizer}.
Tokenizers and TokenFilters are strung together and applied with an {@link org.apache.lucene.analysis.Analyzer Analyzer}.
<a href="../analyzers-common/overview-summary.html">analyzers-common</a> provides a number of Analyzer implementations, including
<a href="../analyzers-common/org/apache/lucene/analysis/core/StopAnalyzer.html">StopAnalyzer</a>
-and the grammar-based <a href="../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
+and the grammar-based <a href="org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
<li>
<b>{@link org.apache.lucene.codecs}</b>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
new file mode 100644
index 0000000..2d63b66
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayMap extends LuceneTestCase {
+ public void doRandom(int iter, boolean ignoreCase) {
+ CharArrayMap<Integer> map = new CharArrayMap<>(1, ignoreCase);
+ HashMap<String,Integer> hmap = new HashMap<>();
+
+ char[] key;
+ for (int i=0; i<iter; i++) {
+ int len = random().nextInt(5);
+ key = new char[len];
+ for (int j=0; j<key.length; j++) {
+ key[j] = (char)random().nextInt(127);
+ }
+ String keyStr = new String(key);
+ String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr;
+
+ int val = random().nextInt();
+
+ Object o1 = map.put(key, val);
+ Object o2 = hmap.put(hmapKey,val);
+ assertEquals(o1,o2);
+
+ // add it again with the string method
+ assertEquals(val, map.put(keyStr,val).intValue());
+
+ assertEquals(val, map.get(key,0,key.length).intValue());
+ assertEquals(val, map.get(key).intValue());
+ assertEquals(val, map.get(keyStr).intValue());
+
+ assertEquals(hmap.size(), map.size());
+ }
+ }
+
+ public void testCharArrayMap() {
+ int num = 5 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < num; i++) { // pump this up for more random testing
+ doRandom(1000,false);
+ doRandom(1000,true);
+ }
+ }
+
+ public void testMethods() {
+ CharArrayMap<Integer> cm = new CharArrayMap<>(2, false);
+ HashMap<String,Integer> hm = new HashMap<>();
+ hm.put("foo",1);
+ hm.put("bar",2);
+ cm.putAll(hm);
+ assertEquals(hm.size(), cm.size());
+ hm.put("baz", 3);
+ cm.putAll(hm);
+ assertEquals(hm.size(), cm.size());
+
+ CharArraySet cs = cm.keySet();
+ int n=0;
+ for (Object o : cs) {
+ assertTrue(cm.containsKey(o));
+ char[] co = (char[]) o;
+ assertTrue(cm.containsKey(co, 0, co.length));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+ assertEquals(hm.size(), cs.size());
+ assertEquals(cm.size(), cs.size());
+ cs.clear();
+ assertEquals(0, cs.size());
+ assertEquals(0, cm.size());
+ // keySet() should not allow adding new keys
+ expectThrows(UnsupportedOperationException.class, () -> {
+ cs.add("test");
+ });
+
+ cm.putAll(hm);
+ assertEquals(hm.size(), cs.size());
+ assertEquals(cm.size(), cs.size());
+
+ Iterator<Map.Entry<Object,Integer>> iter1 = cm.entrySet().iterator();
+ n=0;
+ while (iter1.hasNext()) {
+ Map.Entry<Object,Integer> entry = iter1.next();
+ Object key = entry.getKey();
+ Integer val = entry.getValue();
+ assertEquals(cm.get(key), val);
+ entry.setValue(val*100);
+ assertEquals(val*100, (int)cm.get(key));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+ cm.clear();
+ cm.putAll(hm);
+ assertEquals(cm.size(), n);
+
+ CharArrayMap<Integer>.EntryIterator iter2 = cm.entrySet().iterator();
+ n=0;
+ while (iter2.hasNext()) {
+ char[] keyc = iter2.nextKey();
+ Integer val = iter2.currentValue();
+ assertEquals(hm.get(new String(keyc)), val);
+ iter2.setValue(val*100);
+ assertEquals(val*100, (int)cm.get(keyc));
+ n++;
+ }
+ assertEquals(hm.size(), n);
+
+ cm.entrySet().clear();
+ assertEquals(0, cm.size());
+ assertEquals(0, cm.entrySet().size());
+ assertTrue(cm.isEmpty());
+ }
+
+ // TODO: break this up into simpler test methods vs. "telling a story"
+ public void testModifyOnUnmodifiable(){
+ CharArrayMap<Integer> map = new CharArrayMap<>(2, false);
+ map.put("foo",1);
+ map.put("bar",2);
+ final int size = map.size();
+ assertEquals(2, size);
+ assertTrue(map.containsKey("foo"));
+ assertEquals(1, map.get("foo").intValue());
+ assertTrue(map.containsKey("bar"));
+ assertEquals(2, map.get("bar").intValue());
+
+ map = CharArrayMap.unmodifiableMap(map);
+ assertEquals("Map size changed due to unmodifiableMap call" , size, map.size());
+ String NOT_IN_MAP = "SirGallahad";
+ assertFalse("Test String already exists in map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String already exists in map", map.get(NOT_IN_MAP));
+
+ try{
+ map.put(NOT_IN_MAP.toCharArray(), 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put(NOT_IN_MAP, 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put(new StringBuilder(NOT_IN_MAP), 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.entrySet().clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.keySet().clear();
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.put((Object) NOT_IN_MAP, 3);
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ try{
+ map.putAll(Collections.singletonMap(NOT_IN_MAP, 3));
+ fail("Modified unmodifiable map");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+ assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+ assertEquals("Size of unmodifiable map has changed", size, map.size());
+ }
+
+ assertTrue(map.containsKey("foo"));
+ assertEquals(1, map.get("foo").intValue());
+ assertTrue(map.containsKey("bar"));
+ assertEquals(2, map.get("bar").intValue());
+ }
+
+ public void testToString() {
+ CharArrayMap<Integer> cm = new CharArrayMap<>(Collections.singletonMap("test",1), false);
+ assertEquals("[test]",cm.keySet().toString());
+ assertEquals("[1]",cm.values().toString());
+ assertEquals("[test=1]",cm.entrySet().toString());
+ assertEquals("{test=1}",cm.toString());
+ cm.put("test2", 2);
+ assertTrue(cm.keySet().toString().contains(", "));
+ assertTrue(cm.values().toString().contains(", "));
+ assertTrue(cm.entrySet().toString().contains(", "));
+ assertTrue(cm.toString().contains(", "));
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
new file mode 100644
index 0000000..465f512
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestCharArraySet extends LuceneTestCase {
+
+ static final String[] TEST_STOP_WORDS = {
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+
+ public void testRehash() throws Exception {
+ CharArraySet cas = new CharArraySet(0, true);
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ cas.add(TEST_STOP_WORDS[i]);
+ assertEquals(TEST_STOP_WORDS.length, cas.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertTrue(cas.contains(TEST_STOP_WORDS[i]));
+ }
+
+ public void testNonZeroOffset() {
+ String[] words={"Hello","World","this","is","a","test"};
+ char[] findme="xthisy".toCharArray();
+ CharArraySet set= new CharArraySet(10, true);
+ set.addAll(Arrays.asList(words));
+ assertTrue(set.contains(findme, 1, 4));
+ assertTrue(set.contains(new String(findme,1,4)));
+
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(findme, 1, 4));
+ assertTrue(set.contains(new String(findme,1,4)));
+ }
+
+ public void testObjectContains() {
+ CharArraySet set = new CharArraySet(10, true);
+ Integer val = Integer.valueOf(1);
+ set.add(val);
+ assertTrue(set.contains(val));
+ assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+ // test unmodifiable
+ set = CharArraySet.unmodifiableSet(set);
+ assertTrue(set.contains(val));
+ assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+ }
+
+ public void testClear(){
+ CharArraySet set=new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ set.clear();
+ assertEquals("not empty", 0, set.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertFalse(set.contains(TEST_STOP_WORDS[i]));
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+ for(int i=0;i<TEST_STOP_WORDS.length;i++)
+ assertTrue(set.contains(TEST_STOP_WORDS[i]));
+ }
+
+ // TODO: break this up into simpler test methods, vs "telling a story"
+ public void testModifyOnUnmodifiable(){
+ CharArraySet set=new CharArraySet(10, true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+ String NOT_IN_SET = "SirGallahad";
+ assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
+
+ try{
+ set.add(NOT_IN_SET.toCharArray());
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.add(new StringBuilder(NOT_IN_SET));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.clear();
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+ try{
+ set.add((Object) NOT_IN_SET);
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ // This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
+ // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
+ // remove() on the iterator
+ try{
+ set.removeAll(new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.retainAll(new CharArraySet(Arrays.asList(NOT_IN_SET), true));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertEquals("Size of unmodifiable set has changed", size, set.size());
+ }
+
+ try{
+ set.addAll(Arrays.asList(NOT_IN_SET));
+ fail("Modified unmodifiable set");
+ }catch (UnsupportedOperationException e) {
+ // expected
+ assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+ }
+
+ for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
+ assertTrue(set.contains(TEST_STOP_WORDS[i]));
+ }
+ }
+
+ public void testUnmodifiableSet(){
+ CharArraySet set = new CharArraySet(10,true);
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+ set.add(Integer.valueOf(1));
+ final int size = set.size();
+ set = CharArraySet.unmodifiableSet(set);
+ assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+ for (String stopword : TEST_STOP_WORDS) {
+ assertTrue(set.contains(stopword));
+ }
+ assertTrue(set.contains(Integer.valueOf(1)));
+ assertTrue(set.contains("1"));
+ assertTrue(set.contains(new char[]{'1'}));
+
+ expectThrows(NullPointerException.class, () -> {
+ CharArraySet.unmodifiableSet(null);
+ });
+ }
+
+ public void testSupplementaryChars() {
+ String missing = "Term %s is missing in the set";
+ String falsePos = "Term %s is in the set but shouldn't";
+ // for reference see
+ // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
+ String[] upperArr = new String[] {"Abc\ud801\udc1c",
+ "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
+ String[] lowerArr = new String[] {"abc\ud801\udc44",
+ "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
+ CharArraySet set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), false);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ }
+
+ public void testSingleHighSurrogate() {
+ String missing = "Term %s is missing in the set";
+ String falsePos = "Term %s is in the set but shouldn't";
+ String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
+ "\uD800EfG", "\uD800\ud801\udc1cB" };
+
+ String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
+ "\uD800efg", "\uD800\ud801\udc44b" };
+ CharArraySet set = new CharArraySet(Arrays
+ .asList(TEST_STOP_WORDS), true);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+ }
+ set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS),
+ false);
+ for (String upper : upperArr) {
+ set.add(upper);
+ }
+ for (int i = 0; i < upperArr.length; i++) {
+ assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+ assertFalse(String.format(Locale.ROOT, falsePos, upperArr[i]), set
+ .contains(lowerArr[i]));
+ }
+ }
+
+ @SuppressWarnings("deprecated")
+ public void testCopyCharArraySetBWCompat() {
+ CharArraySet setIngoreCase = new CharArraySet(10, true);
+ CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setIngoreCase.add(Integer.valueOf(1));
+ setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setCaseSensitive.add(Integer.valueOf(1));
+
+ CharArraySet copy = CharArraySet.copy(setIngoreCase);
+ CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+ assertEquals(setIngoreCase.size(), copy.size());
+ assertEquals(setCaseSensitive.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copyCaseSens.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copyCaseSens.contains(string));
+ }
+ // test adding terms to the copy
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(setIngoreCase.contains(string));
+ assertFalse(setCaseSensitive.contains(string));
+
+ }
+ }
+
+ /**
+ * Test the static #copy() function with a CharArraySet as a source
+ */
+ public void testCopyCharArraySet() {
+ CharArraySet setIngoreCase = new CharArraySet(10, true);
+ CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setIngoreCase.add(Integer.valueOf(1));
+ setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+ setCaseSensitive.add(Integer.valueOf(1));
+
+ CharArraySet copy = CharArraySet.copy(setIngoreCase);
+ CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+ assertEquals(setIngoreCase.size(), copy.size());
+ assertEquals(setCaseSensitive.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copyCaseSens.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copyCaseSens.contains(string));
+ }
+ // test adding terms to the copy
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(stopwordsUpper));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(setIngoreCase.contains(string));
+ assertFalse(setCaseSensitive.contains(string));
+
+ }
+ }
+
+ /**
+ * Test the static #copy() function with a JDK {@link Set} as a source
+ */
+ public void testCopyJDKSet() {
+ Set<String> set = new HashSet<>();
+
+ List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+ List<String> stopwordsUpper = new ArrayList<>();
+ for (String string : stopwords) {
+ stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+ }
+ set.addAll(Arrays.asList(TEST_STOP_WORDS));
+
+ CharArraySet copy = CharArraySet.copy(set);
+
+ assertEquals(set.size(), copy.size());
+ assertEquals(set.size(), copy.size());
+
+ assertTrue(copy.containsAll(stopwords));
+ for (String string : stopwordsUpper) {
+ assertFalse(copy.contains(string));
+ }
+
+ List<String> newWords = new ArrayList<>();
+ for (String string : stopwords) {
+ newWords.add(string+"_1");
+ }
+ copy.addAll(newWords);
+
+ assertTrue(copy.containsAll(stopwords));
+ assertTrue(copy.containsAll(newWords));
+ // new added terms are not in the source set
+ for (String string : newWords) {
+ assertFalse(set.contains(string));
+ }
+ }
+
+ /**
+ * Tests a special case of {@link CharArraySet#copy(Set)} where the
+ * set to copy is the {@link CharArraySet#EMPTY_SET}
+ */
+ public void testCopyEmptySet() {
+ assertSame(CharArraySet.EMPTY_SET,
+ CharArraySet.copy(CharArraySet.EMPTY_SET));
+ }
+
+ /**
+ * Smoketests the static empty set
+ */
+ public void testEmptySet() {
+ assertEquals(0, CharArraySet.EMPTY_SET.size());
+
+ assertTrue(CharArraySet.EMPTY_SET.isEmpty());
+ for (String stopword : TEST_STOP_WORDS) {
+ assertFalse(CharArraySet.EMPTY_SET.contains(stopword));
+ }
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo"));
+ assertFalse(CharArraySet.EMPTY_SET.contains((Object) "foo"));
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray()));
+ assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray(),0,3));
+ }
+
+ /**
+ * Test for NPE
+ */
+ public void testContainsWithNull() {
+ CharArraySet set = new CharArraySet(1, true);
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((char[]) null, 0, 10);
+ });
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((CharSequence) null);
+ });
+
+ expectThrows(NullPointerException.class, () -> {
+ set.contains((Object) null);
+ });
+ }
+
+ public void testToString() {
+ CharArraySet set = CharArraySet.copy(Collections.singleton("test"));
+ assertEquals("[test]", set.toString());
+ set.add("test2");
+ assertTrue(set.toString().contains(", "));
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
new file mode 100644
index 0000000..53b3f56
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.Test;
+
+/**
+ * TestCase for the {@link CharacterUtils} class.
+ */
+public class TestCharacterUtils extends LuceneTestCase {
+
+ public void testConversions() {
+ final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
+ final int[] buf = new int[orig.length];
+ final char[] restored = new char[buf.length];
+ final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
+ final int o2 = TestUtil.nextInt(random(), 0, o1);
+ final int o3 = TestUtil.nextInt(random(), 0, o1);
+ final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+ final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
+ assertEquals(orig.length - o1, charCount);
+ assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
+ }
+
+ @Test
+ public void testNewCharacterBuffer() {
+ CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024);
+ assertEquals(1024, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ newCharacterBuffer = CharacterUtils.newCharacterBuffer(2);
+ assertEquals(2, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ // length must be >= 2
+ expectThrows(IllegalArgumentException.class, () -> {
+ CharacterUtils.newCharacterBuffer(1);
+ });
+ }
+
+ @Test
+ public void testFillNoHighSurrogate() throws IOException {
+ Reader reader = new StringReader("helloworld");
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+ assertTrue(CharacterUtils.fill(buffer,reader));
+ assertEquals(0, buffer.getOffset());
+ assertEquals(6, buffer.getLength());
+ assertEquals("hellow", new String(buffer.getBuffer()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals(0, buffer.getOffset());
+
+ assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
+ }
+
+ @Test
+ public void testFill() throws IOException {
+ String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
+ Reader reader = new StringReader(input);
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(5, buffer.getLength());
+ assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
+ assertTrue(CharacterUtils.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("123\ud801", new String(buffer.getBuffer(),
+ buffer.getOffset(), buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer, reader));
+ assertEquals(3, buffer.getLength());
+ assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer, reader));
+ assertEquals(0, buffer.getLength());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
new file mode 100644
index 0000000..c224682
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.English;
+
+public class TestStopFilter extends BaseTokenStreamTestCase {
+
+ // other StopFilter functionality is already tested by TestStopAnalyzer
+
+ public void testExactCase() throws IOException {
+ StringReader reader = new StringReader("Now is The Time");
+ CharArraySet stopWords = new CharArraySet(asSet("is", "the", "Time"), false);
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ TokenStream stream = new StopFilter(in, stopWords);
+ assertTokenStreamContents(stream, new String[] { "Now", "The" });
+ }
+
+ public void testStopFilt() throws IOException {
+ StringReader reader = new StringReader("Now is The Time");
+ String[] stopWords = new String[] { "is", "the", "Time" };
+ CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ TokenStream stream = new StopFilter(in, stopSet);
+ assertTokenStreamContents(stream, new String[] { "Now", "The" });
+ }
+
+ /**
+ * Test Position increments applied by StopFilter with and without enabling this option.
+ */
+ public void testStopPositons() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ ArrayList<String> a = new ArrayList<>();
+ for (int i=0; i<20; i++) {
+ String w = English.intToEnglish(i).trim();
+ sb.append(w).append(" ");
+ if (i%3 != 0) a.add(w);
+ }
+ log(sb.toString());
+ String stopWords[] = a.toArray(new String[0]);
+ for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
+ CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+ // with increments
+ StringReader reader = new StringReader(sb.toString());
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(reader);
+ StopFilter stpf = new StopFilter(in, stopSet);
+ doTestStopPositons(stpf);
+ // with increments, concatenating two stop filters
+ ArrayList<String> a0 = new ArrayList<>();
+ ArrayList<String> a1 = new ArrayList<>();
+ for (int i=0; i<a.size(); i++) {
+ if (i%2==0) {
+ a0.add(a.get(i));
+ } else {
+ a1.add(a.get(i));
+ }
+ }
+ String stopWords0[] = a0.toArray(new String[0]);
+ for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
+ String stopWords1[] = a1.toArray(new String[0]);
+ for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
+ CharArraySet stopSet0 = StopFilter.makeStopSet(stopWords0);
+ CharArraySet stopSet1 = StopFilter.makeStopSet(stopWords1);
+ reader = new StringReader(sb.toString());
+ final MockTokenizer in1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in1.setReader(reader);
+ StopFilter stpf0 = new StopFilter(in1, stopSet0); // first part of the set
+ StopFilter stpf01 = new StopFilter(stpf0, stopSet1); // two stop filters concatenated!
+ doTestStopPositons(stpf01);
+ }
+
+ // LUCENE-3849: make sure after .end() we see the "ending" posInc
+ public void testEndStopword() throws Exception {
+ CharArraySet stopSet = StopFilter.makeStopSet("of");
+ final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ in.setReader(new StringReader("test of"));
+ StopFilter stpf = new StopFilter(in, stopSet);
+ assertTokenStreamContents(stpf, new String[] { "test" },
+ new int[] {0},
+ new int[] {4},
+ null,
+ new int[] {1},
+ null,
+ 7,
+ 1,
+ null,
+ true);
+ }
+
+ private void doTestStopPositons(StopFilter stpf) throws IOException {
+ CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
+ stpf.reset();
+ for (int i=0; i<20; i+=3) {
+ assertTrue(stpf.incrementToken());
+ log("Token "+i+": "+stpf);
+ String w = English.intToEnglish(i).trim();
+ assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
+ assertEquals("all but first token must have position increment of 3",i==0?1:3,posIncrAtt.getPositionIncrement());
+ }
+ assertFalse(stpf.incrementToken());
+ stpf.end();
+ stpf.close();
+ }
+
+ // print debug info depending on VERBOSE
+ private static void log(String s) {
+ if (VERBOSE) {
+ System.out.println(s);
+ }
+ }
+
+ // stupid filter that inserts synonym of 'hte' for 'the'
+ private class MockSynonymFilter extends TokenFilter {
+ State bufferedState;
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ MockSynonymFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (bufferedState != null) {
+ restoreState(bufferedState);
+ posIncAtt.setPositionIncrement(0);
+ termAtt.setEmpty().append("hte");
+ bufferedState = null;
+ return true;
+ } else if (input.incrementToken()) {
+ if (termAtt.toString().equals("the")) {
+ bufferedState = captureState();
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ bufferedState = null;
+ }
+ }
+
+}