You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/14 20:38:23 UTC

[04/12] lucene-solr:master: LUCENE-7318: graduate StandardAnalyzer and make it the default for IndexWriterConfig

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
new file mode 100644
index 0000000..5d7b240
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -0,0 +1,823 @@
+/* The following code was generated by JFlex 1.6.0 */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation 
+ * algorithm, as specified in 
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>. 
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
+ *   <li>&lt;NUM&gt;: A number</li>
+ *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
+ *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
+ *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
+ *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
+ *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+
+public final class StandardTokenizerImpl {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private int ZZ_BUFFERSIZE = 255;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = { 
+     0, 0
+  };
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED = 
+    "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
+    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
+    "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
+    "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
+    "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
+    "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
+    "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
+    "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
+    "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
+    "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
+    "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
+    "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
+    "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
+    "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
+    "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
+    "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
+    "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
+    "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
+    "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
+    "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
+    "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
+    "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
+    "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
+    "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
+    "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
+    "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
+    "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
+    "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
+    "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
+    "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
+    "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
+    "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
+    "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
+    "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
+    "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
+    "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
+    "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
+    "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
+    "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
+    "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
+    "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
+    "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
+    "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
+    "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
+    "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
+    "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
+    "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
+    "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
+    "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
+    "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
+    "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
+    "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
+    "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
+    "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
+    "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
+    "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
+    "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
+    "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
+    "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
+    "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
+    "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
+    "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
+    "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
+    "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
+    "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
+    "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
+    "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
+    "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
+    "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
+    "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
+    "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
+    "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
+    "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
+    "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
+    "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
+    "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
+    "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
+    "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
+    "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
+    "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
+    "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
+    "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
+    "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
+    "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
+    "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
+    "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
+    "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
+    "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
+    "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
+    "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
+    "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
+    "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
+    "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
+    "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
+    "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
+    "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
+    "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
+    "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
+    "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
+    "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
+    "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
+    "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
+    "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
+    "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
+    "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
+    "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
+    "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
+    "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
+    "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
+    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
+    "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
+    "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
+    "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
+    "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
+    "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
+    "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
+    "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
+    "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
+    "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
+    "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
+    "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
+    "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
+    "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
+    "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
+    "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
+    "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
+    "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
+    "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
+    "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
+    "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
+    "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
+    "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
+    "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
+    "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
+    "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
+    "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
+    "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
+    "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
+    "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
+    "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+    "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /** 
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
+    "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
+    "\1\4\1\0\2\2\2\0\1\1\1\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[24];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /** 
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
+    "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
+    "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[24];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /** 
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
+    "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
+    "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
+    "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
+    "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
+    "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
+    "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
+    "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
+    "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
+    "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
+    "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
+    "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
+    "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
+    "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
+    "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
+    "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
+    "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
+    "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
+    "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
+    "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
+    "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
+    "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
+    "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
+    "\1\30\1\15\14\0\1\30";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[396];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
+    "\2\1\2\0\1\1\1\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[24];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
+
+  /* user code: */
+  /** Alphanumeric sequences */
+  public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+  
+  /** Numbers */
+  public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+  
+  /**
+   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
+   * together as as a single token rather than broken up, because the logic
+   * required to break them at word boundaries is too complex for UAX#29.
+   * <p>
+   * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+   */
+  public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+  
+  /** Idiographic token type */
+  public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+  
+  /** Hiragana token type */
+  public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+  
+  /** Katakana token type */
+  public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+  /** Hangul token type */
+  public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+  /** Character count processed so far */
+  public final int yychar()
+  {
+    return yychar;
+  }
+
+  /**
+   * Fills CharTermAttribute with the current token text.
+   */
+  public final void getText(CharTermAttribute t) {
+    t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+  }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
+
+
+  /**
+   * Creates a new scanner
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public StandardTokenizerImpl(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+
+  /** 
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x110000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 2836) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;           
+    int totalRead = 0;
+    while (totalRead < requested) {
+      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
+      if (numRead == -1) {
+        break;
+      }
+      totalRead += numRead;
+    }
+
+    if (totalRead > 0) {
+      zzEndRead += totalRead;
+      if (totalRead == requested) { /* possibly more input available */
+        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+          if (totalRead == 1) { return true; }
+        }
+      }
+      return false;
+    }
+
+    // totalRead = 0: End of stream
+    return true;
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * Internal scan buffer is resized down to its initial length, if it has grown.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+    if (zzBuffer.length > ZZ_BUFFERSIZE)
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public int getNextToken() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      yychar+= zzMarkedPosL-zzStartRead;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+      // set up zzAction for empty match case:
+      int zzAttributes = zzAttrL[zzState];
+      if ( (zzAttributes & 1) == 1 ) {
+        zzAction = zzState;
+      }
+
+
+      zzForAction: {
+        while (true) {
+    
+          if (zzCurrentPosL < zzEndReadL) {
+            zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+            zzCurrentPosL += Character.charCount(zzInput);
+          }
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+              zzCurrentPosL += Character.charCount(zzInput);
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 1: 
+          { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
+          }
+        case 9: break;
+        case 2: 
+          { return WORD_TYPE;
+          }
+        case 10: break;
+        case 3: 
+          { return HANGUL_TYPE;
+          }
+        case 11: break;
+        case 4: 
+          { return NUMERIC_TYPE;
+          }
+        case 12: break;
+        case 5: 
+          { return KATAKANA_TYPE;
+          }
+        case 13: break;
+        case 6: 
+          { return IDEOGRAPHIC_TYPE;
+          }
+        case 14: break;
+        case 7: 
+          { return HIRAGANA_TYPE;
+          }
+        case 15: break;
+        case 8: 
+          { return SOUTH_EAST_ASIAN_TYPE;
+          }
+        case 16: break;
+        default: 
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+              {
+                return YYEOF;
+              }
+          } 
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
new file mode 100644
index 0000000..24c401d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * This class implements Word Break rules from the Unicode Text Segmentation 
+ * algorithm, as specified in 
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>. 
+ * <p>
+ * Tokens produced are of the following types:
+ * <ul>
+ *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
+ *   <li>&lt;NUM&gt;: A number</li>
+ *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
+ *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
+ *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
+ *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
+ *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
+ *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ * </ul>
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.3
+%integer
+%final
+%public
+%class StandardTokenizerImpl
+%function getNextToken
+%char
+%buffer 255
+
+// UAX#29 WB4. X (Extend | Format)* --> X
+//
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
+HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
+HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
+HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
+SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
+RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
+ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
+
+%{
+  /** Alphanumeric sequences */
+  public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
+  
+  /** Numbers */
+  public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
+  
+  /**
+   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
+   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
+   * together as as a single token rather than broken up, because the logic
+   * required to break them at word boundaries is too complex for UAX#29.
+   * <p>
+   * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
+   */
+  public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
+  
+  /** Idiographic token type */
+  public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
+  
+  /** Hiragana token type */
+  public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
+  
+  /** Katakana token type */
+  public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
+
+  /** Hangul token type */
+  public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+
+  /** Character count processed so far */
+  public final int yychar()
+  {
+    return yychar;
+  }
+
+  /**
+   * Fills CharTermAttribute with the current token text.
+   */
+  public final void getText(CharTermAttribute t) {
+    t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+  }
+  
+  /**
+   * Sets the scanner buffer size in chars
+   */
+   public final void setBufferSize(int numChars) {
+     ZZ_BUFFERSIZE = numChars;
+     char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
+     System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
+     zzBuffer = newZzBuffer;
+   }
+%}
+
+%%
+
+// UAX#29 WB1.   sot   �
+//        WB2.     �   eot
+//
+<<EOF>> { return YYEOF; }
+
+// UAX#29 WB8.   Numeric � Numeric
+//        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) � Numeric
+//        WB12.  Numeric � (MidNum | MidNumLet | Single_Quote) Numeric
+//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+//        WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana) 
+//
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}* 
+  { return NUMERIC_TYPE; }
+
+// subset of the below for typing purposes only!
+{HangulEx}+
+  { return HANGUL_TYPE; }
+  
+{KatakanaEx}+
+  { return KATAKANA_TYPE; }
+
+// UAX#29 WB5.   (ALetter | Hebrew_Letter) � (ALetter | Hebrew_Letter)
+//        WB6.   (ALetter | Hebrew_Letter) � (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
+//        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) � (ALetter | Hebrew_Letter)
+//        WB7a.  Hebrew_Letter � Single_Quote
+//        WB7b.  Hebrew_Letter � Double_Quote Hebrew_Letter
+//        WB7c.  Hebrew_Letter Double_Quote � Hebrew_Letter
+//        WB9.   (ALetter | Hebrew_Letter) � Numeric
+//        WB10.  Numeric � (ALetter | Hebrew_Letter)
+//        WB13.  Katakana � Katakana
+//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) � ExtendNumLet
+//        WB13b. ExtendNumLet � (ALetter | Hebrew_Letter | Numeric | Katakana) 
+//
+{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
+                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+                     )+
+                   )
+({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
+                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+                     )+
+                   )
+)*
+{ExtendNumLetEx}* 
+  { return WORD_TYPE; }
+
+
+// From UAX #29:
+//
+//    [C]haracters with the Line_Break property values of Contingent_Break (CB), 
+//    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word 
+//    boundary property values based on criteria outside of the scope of this
+//    annex.  That means that satisfactory treatment of languages like Chinese
+//    or Thai requires special handling.
+// 
+// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// property: U+FFFC ( \ufffc ) OBJECT REPLACEMENT CHARACTER.
+//
+// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
+// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
+// Lao, etc.) are kept together.  This grammar does the same below.
+//
+// See also the Unicode Line Breaking Algorithm:
+//
+//    http://www.unicode.org/reports/tr14/#SA
+//
+{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
+
+// UAX#29 WB14.  Any � Any
+//
+{HanEx} { return IDEOGRAPHIC_TYPE; }
+{HiraganaEx} { return HIRAGANA_TYPE; }
+
+
+// UAX#29 WB3.   CR � LF
+//        WB3a.  (Newline | CR | LF) �
+//        WB3b.  � (Newline | CR | LF)
+//        WB13c. Regional_Indicator � Regional_Indicator
+//        WB14.  Any � Any
+//
+{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
+  { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
new file mode 100644
index 0000000..39ce8f9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/package-info.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Fast, general-purpose grammar-based tokenizer {@link org.apache.lucene.analysis.standard.StandardTokenizer}
+ * implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in 
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * Unlike <code>UAX29URLEmailTokenizer</code> from the analysis module, URLs and email addresses are
+ * <b>not</b> tokenized as single tokens, but are instead split up into 
+ * tokens according to the UAX#29 word break rules.
+ * <br>
+ * {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} includes
+ * {@link org.apache.lucene.analysis.standard.StandardTokenizer StandardTokenizer},
+ * {@link org.apache.lucene.analysis.standard.StandardFilter StandardFilter}, 
+ * {@link org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter}
+ * and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
+ */
+
+package org.apache.lucene.analysis.standard;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 50d2482..368259a 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -21,6 +21,7 @@ import java.io.PrintStream;
 import java.util.EnumSet;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
 import org.apache.lucene.search.Sort;
@@ -121,7 +122,21 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
   }
   
   /**
-   * Creates a new config that with the default {@link
+   * Creates a new config, using {@link StandardAnalyzer} as the
+   * analyzer.  By default, {@link TieredMergePolicy} is used
+   * for merging;
+   * Note that {@link TieredMergePolicy} is free to select
+   * non-contiguous merges, which means docIDs may not
+   * remain monotonic over time.  If this is a problem you
+   * should switch to {@link LogByteSizeMergePolicy} or
+   * {@link LogDocMergePolicy}.
+   */
+  public IndexWriterConfig() {
+    this(new StandardAnalyzer());
+  }
+  
+  /**
+   * Creates a new config that with the provided {@link
    * Analyzer}. By default, {@link TieredMergePolicy} is used
    * for merging;
    * Note that {@link TieredMergePolicy} is free to select

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
index 3fda7c3..82281a9e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct16.java
@@ -62,7 +62,7 @@ final class Direct16 extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
         + RamUsageEstimator.sizeOf(values);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
index aec9eaf..502aa3f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct32.java
@@ -62,7 +62,7 @@ final class Direct32 extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
         + RamUsageEstimator.sizeOf(values);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
index b8e06b6..106f641 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct64.java
@@ -57,7 +57,7 @@ final class Direct64 extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
         + RamUsageEstimator.sizeOf(values);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
index 81fc5a9..27986c0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Direct8.java
@@ -60,7 +60,7 @@ final class Direct8 extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // values ref
         + RamUsageEstimator.sizeOf(values);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
index 02f4e41..8e8e94d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed16ThreeBlocks.java
@@ -112,7 +112,7 @@ final class Packed16ThreeBlocks extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
         + RamUsageEstimator.sizeOf(blocks);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
index 85e7ea8..a7262b3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
@@ -61,7 +61,7 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
         + RamUsageEstimator.sizeOf(blocks);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
index 3ec6df0..5a85735 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed8ThreeBlocks.java
@@ -110,7 +110,7 @@ final class Packed8ThreeBlocks extends PackedInts.MutableImpl {
   public long ramBytesUsed() {
     return RamUsageEstimator.alignObjectSize(
         RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
-        + 2 * Integer.BYTES     // valueCount,bitsPerValue
+        + 2 * RamUsageEstimator.NUM_BYTES_INT     // valueCount,bitsPerValue
         + RamUsageEstimator.NUM_BYTES_OBJECT_REF) // blocks ref
         + RamUsageEstimator.sizeOf(blocks);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/overview.html b/lucene/core/src/java/overview.html
index 9086cf9..b7112ac 100644
--- a/lucene/core/src/java/overview.html
+++ b/lucene/core/src/java/overview.html
@@ -78,7 +78,7 @@ to the output of a {@link org.apache.lucene.analysis.Tokenizer Tokenizer}.&nbsp;
 Tokenizers and TokenFilters are strung together and applied with an {@link org.apache.lucene.analysis.Analyzer Analyzer}.&nbsp;
 <a href="../analyzers-common/overview-summary.html">analyzers-common</a> provides a number of Analyzer implementations, including 
 <a href="../analyzers-common/org/apache/lucene/analysis/core/StopAnalyzer.html">StopAnalyzer</a>
-and the grammar-based <a href="../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
+and the grammar-based <a href="org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.</li>
 
 <li>
 <b>{@link org.apache.lucene.codecs}</b>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
new file mode 100644
index 0000000..2d63b66
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArrayMap.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayMap extends LuceneTestCase {
+  public void doRandom(int iter, boolean ignoreCase) {
+    CharArrayMap<Integer> map = new CharArrayMap<>(1, ignoreCase);
+    HashMap<String,Integer> hmap = new HashMap<>();
+
+    char[] key;
+    for (int i=0; i<iter; i++) {
+      int len = random().nextInt(5);
+      key = new char[len];
+      for (int j=0; j<key.length; j++) {
+        key[j] = (char)random().nextInt(127);
+      }
+      String keyStr = new String(key);
+      String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr; 
+
+      int val = random().nextInt();
+
+      Object o1 = map.put(key, val);
+      Object o2 = hmap.put(hmapKey,val);
+      assertEquals(o1,o2);
+
+      // add it again with the string method
+      assertEquals(val, map.put(keyStr,val).intValue());
+
+      assertEquals(val, map.get(key,0,key.length).intValue());
+      assertEquals(val, map.get(key).intValue());
+      assertEquals(val, map.get(keyStr).intValue());
+
+      assertEquals(hmap.size(), map.size());
+    }
+  }
+
+  public void testCharArrayMap() {
+    int num = 5 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < num; i++) { // pump this up for more random testing
+      doRandom(1000,false);
+      doRandom(1000,true);      
+    }
+  }
+
+  public void testMethods() {
+    CharArrayMap<Integer> cm = new CharArrayMap<>(2, false);
+    HashMap<String,Integer> hm = new HashMap<>();
+    hm.put("foo",1);
+    hm.put("bar",2);
+    cm.putAll(hm);
+    assertEquals(hm.size(), cm.size());
+    hm.put("baz", 3);
+    cm.putAll(hm);
+    assertEquals(hm.size(), cm.size());
+
+    CharArraySet cs = cm.keySet();
+    int n=0;
+    for (Object o : cs) {
+      assertTrue(cm.containsKey(o));
+      char[] co = (char[]) o;
+      assertTrue(cm.containsKey(co, 0, co.length));
+      n++;
+    }
+    assertEquals(hm.size(), n);
+    assertEquals(hm.size(), cs.size());
+    assertEquals(cm.size(), cs.size());
+    cs.clear();
+    assertEquals(0, cs.size());
+    assertEquals(0, cm.size());
+    // keySet() should not allow adding new keys
+    expectThrows(UnsupportedOperationException.class, () -> {
+      cs.add("test");
+    });
+
+    cm.putAll(hm);
+    assertEquals(hm.size(), cs.size());
+    assertEquals(cm.size(), cs.size());
+
+    Iterator<Map.Entry<Object,Integer>> iter1 = cm.entrySet().iterator();
+    n=0;
+    while (iter1.hasNext()) {
+      Map.Entry<Object,Integer> entry = iter1.next();
+      Object key = entry.getKey();
+      Integer val = entry.getValue();
+      assertEquals(cm.get(key), val);
+      entry.setValue(val*100);
+      assertEquals(val*100, (int)cm.get(key));
+      n++;
+    }
+    assertEquals(hm.size(), n);
+    cm.clear();
+    cm.putAll(hm);
+    assertEquals(cm.size(), n);
+
+    CharArrayMap<Integer>.EntryIterator iter2 = cm.entrySet().iterator();
+    n=0;
+    while (iter2.hasNext()) {
+      char[] keyc = iter2.nextKey();
+      Integer val = iter2.currentValue();
+      assertEquals(hm.get(new String(keyc)), val);
+      iter2.setValue(val*100);
+      assertEquals(val*100, (int)cm.get(keyc));
+      n++;
+    }
+    assertEquals(hm.size(), n);
+
+    cm.entrySet().clear();
+    assertEquals(0, cm.size());
+    assertEquals(0, cm.entrySet().size());
+    assertTrue(cm.isEmpty());
+  }
+
+  // TODO: break this up into simpler test methods vs. "telling a story"
+  public void testModifyOnUnmodifiable(){
+    CharArrayMap<Integer> map = new CharArrayMap<>(2, false);
+    map.put("foo",1);
+    map.put("bar",2);
+    final int size = map.size();
+    assertEquals(2, size);
+    assertTrue(map.containsKey("foo"));  
+    assertEquals(1, map.get("foo").intValue());  
+    assertTrue(map.containsKey("bar"));  
+    assertEquals(2, map.get("bar").intValue());  
+
+    map = CharArrayMap.unmodifiableMap(map);
+    assertEquals("Map size changed due to unmodifiableMap call" , size, map.size());
+    String NOT_IN_MAP = "SirGallahad";
+    assertFalse("Test String already exists in map", map.containsKey(NOT_IN_MAP));
+    assertNull("Test String already exists in map", map.get(NOT_IN_MAP));
+    
+    try{
+      map.put(NOT_IN_MAP.toCharArray(), 3);  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+      assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.put(NOT_IN_MAP, 3);  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+      assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.put(new StringBuilder(NOT_IN_MAP), 3);  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+      assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.clear();  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.entrySet().clear();  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.keySet().clear();  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.put((Object) NOT_IN_MAP, 3);  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+      assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    try{
+      map.putAll(Collections.singletonMap(NOT_IN_MAP, 3));  
+      fail("Modified unmodifiable map");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable map", map.containsKey(NOT_IN_MAP));
+      assertNull("Test String has been added to unmodifiable map", map.get(NOT_IN_MAP));
+      assertEquals("Size of unmodifiable map has changed", size, map.size());
+    }
+    
+    assertTrue(map.containsKey("foo"));  
+    assertEquals(1, map.get("foo").intValue());  
+    assertTrue(map.containsKey("bar"));  
+    assertEquals(2, map.get("bar").intValue());  
+  }
+  
+  public void testToString() {
+    CharArrayMap<Integer> cm = new CharArrayMap<>(Collections.singletonMap("test",1), false);
+    assertEquals("[test]",cm.keySet().toString());
+    assertEquals("[1]",cm.values().toString());
+    assertEquals("[test=1]",cm.entrySet().toString());
+    assertEquals("{test=1}",cm.toString());
+    cm.put("test2", 2);
+    assertTrue(cm.keySet().toString().contains(", "));
+    assertTrue(cm.values().toString().contains(", "));
+    assertTrue(cm.entrySet().toString().contains(", "));
+    assertTrue(cm.toString().contains(", "));
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
new file mode 100644
index 0000000..465f512
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.*;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestCharArraySet extends LuceneTestCase {
+  
+  static final String[] TEST_STOP_WORDS = {
+    "a", "an", "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+  };
+  
+  
+  public void testRehash() throws Exception {
+    CharArraySet cas = new CharArraySet(0, true);
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      cas.add(TEST_STOP_WORDS[i]);
+    assertEquals(TEST_STOP_WORDS.length, cas.size());
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      assertTrue(cas.contains(TEST_STOP_WORDS[i]));
+  }
+
+  public void testNonZeroOffset() {
+    String[] words={"Hello","World","this","is","a","test"};
+    char[] findme="xthisy".toCharArray();   
+    CharArraySet set= new CharArraySet(10, true);
+    set.addAll(Arrays.asList(words));
+    assertTrue(set.contains(findme, 1, 4));
+    assertTrue(set.contains(new String(findme,1,4)));
+    
+    // test unmodifiable
+    set = CharArraySet.unmodifiableSet(set);
+    assertTrue(set.contains(findme, 1, 4));
+    assertTrue(set.contains(new String(findme,1,4)));
+  }
+  
+  public void testObjectContains() {
+    CharArraySet set = new CharArraySet(10, true);
+    Integer val = Integer.valueOf(1);
+    set.add(val);
+    assertTrue(set.contains(val));
+    assertTrue(set.contains(new Integer(1))); // another integer
+    assertTrue(set.contains("1"));
+    assertTrue(set.contains(new char[]{'1'}));
+    // test unmodifiable
+    set = CharArraySet.unmodifiableSet(set);
+    assertTrue(set.contains(val));
+    assertTrue(set.contains(new Integer(1))); // another integer
+    assertTrue(set.contains("1"));
+    assertTrue(set.contains(new char[]{'1'}));
+  }
+  
+  public void testClear(){
+    CharArraySet set=new CharArraySet(10,true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+    set.clear();
+    assertEquals("not empty", 0, set.size());
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      assertFalse(set.contains(TEST_STOP_WORDS[i]));
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      assertTrue(set.contains(TEST_STOP_WORDS[i]));
+  }
+  
+  // TODO: break this up into simpler test methods, vs "telling a story"
+  public void testModifyOnUnmodifiable(){
+    CharArraySet set=new CharArraySet(10, true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    final int size = set.size();
+    set = CharArraySet.unmodifiableSet(set);
+    assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+    String NOT_IN_SET = "SirGallahad";
+    assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
+    
+    try{
+      set.add(NOT_IN_SET.toCharArray());  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.add(NOT_IN_SET);  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.add(new StringBuilder(NOT_IN_SET));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.clear();  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    try{
+      set.add((Object) NOT_IN_SET);  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    // This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
+    // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
+    // remove() on the iterator
+    try{
+      set.removeAll(new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.retainAll(new CharArraySet(Arrays.asList(NOT_IN_SET), true));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.addAll(Arrays.asList(NOT_IN_SET));
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+    }
+    
+    for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
+      assertTrue(set.contains(TEST_STOP_WORDS[i]));  
+    }
+  }
+  
+  public void testUnmodifiableSet(){
+    CharArraySet set = new CharArraySet(10,true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    set.add(Integer.valueOf(1));
+    final int size = set.size();
+    set = CharArraySet.unmodifiableSet(set);
+    assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+    for (String stopword : TEST_STOP_WORDS) {
+      assertTrue(set.contains(stopword));
+    }
+    assertTrue(set.contains(Integer.valueOf(1)));
+    assertTrue(set.contains("1"));
+    assertTrue(set.contains(new char[]{'1'}));
+    
+    expectThrows(NullPointerException.class, () -> {
+      CharArraySet.unmodifiableSet(null);
+    });
+  }
+  
+  public void testSupplementaryChars() {
+    String missing = "Term %s is missing in the set";
+    String falsePos = "Term %s is in the set but shouldn't";
+    // for reference see
+    // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
+    String[] upperArr = new String[] {"Abc\ud801\udc1c",
+        "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
+    String[] lowerArr = new String[] {"abc\ud801\udc44",
+        "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
+    CharArraySet set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), true);
+    for (String upper : upperArr) {
+      set.add(upper);
+    }
+    for (int i = 0; i < upperArr.length; i++) {
+      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+      assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+    }
+    set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS), false);
+    for (String upper : upperArr) {
+      set.add(upper);
+    }
+    for (int i = 0; i < upperArr.length; i++) {
+      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+      assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
+    }
+  }
+  
+  public void testSingleHighSurrogate() {
+    String missing = "Term %s is missing in the set";
+    String falsePos = "Term %s is in the set but shouldn't";
+    String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
+        "\uD800EfG", "\uD800\ud801\udc1cB" };
+
+    String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
+        "\uD800efg", "\uD800\ud801\udc44b" };
+    CharArraySet set = new CharArraySet(Arrays
+        .asList(TEST_STOP_WORDS), true);
+    for (String upper : upperArr) {
+      set.add(upper);
+    }
+    for (int i = 0; i < upperArr.length; i++) {
+      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+      assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
+    }
+    set = new CharArraySet(Arrays.asList(TEST_STOP_WORDS),
+        false);
+    for (String upper : upperArr) {
+      set.add(upper);
+    }
+    for (int i = 0; i < upperArr.length; i++) {
+      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
+      assertFalse(String.format(Locale.ROOT, falsePos, upperArr[i]), set
+          .contains(lowerArr[i]));
+    }
+  }
+  
+  @SuppressWarnings("deprecated")
+  public void testCopyCharArraySetBWCompat() {
+    CharArraySet setIngoreCase = new CharArraySet(10, true);
+    CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+    List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+    List<String> stopwordsUpper = new ArrayList<>();
+    for (String string : stopwords) {
+      stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+    }
+    setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+    setIngoreCase.add(Integer.valueOf(1));
+    setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+    setCaseSensitive.add(Integer.valueOf(1));
+
+    CharArraySet copy = CharArraySet.copy(setIngoreCase);
+    CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+    assertEquals(setIngoreCase.size(), copy.size());
+    assertEquals(setCaseSensitive.size(), copy.size());
+
+    assertTrue(copy.containsAll(stopwords));
+    assertTrue(copy.containsAll(stopwordsUpper));
+    assertTrue(copyCaseSens.containsAll(stopwords));
+    for (String string : stopwordsUpper) {
+      assertFalse(copyCaseSens.contains(string));
+    }
+    // test adding terms to the copy
+    List<String> newWords = new ArrayList<>();
+    for (String string : stopwords) {
+      newWords.add(string+"_1");
+    }
+    copy.addAll(newWords);
+    
+    assertTrue(copy.containsAll(stopwords));
+    assertTrue(copy.containsAll(stopwordsUpper));
+    assertTrue(copy.containsAll(newWords));
+    // new added terms are not in the source set
+    for (String string : newWords) {
+      assertFalse(setIngoreCase.contains(string));  
+      assertFalse(setCaseSensitive.contains(string));  
+
+    }
+  }
+  
+  /**
+   * Test the static #copy() function with a CharArraySet as a source
+   */
+  public void testCopyCharArraySet() {
+    CharArraySet setIngoreCase = new CharArraySet(10, true);
+    CharArraySet setCaseSensitive = new CharArraySet(10, false);
+
+    List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+    List<String> stopwordsUpper = new ArrayList<>();
+    for (String string : stopwords) {
+      stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+    }
+    setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
+    setIngoreCase.add(Integer.valueOf(1));
+    setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
+    setCaseSensitive.add(Integer.valueOf(1));
+
+    CharArraySet copy = CharArraySet.copy(setIngoreCase);
+    CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
+
+    assertEquals(setIngoreCase.size(), copy.size());
+    assertEquals(setCaseSensitive.size(), copy.size());
+
+    assertTrue(copy.containsAll(stopwords));
+    assertTrue(copy.containsAll(stopwordsUpper));
+    assertTrue(copyCaseSens.containsAll(stopwords));
+    for (String string : stopwordsUpper) {
+      assertFalse(copyCaseSens.contains(string));
+    }
+    // test adding terms to the copy
+    List<String> newWords = new ArrayList<>();
+    for (String string : stopwords) {
+      newWords.add(string+"_1");
+    }
+    copy.addAll(newWords);
+    
+    assertTrue(copy.containsAll(stopwords));
+    assertTrue(copy.containsAll(stopwordsUpper));
+    assertTrue(copy.containsAll(newWords));
+    // new added terms are not in the source set
+    for (String string : newWords) {
+      assertFalse(setIngoreCase.contains(string));  
+      assertFalse(setCaseSensitive.contains(string));  
+
+    }
+  }
+  
+  /**
+   * Test the static #copy() function with a JDK {@link Set} as a source
+   */
+  public void testCopyJDKSet() {
+    Set<String> set = new HashSet<>();
+
+    List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
+    List<String> stopwordsUpper = new ArrayList<>();
+    for (String string : stopwords) {
+      stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
+    }
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+
+    CharArraySet copy = CharArraySet.copy(set);
+
+    assertEquals(set.size(), copy.size());
+    assertEquals(set.size(), copy.size());
+
+    assertTrue(copy.containsAll(stopwords));
+    for (String string : stopwordsUpper) {
+      assertFalse(copy.contains(string));
+    }
+    
+    List<String> newWords = new ArrayList<>();
+    for (String string : stopwords) {
+      newWords.add(string+"_1");
+    }
+    copy.addAll(newWords);
+    
+    assertTrue(copy.containsAll(stopwords));
+    assertTrue(copy.containsAll(newWords));
+    // new added terms are not in the source set
+    for (String string : newWords) {
+      assertFalse(set.contains(string));  
+    }
+  }
+  
+  /**
+   * Tests a special case of {@link CharArraySet#copy(Set)} where the
+   * set to copy is the {@link CharArraySet#EMPTY_SET}
+   */
+  public void testCopyEmptySet() {
+    assertSame(CharArraySet.EMPTY_SET, 
+        CharArraySet.copy(CharArraySet.EMPTY_SET));
+  }
+
+  /**
+   * Smoketests the static empty set
+   */
+  public void testEmptySet() {
+    assertEquals(0, CharArraySet.EMPTY_SET.size());
+    
+    assertTrue(CharArraySet.EMPTY_SET.isEmpty());
+    for (String stopword : TEST_STOP_WORDS) {
+      assertFalse(CharArraySet.EMPTY_SET.contains(stopword));
+    }
+    assertFalse(CharArraySet.EMPTY_SET.contains("foo"));
+    assertFalse(CharArraySet.EMPTY_SET.contains((Object) "foo"));
+    assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray()));
+    assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray(),0,3));
+  }
+  
+  /**
+   * Test for NPE
+   */
+  public void testContainsWithNull() {
+    CharArraySet set = new CharArraySet(1, true);
+
+    expectThrows(NullPointerException.class, () -> {
+      set.contains((char[]) null, 0, 10);
+    });
+
+    expectThrows(NullPointerException.class, () -> {
+      set.contains((CharSequence) null);
+    });
+
+    expectThrows(NullPointerException.class, () -> {
+      set.contains((Object) null);
+    });
+  }
+  
+  public void testToString() {
+    CharArraySet set = CharArraySet.copy(Collections.singleton("test"));
+    assertEquals("[test]", set.toString());
+    set.add("test2");
+    assertTrue(set.toString().contains(", "));
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
new file mode 100644
index 0000000..53b3f56
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharacterUtils.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.Test;
+
+/**
+ * TestCase for the {@link CharacterUtils} class.
+ */
+public class TestCharacterUtils extends LuceneTestCase {
+
+  public void testConversions() {
+    final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
+    final int[] buf = new int[orig.length];
+    final char[] restored = new char[buf.length];
+    final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
+    final int o2 = TestUtil.nextInt(random(), 0, o1);
+    final int o3 = TestUtil.nextInt(random(), 0, o1);
+    final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+    final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
+    assertEquals(orig.length - o1, charCount);
+    assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
+  }
+
+  @Test
+  public void testNewCharacterBuffer() {
+    CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024);
+    assertEquals(1024, newCharacterBuffer.getBuffer().length);
+    assertEquals(0, newCharacterBuffer.getOffset());
+    assertEquals(0, newCharacterBuffer.getLength());
+
+    newCharacterBuffer = CharacterUtils.newCharacterBuffer(2);
+    assertEquals(2, newCharacterBuffer.getBuffer().length);
+    assertEquals(0, newCharacterBuffer.getOffset());
+    assertEquals(0, newCharacterBuffer.getLength());
+
+    // length must be >= 2
+    expectThrows(IllegalArgumentException.class, () -> {
+      CharacterUtils.newCharacterBuffer(1);
+    });
+  }
+
+  @Test
+  public void testFillNoHighSurrogate() throws IOException {
+    Reader reader = new StringReader("helloworld");
+    CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+    assertTrue(CharacterUtils.fill(buffer,reader));
+    assertEquals(0, buffer.getOffset());
+    assertEquals(6, buffer.getLength());
+    assertEquals("hellow", new String(buffer.getBuffer()));
+    assertFalse(CharacterUtils.fill(buffer,reader));
+    assertEquals(4, buffer.getLength());
+    assertEquals(0, buffer.getOffset());
+
+    assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+        buffer.getLength()));
+    assertFalse(CharacterUtils.fill(buffer,reader));
+  }
+
+  @Test
+  public void testFill() throws IOException {
+    String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
+    Reader reader = new StringReader(input);
+    CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
+    assertTrue(CharacterUtils.fill(buffer, reader));
+    assertEquals(4, buffer.getLength());
+    assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
+        buffer.getLength()));
+    assertTrue(CharacterUtils.fill(buffer, reader));
+    assertEquals(5, buffer.getLength());
+    assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
+    assertTrue(CharacterUtils.fill(buffer, reader));
+    assertEquals(4, buffer.getLength());
+    assertEquals("123\ud801", new String(buffer.getBuffer(),
+        buffer.getOffset(), buffer.getLength()));
+    assertFalse(CharacterUtils.fill(buffer, reader));
+    assertEquals(3, buffer.getLength());
+    assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
+        .getOffset(), buffer.getLength()));
+    assertFalse(CharacterUtils.fill(buffer, reader));
+    assertEquals(0, buffer.getLength());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/87016b5f/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
new file mode 100644
index 0000000..c224682
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.util.English;
+
+public class TestStopFilter extends BaseTokenStreamTestCase {
+  
+  // other StopFilter functionality is already tested by TestStopAnalyzer
+
+  public void testExactCase() throws IOException {
+    StringReader reader = new StringReader("Now is The Time");
+    CharArraySet stopWords = new CharArraySet(asSet("is", "the", "Time"), false);
+    final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    in.setReader(reader);
+    TokenStream stream = new StopFilter(in, stopWords);
+    assertTokenStreamContents(stream, new String[] { "Now", "The" });
+  }
+
+  public void testStopFilt() throws IOException {
+    StringReader reader = new StringReader("Now is The Time");
+    String[] stopWords = new String[] { "is", "the", "Time" };
+    CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+    final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    in.setReader(reader);
+    TokenStream stream = new StopFilter(in, stopSet);
+    assertTokenStreamContents(stream, new String[] { "Now", "The" });
+  }
+
+  /**
+   * Test Position increments applied by StopFilter with and without enabling this option.
+   */
+  public void testStopPositons() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    ArrayList<String> a = new ArrayList<>();
+    for (int i=0; i<20; i++) {
+      String w = English.intToEnglish(i).trim();
+      sb.append(w).append(" ");
+      if (i%3 != 0) a.add(w);
+    }
+    log(sb.toString());
+    String stopWords[] = a.toArray(new String[0]);
+    for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
+    CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
+    // with increments
+    StringReader reader = new StringReader(sb.toString());
+    final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    in.setReader(reader);
+    StopFilter stpf = new StopFilter(in, stopSet);
+    doTestStopPositons(stpf);
+    // with increments, concatenating two stop filters
+    ArrayList<String> a0 = new ArrayList<>();
+    ArrayList<String> a1 = new ArrayList<>();
+    for (int i=0; i<a.size(); i++) {
+      if (i%2==0) { 
+        a0.add(a.get(i));
+      } else {
+        a1.add(a.get(i));
+      }
+    }
+    String stopWords0[] =  a0.toArray(new String[0]);
+    for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
+    String stopWords1[] =  a1.toArray(new String[0]);
+    for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
+    CharArraySet stopSet0 = StopFilter.makeStopSet(stopWords0);
+    CharArraySet stopSet1 = StopFilter.makeStopSet(stopWords1);
+    reader = new StringReader(sb.toString());
+    final MockTokenizer in1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    in1.setReader(reader);
+    StopFilter stpf0 = new StopFilter(in1, stopSet0); // first part of the set
+    StopFilter stpf01 = new StopFilter(stpf0, stopSet1); // two stop filters concatenated!
+    doTestStopPositons(stpf01);
+  }
+
+  // LUCENE-3849: make sure after .end() we see the "ending" posInc
+  public void testEndStopword() throws Exception {
+    CharArraySet stopSet = StopFilter.makeStopSet("of");
+    final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    in.setReader(new StringReader("test of"));
+    StopFilter stpf = new StopFilter(in, stopSet);
+    assertTokenStreamContents(stpf, new String[] { "test" },
+                              new int[] {0},
+                              new int[] {4},
+                              null,
+                              new int[] {1},
+                              null,
+                              7,
+                              1,
+                              null,
+                              true);    
+  }
+  
+  private void doTestStopPositons(StopFilter stpf) throws IOException {
+    CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
+    stpf.reset();
+    for (int i=0; i<20; i+=3) {
+      assertTrue(stpf.incrementToken());
+      log("Token "+i+": "+stpf);
+      String w = English.intToEnglish(i).trim();
+      assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
+      assertEquals("all but first token must have position increment of 3",i==0?1:3,posIncrAtt.getPositionIncrement());
+    }
+    assertFalse(stpf.incrementToken());
+    stpf.end();
+    stpf.close();
+  }
+  
+  // print debug info depending on VERBOSE
+  private static void log(String s) {
+    if (VERBOSE) {
+      System.out.println(s);
+    }
+  }
+  
+  // stupid filter that inserts synonym of 'hte' for 'the'
+  private class MockSynonymFilter extends TokenFilter {
+    State bufferedState;
+    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+    MockSynonymFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (bufferedState != null) {
+        restoreState(bufferedState);
+        posIncAtt.setPositionIncrement(0);
+        termAtt.setEmpty().append("hte");
+        bufferedState = null;
+        return true;
+      } else if (input.incrementToken()) {
+        if (termAtt.toString().equals("the")) {
+          bufferedState = captureState();
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      bufferedState = null;
+    }
+  }
+
+}