You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/14 22:51:17 UTC

[05/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer and make it the default for IndexWriterConfig

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
new file mode 100644
index 0000000..e7e610a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@@ -0,0 +1,669 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * A simple class that stores key Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class.  For example, it cannot remove items from the
+ * map, nor does it resize its hash table to be smaller,
+ * etc.  It is designed to be quick to retrieve items
+ * by char[] keys without the necessity of converting
+ * to a String first.
+ */
+public class CharArrayMap<V> extends AbstractMap<Object,V> {
+  // private only because missing generics
+  private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
+
+  private final static int INIT_SIZE = 8;
+  private boolean ignoreCase;  
+  private int count;
+  char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+  V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+
+  /**
+   * Create map with enough capacity to hold startSize terms
+   *
+   * @param startSize
+   *          the initial capacity
+   * @param ignoreCase
+   *          <code>false</code> if and only if the set should be case sensitive
+   *          otherwise <code>true</code>.
+   */
+  @SuppressWarnings("unchecked")
+  public CharArrayMap(int startSize, boolean ignoreCase) {
+    this.ignoreCase = ignoreCase;
+    int size = INIT_SIZE;
+    while(startSize + (startSize>>2) > size)
+      size <<= 1;
+    keys = new char[size][];
+    values = (V[]) new Object[size];
+  }
+
+  /**
+   * Creates a map from the mappings in another map. 
+   *
+   * @param c
+   *          a map whose mappings to be copied
+   * @param ignoreCase
+   *          <code>false</code> if and only if the set should be case sensitive
+   *          otherwise <code>true</code>.
+   */
+  public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) {
+    this(c.size(), ignoreCase);
+    putAll(c);
+  }
+  
+  /** Create set from the supplied map (used internally for readonly maps...) */
+  private CharArrayMap(CharArrayMap<V> toCopy){
+    this.keys = toCopy.keys;
+    this.values = toCopy.values;
+    this.ignoreCase = toCopy.ignoreCase;
+    this.count = toCopy.count;
+  }
+  
+  /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
+  @Override
+  public void clear() {
+    count = 0;
+    Arrays.fill(keys, null);
+    Arrays.fill(values, null);
+  }
+
+  /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+   * are in the {@link #keySet()} */
+  public boolean containsKey(char[] text, int off, int len) {
+    return keys[getSlot(text, off, len)] != null;
+  }
+
+  /** true if the <code>CharSequence</code> is in the {@link #keySet()} */
+  public boolean containsKey(CharSequence cs) {
+    return keys[getSlot(cs)] != null;
+  }
+
+  @Override
+  public boolean containsKey(Object o) {
+    if (o instanceof char[]) {
+      final char[] text = (char[])o;
+      return containsKey(text, 0, text.length);
+    } 
+    return containsKey(o.toString());
+  }
+
+  /** returns the value of the mapping of <code>len</code> chars of <code>text</code>
+   * starting at <code>off</code> */
+  public V get(char[] text, int off, int len) {
+    return values[getSlot(text, off, len)];
+  }
+
+  /** returns the value of the mapping of the chars inside this {@code CharSequence} */
+  public V get(CharSequence cs) {
+    return values[getSlot(cs)];
+  }
+
+  @Override
+  public V get(Object o) {
+    if (o instanceof char[]) {
+      final char[] text = (char[])o;
+      return get(text, 0, text.length);
+    } 
+    return get(o.toString());
+  }
+
+  private int getSlot(char[] text, int off, int len) {
+    int code = getHashCode(text, off, len);
+    int pos = code & (keys.length-1);
+    char[] text2 = keys[pos];
+    if (text2 != null && !equals(text, off, len, text2)) {
+      final int inc = ((code>>8)+code)|1;
+      do {
+        code += inc;
+        pos = code & (keys.length-1);
+        text2 = keys[pos];
+      } while (text2 != null && !equals(text, off, len, text2));
+    }
+    return pos;
+  }
+
+  /** Returns true if the String is in the set */  
+  private int getSlot(CharSequence text) {
+    int code = getHashCode(text);
+    int pos = code & (keys.length-1);
+    char[] text2 = keys[pos];
+    if (text2 != null && !equals(text, text2)) {
+      final int inc = ((code>>8)+code)|1;
+      do {
+        code += inc;
+        pos = code & (keys.length-1);
+        text2 = keys[pos];
+      } while (text2 != null && !equals(text, text2));
+    }
+    return pos;
+  }
+
+  /** Add the given mapping. */
+  public V put(CharSequence text, V value) {
+    return put(text.toString(), value); // could be more efficient
+  }
+
+  @Override
+  public V put(Object o, V value) {
+    if (o instanceof char[]) {
+      return put((char[])o, value);
+    }
+    return put(o.toString(), value);
+  }
+  
+  /** Add the given mapping. */
+  public V put(String text, V value) {
+    return put(text.toCharArray(), value);
+  }
+
+  /** Add the given mapping.
+   * If ignoreCase is true for this Set, the text array will be directly modified.
+   * The user should never modify this text array after calling this method.
+   */
+  public V put(char[] text, V value) {
+    if (ignoreCase) {
+      CharacterUtils.toLowerCase(text, 0, text.length);
+    }
+    int slot = getSlot(text, 0, text.length);
+    if (keys[slot] != null) {
+      final V oldValue = values[slot];
+      values[slot] = value;
+      return oldValue;
+    }
+    keys[slot] = text;
+    values[slot] = value;
+    count++;
+
+    if (count + (count>>2) > keys.length) {
+      rehash();
+    }
+
+    return null;
+  }
+
+  @SuppressWarnings("unchecked")
+  private void rehash() {
+    assert keys.length == values.length;
+    final int newSize = 2*keys.length;
+    final char[][] oldkeys = keys;
+    final V[] oldvalues = values;
+    keys = new char[newSize][];
+    values = (V[]) new Object[newSize];
+
+    for(int i=0; i<oldkeys.length; i++) {
+      char[] text = oldkeys[i];
+      if (text != null) {
+        // todo: could be faster... no need to compare strings on collision
+        final int slot = getSlot(text,0,text.length);
+        keys[slot] = text;
+        values[slot] = oldvalues[i];
+      }
+    }
+  }
+  
+  private boolean equals(char[] text1, int off, int len, char[] text2) {
+    if (len != text2.length)
+      return false;
+    final int limit = off+len;
+    if (ignoreCase) {
+      for(int i=0;i<len;) {
+        final int codePointAt = Character.codePointAt(text1, off+i, limit);
+        if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+          return false;
+        i += Character.charCount(codePointAt); 
+      }
+    } else {
+      for(int i=0;i<len;i++) {
+        if (text1[off+i] != text2[i])
+          return false;
+      }
+    }
+    return true;
+  }
+
+  private boolean equals(CharSequence text1, char[] text2) {
+    int len = text1.length();
+    if (len != text2.length)
+      return false;
+    if (ignoreCase) {
+      for(int i=0;i<len;) {
+        final int codePointAt = Character.codePointAt(text1, i);
+        if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+          return false;
+        i += Character.charCount(codePointAt);
+      }
+    } else {
+      for(int i=0;i<len;i++) {
+        if (text1.charAt(i) != text2[i])
+          return false;
+      }
+    }
+    return true;
+  }
+  
+  private int getHashCode(char[] text, int offset, int len) {
+    if (text == null)
+      throw new NullPointerException();
+    int code = 0;
+    final int stop = offset + len;
+    if (ignoreCase) {
+      for (int i=offset; i<stop;) {
+        final int codePointAt = Character.codePointAt(text, i, stop);
+        code = code*31 + Character.toLowerCase(codePointAt);
+        i += Character.charCount(codePointAt);
+      }
+    } else {
+      for (int i=offset; i<stop; i++) {
+        code = code*31 + text[i];
+      }
+    }
+    return code;
+  }
+
+  private int getHashCode(CharSequence text) {
+    if (text == null)
+      throw new NullPointerException();
+    int code = 0;
+    int len = text.length();
+    if (ignoreCase) {
+      for (int i=0; i<len;) {
+        int codePointAt = Character.codePointAt(text, i);
+        code = code*31 + Character.toLowerCase(codePointAt);
+        i += Character.charCount(codePointAt);
+      }
+    } else {
+      for (int i=0; i<len; i++) {
+        code = code*31 + text.charAt(i);
+      }
+    }
+    return code;
+  }
+
+  @Override
+  public V remove(Object key) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int size() {
+    return count;
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder("{");
+    for (Map.Entry<Object,V> entry : entrySet()) {
+      if (sb.length()>1) sb.append(", ");
+      sb.append(entry);
+    }
+    return sb.append('}').toString();
+  }
+
+  private EntrySet entrySet = null;
+  private CharArraySet keySet = null;
+  
+  EntrySet createEntrySet() {
+    return new EntrySet(true);
+  }
+  
+  @Override
+  public final EntrySet entrySet() {
+    if (entrySet == null) {
+      entrySet = createEntrySet();
+    }
+    return entrySet;
+  }
+  
+  // helper for CharArraySet to not produce endless recursion
+  final Set<Object> originalKeySet() {
+    return super.keySet();
+  }
+
+  /** Returns an {@link CharArraySet} view on the map's keys.
+   * The set will use the same {@code matchVersion} as this map. */
+  @Override @SuppressWarnings({"unchecked","rawtypes"})
+  public final CharArraySet keySet() {
+    if (keySet == null) {
+      // prevent adding of entries
+      keySet = new CharArraySet((CharArrayMap) this) {
+        @Override
+        public boolean add(Object o) {
+          throw new UnsupportedOperationException();
+        }
+        @Override
+        public boolean add(CharSequence text) {
+          throw new UnsupportedOperationException();
+        }
+        @Override
+        public boolean add(String text) {
+          throw new UnsupportedOperationException();
+        }
+        @Override
+        public boolean add(char[] text) {
+          throw new UnsupportedOperationException();
+        }
+      };
+    }
+    return keySet;
+  }
+
+  /** public iterator class so efficient methods are exposed to users */
+  public class EntryIterator implements Iterator<Map.Entry<Object,V>> {
+    private int pos=-1;
+    private int lastPos;
+    private final boolean allowModify;
+    
+    private EntryIterator(boolean allowModify) {
+      this.allowModify = allowModify;
+      goNext();
+    }
+
+    private void goNext() {
+      lastPos = pos;
+      pos++;
+      while (pos < keys.length && keys[pos] == null) pos++;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return pos < keys.length;
+    }
+
+    /** gets the next key... do not modify the returned char[] */
+    public char[] nextKey() {
+      goNext();
+      return keys[lastPos];
+    }
+
+    /** gets the next key as a newly created String object */
+    public String nextKeyString() {
+      return new String(nextKey());
+    }
+
+    /** returns the value associated with the last key returned */
+    public V currentValue() {
+      return values[lastPos];
+    }
+
+    /** sets the value associated with the last key returned */    
+    public V setValue(V value) {
+      if (!allowModify)
+        throw new UnsupportedOperationException();
+      V old = values[lastPos];
+      values[lastPos] = value;
+      return old;      
+    }
+
+    /** use nextCharArray() + currentValue() for better efficiency. */
+    @Override
+    public Map.Entry<Object,V> next() {
+      goNext();
+      return new MapEntry(lastPos, allowModify);
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  private final class MapEntry implements Map.Entry<Object,V> {
+    private final int pos;
+    private final boolean allowModify;
+
+    private MapEntry(int pos, boolean allowModify) {
+      this.pos = pos;
+      this.allowModify = allowModify;
+    }
+
+    @Override
+    public Object getKey() {
+      // we must clone here, as putAll to another CharArrayMap
+      // with other case sensitivity flag would corrupt the keys
+      return keys[pos].clone();
+    }
+
+    @Override
+    public V getValue() {
+      return values[pos];
+    }
+
+    @Override
+    public V setValue(V value) {
+      if (!allowModify)
+        throw new UnsupportedOperationException();
+      final V old = values[pos];
+      values[pos] = value;
+      return old;
+    }
+
+    @Override
+    public String toString() {
+      return new StringBuilder().append(keys[pos]).append('=')
+        .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos])
+        .toString();
+    }
+  }
+
+  /** public EntrySet class so efficient methods are exposed to users */
+  public final class EntrySet extends AbstractSet<Map.Entry<Object,V>> {
+    private final boolean allowModify;
+    
+    private EntrySet(boolean allowModify) {
+      this.allowModify = allowModify;
+    }
+  
+    @Override
+    public EntryIterator iterator() {
+      return new EntryIterator(allowModify);
+    }
+    
+    @Override
+    @SuppressWarnings("unchecked")
+    public boolean contains(Object o) {
+      if (!(o instanceof Map.Entry))
+        return false;
+      final Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+      final Object key = e.getKey();
+      final Object val = e.getValue();
+      final Object v = get(key);
+      return v == null ? val == null : v.equals(val);
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public int size() {
+      return count;
+    }
+    
+    @Override
+    public void clear() {
+      if (!allowModify)
+        throw new UnsupportedOperationException();
+      CharArrayMap.this.clear();
+    }
+  }
+  
+  /**
+   * Returns an unmodifiable {@link CharArrayMap}. This allows to provide
+   * unmodifiable views of internal map for "read-only" use.
+   * 
+   * @param map
+   *          a map for which the unmodifiable map is returned.
+   * @return an new unmodifiable {@link CharArrayMap}.
+   * @throws NullPointerException
+   *           if the given map is <code>null</code>.
+   */
+  public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
+    if (map == null)
+      throw new NullPointerException("Given map is null");
+    if (map == emptyMap() || map.isEmpty())
+      return emptyMap();
+    if (map instanceof UnmodifiableCharArrayMap)
+      return map;
+    return new UnmodifiableCharArrayMap<>(map);
+  }
+
+  /**
+   * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
+   * is a {@link CharArrayMap} the ignoreCase property will be preserved.
+   * 
+   * @param map
+   *          a map to copy
+   * @return a copy of the given map as a {@link CharArrayMap}. If the given map
+   *         is a {@link CharArrayMap} the ignoreCase property as well as the
+   *         matchVersion will be of the given map will be preserved.
+   */
+  @SuppressWarnings("unchecked")
+  public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) {
+    if(map == EMPTY_MAP)
+      return emptyMap();
+    if(map instanceof CharArrayMap) {
+      CharArrayMap<V> m = (CharArrayMap<V>) map;
+      // use fast path instead of iterating all values
+      // this is even on very small sets ~10 times faster than iterating
+      final char[][] keys = new char[m.keys.length][];
+      System.arraycopy(m.keys, 0, keys, 0, keys.length);
+      final V[] values = (V[]) new Object[m.values.length];
+      System.arraycopy(m.values, 0, values, 0, values.length);
+      m = new CharArrayMap<>(m);
+      m.keys = keys;
+      m.values = values;
+      return m;
+    }
+    return new CharArrayMap<>(map, false);
+  }
+  
+  /** Returns an empty, unmodifiable map. */
+  @SuppressWarnings("unchecked")
+  public static <V> CharArrayMap<V> emptyMap() {
+    return (CharArrayMap<V>) EMPTY_MAP;
+  }
+  
+  // package private CharArraySet instanceof check in CharArraySet
+  static class UnmodifiableCharArrayMap<V> extends CharArrayMap<V> {
+
+    UnmodifiableCharArrayMap(CharArrayMap<V> map) {
+      super(map);
+    }
+
+    @Override
+    public void clear() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V put(Object o, V val){
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public V put(char[] text, V val) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V put(CharSequence text, V val) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public V put(String text, V val) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public V remove(Object key) {
+      throw new UnsupportedOperationException();
+    }
+  
+    @Override
+    EntrySet createEntrySet() {
+      return new EntrySet(false);
+    }
+  }
+  
+  /**
+   * Empty {@link org.apache.lucene.analysis.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
+   * Contains checks will always return <code>false</code> or throw
+   * NPE if necessary.
+   */
+  private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
+    EmptyCharArrayMap() {
+      super(new CharArrayMap<V>(0, false));
+    }
+    
+    @Override
+    public boolean containsKey(char[] text, int off, int len) {
+      if(text == null)
+        throw new NullPointerException();
+      return false;
+    }
+
+    @Override
+    public boolean containsKey(CharSequence cs) {
+      if(cs == null)
+        throw new NullPointerException();
+      return false;
+    }
+
+    @Override
+    public boolean containsKey(Object o) {
+      if(o == null)
+        throw new NullPointerException();
+      return false;
+    }
+    
+    @Override
+    public V get(char[] text, int off, int len) {
+      if(text == null)
+        throw new NullPointerException();
+      return null;
+    }
+
+    @Override
+    public V get(CharSequence cs) {
+      if(cs == null)
+        throw new NullPointerException();
+      return null;
+    }
+
+    @Override
+    public V get(Object o) {
+      if(o == null)
+        throw new NullPointerException();
+      return null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
new file mode 100644
index 0000000..4c8066a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A simple class that stores Strings as char[]'s in a
+ * hash table.  Note that this is not a general purpose
+ * class.  For example, it cannot remove items from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc.  It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
+ *
+ * <P>
+ * <em>Please note:</em> This class implements {@link java.util.Set Set} but
+ * does not behave like it should in all cases. The generic type is
+ * {@code Set<Object>}, because you can add any object to it,
+ * that has a string representation. The add methods will use
+ * {@link Object#toString} and store the result using a {@code char[]}
+ * buffer. The same behavior have the {@code contains()} methods.
+ * The {@link #iterator()} returns an {@code Iterator<char[]>}.
+ */
+public class CharArraySet extends AbstractSet<Object> {
+
+  /** An empty {@code CharArraySet}. */
+  public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+  
+  private static final Object PLACEHOLDER = new Object();
+  
+  private final CharArrayMap<Object> map;
+  
+  /**
+   * Create set with enough capacity to hold startSize terms
+   * 
+   * @param startSize
+   *          the initial capacity
+   * @param ignoreCase
+   *          <code>false</code> if and only if the set should be case sensitive
+   *          otherwise <code>true</code>.
+   */
+  public CharArraySet(int startSize, boolean ignoreCase) {
+    this(new CharArrayMap<>(startSize, ignoreCase));
+  }
+
+  /**
+   * Creates a set from a Collection of objects. 
+   * 
+   * @param c
+   *          a collection whose elements to be placed into the set
+   * @param ignoreCase
+   *          <code>false</code> if and only if the set should be case sensitive
+   *          otherwise <code>true</code>.
+   */
+  public CharArraySet(Collection<?> c, boolean ignoreCase) {
+    this(c.size(), ignoreCase);
+    addAll(c);
+  }
+
+  /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+  CharArraySet(final CharArrayMap<Object> map){
+    this.map = map;
+  }
+  
+  /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
+  @Override
+  public void clear() {
+    map.clear();
+  }
+
+  /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+   * are in the set */
+  public boolean contains(char[] text, int off, int len) {
+    return map.containsKey(text, off, len);
+  }
+
+  /** true if the <code>CharSequence</code> is in the set */
+  public boolean contains(CharSequence cs) {
+    return map.containsKey(cs);
+  }
+
+  @Override
+  public boolean contains(Object o) {
+    return map.containsKey(o);
+  }
+
+  @Override
+  public boolean add(Object o) {
+    return map.put(o, PLACEHOLDER) == null;
+  }
+
+  /** Add this CharSequence into the set */
+  public boolean add(CharSequence text) {
+    return map.put(text, PLACEHOLDER) == null;
+  }
+  
+  /** Add this String into the set */
+  public boolean add(String text) {
+    return map.put(text, PLACEHOLDER) == null;
+  }
+
+  /** Add this char[] directly to the set.
+   * If ignoreCase is true for this Set, the text array will be directly modified.
+   * The user should never modify this text array after calling this method.
+   */
+  public boolean add(char[] text) {
+    return map.put(text, PLACEHOLDER) == null;
+  }
+
+  @Override
+  public int size() {
+    return map.size();
+  }
+  
+  /**
+   * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+   * unmodifiable views of internal sets for "read-only" use.
+   * 
+   * @param set
+   *          a set for which the unmodifiable set is returned.
+   * @return an new unmodifiable {@link CharArraySet}.
+   * @throws NullPointerException
+   *           if the given set is <code>null</code>.
+   */
+  public static CharArraySet unmodifiableSet(CharArraySet set) {
+    if (set == null)
+      throw new NullPointerException("Given set is null");
+    if (set == EMPTY_SET)
+      return EMPTY_SET;
+    if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
+      return set;
+    return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
+  }
+
+  /**
+   * Returns a copy of the given set as a {@link CharArraySet}. If the given set
+   * is a {@link CharArraySet} the ignoreCase property will be preserved.
+   * 
+   * @param set
+   *          a set to copy
+   * @return a copy of the given set as a {@link CharArraySet}. If the given set
+   *         is a {@link CharArraySet} the ignoreCase property as well as the
+   *         matchVersion will be of the given set will be preserved.
+   */
+  public static CharArraySet copy(final Set<?> set) {
+    if(set == EMPTY_SET)
+      return EMPTY_SET;
+    if(set instanceof CharArraySet) {
+      final CharArraySet source = (CharArraySet) set;
+      return new CharArraySet(CharArrayMap.copy(source.map));
+    }
+    return new CharArraySet(set, false);
+  }
+  
+  /**
+   * Returns an {@link Iterator} for {@code char[]} instances in this set.
+   */
+  @Override @SuppressWarnings("unchecked")
+  public Iterator<Object> iterator() {
+    // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+    return map.originalKeySet().iterator();
+  }
+  
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder("[");
+    for (Object item : this) {
+      if (sb.length()>1) sb.append(", ");
+      if (item instanceof char[]) {
+        sb.append((char[]) item);
+      } else {
+        sb.append(item);
+      }
+    }
+    return sb.append(']').toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
new file mode 100644
index 0000000..e2cc47f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Utility class to write tokenizers or token filters.
+ * @lucene.internal
+ */
+public final class CharacterUtils {
+
+  private CharacterUtils() {} // no instantiation
+
+  /**
+   * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+   * of the given bufferSize.
+   * 
+   * @param bufferSize
+   *          the internal char buffer size, must be <code>&gt;= 2</code>
+   * @return a new {@link CharacterBuffer} instance.
+   */
+  public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+    if (bufferSize < 2) {
+      throw new IllegalArgumentException("buffersize must be >= 2");
+    }
+    return new CharacterBuffer(new char[bufferSize], 0, 0);
+  }
+  
+  
+  /**
+   * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting 
+   * at the given offset.
+   * @param buffer the char buffer to lowercase
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to lower case
+   */
+  public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert offset <=0 && offset <= buffer.length;
+    for (int i = offset; i < limit;) {
+      i += Character.toChars(
+              Character.toLowerCase(
+                  Character.codePointAt(buffer, i, limit)), buffer, i);
+     }
+  }
+
+  /**
+   * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting 
+   * at the given offset.
+   * @param buffer the char buffer to UPPERCASE
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to lower case
+   */
+  public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert offset <=0 && offset <= buffer.length;
+    for (int i = offset; i < limit;) {
+      i += Character.toChars(
+              Character.toUpperCase(
+                  Character.codePointAt(buffer, i, limit)), buffer, i);
+     }
+  }
+
+  /** Converts a sequence of Java characters to a sequence of unicode code points.
+   *  @return the number of code points written to the destination buffer */
+  public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+    if (srcLen < 0) {
+      throw new IllegalArgumentException("srcLen must be >= 0");
+    }
+    int codePointCount = 0;
+    for (int i = 0; i < srcLen; ) {
+      final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
+      final int charCount = Character.charCount(cp);
+      dest[destOff + codePointCount++] = cp;
+      i += charCount;
+    }
+    return codePointCount;
+  }
+
+  /** Converts a sequence of unicode code points to a sequence of Java characters.
+   *  @return the number of chars written to the destination buffer */
+  public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+    if (srcLen < 0) {
+      throw new IllegalArgumentException("srcLen must be >= 0");
+    }
+    int written = 0;
+    for (int i = 0; i < srcLen; ++i) {
+      written += Character.toChars(src[srcOff + i], dest, destOff + written);
+    }
+    return written;
+  }
+
+  /**
+   * Fills the {@link CharacterBuffer} with characters read from the given
+   * reader {@link Reader}. This method tries to read <code>numChars</code>
+   * characters into the {@link CharacterBuffer}, each call to fill will start
+   * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+   * In case code points can span across 2 java characters, this method may
+   * only fill <code>numChars - 1</code> characters in order not to split in
+   * the middle of a surrogate pair, even if there are remaining characters in
+   * the {@link Reader}.
+   * <p>
+   * This method guarantees
+   * that the given {@link CharacterBuffer} will never contain a high surrogate
+   * character as the last element in the buffer unless it is the last available
+   * character in the reader. In other words, high and low surrogate pairs will
+   * always be preserved across buffer boarders.
+   * </p>
+   * <p>
+   * A return value of <code>false</code> means that this method call exhausted
+   * the reader, but there may be some bytes which have been read, which can be
+   * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
+   * </p>
+   * 
+   * @param buffer
+   *          the buffer to fill.
+   * @param reader
+   *          the reader to read characters from.
+   * @param numChars
+   *          the number of chars to read
+   * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+   * @throws IOException
+   *           if the reader throws an {@link IOException}.
+   */
+  public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
+    assert buffer.buffer.length >= 2;
+    if (numChars < 2 || numChars > buffer.buffer.length) {
+      throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+    }
+    final char[] charBuffer = buffer.buffer;
+    buffer.offset = 0;
+    final int offset;
+
+    // Install the previously saved ending high surrogate:
+    if (buffer.lastTrailingHighSurrogate != 0) {
+      charBuffer[0] = buffer.lastTrailingHighSurrogate;
+      buffer.lastTrailingHighSurrogate = 0;
+      offset = 1;
+    } else {
+      offset = 0;
+    }
+
+    final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+    buffer.length = offset + read;
+    final boolean result = buffer.length == numChars;
+    if (buffer.length < numChars) {
+      // We failed to fill the buffer. Even if the last char is a high
+      // surrogate, there is nothing we can do
+      return result;
+    }
+
+    if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+      buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+    }
+    return result;
+  }
+
+  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+  public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+    return fill(buffer, reader, buffer.buffer.length);
+  }
+
+  static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+    int read = 0;
+    while (read < len) {
+      final int r = reader.read(dest, offset + read, len - read);
+      if (r == -1) {
+        break;
+      }
+      read += r;
+    }
+    return read;
+  }
+
+  /**
+   * A simple IO buffer to use with
+   * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+   */
+  public static final class CharacterBuffer {
+    
+    private final char[] buffer;
+    private int offset;
+    private int length;
+    // NOTE: not private so outer class can access without
+    // $access methods:
+    char lastTrailingHighSurrogate;
+    
+    CharacterBuffer(char[] buffer, int offset, int length) {
+      this.buffer = buffer;
+      this.offset = offset;
+      this.length = length;
+    }
+    
+    /**
+     * Returns the internal buffer
+     * 
+     * @return the buffer
+     */
+    public char[] getBuffer() {
+      return buffer;
+    }
+    
+    /**
+     * Returns the data offset in the internal buffer.
+     * 
+     * @return the offset
+     */
+    public int getOffset() {
+      return offset;
+    }
+    
+    /**
+     * Return the length of the data in the internal buffer starting at
+     * {@link #getOffset()}
+     * 
+     * @return the length
+     */
+    public int getLength() {
+      return length;
+    }
+    
+    /**
+     * Resets the CharacterBuffer. All internals are reset to its default
+     * values.
+     */
+    public void reset() {
+      offset = 0;
+      length = 0;
+      lastTrailingHighSurrogate = 0;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
new file mode 100644
index 0000000..cecad10
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * Abstract base class for TokenFilters that may remove tokens.
+ * You have to implement {@link #accept} and return a boolean if the current
+ * token should be preserved. {@link #incrementToken} uses this method
+ * to decide if a token should be passed to the caller.
+ */
+public abstract class FilteringTokenFilter extends TokenFilter {
+
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private int skippedPositions;
+
+  /**
+   * Create a new {@link FilteringTokenFilter}.
+   * @param in      the {@link TokenStream} to consume
+   */
+  public FilteringTokenFilter(TokenStream in) {
+    super(in);
+  }
+
+  /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
+  protected abstract boolean accept() throws IOException;
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    skippedPositions = 0;
+    while (input.incrementToken()) {
+      if (accept()) {
+        if (skippedPositions != 0) {
+          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+        }
+        return true;
+      }
+      skippedPositions += posIncrAtt.getPositionIncrement();
+    }
+
+    // reached EOS -- return false
+    return false;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    skippedPositions = 0;
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
new file mode 100644
index 0000000..b86684d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * Normalizes token text to lower case.
+ */
+public final class LowerCaseFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  /**
+   * Create a new LowerCaseFilter, that normalizes token text to lower case.
+   * 
+   * @param in TokenStream to filter
+   */
+  public LowerCaseFilter(TokenStream in) {
+    super(in);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+      return true;
+    } else
+      return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
new file mode 100644
index 0000000..79707bc
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Removes stop words from a token stream.
+ */
+public final class StopFilter extends FilteringTokenFilter {
+
+  private final CharArraySet stopWords;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  /**
+   * Constructs a filter which removes words from the input TokenStream that are
+   * named in the Set.
+   * 
+   * @param in
+   *          Input stream
+   * @param stopWords
+   *          A {@link CharArraySet} representing the stopwords.
+   * @see #makeStopSet(java.lang.String...)
+   */
+  public StopFilter(TokenStream in, CharArraySet stopWords) {
+    super(in);
+    this.stopWords = stopWords;
+  }
+
+  /**
+   * Builds a Set from an array of stop words,
+   * appropriate for passing into the StopFilter constructor.
+   * This permits this stopWords construction to be cached once when
+   * an Analyzer is constructed.
+   * 
+   * @param stopWords An array of stopwords
+   * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+   */
+  public static CharArraySet makeStopSet(String... stopWords) {
+    return makeStopSet(stopWords, false);
+  }
+  
+  /**
+   * Builds a Set from an array of stop words,
+   * appropriate for passing into the StopFilter constructor.
+   * This permits this stopWords construction to be cached once when
+   * an Analyzer is constructed.
+   * 
+   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+   * @return A Set ({@link CharArraySet}) containing the words
+   * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+   */
+  public static CharArraySet makeStopSet(List<?> stopWords) {
+    return makeStopSet(stopWords, false);
+  }
+    
+  /**
+   * Creates a stopword set from the given stopword array.
+   * 
+   * @param stopWords An array of stopwords
+   * @param ignoreCase If true, all words are lower cased first.  
+   * @return a Set containing the words
+   */    
+  public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
+    CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
+    stopSet.addAll(Arrays.asList(stopWords));
+    return stopSet;
+  }
+  
+  /**
+   * Creates a stopword set from the given stopword list.
+   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+   * @param ignoreCase if true, all words are lower cased first
+   * @return A Set ({@link CharArraySet}) containing the words
+   */
+  public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
+    CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
+    stopSet.addAll(stopWords);
+    return stopSet;
+  }
+  
+  /**
+   * Returns the next input Token whose term() is not a stop word.
+   */
+  @Override
+  protected boolean accept() {
+    return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
new file mode 100644
index 0000000..c35e715
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets. 
+ * 
+ */
+public abstract class StopwordAnalyzerBase extends Analyzer {
+
+  /**
+   * An immutable stopword set
+   */
+  protected final CharArraySet stopwords;
+
+  /**
+   * Returns the analyzer's stopword set or an empty set if the analyzer has no
+   * stopwords
+   * 
+   * @return the analyzer's stopword set or an empty set if the analyzer has no
+   *         stopwords
+   */
+  public CharArraySet getStopwordSet() {
+    return stopwords;
+  }
+
+  /**
+   * Creates a new instance initialized with the given stopword set
+   * 
+   * @param stopwords
+   *          the analyzer's stopword set
+   */
+  protected StopwordAnalyzerBase(final CharArraySet stopwords) {
+    // analyzers should use char array set for stopwords!
+    this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+        .unmodifiableSet(CharArraySet.copy(stopwords));
+  }
+
+  /**
+   * Creates a new Analyzer with an empty stopword set
+   */
+  protected StopwordAnalyzerBase() {
+    this(null);
+  }
+
+  /**
+   * Creates a CharArraySet from a file resource associated with a class. (See
+   * {@link Class#getResourceAsStream(String)}).
+   * 
+   * @param ignoreCase
+   *          <code>true</code> if the set should ignore the case of the
+   *          stopwords, otherwise <code>false</code>
+   * @param aClass
+   *          a class that is associated with the given stopwordResource
+   * @param resource
+   *          name of the resource file associated with the given class
+   * @param comment
+   *          comment string to ignore in the stopword file
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         file
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+      final Class<? extends Analyzer> aClass, final String resource,
+      final String comment) throws IOException {
+    Reader reader = null;
+    try {
+      reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
+      return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase));
+    } finally {
+      IOUtils.close(reader);
+    }
+    
+  }
+  
+  /**
+   * Creates a CharArraySet from a path.
+   * 
+   * @param stopwords
+   *          the stopwords file to load
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         file
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(Path stopwords) throws IOException {
+    Reader reader = null;
+    try {
+      reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8);
+      return WordlistLoader.getWordSet(reader);
+    } finally {
+      IOUtils.close(reader);
+    }
+  }
+  
+  /**
+   * Creates a CharArraySet from a file.
+   * 
+   * @param stopwords
+   *          the stopwords reader to load
+   * 
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         reader
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(Reader stopwords) throws IOException {
+    try {
+      return WordlistLoader.getWordSet(stopwords);
+    } finally {
+      IOUtils.close(stopwords);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
new file mode 100644
index 0000000..2397e66
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Loader for text files that represent a list of stopwords.
+ * 
+ * @see IOUtils to obtain {@link Reader} instances
+ * @lucene.internal
+ */
+public class WordlistLoader {
+  
+  private static final int INITIAL_CAPACITY = 16;
+  
+  /** no instance */
+  private WordlistLoader() {}
+  
+  /**
+   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
+    BufferedReader br = null;
+    try {
+      br = getBufferedReader(reader);
+      String word = null;
+      while ((word = br.readLine()) != null) {
+        result.add(word.trim());
+      }
+    }
+    finally {
+      IOUtils.close(br);
+    }
+    return result;
+  }
+  
+  /**
+   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @return A {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader) throws IOException {
+    return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+  }
+
+  /**
+   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param comment The string representing a comment.
+   * @return A CharArraySet with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
+    return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
+  }
+
+  /**
+   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param comment The string representing a comment.
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
+    BufferedReader br = null;
+    try {
+      br = getBufferedReader(reader);
+      String word = null;
+      while ((word = br.readLine()) != null) {
+        if (word.startsWith(comment) == false){
+          result.add(word.trim());
+        }
+      }
+    }
+    finally {
+      IOUtils.close(br);
+    }
+    return result;
+  }
+
+  
+  /**
+   * Reads stopwords from a stopword list in Snowball format.
+   * <p>
+   * The snowball format is the following:
+   * <ul>
+   * <li>Lines may contain multiple words separated by whitespace.
+   * <li>The comment character is the vertical line (&#124;).
+   * <li>Lines may contain trailing comments.
+   * </ul>
+   * 
+   * @param reader Reader containing a Snowball stopword list
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
+      throws IOException {
+    BufferedReader br = null;
+    try {
+      br = getBufferedReader(reader);
+      String line = null;
+      while ((line = br.readLine()) != null) {
+        int comment = line.indexOf('|');
+        if (comment >= 0) line = line.substring(0, comment);
+        String words[] = line.split("\\s+");
+        for (int i = 0; i < words.length; i++)
+          if (words[i].length() > 0) result.add(words[i]);
+      }
+    } finally {
+      IOUtils.close(br);
+    }
+    return result;
+  }
+  
+  /**
+   * Reads stopwords from a stopword list in Snowball format.
+   * <p>
+   * The snowball format is the following:
+   * <ul>
+   * <li>Lines may contain multiple words separated by whitespace.
+   * <li>The comment character is the vertical line (&#124;).
+   * <li>Lines may contain trailing comments.
+   * </ul>
+   * 
+   * @param reader Reader containing a Snowball stopword list
+   * @return A {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
+    return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+  }
+
+
+  /**
+   * Reads a stem dictionary. Each line contains:
+   * <pre>word<b>\t</b>stem</pre>
+   * (i.e. two tab separated words)
+   *
+   * @return stem dictionary that overrules the stemming algorithm
+   * @throws IOException If there is a low-level I/O error.
+   */
+  public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
+    BufferedReader br = null;
+    try {
+      br = getBufferedReader(reader);
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] wordstem = line.split("\t", 2);
+        result.put(wordstem[0], wordstem[1]);
+      }
+    } finally {
+      IOUtils.close(br);
+    }
+    return result;
+  }
+  
+  /**
+   * Accesses a resource by name and returns the (non comment) lines containing
+   * data using the given character encoding.
+   *
+   * <p>
+   * A comment line is any line that starts with the character "#"
+   * </p>
+   *
+   * @return a list of non-blank non-comment lines with whitespace trimmed
+   * @throws IOException If there is a low-level I/O error.
+   */
+  public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
+    BufferedReader input = null;
+    ArrayList<String> lines;
+    boolean success = false;
+    try {
+      input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
+
+      lines = new ArrayList<>();
+      for (String word=null; (word=input.readLine())!=null;) {
+        // skip initial bom marker
+        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
+          word = word.substring(1);
+        // skip comments
+        if (word.startsWith("#")) continue;
+        word=word.trim();
+        // skip blank lines
+        if (word.length()==0) continue;
+        lines.add(word);
+      }
+      success = true;
+      return lines;
+    } finally {
+      if (success) {
+        IOUtils.close(input);
+      } else {
+        IOUtils.closeWhileHandlingException(input);
+      }
+    }
+  }
+  
+  private static BufferedReader getBufferedReader(Reader reader) {
+    return (reader instanceof BufferedReader) ? (BufferedReader) reader
+        : new BufferedReader(reader);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
index 511f268..81858df 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
@@ -156,7 +156,7 @@
  *   over and over in many places, you can make a subclass of
  *   {@link org.apache.lucene.analysis.Analyzer}. In fact, Apache Lucene
  *   supplies a large family of <code>Analyzer</code> classes that deliver useful
- *   analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
+ *   analysis chains. The most common of these is the <a href="{@docRoot}/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
  *   Many applications will have a long and industrious life with nothing more
  *   than the <code>StandardAnalyzer</code>. The <a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a>
  *   library provides many pre-existing analyzers for various languages.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
new file mode 100644
index 0000000..251017d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
+ * LowerCaseFilter} and {@link StopFilter}, using a list of
+ * English stop words.
+ */
+public final class StandardAnalyzer extends StopwordAnalyzerBase {
+
+  /** An unmodifiable set containing some common English words that are not usually useful
+  for searching.*/
+  public static final CharArraySet ENGLISH_STOP_WORDS_SET;
+  
+  static {
+    final List<String> stopWords = Arrays.asList(
+      "a", "an", "and", "are", "as", "at", "be", "but", "by",
+      "for", "if", "in", "into", "is", "it",
+      "no", "not", "of", "on", "or", "such",
+      "that", "the", "their", "then", "there", "these",
+      "they", "this", "to", "was", "will", "with"
+    );
+    final CharArraySet stopSet = new CharArraySet(stopWords, false);
+    ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); 
+  }
+  
+  /** Default maximum allowed token length */
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+  /** An unmodifiable set containing some common English words that are usually not
+  useful for searching. */
+  public static final CharArraySet STOP_WORDS_SET = ENGLISH_STOP_WORDS_SET;
+
+  /** Builds an analyzer with the given stop words.
+   * @param stopWords stop words */
+  public StandardAnalyzer(CharArraySet stopWords) {
+    super(stopWords);
+  }
+
+  /** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
+   */
+  public StandardAnalyzer() {
+    this(STOP_WORDS_SET);
+  }
+
+  /** Builds an analyzer with the stop words from the given reader.
+   * @see WordlistLoader#getWordSet(Reader)
+   * @param stopwords Reader to read stop words from */
+  public StandardAnalyzer(Reader stopwords) throws IOException {
+    this(loadStopwordSet(stopwords));
+  }
+
+  /**
+   * Set maximum allowed token length.  If a token is seen
+   * that exceeds this length then it is discarded.  This
+   * setting only takes effect the next time tokenStream or
+   * tokenStream is called.
+   */
+  public void setMaxTokenLength(int length) {
+    maxTokenLength = length;
+  }
+    
+  /** Returns the current maximum token length
+   * 
+   *  @see #setMaxTokenLength */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(final String fieldName) {
+    final StandardTokenizer src = new StandardTokenizer();
+    src.setMaxTokenLength(maxTokenLength);
+    TokenStream tok = new StandardFilter(src);
+    tok = new LowerCaseFilter(tok);
+    tok = new StopFilter(tok, stopwords);
+    return new TokenStreamComponents(src, tok) {
+      @Override
+      protected void setReader(final Reader reader) {
+        src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
+        super.setReader(reader);
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
new file mode 100644
index 0000000..202db37
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes tokens extracted with {@link StandardTokenizer}.
+ */
+public class StandardFilter extends TokenFilter {
+
+  /** Sole constructor */
+  public StandardFilter(TokenStream in) {
+    super(in);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    return input.incrementToken(); // TODO: add some niceties for the new grammar
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
new file mode 100644
index 0000000..5c5169a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeFactory;
+
+/** A grammar-based tokenizer constructed with JFlex.
+ * <p>
+ * This class implements the Word Break rules from the
+ * Unicode Text Segmentation algorithm, as specified in 
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>Many applications have specific tokenizer needs.  If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+
+public final class StandardTokenizer extends Tokenizer {
+  /** A private instance of the JFlex-constructed scanner */
+  private StandardTokenizerImpl scanner;
+
+  // TODO: how can we remove these old types?!
+  /** Alpha/numeric token type */
+  public static final int ALPHANUM          = 0;
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int APOSTROPHE        = 1;
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int ACRONYM           = 2;
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int COMPANY           = 3;
+  /** Email token type */
+  public static final int EMAIL             = 4;
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int HOST              = 5;
+  /** Numeric token type */
+  public static final int NUM               = 6;
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int CJ                = 7;
+
+  /** @deprecated (3.1) */
+  @Deprecated
+  public static final int ACRONYM_DEP       = 8;
+
+  /** Southeast Asian token type */
+  public static final int SOUTHEAST_ASIAN = 9;
+  /** Idiographic token type */
+  public static final int IDEOGRAPHIC = 10;
+  /** Hiragana token type */
+  public static final int HIRAGANA = 11;
+  /** Katakana token type */
+  public static final int KATAKANA = 12;
+
+  /** Hangul token type */
+  public static final int HANGUL = 13;
+  
+  /** String token types that correspond to token type int constants */
+  public static final String [] TOKEN_TYPES = new String [] {
+    "<ALPHANUM>",
+    "<APOSTROPHE>",
+    "<ACRONYM>",
+    "<COMPANY>",
+    "<EMAIL>",
+    "<HOST>",
+    "<NUM>",
+    "<CJ>",
+    "<ACRONYM_DEP>",
+    "<SOUTHEAST_ASIAN>",
+    "<IDEOGRAPHIC>",
+    "<HIRAGANA>",
+    "<KATAKANA>",
+    "<HANGUL>"
+  };
+  
+  /** Absolute maximum sized token */
+  public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
+  
+  private int skippedPositions;
+
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  /**
+   * Set the max allowed token length.  No tokens longer than this are emitted.
+   * 
+   * @throws IllegalArgumentException if the given length is outside of the
+   *  range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+   */ 
+  public void setMaxTokenLength(int length) {
+    if (length < 1) {
+      throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+    } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+      throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+    }
+    if (length != maxTokenLength) {
+      maxTokenLength = length;
+      scanner.setBufferSize(length);
+    }
+  }
+
+  /** Returns the current maximum token length
+   * 
+   *  @see #setMaxTokenLength */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  /**
+   * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}.  Attaches
+   * the <code>input</code> to the newly created JFlex scanner.
+
+   * See http://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  public StandardTokenizer() {
+    init();
+  }
+
+  /**
+   * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} 
+   */
+  public StandardTokenizer(AttributeFactory factory) {
+    super(factory);
+    init();
+  }
+
+  private void init() {
+    this.scanner = new StandardTokenizerImpl(input);
+  }
+
+  // this tokenizer generates three attributes:
+  // term offset, positionIncrement and type
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+  /*
+   * (non-Javadoc)
+   *
+   * @see org.apache.lucene.analysis.TokenStream#next()
+   */
+  @Override
+  public final boolean incrementToken() throws IOException {
+    clearAttributes();
+    skippedPositions = 0;
+
+    while(true) {
+      int tokenType = scanner.getNextToken();
+
+      if (tokenType == StandardTokenizerImpl.YYEOF) {
+        return false;
+      }
+
+      if (scanner.yylength() <= maxTokenLength) {
+        posIncrAtt.setPositionIncrement(skippedPositions+1);
+        scanner.getText(termAtt);
+        final int start = scanner.yychar();
+        offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+        typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
+        return true;
+      } else
+        // When we skip a too-long term, we still increment the
+        // position increment
+        skippedPositions++;
+    }
+  }
+  
+  @Override
+  public final void end() throws IOException {
+    super.end();
+    // set final offset
+    int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+    // adjust any skipped tokens
+    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+  }
+
+  @Override
+  public void close() throws IOException {
+    super.close();
+    scanner.yyreset(input);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    scanner.yyreset(input);
+    skippedPositions = 0;
+  }
+}