You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/14 22:51:17 UTC
[05/12] lucene-solr:branch_6x: LUCENE-7318: graduate StandardAnalyzer
and make it the default for IndexWriterConfig
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
new file mode 100644
index 0000000..e7e610a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArrayMap.java
@@ -0,0 +1,669 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * A simple class that stores key Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * map, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to retrieve items
+ * by char[] keys without the necessity of converting
+ * to a String first.
+ */
+public class CharArrayMap<V> extends AbstractMap<Object,V> {
+ // private only because missing generics
+ private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
+
+ private final static int INIT_SIZE = 8;
+ private boolean ignoreCase;
+ private int count;
+ char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+ V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+
+ /**
+ * Create map with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ @SuppressWarnings("unchecked")
+ public CharArrayMap(int startSize, boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while(startSize + (startSize>>2) > size)
+ size <<= 1;
+ keys = new char[size][];
+ values = (V[]) new Object[size];
+ }
+
+ /**
+ * Creates a map from the mappings in another map.
+ *
+ * @param c
+ * a map whose mappings to be copied
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ putAll(c);
+ }
+
+ /** Create set from the supplied map (used internally for readonly maps...) */
+ private CharArrayMap(CharArrayMap<V> toCopy){
+ this.keys = toCopy.keys;
+ this.values = toCopy.values;
+ this.ignoreCase = toCopy.ignoreCase;
+ this.count = toCopy.count;
+ }
+
+ /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
+ @Override
+ public void clear() {
+ count = 0;
+ Arrays.fill(keys, null);
+ Arrays.fill(values, null);
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the {@link #keySet()} */
+ public boolean containsKey(char[] text, int off, int len) {
+ return keys[getSlot(text, off, len)] != null;
+ }
+
+ /** true if the <code>CharSequence</code> is in the {@link #keySet()} */
+ public boolean containsKey(CharSequence cs) {
+ return keys[getSlot(cs)] != null;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return containsKey(text, 0, text.length);
+ }
+ return containsKey(o.toString());
+ }
+
+ /** returns the value of the mapping of <code>len</code> chars of <code>text</code>
+ * starting at <code>off</code> */
+ public V get(char[] text, int off, int len) {
+ return values[getSlot(text, off, len)];
+ }
+
+ /** returns the value of the mapping of the chars inside this {@code CharSequence} */
+ public V get(CharSequence cs) {
+ return values[getSlot(cs)];
+ }
+
+ @Override
+ public V get(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return get(text, 0, text.length);
+ }
+ return get(o.toString());
+ }
+
+ private int getSlot(char[] text, int off, int len) {
+ int code = getHashCode(text, off, len);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, off, len, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, off, len, text2));
+ }
+ return pos;
+ }
+
+ /** Returns true if the String is in the set */
+ private int getSlot(CharSequence text) {
+ int code = getHashCode(text);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, text2));
+ }
+ return pos;
+ }
+
+ /** Add the given mapping. */
+ public V put(CharSequence text, V value) {
+ return put(text.toString(), value); // could be more efficient
+ }
+
+ @Override
+ public V put(Object o, V value) {
+ if (o instanceof char[]) {
+ return put((char[])o, value);
+ }
+ return put(o.toString(), value);
+ }
+
+ /** Add the given mapping. */
+ public V put(String text, V value) {
+ return put(text.toCharArray(), value);
+ }
+
+ /** Add the given mapping.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public V put(char[] text, V value) {
+ if (ignoreCase) {
+ CharacterUtils.toLowerCase(text, 0, text.length);
+ }
+ int slot = getSlot(text, 0, text.length);
+ if (keys[slot] != null) {
+ final V oldValue = values[slot];
+ values[slot] = value;
+ return oldValue;
+ }
+ keys[slot] = text;
+ values[slot] = value;
+ count++;
+
+ if (count + (count>>2) > keys.length) {
+ rehash();
+ }
+
+ return null;
+ }
+
+ @SuppressWarnings("unchecked")
+ private void rehash() {
+ assert keys.length == values.length;
+ final int newSize = 2*keys.length;
+ final char[][] oldkeys = keys;
+ final V[] oldvalues = values;
+ keys = new char[newSize][];
+ values = (V[]) new Object[newSize];
+
+ for(int i=0; i<oldkeys.length; i++) {
+ char[] text = oldkeys[i];
+ if (text != null) {
+ // todo: could be faster... no need to compare strings on collision
+ final int slot = getSlot(text,0,text.length);
+ keys[slot] = text;
+ values[slot] = oldvalues[i];
+ }
+ }
+ }
+
+ private boolean equals(char[] text1, int off, int len, char[] text2) {
+ if (len != text2.length)
+ return false;
+ final int limit = off+len;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = Character.codePointAt(text1, off+i, limit);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1[off+i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean equals(CharSequence text1, char[] text2) {
+ int len = text1.length();
+ if (len != text2.length)
+ return false;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = Character.codePointAt(text1, i);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1.charAt(i) != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private int getHashCode(char[] text, int offset, int len) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ final int stop = offset + len;
+ if (ignoreCase) {
+ for (int i=offset; i<stop;) {
+ final int codePointAt = Character.codePointAt(text, i, stop);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=offset; i<stop; i++) {
+ code = code*31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ private int getHashCode(CharSequence text) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ int len = text.length();
+ if (ignoreCase) {
+ for (int i=0; i<len;) {
+ int codePointAt = Character.codePointAt(text, i);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=0; i<len; i++) {
+ code = code*31 + text.charAt(i);
+ }
+ }
+ return code;
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("{");
+ for (Map.Entry<Object,V> entry : entrySet()) {
+ if (sb.length()>1) sb.append(", ");
+ sb.append(entry);
+ }
+ return sb.append('}').toString();
+ }
+
+ private EntrySet entrySet = null;
+ private CharArraySet keySet = null;
+
+ EntrySet createEntrySet() {
+ return new EntrySet(true);
+ }
+
+ @Override
+ public final EntrySet entrySet() {
+ if (entrySet == null) {
+ entrySet = createEntrySet();
+ }
+ return entrySet;
+ }
+
+ // helper for CharArraySet to not produce endless recursion
+ final Set<Object> originalKeySet() {
+ return super.keySet();
+ }
+
+ /** Returns an {@link CharArraySet} view on the map's keys.
+ * The set will use the same {@code matchVersion} as this map. */
+ @Override @SuppressWarnings({"unchecked","rawtypes"})
+ public final CharArraySet keySet() {
+ if (keySet == null) {
+ // prevent adding of entries
+ keySet = new CharArraySet((CharArrayMap) this) {
+ @Override
+ public boolean add(Object o) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(CharSequence text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(String text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(char[] text) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ return keySet;
+ }
+
+ /** public iterator class so efficient methods are exposed to users */
+ public class EntryIterator implements Iterator<Map.Entry<Object,V>> {
+ private int pos=-1;
+ private int lastPos;
+ private final boolean allowModify;
+
+ private EntryIterator(boolean allowModify) {
+ this.allowModify = allowModify;
+ goNext();
+ }
+
+ private void goNext() {
+ lastPos = pos;
+ pos++;
+ while (pos < keys.length && keys[pos] == null) pos++;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return pos < keys.length;
+ }
+
+ /** gets the next key... do not modify the returned char[] */
+ public char[] nextKey() {
+ goNext();
+ return keys[lastPos];
+ }
+
+ /** gets the next key as a newly created String object */
+ public String nextKeyString() {
+ return new String(nextKey());
+ }
+
+ /** returns the value associated with the last key returned */
+ public V currentValue() {
+ return values[lastPos];
+ }
+
+ /** sets the value associated with the last key returned */
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ V old = values[lastPos];
+ values[lastPos] = value;
+ return old;
+ }
+
+ /** use nextCharArray() + currentValue() for better efficiency. */
+ @Override
+ public Map.Entry<Object,V> next() {
+ goNext();
+ return new MapEntry(lastPos, allowModify);
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private final class MapEntry implements Map.Entry<Object,V> {
+ private final int pos;
+ private final boolean allowModify;
+
+ private MapEntry(int pos, boolean allowModify) {
+ this.pos = pos;
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public Object getKey() {
+ // we must clone here, as putAll to another CharArrayMap
+ // with other case sensitivity flag would corrupt the keys
+ return keys[pos].clone();
+ }
+
+ @Override
+ public V getValue() {
+ return values[pos];
+ }
+
+ @Override
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ final V old = values[pos];
+ values[pos] = value;
+ return old;
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append(keys[pos]).append('=')
+ .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos])
+ .toString();
+ }
+ }
+
+ /** public EntrySet class so efficient methods are exposed to users */
+ public final class EntrySet extends AbstractSet<Map.Entry<Object,V>> {
+ private final boolean allowModify;
+
+ private EntrySet(boolean allowModify) {
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public EntryIterator iterator() {
+ return new EntryIterator(allowModify);
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public boolean contains(Object o) {
+ if (!(o instanceof Map.Entry))
+ return false;
+ final Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+ final Object key = e.getKey();
+ final Object val = e.getValue();
+ final Object v = get(key);
+ return v == null ? val == null : v.equals(val);
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public void clear() {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ CharArrayMap.this.clear();
+ }
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArrayMap}. This allows to provide
+ * unmodifiable views of internal map for "read-only" use.
+ *
+ * @param map
+ * a map for which the unmodifiable map is returned.
+ * @return an new unmodifiable {@link CharArrayMap}.
+ * @throws NullPointerException
+ * if the given map is <code>null</code>.
+ */
+ public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
+ if (map == null)
+ throw new NullPointerException("Given map is null");
+ if (map == emptyMap() || map.isEmpty())
+ return emptyMap();
+ if (map instanceof UnmodifiableCharArrayMap)
+ return map;
+ return new UnmodifiableCharArrayMap<>(map);
+ }
+
+ /**
+ * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property will be preserved.
+ *
+ * @param map
+ * a map to copy
+ * @return a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property as well as the
+ * matchVersion will be of the given map will be preserved.
+ */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) {
+ if(map == EMPTY_MAP)
+ return emptyMap();
+ if(map instanceof CharArrayMap) {
+ CharArrayMap<V> m = (CharArrayMap<V>) map;
+ // use fast path instead of iterating all values
+ // this is even on very small sets ~10 times faster than iterating
+ final char[][] keys = new char[m.keys.length][];
+ System.arraycopy(m.keys, 0, keys, 0, keys.length);
+ final V[] values = (V[]) new Object[m.values.length];
+ System.arraycopy(m.values, 0, values, 0, values.length);
+ m = new CharArrayMap<>(m);
+ m.keys = keys;
+ m.values = values;
+ return m;
+ }
+ return new CharArrayMap<>(map, false);
+ }
+
+ /** Returns an empty, unmodifiable map. */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> emptyMap() {
+ return (CharArrayMap<V>) EMPTY_MAP;
+ }
+
+ // package private CharArraySet instanceof check in CharArraySet
+ static class UnmodifiableCharArrayMap<V> extends CharArrayMap<V> {
+
+ UnmodifiableCharArrayMap(CharArrayMap<V> map) {
+ super(map);
+ }
+
+ @Override
+ public void clear() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(Object o, V val){
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(char[] text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(CharSequence text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(String text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ EntrySet createEntrySet() {
+ return new EntrySet(false);
+ }
+ }
+
+ /**
+ * Empty {@link org.apache.lucene.analysis.CharArrayMap.UnmodifiableCharArrayMap} optimized for speed.
+ * Contains checks will always return <code>false</code> or throw
+ * NPE if necessary.
+ */
+ private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
+ EmptyCharArrayMap() {
+ super(new CharArrayMap<V>(0, false));
+ }
+
+ @Override
+ public boolean containsKey(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public V get(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
new file mode 100644
index 0000000..4c8066a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharArraySet.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A simple class that stores Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
+ *
+ * <P>
+ * <em>Please note:</em> This class implements {@link java.util.Set Set} but
+ * does not behave like it should in all cases. The generic type is
+ * {@code Set<Object>}, because you can add any object to it,
+ * that has a string representation. The add methods will use
+ * {@link Object#toString} and store the result using a {@code char[]}
+ * buffer. The same behavior have the {@code contains()} methods.
+ * The {@link #iterator()} returns an {@code Iterator<char[]>}.
+ */
+public class CharArraySet extends AbstractSet<Object> {
+
+ /** An empty {@code CharArraySet}. */
+ public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+
+ private static final Object PLACEHOLDER = new Object();
+
+ private final CharArrayMap<Object> map;
+
+ /**
+ * Create set with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(int startSize, boolean ignoreCase) {
+ this(new CharArrayMap<>(startSize, ignoreCase));
+ }
+
+ /**
+ * Creates a set from a Collection of objects.
+ *
+ * @param c
+ * a collection whose elements to be placed into the set
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(Collection<?> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ addAll(c);
+ }
+
+ /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+ CharArraySet(final CharArrayMap<Object> map){
+ this.map = map;
+ }
+
+ /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
+ @Override
+ public void clear() {
+ map.clear();
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the set */
+ public boolean contains(char[] text, int off, int len) {
+ return map.containsKey(text, off, len);
+ }
+
+ /** true if the <code>CharSequence</code> is in the set */
+ public boolean contains(CharSequence cs) {
+ return map.containsKey(cs);
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return map.containsKey(o);
+ }
+
+ @Override
+ public boolean add(Object o) {
+ return map.put(o, PLACEHOLDER) == null;
+ }
+
+ /** Add this CharSequence into the set */
+ public boolean add(CharSequence text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this String into the set */
+ public boolean add(String text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this char[] directly to the set.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public boolean add(char[] text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+ * unmodifiable views of internal sets for "read-only" use.
+ *
+ * @param set
+ * a set for which the unmodifiable set is returned.
+ * @return an new unmodifiable {@link CharArraySet}.
+ * @throws NullPointerException
+ * if the given set is <code>null</code>.
+ */
+ public static CharArraySet unmodifiableSet(CharArraySet set) {
+ if (set == null)
+ throw new NullPointerException("Given set is null");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
+ return set;
+ return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
+ }
+
+ /**
+ * Returns a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property will be preserved.
+ *
+ * @param set
+ * a set to copy
+ * @return a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property as well as the
+ * matchVersion will be of the given set will be preserved.
+ */
+ public static CharArraySet copy(final Set<?> set) {
+ if(set == EMPTY_SET)
+ return EMPTY_SET;
+ if(set instanceof CharArraySet) {
+ final CharArraySet source = (CharArraySet) set;
+ return new CharArraySet(CharArrayMap.copy(source.map));
+ }
+ return new CharArraySet(set, false);
+ }
+
+ /**
+ * Returns an {@link Iterator} for {@code char[]} instances in this set.
+ */
+ @Override @SuppressWarnings("unchecked")
+ public Iterator<Object> iterator() {
+ // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+ return map.originalKeySet().iterator();
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("[");
+ for (Object item : this) {
+ if (sb.length()>1) sb.append(", ");
+ if (item instanceof char[]) {
+ sb.append((char[]) item);
+ } else {
+ sb.append(item);
+ }
+ }
+ return sb.append(']').toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
new file mode 100644
index 0000000..e2cc47f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Utility class to write tokenizers or token filters.
+ * @lucene.internal
+ */
+public final class CharacterUtils {
+
+ private CharacterUtils() {} // no instantiation
+
+ /**
+ * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+ * of the given bufferSize.
+ *
+ * @param bufferSize
+ * the internal char buffer size, must be <code>>= 2</code>
+ * @return a new {@link CharacterBuffer} instance.
+ */
+ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+ if (bufferSize < 2) {
+ throw new IllegalArgumentException("buffersize must be >= 2");
+ }
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+
+
+ /**
+ * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to lowercase
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ Character.codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /**
+ * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to UPPERCASE
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toUpperCase(
+ Character.codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /** Converts a sequence of Java characters to a sequence of unicode code points.
+ * @return the number of code points written to the destination buffer */
+ public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount = 0;
+ for (int i = 0; i < srcLen; ) {
+ final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int charCount = Character.charCount(cp);
+ dest[destOff + codePointCount++] = cp;
+ i += charCount;
+ }
+ return codePointCount;
+ }
+
+ /** Converts a sequence of unicode code points to a sequence of Java characters.
+ * @return the number of chars written to the destination buffer */
+ public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i) {
+ written += Character.toChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
+ /**
+ * Fills the {@link CharacterBuffer} with characters read from the given
+ * reader {@link Reader}. This method tries to read <code>numChars</code>
+ * characters into the {@link CharacterBuffer}, each call to fill will start
+ * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ * In case code points can span across 2 java characters, this method may
+ * only fill <code>numChars - 1</code> characters in order not to split in
+ * the middle of a surrogate pair, even if there are remaining characters in
+ * the {@link Reader}.
+ * <p>
+ * This method guarantees
+ * that the given {@link CharacterBuffer} will never contain a high surrogate
+ * character as the last element in the buffer unless it is the last available
+ * character in the reader. In other words, high and low surrogate pairs will
+ * always be preserved across buffer boarders.
+ * </p>
+ * <p>
+ * A return value of <code>false</code> means that this method call exhausted
+ * the reader, but there may be some bytes which have been read, which can be
+ * verified by checking whether <code>buffer.getLength() > 0</code>.
+ * </p>
+ *
+ * @param buffer
+ * the buffer to fill.
+ * @param reader
+ * the reader to read characters from.
+ * @param numChars
+ * the number of chars to read
+ * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+ * @throws IOException
+ * if the reader throws an {@link IOException}.
+ */
+ public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
+
+ /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+ public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ return fill(buffer, reader, buffer.buffer.length);
+ }
+
+ static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+ int read = 0;
+ while (read < len) {
+ final int r = reader.read(dest, offset + read, len - read);
+ if (r == -1) {
+ break;
+ }
+ read += r;
+ }
+ return read;
+ }
+
+ /**
+ * A simple IO buffer to use with
+ * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+ */
+ public static final class CharacterBuffer {
+
+ private final char[] buffer;
+ private int offset;
+ private int length;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ char lastTrailingHighSurrogate;
+
+ CharacterBuffer(char[] buffer, int offset, int length) {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /**
+ * Returns the internal buffer
+ *
+ * @return the buffer
+ */
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Returns the data offset in the internal buffer.
+ *
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Return the length of the data in the internal buffer starting at
+ * {@link #getOffset()}
+ *
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Resets the CharacterBuffer. All internals are reset to its default
+ * values.
+ */
+ public void reset() {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = 0;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
new file mode 100644
index 0000000..cecad10
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/FilteringTokenFilter.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * Abstract base class for TokenFilters that may remove tokens.
+ * You have to implement {@link #accept} and return a boolean if the current
+ * token should be preserved. {@link #incrementToken} uses this method
+ * to decide if a token should be passed to the caller.
+ */
+public abstract class FilteringTokenFilter extends TokenFilter {
+
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private int skippedPositions;
+
+ /**
+ * Create a new {@link FilteringTokenFilter}.
+ * @param in the {@link TokenStream} to consume
+ */
+ public FilteringTokenFilter(TokenStream in) {
+ super(in);
+ }
+
+ /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
+ protected abstract boolean accept() throws IOException;
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.getPositionIncrement();
+ }
+
+ // reached EOS -- return false
+ return false;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ skippedPositions = 0;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
new file mode 100644
index 0000000..b86684d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharacterUtils;
+
+/**
+ * Normalizes token text to lower case.
+ */
+public final class LowerCaseFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Create a new LowerCaseFilter, that normalizes token text to lower case.
+ *
+ * @param in TokenStream to filter
+ */
+ public LowerCaseFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+ return true;
+ } else
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
new file mode 100644
index 0000000..79707bc
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopFilter.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Removes stop words from a token stream.
+ */
+public final class StopFilter extends FilteringTokenFilter {
+
+ private final CharArraySet stopWords;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Constructs a filter which removes words from the input TokenStream that are
+ * named in the Set.
+ *
+ * @param in
+ * Input stream
+ * @param stopWords
+ * A {@link CharArraySet} representing the stopwords.
+ * @see #makeStopSet(java.lang.String...)
+ */
+ public StopFilter(TokenStream in, CharArraySet stopWords) {
+ super(in);
+ this.stopWords = stopWords;
+ }
+
+ /**
+ * Builds a Set from an array of stop words,
+ * appropriate for passing into the StopFilter constructor.
+ * This permits this stopWords construction to be cached once when
+ * an Analyzer is constructed.
+ *
+ * @param stopWords An array of stopwords
+ * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+ */
+ public static CharArraySet makeStopSet(String... stopWords) {
+ return makeStopSet(stopWords, false);
+ }
+
+ /**
+ * Builds a Set from an array of stop words,
+ * appropriate for passing into the StopFilter constructor.
+ * This permits this stopWords construction to be cached once when
+ * an Analyzer is constructed.
+ *
+ * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+ * @return A Set ({@link CharArraySet}) containing the words
+ * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
+ */
+ public static CharArraySet makeStopSet(List<?> stopWords) {
+ return makeStopSet(stopWords, false);
+ }
+
+ /**
+ * Creates a stopword set from the given stopword array.
+ *
+ * @param stopWords An array of stopwords
+ * @param ignoreCase If true, all words are lower cased first.
+ * @return a Set containing the words
+ */
+ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
+ CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
+ stopSet.addAll(Arrays.asList(stopWords));
+ return stopSet;
+ }
+
+ /**
+ * Creates a stopword set from the given stopword list.
+ * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
+ * @param ignoreCase if true, all words are lower cased first
+ * @return A Set ({@link CharArraySet}) containing the words
+ */
+ public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
+ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
+ stopSet.addAll(stopWords);
+ return stopSet;
+ }
+
+ /**
+ * Returns the next input Token whose term() is not a stop word.
+ */
+ @Override
+ protected boolean accept() {
+ return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
new file mode 100644
index 0000000..c35e715
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets.
+ *
+ */
+public abstract class StopwordAnalyzerBase extends Analyzer {
+
+ /**
+ * An immutable stopword set
+ */
+ protected final CharArraySet stopwords;
+
+ /**
+ * Returns the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ *
+ * @return the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ */
+ public CharArraySet getStopwordSet() {
+ return stopwords;
+ }
+
+ /**
+ * Creates a new instance initialized with the given stopword set
+ *
+ * @param stopwords
+ * the analyzer's stopword set
+ */
+ protected StopwordAnalyzerBase(final CharArraySet stopwords) {
+ // analyzers should use char array set for stopwords!
+ this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+ .unmodifiableSet(CharArraySet.copy(stopwords));
+ }
+
+ /**
+ * Creates a new Analyzer with an empty stopword set
+ */
+ protected StopwordAnalyzerBase() {
+ this(null);
+ }
+
+ /**
+ * Creates a CharArraySet from a file resource associated with a class. (See
+ * {@link Class#getResourceAsStream(String)}).
+ *
+ * @param ignoreCase
+ * <code>true</code> if the set should ignore the case of the
+ * stopwords, otherwise <code>false</code>
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param resource
+ * name of the resource file associated with the given class
+ * @param comment
+ * comment string to ignore in the stopword file
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+ final Class<? extends Analyzer> aClass, final String resource,
+ final String comment) throws IOException {
+ Reader reader = null;
+ try {
+ reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
+ return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase));
+ } finally {
+ IOUtils.close(reader);
+ }
+
+ }
+
+ /**
+ * Creates a CharArraySet from a path.
+ *
+ * @param stopwords
+ * the stopwords file to load
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(Path stopwords) throws IOException {
+ Reader reader = null;
+ try {
+ reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8);
+ return WordlistLoader.getWordSet(reader);
+ } finally {
+ IOUtils.close(reader);
+ }
+ }
+
+ /**
+ * Creates a CharArraySet from a file.
+ *
+ * @param stopwords
+ * the stopwords reader to load
+ *
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * reader
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(Reader stopwords) throws IOException {
+ try {
+ return WordlistLoader.getWordSet(stopwords);
+ } finally {
+ IOUtils.close(stopwords);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
new file mode 100644
index 0000000..2397e66
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Loader for text files that represent a list of stopwords.
+ *
+ * @see IOUtils to obtain {@link Reader} instances
+ * @lucene.internal
+ */
+public class WordlistLoader {
+
+ private static final int INITIAL_CAPACITY = 16;
+
+ /** no instance */
+ private WordlistLoader() {}
+
+ /**
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ result.add(word.trim());
+ }
+ }
+ finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @return A {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader) throws IOException {
+ return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+ /**
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param comment The string representing a comment.
+ * @return A CharArraySet with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
+ return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+ /**
+ * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param comment The string representing a comment.
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ if (word.startsWith(comment) == false){
+ result.add(word.trim());
+ }
+ }
+ }
+ finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @param result the {@link CharArraySet} to fill with the readers words
+ * @return the given {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
+ throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ int comment = line.indexOf('|');
+ if (comment >= 0) line = line.substring(0, comment);
+ String words[] = line.split("\\s+");
+ for (int i = 0; i < words.length; i++)
+ if (words[i].length() > 0) result.add(words[i]);
+ }
+ } finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @return A {@link CharArraySet} with the reader's words
+ */
+ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
+ return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
+ }
+
+
+ /**
+ * Reads a stem dictionary. Each line contains:
+ * <pre>word<b>\t</b>stem</pre>
+ * (i.e. two tab separated words)
+ *
+ * @return stem dictionary that overrules the stemming algorithm
+ * @throws IOException If there is a low-level I/O error.
+ */
+ public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
+ BufferedReader br = null;
+ try {
+ br = getBufferedReader(reader);
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] wordstem = line.split("\t", 2);
+ result.put(wordstem[0], wordstem[1]);
+ }
+ } finally {
+ IOUtils.close(br);
+ }
+ return result;
+ }
+
+ /**
+ * Accesses a resource by name and returns the (non comment) lines containing
+ * data using the given character encoding.
+ *
+ * <p>
+ * A comment line is any line that starts with the character "#"
+ * </p>
+ *
+ * @return a list of non-blank non-comment lines with whitespace trimmed
+ * @throws IOException If there is a low-level I/O error.
+ */
+ public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
+ BufferedReader input = null;
+ ArrayList<String> lines;
+ boolean success = false;
+ try {
+ input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
+
+ lines = new ArrayList<>();
+ for (String word=null; (word=input.readLine())!=null;) {
+ // skip initial bom marker
+ if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
+ word = word.substring(1);
+ // skip comments
+ if (word.startsWith("#")) continue;
+ word=word.trim();
+ // skip blank lines
+ if (word.length()==0) continue;
+ lines.add(word);
+ }
+ success = true;
+ return lines;
+ } finally {
+ if (success) {
+ IOUtils.close(input);
+ } else {
+ IOUtils.closeWhileHandlingException(input);
+ }
+ }
+ }
+
+ private static BufferedReader getBufferedReader(Reader reader) {
+ return (reader instanceof BufferedReader) ? (BufferedReader) reader
+ : new BufferedReader(reader);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
index 511f268..81858df 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/package-info.java
@@ -156,7 +156,7 @@
* over and over in many places, you can make a subclass of
* {@link org.apache.lucene.analysis.Analyzer}. In fact, Apache Lucene
* supplies a large family of <code>Analyzer</code> classes that deliver useful
- * analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
+ * analysis chains. The most common of these is the <a href="{@docRoot}/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
* Many applications will have a long and industrious life with nothing more
* than the <code>StandardAnalyzer</code>. The <a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a>
* library provides many pre-existing analyzers for various languages.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
new file mode 100644
index 0000000..251017d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
+ * LowerCaseFilter} and {@link StopFilter}, using a list of
+ * English stop words.
+ */
+public final class StandardAnalyzer extends StopwordAnalyzerBase {
+
+ /** An unmodifiable set containing some common English words that are not usually useful
+ for searching.*/
+ public static final CharArraySet ENGLISH_STOP_WORDS_SET;
+
+ static {
+ final List<String> stopWords = Arrays.asList(
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ );
+ final CharArraySet stopSet = new CharArraySet(stopWords, false);
+ ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+ }
+
+ /** Default maximum allowed token length */
+ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /** An unmodifiable set containing some common English words that are usually not
+ useful for searching. */
+ public static final CharArraySet STOP_WORDS_SET = ENGLISH_STOP_WORDS_SET;
+
+ /** Builds an analyzer with the given stop words.
+ * @param stopWords stop words */
+ public StandardAnalyzer(CharArraySet stopWords) {
+ super(stopWords);
+ }
+
+ /** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
+ */
+ public StandardAnalyzer() {
+ this(STOP_WORDS_SET);
+ }
+
+ /** Builds an analyzer with the stop words from the given reader.
+ * @see WordlistLoader#getWordSet(Reader)
+ * @param stopwords Reader to read stop words from */
+ public StandardAnalyzer(Reader stopwords) throws IOException {
+ this(loadStopwordSet(stopwords));
+ }
+
+ /**
+ * Set maximum allowed token length. If a token is seen
+ * that exceeds this length then it is discarded. This
+ * setting only takes effect the next time tokenStream or
+ * tokenStream is called.
+ */
+ public void setMaxTokenLength(int length) {
+ maxTokenLength = length;
+ }
+
+ /** Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName) {
+ final StandardTokenizer src = new StandardTokenizer();
+ src.setMaxTokenLength(maxTokenLength);
+ TokenStream tok = new StandardFilter(src);
+ tok = new LowerCaseFilter(tok);
+ tok = new StopFilter(tok, stopwords);
+ return new TokenStreamComponents(src, tok) {
+ @Override
+ protected void setReader(final Reader reader) {
+ src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength);
+ super.setReader(reader);
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
new file mode 100644
index 0000000..202db37
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes tokens extracted with {@link StandardTokenizer}.
+ */
+public class StandardFilter extends TokenFilter {
+
+ /** Sole constructor */
+ public StandardFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ return input.incrementToken(); // TODO: add some niceties for the new grammar
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ba922148/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
new file mode 100644
index 0000000..5c5169a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.standard;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeFactory;
+
+/** A grammar-based tokenizer constructed with JFlex.
+ * <p>
+ * This class implements the Word Break rules from the
+ * Unicode Text Segmentation algorithm, as specified in
+ * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+
+public final class StandardTokenizer extends Tokenizer {
+ /** A private instance of the JFlex-constructed scanner */
+ private StandardTokenizerImpl scanner;
+
+ // TODO: how can we remove these old types?!
+ /** Alpha/numeric token type */
+ public static final int ALPHANUM = 0;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int APOSTROPHE = 1;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int ACRONYM = 2;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int COMPANY = 3;
+ /** Email token type */
+ public static final int EMAIL = 4;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int HOST = 5;
+ /** Numeric token type */
+ public static final int NUM = 6;
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int CJ = 7;
+
+ /** @deprecated (3.1) */
+ @Deprecated
+ public static final int ACRONYM_DEP = 8;
+
+ /** Southeast Asian token type */
+ public static final int SOUTHEAST_ASIAN = 9;
+ /** Idiographic token type */
+ public static final int IDEOGRAPHIC = 10;
+ /** Hiragana token type */
+ public static final int HIRAGANA = 11;
+ /** Katakana token type */
+ public static final int KATAKANA = 12;
+
+ /** Hangul token type */
+ public static final int HANGUL = 13;
+
+ /** String token types that correspond to token type int constants */
+ public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>",
+ "<SOUTHEAST_ASIAN>",
+ "<IDEOGRAPHIC>",
+ "<HIRAGANA>",
+ "<KATAKANA>",
+ "<HANGUL>"
+ };
+
+ /** Absolute maximum sized token */
+ public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
+
+ private int skippedPositions;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /**
+ * Set the max allowed token length. No tokens longer than this are emitted.
+ *
+ * @throws IllegalArgumentException if the given length is outside of the
+ * range [1, {@value #MAX_TOKEN_LENGTH_LIMIT}].
+ */
+ public void setMaxTokenLength(int length) {
+ if (length < 1) {
+ throw new IllegalArgumentException("maxTokenLength must be greater than zero");
+ } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+ throw new IllegalArgumentException("maxTokenLength may not exceed " + MAX_TOKEN_LENGTH_LIMIT);
+ }
+ if (length != maxTokenLength) {
+ maxTokenLength = length;
+ scanner.setBufferSize(length);
+ }
+ }
+
+ /** Returns the current maximum token length
+ *
+ * @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ /**
+ * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
+ * the <code>input</code> to the newly created JFlex scanner.
+
+ * See http://issues.apache.org/jira/browse/LUCENE-1068
+ */
+ public StandardTokenizer() {
+ init();
+ }
+
+ /**
+ * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
+ */
+ public StandardTokenizer(AttributeFactory factory) {
+ super(factory);
+ init();
+ }
+
+ private void init() {
+ this.scanner = new StandardTokenizerImpl(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ skippedPositions = 0;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(skippedPositions+1);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ skippedPositions++;
+ }
+ }
+
+ @Override
+ public final void end() throws IOException {
+ super.end();
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ // adjust any skipped tokens
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ scanner.yyreset(input);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+}