You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/26 14:10:08 UTC
svn commit: r916666 [8/16] - in /lucene/java/branches/flex_1458: ./ contrib/
contrib/analyzers/common/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
contrib/analyzers/c...
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java Fri Feb 26 13:09:54 2010
@@ -1,15 +1,5 @@
package org.apache.lucene.analysis;
-import java.util.Arrays;
-import java.util.AbstractSet;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.lucene.util.CharacterUtils;
-import org.apache.lucene.util.Version;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -27,6 +17,13 @@
* limitations under the License.
*/
+import java.util.Arrays;
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
/**
* A simple class that stores Strings as char[]'s in a
@@ -58,16 +55,11 @@
* For type safety also {@link #stringIterator()} is provided.
*/
public class CharArraySet extends AbstractSet<Object> {
- private final static int INIT_SIZE = 8;
- private char[][] entries;
- private int count;
- private final boolean ignoreCase;
- public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(
- new CharArraySet(Version.LUCENE_CURRENT, 0, false));
+ public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+ private static final Object PLACEHOLDER = new Object();
+
+ private final CharArrayMap<Object> map;
- private final CharacterUtils charUtils;
- private final Version matchVersion;
-
/**
* Create set with enough capacity to hold startSize terms
*
@@ -81,13 +73,7 @@
* otherwise <code>true</code>.
*/
public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) {
- this.ignoreCase = ignoreCase;
- int size = INIT_SIZE;
- while(startSize + (startSize>>2) > size)
- size <<= 1;
- entries = new char[size][];
- this.charUtils = CharacterUtils.getInstance(matchVersion);
- this.matchVersion = matchVersion;
+ this(new CharArrayMap<Object>(matchVersion, startSize, ignoreCase));
}
/**
@@ -102,7 +88,7 @@
* <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>.
*/
- public CharArraySet(Version matchVersion, Collection<? extends Object> c, boolean ignoreCase) {
+ public CharArraySet(Version matchVersion, Collection<?> c, boolean ignoreCase) {
this(matchVersion, c.size(), ignoreCase);
addAll(c);
}
@@ -133,77 +119,51 @@
* @deprecated use {@link #CharArraySet(Version, Collection, boolean)} instead
*/
@Deprecated
- public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
+ public CharArraySet(Collection<?> c, boolean ignoreCase) {
this(Version.LUCENE_30, c.size(), ignoreCase);
addAll(c);
}
- /** Create set from entries */
- private CharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count){
- this.entries = entries;
- this.ignoreCase = ignoreCase;
- this.count = count;
- this.charUtils = CharacterUtils.getInstance(matchVersion);
- this.matchVersion = matchVersion;
+ /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+ CharArraySet(final CharArrayMap<Object> map){
+ this.map = map;
}
/** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
@Override
public void clear() {
- count = 0;
- Arrays.fill(entries, null);
+ map.clear();
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
public boolean contains(char[] text, int off, int len) {
- return entries[getSlot(text, off, len)] != null;
+ return map.containsKey(text, off, len);
}
/** true if the <code>CharSequence</code> is in the set */
public boolean contains(CharSequence cs) {
- return entries[getSlot(cs)] != null;
+ return map.containsKey(cs);
}
- private int getSlot(char[] text, int off, int len) {
- int code = getHashCode(text, off, len);
- int pos = code & (entries.length-1);
- char[] text2 = entries[pos];
- if (text2 != null && !equals(text, off, len, text2)) {
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- pos = code & (entries.length-1);
- text2 = entries[pos];
- } while (text2 != null && !equals(text, off, len, text2));
- }
- return pos;
+ @Override
+ public boolean contains(Object o) {
+ return map.containsKey(o);
}
- /** Returns true if the String is in the set */
- private int getSlot(CharSequence text) {
- int code = getHashCode(text);
- int pos = code & (entries.length-1);
- char[] text2 = entries[pos];
- if (text2 != null && !equals(text, text2)) {
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- pos = code & (entries.length-1);
- text2 = entries[pos];
- } while (text2 != null && !equals(text, text2));
- }
- return pos;
+ @Override
+ public boolean add(Object o) {
+ return map.put(o, PLACEHOLDER) == null;
}
/** Add this CharSequence into the set */
public boolean add(CharSequence text) {
- return add(text.toString()); // could be more efficient
+ return map.put(text, PLACEHOLDER) == null;
}
/** Add this String into the set */
public boolean add(String text) {
- return add(text.toCharArray());
+ return map.put(text, PLACEHOLDER) == null;
}
/** Add this char[] directly to the set.
@@ -211,140 +171,12 @@
* The user should never modify this text array after calling this method.
*/
public boolean add(char[] text) {
- if (ignoreCase)
- for(int i=0;i<text.length;){
- i += Character.toChars(
- Character.toLowerCase(
- charUtils.codePointAt(text, i)), text, i);
- }
- int slot = getSlot(text, 0, text.length);
- if (entries[slot] != null) return false;
- entries[slot] = text;
- count++;
-
- if (count + (count>>2) > entries.length) {
- rehash();
- }
-
- return true;
- }
-
- private boolean equals(char[] text1, int off, int len, char[] text2) {
- if (len != text2.length)
- return false;
- final int limit = off+len;
- if (ignoreCase) {
- for(int i=0;i<len;) {
- final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
- return false;
- i += Character.charCount(codePointAt);
- }
- } else {
- for(int i=0;i<len;i++) {
- if (text1[off+i] != text2[i])
- return false;
- }
- }
- return true;
- }
-
- private boolean equals(CharSequence text1, char[] text2) {
- int len = text1.length();
- if (len != text2.length)
- return false;
- if (ignoreCase) {
- for(int i=0;i<len;) {
- final int codePointAt = charUtils.codePointAt(text1, i);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
- return false;
- i += Character.charCount(codePointAt);
- }
- } else {
- for(int i=0;i<len;i++) {
- if (text1.charAt(i) != text2[i])
- return false;
- }
- }
- return true;
- }
-
-
-
- private void rehash() {
- final int newSize = 2*entries.length;
- char[][] oldEntries = entries;
- entries = new char[newSize][];
-
- for(int i=0;i<oldEntries.length;i++) {
- char[] text = oldEntries[i];
- if (text != null) {
- // todo: could be faster... no need to compare strings on collision
- entries[getSlot(text,0,text.length)] = text;
- }
- }
- }
-
- private int getHashCode(char[] text, int offset, int len) {
- int code = 0;
- final int stop = offset + len;
- if (ignoreCase) {
- for (int i=offset; i<stop;) {
- final int codePointAt = charUtils.codePointAt(text, i, stop);
- code = code*31 + Character.toLowerCase(codePointAt);
- i += Character.charCount(codePointAt);
- }
- } else {
- for (int i=offset; i<stop; i++) {
- code = code*31 + text[i];
- }
- }
- return code;
+ return map.put(text, PLACEHOLDER) == null;
}
- private int getHashCode(CharSequence text) {
- int code = 0;
- int len = text.length();
- if (ignoreCase) {
- for (int i=0; i<len;) {
- int codePointAt = charUtils.codePointAt(text, i);
- code = code*31 + Character.toLowerCase(codePointAt);
- i += Character.charCount(codePointAt);
- }
- } else {
- for (int i=0; i<len; i++) {
- code = code*31 + text.charAt(i);
- }
- }
- return code;
- }
-
-
@Override
public int size() {
- return count;
- }
-
- @Override
- public boolean isEmpty() {
- return count==0;
- }
-
- @Override
- public boolean contains(Object o) {
- if (o instanceof char[]) {
- final char[] text = (char[])o;
- return contains(text, 0, text.length);
- }
- return contains(o.toString());
- }
-
- @Override
- public boolean add(Object o) {
- if (o instanceof char[]) {
- return add((char[])o);
- }
- return add(o.toString());
+ return map.size();
}
/**
@@ -362,14 +194,9 @@
throw new NullPointerException("Given set is null");
if (set == EMPTY_SET)
return EMPTY_SET;
- if (set instanceof UnmodifiableCharArraySet)
+ if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
return set;
-
- /*
- * Instead of delegating calls to the given set copy the low-level values to
- * the unmodifiable Subclass
- */
- return new UnmodifiableCharArraySet(set.matchVersion, set.entries, set.ignoreCase, set.count);
+ return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
}
/**
@@ -387,7 +214,7 @@
public static CharArraySet copy(final Set<?> set) {
if(set == EMPTY_SET)
return EMPTY_SET;
- return (set instanceof CharArraySet) ? copy((CharArraySet) set) : copy(Version.LUCENE_30, set);
+ return copy(Version.LUCENE_30, set);
}
/**
@@ -416,29 +243,27 @@
return EMPTY_SET;
if(set instanceof CharArraySet) {
final CharArraySet source = (CharArraySet) set;
- // use fast path instead of iterating all values
- // this is even on very small sets ~10 times faster than iterating
- final char[][] entries = new char[source.entries.length][];
- System.arraycopy(source.entries, 0, entries, 0, entries.length);
- return new CharArraySet(source.matchVersion, entries, source.ignoreCase, source.count);
+ return new CharArraySet(CharArrayMap.copy(source.map.matchVersion, source.map));
}
return new CharArraySet(matchVersion, set, false);
}
-
/** The Iterator<String> for this set. Strings are constructed on the fly, so
- * use <code>nextCharArray</code> for more efficient access. */
+ * use <code>nextCharArray</code> for more efficient access.
+ * @deprecated Use the standard iterator, which returns {@code char[]} instances.
+ */
+ @Deprecated
public class CharArraySetIterator implements Iterator<String> {
int pos=-1;
char[] next;
- CharArraySetIterator() {
+ private CharArraySetIterator() {
goNext();
}
private void goNext() {
next = null;
pos++;
- while (pos < entries.length && (next=entries[pos]) == null) pos++;
+ while (pos < map.keys.length && (next=map.keys[pos]) == null) pos++;
}
public boolean hasNext() {
@@ -463,61 +288,41 @@
}
}
- /** returns an iterator of new allocated Strings */
+ /** returns an iterator of new allocated Strings (an instance of {@link CharArraySetIterator}).
+ * @deprecated Use {@link #iterator}, which returns {@code char[]} instances.
+ */
+ @Deprecated
public Iterator<String> stringIterator() {
return new CharArraySetIterator();
}
- /** returns an iterator of new allocated Strings, this method violates the Set interface */
- @Override
- @SuppressWarnings("unchecked")
+ /** Returns an {@link Iterator} depending on the version used:
+ * <ul>
+ * <li>if {@code matchVersion} ≥ 3.1, it returns {@code char[]} instances in this set.</li>
+ * <li>if {@code matchVersion} is 3.0 or older, it returns new
+ * allocated Strings, so this method violates the Set interface.
+ * It is kept this way for backwards compatibility, normally it should
+ * return {@code char[]} on {@code next()}</li>
+ * </ul>
+ */
+ @Override @SuppressWarnings("unchecked")
public Iterator<Object> iterator() {
- return (Iterator) stringIterator();
+ // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+ return map.matchVersion.onOrAfter(Version.LUCENE_31) ?
+ map.originalKeySet().iterator() : (Iterator) stringIterator();
}
- /**
- * Efficient unmodifiable {@link CharArraySet}. This implementation does not
- * delegate calls to a give {@link CharArraySet} like
- * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
- * the internal representation of a {@link CharArraySet} to a super
- * constructor and overrides all mutators.
- */
- private static final class UnmodifiableCharArraySet extends CharArraySet {
-
- private UnmodifiableCharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase,
- int count) {
- super(matchVersion, entries, ignoreCase, count);
- }
-
- @Override
- public void clear() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean add(Object o){
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean addAll(Collection<? extends Object> coll) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean add(char[] text) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean add(CharSequence text) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean add(String text) {
- throw new UnsupportedOperationException();
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("[");
+ for (Object item : this) {
+ if (sb.length()>1) sb.append(", ");
+ if (item instanceof char[]) {
+ sb.append((char[]) item);
+ } else {
+ sb.append(item);
+ }
}
+ return sb.append(']').toString();
}
-
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java Fri Feb 26 13:09:54 2010
@@ -24,9 +24,6 @@
* They can be used as {@link java.io.Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter/CharStream subclass is used.
- *
- * @version $Id$
- *
*/
public abstract class CharFilter extends CharStream {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Feb 26 13:09:54 2010
@@ -23,59 +23,310 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
-
-/** An abstract base class for simple, character-oriented tokenizers.*/
+import org.apache.lucene.util.CharacterUtils;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
+import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
+
+/**
+ * An abstract base class for simple, character-oriented tokenizers.
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link #isTokenChar(int)} and
+ * {@link #normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ * A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API
+ * moved from UTF-16 code units to UTF-32 codepoints to eventually add support
+ * for <a href=
+ * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+ * >supplementary characters</a>. The old <i>char</i> based API has been
+ * deprecated and should be replaced with the <i>int</i> based methods
+ * {@link #isTokenChar(int)} and {@link #normalize(int)}.
+ * </p>
+ * <p>
+ * As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a
+ * {@link Version} argument. Based on the given {@link Version} either the new
+ * API or a backwards compatibility layer is used at runtime. For
+ * {@link Version} < 3.1 the backwards compatibility layer ensures correct
+ * behavior even for indexes build with previous versions of Lucene. If a
+ * {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to
+ * be implemented by the instantiated class. Yet, the old <i>char</i> based API
+ * is not required anymore even if backwards compatibility must be preserved.
+ * {@link CharTokenizer} subclasses implementing the new API are fully backwards
+ * compatible if instantiated with {@link Version} < 3.1.
+ * </p>
+ * <p>
+ * <strong>Note:</strong> If you use a subclass of {@link CharTokenizer} with {@link Version} >=
+ * 3.1 on an index build with a version < 3.1, created tokens might not be
+ * compatible with the terms in your index.
+ * </p>
+ **/
public abstract class CharTokenizer extends Tokenizer {
- public CharTokenizer(Reader input) {
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param input
+ * the input to split up into tokens
+ */
+ public CharTokenizer(Version matchVersion, Reader input) {
super(input);
+ charUtils = CharacterUtils.getInstance(matchVersion);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
- }
+ useOldAPI = useOldAPI(matchVersion);
+ ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
- public CharTokenizer(AttributeSource source, Reader input) {
+ }
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param source
+ * the attribute source to use for this {@link Tokenizer}
+ * @param input
+ * the input to split up into tokens
+ */
+ public CharTokenizer(Version matchVersion, AttributeSource source,
+ Reader input) {
super(source, input);
+ charUtils = CharacterUtils.getInstance(matchVersion);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
+ useOldAPI = useOldAPI(matchVersion);
+ ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
}
-
- public CharTokenizer(AttributeFactory factory, Reader input) {
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param factory
+ * the attribute factory to use for this {@link Tokenizer}
+ * @param input
+ * the input to split up into tokens
+ */
+ public CharTokenizer(Version matchVersion, AttributeFactory factory,
+ Reader input) {
super(factory, input);
+ charUtils = CharacterUtils.getInstance(matchVersion);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(TermAttribute.class);
+ useOldAPI = useOldAPI(matchVersion);
+ ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
+ }
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ * @param input the input to split up into tokens
+ * @deprecated use {@link #CharTokenizer(Version, Reader)} instead. This will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
+ public CharTokenizer(Reader input) {
+ this(Version.LUCENE_30, input);
+ }
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ * @param input the input to split up into tokens
+ * @param source the attribute source to use for this {@link Tokenizer}
+ * @deprecated use {@link #CharTokenizer(Version, AttributeSource, Reader)} instead. This will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
+ public CharTokenizer(AttributeSource source, Reader input) {
+ this(Version.LUCENE_30, source, input);
+ }
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ * @param input the input to split up into tokens
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @deprecated use {@link #CharTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. This will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
+ public CharTokenizer(AttributeFactory factory, Reader input) {
+ this(Version.LUCENE_30, factory, input);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
- private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
- private TermAttribute termAtt;
- private OffsetAttribute offsetAtt;
+ private final TermAttribute termAtt;
+ private final OffsetAttribute offsetAtt;
+
+ private final CharacterUtils charUtils;
+ private final CharacterBuffer ioBuffer;
+
+ /**
+ * @deprecated this will be removed in lucene 4.0
+ */
+ @Deprecated
+ private final boolean useOldAPI;
+
+ /**
+ * @deprecated this will be removed in lucene 4.0
+ */
+ @Deprecated
+ private static final VirtualMethod<CharTokenizer> isTokenCharMethod =
+ new VirtualMethod<CharTokenizer>(CharTokenizer.class, "isTokenChar", char.class);
+
+ /**
+ * @deprecated this will be removed in lucene 4.0
+ */
+ @Deprecated
+ private static final VirtualMethod<CharTokenizer> normalizeMethod =
+ new VirtualMethod<CharTokenizer>(CharTokenizer.class, "normalize", char.class);
- /** Returns true iff a character should be included in a token. This
+ /**
+ * Returns true iff a UTF-16 code unit should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
- * satisfy this predicate. Characters for which this is false are used to
- * define token boundaries and are not included in tokens. */
- protected abstract boolean isTokenChar(char c);
-
- /** Called on each token character to normalize it before it is added to the
- * token. The default implementation does nothing. Subclasses may use this
- * to, e.g., lowercase tokens. */
+ * satisfy this predicate. Characters for which this is <code>false</code> are
+ * used to define token boundaries and are not included in tokens.
+ * <p>
+ * Note: This method cannot handle <a href=
+ * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+ * >supplementary characters</a>. To support all Unicode characters, including
+ * supplementary characters, use the {@link #isTokenChar(int)} method.
+ * </p>
+ *
+ * @deprecated use {@link #isTokenChar(int)} instead. This method will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
+ protected boolean isTokenChar(char c) {
+ return isTokenChar((int)c);
+ }
+
+ /**
+ * Called on each token UTF-16 code unit to normalize it before it is added to the
+ * token. The default implementation does nothing. Subclasses may use this to,
+ * e.g., lowercase tokens.
+ * <p>
+ * Note: This method cannot handle <a href=
+ * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+ * >supplementary characters</a>. To support all Unicode characters, including
+ * supplementary characters, use the {@link #normalize(int)} method.
+ * </p>
+ *
+ * @deprecated use {@link #normalize(int)} instead. This method will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
protected char normalize(char c) {
+ return (char) normalize((int) c);
+ }
+
+ /**
+ * Returns true iff a codepoint should be included in a token. This tokenizer
+ * generates as tokens adjacent sequences of codepoints which satisfy this
+ * predicate. Codepoints for which this is false are used to define token
+ * boundaries and are not included in tokens.
+ * <p>
+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
+ * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
+ * compatible int based API to support codepoints instead of UTF-16 code
+ * units. Subclasses of {@link CharTokenizer} must not override the char based
+ * methods if a {@link Version} >= 3.1 is passed to the constructor.
+ * <p>
+ * <p>
+ * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
+ * </p>
+ */
+ protected boolean isTokenChar(int c) {
+ throw new UnsupportedOperationException("since LUCENE_3_1 subclasses of CharTokenizer must implement isTokenChar(int)");
+ }
+
+ /**
+ * Called on each token character to normalize it before it is added to the
+ * token. The default implementation does nothing. Subclasses may use this to,
+ * e.g., lowercase tokens.
+ * <p>
+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
+ * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
+ * compatible int based API to support codepoints instead of UTF-16 code
+ * units. Subclasses of {@link CharTokenizer} must not override the char based
+ * methods if a {@link Version} >= 3.1 is passed to the constructor.
+ * <p>
+ * <p>
+ * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
+ * </p>
+ */
+ protected int normalize(int c) {
return c;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
+ if(useOldAPI) // TODO remove this in LUCENE 4.0
+ return incrementTokenOld();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+ if (bufferIndex >= dataLen) {
+ offset += dataLen;
+ if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
+ dataLen = 0; // so next offset += dataLen won't decrement offset
+ if (length > 0)
+ break;
+ else
+ return false;
+ }
+ dataLen = ioBuffer.getLength();
+ bufferIndex = 0;
+ }
+ // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
+ final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
+ bufferIndex += Character.charCount(c);
+
+ if (isTokenChar(c)) { // if it's a token char
+ if (length == 0) // start of token
+ start = offset + bufferIndex - 1;
+ else if (length >= buffer.length-1) // check if a supplementary could run out of bounds
+ buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer
+ length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
+ if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
+ break;
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.setTermLength(length);
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
+ return true;
+
+ }
+
+ /**
+ * The <= 3.0 version of incrementToken. This is a backwards compat implementation used
+ * if a version <= 3.0 is provided to the ctor.
+ * @deprecated remove in 4.0
+ */
+ @Deprecated
+ private boolean incrementTokenOld() throws IOException {
int length = 0;
int start = bufferIndex;
char[] buffer = termAtt.termBuffer();
+ final char[] oldIoBuffer = ioBuffer.getBuffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
- dataLen = input.read(ioBuffer);
+ dataLen = input.read(oldIoBuffer);
if (dataLen == -1) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
@@ -86,7 +337,7 @@
bufferIndex = 0;
}
- final char c = ioBuffer[bufferIndex++];
+ final char c = oldIoBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
@@ -107,12 +358,14 @@
termAtt.setTermLength(length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
- }
+ }
+
+
@Override
public final void end() {
// set final offset
- int finalOffset = correctOffset(offset);
+ final int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@@ -122,5 +375,19 @@
bufferIndex = 0;
offset = 0;
dataLen = 0;
+ ioBuffer.reset(); // make sure to reset the IO buffer!!
}
-}
+
+ /**
+ * @deprecated this will be removed in lucene 4.0
+ */
+ @Deprecated
+ private boolean useOldAPI(Version matchVersion) {
+ final Class<? extends CharTokenizer> clazz = this.getClass();
+ if (matchVersion.onOrAfter(Version.LUCENE_31)
+ && (isTokenCharMethod.isOverriddenAsOf(clazz) || normalizeMethod
+ .isOverriddenAsOf(clazz))) throw new IllegalArgumentException(
+ "For matchVersion >= LUCENE_31, CharTokenizer subclasses must not override isTokenChar(char) or normalize(char).");
+ return !matchVersion.onOrAfter(Version.LUCENE_31);
+ }
+}
\ No newline at end of file
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,34 +20,106 @@
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
-/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's
- to say, it defines tokens as maximal strings of adjacent letters, as defined
- by java.lang.Character.isLetter() predicate.
-
- Note: this does a decent job for most European languages, but does a terrible
- job for some Asian languages, where words are not separated by spaces. */
+/**
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
+ * say, it defines tokens as maximal strings of adjacent letters, as defined by
+ * java.lang.Character.isLetter() predicate.
+ * <p>
+ * Note: this does a decent job for most European languages, but does a terrible
+ * job for some Asian languages, where words are not separated by spaces.
+ * </p>
+ * <p>
+ * <a name="version"/>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link LetterTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * </p>
+ */
public class LetterTokenizer extends CharTokenizer {
- /** Construct a new LetterTokenizer. */
+
+ /**
+ * Construct a new LetterTokenizer.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param in
+ * the input to split up into tokens
+ */
+ public LetterTokenizer(Version matchVersion, Reader in) {
+ super(matchVersion, in);
+ }
+
+ /**
+ * Construct a new LetterTokenizer using a given {@link AttributeSource}.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param source
+ * the attribute source to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public LetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+ super(matchVersion, source, in);
+ }
+
+ /**
+ * Construct a new LetterTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param factory
+ * the attribute factory to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public LetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+ super(matchVersion, factory, in);
+ }
+
+ /**
+ * Construct a new LetterTokenizer.
+ *
+ * @deprecated use {@link #LetterTokenizer(Version, Reader)} instead. This
+ * will be removed in Lucene 4.0.
+ */
public LetterTokenizer(Reader in) {
- super(in);
+ super(Version.LUCENE_30, in);
}
- /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
+ /**
+ * Construct a new LetterTokenizer using a given {@link AttributeSource}.
+ * @deprecated
+ * use {@link #LetterTokenizer(Version, AttributeSource, Reader)} instead.
+ * This will be removed in Lucene 4.0.
+ */
public LetterTokenizer(AttributeSource source, Reader in) {
- super(source, in);
+ super(Version.LUCENE_30, source, in);
}
- /** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+ /**
+ * Construct a new LetterTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @deprecated use {@link #LetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
+ * instead. This will be removed in Lucene 4.0.
+ */
public LetterTokenizer(AttributeFactory factory, Reader in) {
- super(factory, in);
+ super(Version.LUCENE_30, factory, in);
}
-
+
/** Collects only characters which satisfy
- * {@link Character#isLetter(char)}.*/
+ * {@link Character#isLetter(int)}.*/
@Override
- protected boolean isTokenChar(char c) {
+ protected boolean isTokenChar(int c) {
return Character.isLetter(c);
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,6 +20,7 @@
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
/**
* LowerCaseTokenizer performs the function of LetterTokenizer
@@ -30,27 +31,98 @@
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
+ * </p>
+ * <p>
+ * <a name="version"/>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link LowerCaseTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * </p>
*/
public final class LowerCaseTokenizer extends LetterTokenizer {
- /** Construct a new LowerCaseTokenizer. */
+
+ /**
+ * Construct a new LowerCaseTokenizer.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ *
+ * @param in
+ * the input to split up into tokens
+ */
+ public LowerCaseTokenizer(Version matchVersion, Reader in) {
+ super(matchVersion, in);
+ }
+
+ /**
+ * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param source
+ * the attribute source to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public LowerCaseTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+ super(matchVersion, source, in);
+ }
+
+ /**
+ * Construct a new LowerCaseTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param factory
+ * the attribute factory to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+ super(matchVersion, factory, in);
+ }
+
+ /**
+ * Construct a new LowerCaseTokenizer.
+ *
+ * @deprecated use {@link #LowerCaseTokenizer(Reader)} instead. This will be
+ * removed in Lucene 4.0.
+ */
+ @Deprecated
public LowerCaseTokenizer(Reader in) {
- super(in);
+ super(Version.LUCENE_30, in);
}
- /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
+ /**
+ * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}.
+ *
+ * @deprecated use {@link #LowerCaseTokenizer(AttributeSource, Reader)}
+ * instead. This will be removed in Lucene 4.0.
+ */
public LowerCaseTokenizer(AttributeSource source, Reader in) {
- super(source, in);
+ super(Version.LUCENE_30, source, in);
}
- /** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+ /**
+ * Construct a new LowerCaseTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @deprecated use {@link #LowerCaseTokenizer(AttributeSource.AttributeFactory, Reader)}
+ * instead. This will be removed in Lucene 4.0.
+ */
public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
- super(factory, in);
+ super(Version.LUCENE_30, factory, in);
}
/** Converts char to lower case
- * {@link Character#toLowerCase(char)}.*/
+ * {@link Character#toLowerCase(int)}.*/
@Override
- protected char normalize(char c) {
+ protected int normalize(int c) {
return Character.toLowerCase(c);
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Feb 26 13:09:54 2010
@@ -82,8 +82,7 @@
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
* parameter as well as how numeric fields work under the hood.</p>
*
- * <p><font color="red"><b>NOTE:</b> This API is experimental and
- * might change in incompatible ways in the next release.</font>
+ * @lucene.experimental
*
* @since 2.9
*/
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java Fri Feb 26 13:09:54 2010
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import org.apache.lucene.document.Fieldable;
+
import java.io.Reader;
import java.io.IOException;
import java.util.Map;
@@ -118,6 +120,15 @@
analyzer = defaultAnalyzer;
return analyzer.getPositionIncrementGap(fieldName);
}
+
+ /** Return the offsetGap from the analyzer assigned to field */
+ @Override
+ public int getOffsetGap(Fieldable field) {
+ Analyzer analyzer = analyzerMap.get(field.name());
+ if (analyzer == null)
+ analyzer = defaultAnalyzer;
+ return analyzer.getOffsetGap(field);
+ }
@Override
public String toString() {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java Fri Feb 26 13:09:54 2010
@@ -19,6 +19,7 @@
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** Transforms the token stream as per the Porter stemming algorithm.
@@ -38,15 +39,23 @@
}
}
</PRE>
+ <p>
+ Note: This filter is aware of the {@link KeywordAttribute}. To prevent
+ certain terms from being passed to the stemmer
+ {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
+ in a previous {@link TokenStream}.
+ </p>
*/
public final class PorterStemFilter extends TokenFilter {
- private PorterStemmer stemmer;
- private TermAttribute termAtt;
+ private final PorterStemmer stemmer;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public PorterStemFilter(TokenStream in) {
super(in);
stemmer = new PorterStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
@Override
@@ -54,7 +63,7 @@
if (!input.incrementToken())
return false;
- if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
+ if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return true;
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java Fri Feb 26 13:09:54 2010
@@ -44,7 +44,12 @@
*/
-import java.io.*;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.FileInputStream;
+
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
+import org.apache.lucene.util.ArrayUtil;
/**
*
@@ -61,11 +66,10 @@
private int i, /* offset into b */
j, k, k0;
private boolean dirty = false;
- private static final int INC = 50; /* unit of size whereby b is increased */
- private static final int EXTRA = 1;
+ private static final int INITIAL_SIZE = 50;
public PorterStemmer() {
- b = new char[INC];
+ b = new char[INITIAL_SIZE];
i = 0;
}
@@ -81,10 +85,8 @@
* adding characters, you can call stem(void) to process the word.
*/
public void add(char ch) {
- if (b.length <= i + EXTRA) {
- char[] new_b = new char[b.length+INC];
- System.arraycopy(b, 0, new_b, 0, b.length);
- b = new_b;
+ if (b.length <= i) {
+ b = ArrayUtil.grow(b, i+1);
}
b[i++] = ch;
}
@@ -451,8 +453,7 @@
public boolean stem(char[] wordBuffer, int offset, int wordLen) {
reset();
if (b.length < wordLen) {
- char[] new_b = new char[wordLen + EXTRA];
- b = new_b;
+ b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
}
System.arraycopy(wordBuffer, offset, b, 0, wordLen);
i = wordLen;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -19,14 +19,42 @@
import java.io.Reader;
-/** An {@link Analyzer} that filters {@link LetterTokenizer}
- * with {@link LowerCaseFilter} */
+import org.apache.lucene.util.Version;
+/** An {@link Analyzer} that filters {@link LetterTokenizer}
+ * with {@link LowerCaseFilter}
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link LowerCaseTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ **/
public final class SimpleAnalyzer extends ReusableAnalyzerBase {
+ private final Version matchVersion;
+
+ /**
+ * Creates a new {@link SimpleAnalyzer}
+ * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
+ */
+ public SimpleAnalyzer(Version matchVersion) {
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Creates a new {@link SimpleAnalyzer}
+ * @deprecated use {@link #SimpleAnalyzer(Version)} instead
+ */
+ @Deprecated public SimpleAnalyzer() {
+ this(Version.LUCENE_30);
+ }
@Override
protected TokenStreamComponents createComponents(final String fieldName,
final Reader reader) {
- return new TokenStreamComponents(new LowerCaseTokenizer(reader));
+ return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion, reader));
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -99,7 +99,7 @@
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new LowerCaseTokenizer(reader);
+ final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion,
source, stopwords));
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java Fri Feb 26 13:09:54 2010
@@ -29,6 +29,7 @@
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.RamUsageEstimator;
/**
A Token is an occurrence of a term from the text of a field. It consists of
@@ -347,12 +348,12 @@
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
- termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
- final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
+ final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
@@ -367,19 +368,19 @@
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
- termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
- termBuffer = new char[ArrayUtil.getNextSize(newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
- termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
+ termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
termLength = 0;
}
}
@@ -453,14 +454,14 @@
}
/**
- * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
*
- *
+ *
* @return The bits
+ * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
*/
public int getFlags() {
return flags;
Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Tokenizer.java
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Fri Feb 26 13:09:54 2010
@@ -0,0 +1 @@
+/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/analysis/Tokenizer.java:909334
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -19,13 +19,44 @@
import java.io.Reader;
-/** An Analyzer that uses {@link WhitespaceTokenizer}. */
+import org.apache.lucene.util.Version;
+/**
+ * An Analyzer that uses {@link WhitespaceTokenizer}.
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link WhitespaceTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ **/
public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
-
+
+ private final Version matchVersion;
+
+ /**
+ * Creates a new {@link WhitespaceAnalyzer}
+ * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
+ */
+ public WhitespaceAnalyzer(Version matchVersion) {
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Creates a new {@link WhitespaceAnalyzer}
+ * @deprecated use {@link #WhitespaceAnalyzer(Version)} instead
+ */
+ @Deprecated
+ public WhitespaceAnalyzer() {
+ this(Version.LUCENE_30);
+ }
+
@Override
protected TokenStreamComponents createComponents(final String fieldName,
final Reader reader) {
- return new TokenStreamComponents(new WhitespaceTokenizer(reader));
+ return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion, reader));
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,30 +20,102 @@
import java.io.Reader;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
-/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
- * Adjacent sequences of non-Whitespace characters form tokens. */
-
+/**
+ * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * Adjacent sequences of non-Whitespace characters form tokens. <a
+ * name="version"/>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link WhitespaceTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ */
public class WhitespaceTokenizer extends CharTokenizer {
- /** Construct a new WhitespaceTokenizer. */
+
+ /**
+ * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
+ * to match See {@link <a href="#version">above</a>}
+ *
+ * @param in
+ * the input to split up into tokens
+ */
+ public WhitespaceTokenizer(Version matchVersion, Reader in) {
+ super(matchVersion, in);
+ }
+
+ /**
+ * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}.
+ *
+ * @param matchVersion
+ * Lucene version to match See {@link <a href="#version">above</a>}
+ * @param source
+ * the attribute source to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public WhitespaceTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+ super(matchVersion, source, in);
+ }
+
+ /**
+ * Construct a new WhitespaceTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @param
+ * matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param factory
+ * the attribute factory to use for this {@link Tokenizer}
+ * @param in
+ * the input to split up into tokens
+ */
+ public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+ super(matchVersion, factory, in);
+ }
+
+ /**
+ * Construct a new WhitespaceTokenizer.
+ *
+ * @deprecated use {@link #WhitespaceTokenizer(Version, Reader)} instead. This will
+ * be removed in Lucene 4.0.
+ */
+ @Deprecated
public WhitespaceTokenizer(Reader in) {
super(in);
}
- /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
+ /**
+ * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}.
+ *
+ * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource, Reader)}
+ * instead. This will be removed in Lucene 4.0.
+ */
+ @Deprecated
public WhitespaceTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
- /** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+ /**
+ * Construct a new WhitespaceTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+ *
+ * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
+ * instead. This will be removed in Lucene 4.0.
+ */
+ @Deprecated
public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
/** Collects only characters which do not satisfy
- * {@link Character#isWhitespace(char)}.*/
+ * {@link Character#isWhitespace(int)}.*/
@Override
- protected boolean isTokenChar(char c) {
+ protected boolean isTokenChar(int c) {
return !Character.isWhitespace(c);
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java Fri Feb 26 13:09:54 2010
@@ -191,6 +191,66 @@
return result;
}
+ /**
+ * Loads a text file in Snowball format associated with a given class (See
+ * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
+ * a {@link Set}. The words need to be in lower-case if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param aClass a class that is associated with the given stopwordResource
+ * @param stopwordResource name of the resource file associated with the given
+ * class
+ * @return a {@link Set} with the file's words
+ * @see #getSnowballWordSet(Reader)
+ */
+ public static Set<String> getSnowballWordSet(Class<?> aClass,
+ String stopwordResource) throws IOException {
+ final Reader reader = new BufferedReader(new InputStreamReader(aClass
+ .getResourceAsStream(stopwordResource), "UTF-8"));
+ try {
+ return getSnowballWordSet(reader);
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Reads stopwords from a stopword list in Snowball format.
+ * <p>
+ * The snowball format is the following:
+ * <ul>
+ * <li>Lines may contain multiple words separated by whitespace.
+ * <li>The comment character is the vertical line (|).
+ * <li>Lines may contain trailing comments.
+ * </ul>
+ * </p>
+ *
+ * @param reader Reader containing a Snowball stopword list
+ * @return A Set with the reader's words
+ */
+ public static Set<String> getSnowballWordSet(Reader reader)
+ throws IOException {
+ final Set<String> result = new HashSet<String>();
+ BufferedReader br = null;
+ try {
+ if (reader instanceof BufferedReader) {
+ br = (BufferedReader) reader;
+ } else {
+ br = new BufferedReader(reader);
+ }
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ int comment = line.indexOf('|');
+ if (comment >= 0) line = line.substring(0, comment);
+ String words[] = line.split("\\s+");
+ for (int i = 0; i < words.length; i++)
+ if (words[i].length() > 0) result.add(words[i]);
+ }
+ } finally {
+ if (br != null) br.close();
+ }
+ return result;
+ }
/**
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java Fri Feb 26 13:09:54 2010
@@ -23,10 +23,10 @@
/**
* This attribute can be used to pass different flags down the {@link Tokenizer} chain,
* eg from one TokenFilter to another one.
+ * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
*/
public interface FlagsAttribute extends Attribute {
/**
- * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java Fri Feb 26 13:09:54 2010
@@ -24,12 +24,12 @@
/**
* This attribute can be used to pass different flags down the tokenizer chain,
* eg from one TokenFilter to another one.
+ * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
*/
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
private int flags = 0;
/**
- * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java Fri Feb 26 13:09:54 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.RamUsageEstimator;
/**
* The term text of a Token.
@@ -106,12 +107,12 @@
public char[] resizeTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
- termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation and preserve content
- final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
+ final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
@@ -127,19 +128,19 @@
private void growTermBuffer(int newSize) {
if (termBuffer == null) {
// The buffer is always at least MIN_BUFFER_SIZE
- termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
} else {
if(termBuffer.length < newSize){
// Not big enough; create a new array with slight
// over allocation:
- termBuffer = new char[ArrayUtil.getNextSize(newSize)];
+ termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
}
}
}
private void initTermBuffer() {
if (termBuffer == null) {
- termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
+ termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
termLength = 0;
}
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java Fri Feb 26 13:09:54 2010
@@ -134,9 +134,7 @@
* values are returned as {@link String}s (according to
* <code>toString(value)</code> of the used data type).
*
- * <p><font color="red"><b>NOTE:</b> This API is
- * experimental and might change in incompatible ways in the
- * next release.</font>
+ * @lucene.experimental
*
* @since 2.9
*/
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java Fri Feb 26 13:09:54 2010
@@ -35,6 +35,8 @@
import java.util.Arrays;
import org.apache.lucene.util.BytesRef;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+import org.apache.lucene.util.ArrayUtil;
final class ByteBlockPool {
@@ -84,7 +86,8 @@
public void nextBuffer() {
if (1+bufferUpto == buffers.length) {
- byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][];
+ byte[][] newBuffers = new byte[ArrayUtil.oversize(buffers.length+1,
+ NUM_BYTES_OBJECT_REF)][];
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
buffers = newBuffers;
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java Fri Feb 26 13:09:54 2010
@@ -44,9 +44,7 @@
* <p>As this tool checks every byte in the index, on a large
* index it can take quite a long time to run.
*
- * <p><b>WARNING</b>: this tool and API is new and
- * experimental and is subject to suddenly change in the
- * next release. Please make a complete backup of your
+ * @lucene.experimental Please make a complete backup of your
* index before using this to fix your index!
*/
public class CheckIndex {
@@ -57,8 +55,7 @@
/**
* Returned from {@link #checkIndex()} detailing the health and status of the index.
*
- * <p><b>WARNING</b>: this API is new and experimental and is
- * subject to suddenly change in the next release.
+ * @lucene.experimental
**/
public static class Status {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java Fri Feb 26 13:09:54 2010
@@ -985,6 +985,12 @@
// not a good idea):
FieldCache.DEFAULT.purge(this);
+ if (writer != null) {
+ // Since we just closed, writer may now be able to
+ // delete unused files:
+ writer.deleteUnusedFiles();
+ }
+
// throw the first exception
if (ioe != null) throw ioe;
}
@@ -1032,7 +1038,7 @@
/**
* Expert: return the IndexCommit that this reader has opened.
* <p/>
- * <p><b>WARNING</b>: this API is new and experimental and may suddenly change.</p>
+ * @lucene.experimental
*/
@Override
public IndexCommit getIndexCommit() throws IOException {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java Fri Feb 26 13:09:54 2010
@@ -25,6 +25,7 @@
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
/** This is just a "splitter" class: it lets you wrap two
* DocFieldConsumer instances as a single consumer. */
@@ -117,7 +118,7 @@
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
- docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+ docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
/**
* Gathers all Fieldables for a document under the same
@@ -340,7 +341,7 @@
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
- docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+ docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java Fri Feb 26 13:09:54 2010
@@ -42,6 +42,7 @@
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
/**
* This class accepts multiple added documents and directly
@@ -999,57 +1000,58 @@
assert checkDeleteTerm(null);
// Delete by term
- try {
- Fields fields = reader.fields();
- TermsEnum termsEnum = null;
-
- String currentField = null;
- BytesRef termRef = new BytesRef();
- DocsEnum docs = null;
-
- for (Entry<Term, BufferedDeletes.Num> entry: deletesFlushed.terms.entrySet()) {
- Term term = entry.getKey();
- // Since we visit terms sorted, we gain performance
- // by re-using the same TermsEnum and seeking only
- // forwards
- if (term.field() != currentField) {
- // nocommit -- once we sync up branch again, add
- // assert that this field is always > last one
- currentField = term.field();
- Terms terms = fields.terms(currentField);
- if (terms != null) {
- termsEnum = terms.iterator();
- } else {
- termsEnum = null;
+ if (deletesFlushed.terms.size() > 0) {
+ try {
+ Fields fields = reader.fields();
+ TermsEnum termsEnum = null;
+
+ String currentField = null;
+ BytesRef termRef = new BytesRef();
+ DocsEnum docs = null;
+
+ for (Entry<Term, BufferedDeletes.Num> entry: deletesFlushed.terms.entrySet()) {
+ Term term = entry.getKey();
+ // Since we visit terms sorted, we gain performance
+ // by re-using the same TermsEnum and seeking only
+ // forwards
+ if (term.field() != currentField) {
+ // nocommit -- once we sync up branch again, add
+ // assert that this field is always > last one
+ currentField = term.field();
+ Terms terms = fields.terms(currentField);
+ if (terms != null) {
+ termsEnum = terms.iterator();
+ } else {
+ termsEnum = null;
+ }
}
- }
-
- if (termsEnum == null) {
- continue;
- }
-
- termRef.copy(term.text());
- if (termsEnum.seek(termRef) == TermsEnum.SeekStatus.FOUND) {
- DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
-
- if (docsEnum != null) {
- docs = docsEnum;
- int limit = entry.getValue().getNum();
- while (true) {
- final int docID = docs.nextDoc();
- if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) {
- break;
+
+ if (termsEnum == null) {
+ continue;
+ }
+
+ termRef.copy(term.text());
+ if (termsEnum.seek(termRef) == TermsEnum.SeekStatus.FOUND) {
+ DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
+
+ if (docsEnum != null) {
+ docs = docsEnum;
+ int limit = entry.getValue().getNum();
+ while (true) {
+ final int docID = docs.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) {
+ break;
+ }
+ reader.deleteDocument(docID);
+ any = true;
}
- reader.deleteDocument(docID);
- any = true;
}
}
}
+ } finally {
+ //docs.close();
}
- } finally {
- //docs.close();
}
-
// Delete by docID
for (Integer docIdInt : deletesFlushed.docIDs) {
int docID = docIdInt.intValue();
@@ -1060,23 +1062,28 @@
}
// Delete by query
- IndexSearcher searcher = new IndexSearcher(reader);
- for (Entry<Query, Integer> entry : deletesFlushed.queries.entrySet()) {
- Query query = entry.getKey();
- int limit = entry.getValue().intValue();
- Weight weight = query.weight(searcher);
- Scorer scorer = weight.scorer(reader, true, false);
- if (scorer != null) {
- while(true) {
- int doc = scorer.nextDoc();
- if (((long) docIDStart) + doc >= limit)
- break;
- reader.deleteDocument(doc);
- any = true;
+ if (deletesFlushed.queries.size() > 0) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ try {
+ for (Entry<Query, Integer> entry : deletesFlushed.queries.entrySet()) {
+ Query query = entry.getKey();
+ int limit = entry.getValue().intValue();
+ Weight weight = query.weight(searcher);
+ Scorer scorer = weight.scorer(reader, true, false);
+ if (scorer != null) {
+ while(true) {
+ int doc = scorer.nextDoc();
+ if (((long) docIDStart) + doc >= limit)
+ break;
+ reader.deleteDocument(doc);
+ any = true;
+ }
+ }
}
+ } finally {
+ searcher.close();
}
}
- searcher.close();
return any;
}
@@ -1507,7 +1514,7 @@
int gap = doc.docID - nextWriteDocID;
if (gap >= waiting.length) {
// Grow queue
- DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)];
+ DocWriter[] newArray = new DocWriter[ArrayUtil.oversize(gap, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert nextWriteLoc >= 0;
System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc);
System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc);
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java Fri Feb 26 13:09:54 2010
@@ -23,8 +23,7 @@
* being added to the index. The information collected in this class is
* also used to calculate the normalization factor for a field.
*
- * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
- * change.</p>
+ * @lucene.experimental
*/
public final class FieldInvertState {
int position;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java Fri Feb 26 13:09:54 2010
@@ -37,8 +37,7 @@
* associated with it. The segments file associated with a
* later index commit point would have a larger N.</p>
*
- * <p><b>WARNING</b>: This API is a new and experimental and
- * may suddenly change. </p>
+ * @lucene.experimental
*/
public abstract class IndexCommit {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java Fri Feb 26 13:09:54 2010
@@ -343,7 +343,7 @@
deletePendingFiles();
}
- private void deletePendingFiles() throws IOException {
+ public void deletePendingFiles() throws IOException {
if (deletable != null) {
List<String> oldDeletable = deletable;
deletable = null;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java Fri Feb 26 13:09:54 2010
@@ -1266,8 +1266,7 @@
* readers that correspond to a Directory with its own
* segments_N file.
*
- * <p><b>WARNING</b>: this API is new and experimental and
- * may suddenly change.</p>
+ * @lucene.experimental
*/
public IndexCommit getIndexCommit() throws IOException {
throw new UnsupportedOperationException("This reader does not support this method.");
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java Fri Feb 26 13:09:54 2010
@@ -362,8 +362,7 @@
* if you attempt to reopen any of those readers, you'll
* hit an {@link AlreadyClosedException}.</p>
*
- * <p><b>NOTE:</b> This API is experimental and might
- * change in incompatible ways in the next release.</p>
+ * @lucene.experimental
*
* @return IndexReader that covers entire index plus all
* changes made so far by this IndexWriter instance
@@ -3327,12 +3326,18 @@
}
}
- // This is called after pending added and deleted
- // documents have been flushed to the Directory but before
- // the change is committed (new segments_N file written).
- void doAfterFlush()
- throws IOException {
- }
+ /**
+ * A hook for extending classes to execute operations after pending added and
+ * deleted documents have been flushed to the Directory but before the change
+ * is committed (new segments_N file written).
+ */
+ protected void doAfterFlush() throws IOException {}
+
+ /**
+ * A hook for extending classes to execute operations before pending added and
+ * deleted documents are flushed to the Directory.
+ */
+ protected void doBeforeFlush() throws IOException {}
/** Expert: prepare for commit.
*
@@ -3540,6 +3545,8 @@
assert testPoint("startDoFlush");
+ doBeforeFlush();
+
flushCount++;
// If we are flushing because too many deletes
@@ -4857,8 +4864,7 @@
* search, but will reduce search latency on opening a
* new near real-time reader after a merge completes.
*
- * <p><b>NOTE:</b> This API is experimental and might
- * change in incompatible ways in the next release.</p>
+ * @lucene.experimental
*
* <p><b>NOTE</b>: warm is called before any deletes have
* been carried over to the merged segment. */
@@ -4917,4 +4923,25 @@
synchronized boolean isClosed() {
return closed;
}
+
+ /** Expert: remove any index files that are no longer
+ * used.
+ *
+ * <p> IndexWriter normally deletes unused files itself,
+ * during indexing. However, on Windows, which disallows
+ * deletion of open files, if there is a reader open on
+ * the index then those files cannot be deleted. This is
+ * fine, because IndexWriter will periodically retry
+ * the deletion.</p>
+ *
+ * <p> However, IndexWriter doesn't try that often: only
+ * on open, close, flushing a new segment, and finishing
+ * a merge. If you don't do any of these actions with your
+ * IndexWriter, you'll see the unused files linger. If
+ * that's a problem, call this method to delete them
+ * (once you've closed the open readers that were
+ * preventing their deletion). */
+ public synchronized void deleteUnusedFiles() throws IOException {
+ deleter.deletePendingFiles();
+ }
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java Fri Feb 26 13:09:54 2010
@@ -51,8 +51,7 @@
* <p>The default MergePolicy is {@link
* LogByteSizeMergePolicy}.</p>
*
- * <p><b>NOTE:</b> This API is new and still experimental
- * (subject to change suddenly in the next release)</p>
+ * @lucene.experimental
*
* <p><b>NOTE</b>: This class typically requires access to
* package-private APIs (e.g. <code>SegmentInfos</code>) to do its job;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java Fri Feb 26 13:09:54 2010
@@ -24,8 +24,7 @@
* selected by a {@link MergePolicy}. The default
* MergeScheduler is {@link ConcurrentMergeScheduler}.</p>
*
- * <p><b>NOTE:</b> This API is new and still experimental
- * (subject to change suddenly in the next release)</p>
+ * @lucene.experimental
*
* <p><b>NOTE</b>: This class typically requires access to
* package-private APIs (eg, SegmentInfos) to do its job;