You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/26 14:10:08 UTC
svn commit: r916666 [8/16] - in /lucene/java/branches/flex_1458: ./ contrib/ contrib/analyzers/common/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/c...

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharArraySet.java Fri Feb 26 13:09:54 2010
@@ -1,15 +1,5 @@
 package org.apache.lucene.analysis;
 
-import java.util.Arrays;
-import java.util.AbstractSet;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.lucene.util.CharacterUtils;
-import org.apache.lucene.util.Version;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -27,6 +17,13 @@
  * limitations under the License.
  */
 
+import java.util.Arrays;
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
 
 /**
  * A simple class that stores Strings as char[]'s in a
@@ -58,16 +55,11 @@
  * For type safety also {@link #stringIterator()} is provided.
  */
 public class CharArraySet extends AbstractSet<Object> {
-  private final static int INIT_SIZE = 8;
-  private char[][] entries;
-  private int count;
-  private final boolean ignoreCase;
-  public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(
-      new CharArraySet(Version.LUCENE_CURRENT, 0, false));
+  public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+  private static final Object PLACEHOLDER = new Object();
+  
+  private final CharArrayMap<Object> map;
   
-  private final CharacterUtils charUtils;
-  private final Version matchVersion;
-
   /**
    * Create set with enough capacity to hold startSize terms
    * 
@@ -81,13 +73,7 @@
    *          otherwise <code>true</code>.
    */
   public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) {
-    this.ignoreCase = ignoreCase;
-    int size = INIT_SIZE;
-    while(startSize + (startSize>>2) > size)
-      size <<= 1;
-    entries = new char[size][];
-    this.charUtils = CharacterUtils.getInstance(matchVersion);
-    this.matchVersion = matchVersion;
+    this(new CharArrayMap<Object>(matchVersion, startSize, ignoreCase));
   }
 
   /**
@@ -102,7 +88,7 @@
    *          <code>false</code> if and only if the set should be case sensitive
    *          otherwise <code>true</code>.
    */
-  public CharArraySet(Version matchVersion, Collection<? extends Object> c, boolean ignoreCase) {
+  public CharArraySet(Version matchVersion, Collection<?> c, boolean ignoreCase) {
     this(matchVersion, c.size(), ignoreCase);
     addAll(c);
   }
@@ -133,77 +119,51 @@
    * @deprecated use {@link #CharArraySet(Version, Collection, boolean)} instead         
    */  
   @Deprecated
-  public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
+  public CharArraySet(Collection<?> c, boolean ignoreCase) {
     this(Version.LUCENE_30, c.size(), ignoreCase);
     addAll(c);
   }
   
-  /** Create set from entries */
-  private CharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count){
-    this.entries = entries;
-    this.ignoreCase = ignoreCase;
-    this.count = count;
-    this.charUtils = CharacterUtils.getInstance(matchVersion);
-    this.matchVersion = matchVersion;
+  /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+  CharArraySet(final CharArrayMap<Object> map){
+    this.map = map;
   }
   
   /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
   @Override
   public void clear() {
-    count = 0;
-    Arrays.fill(entries, null);
+    map.clear();
   }
 
   /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
    * are in the set */
   public boolean contains(char[] text, int off, int len) {
-    return entries[getSlot(text, off, len)] != null;
+    return map.containsKey(text, off, len);
   }
 
   /** true if the <code>CharSequence</code> is in the set */
   public boolean contains(CharSequence cs) {
-    return entries[getSlot(cs)] != null;
+    return map.containsKey(cs);
   }
 
-  private int getSlot(char[] text, int off, int len) {
-    int code = getHashCode(text, off, len);
-    int pos = code & (entries.length-1);
-    char[] text2 = entries[pos];
-    if (text2 != null && !equals(text, off, len, text2)) {
-      final int inc = ((code>>8)+code)|1;
-      do {
-        code += inc;
-        pos = code & (entries.length-1);
-        text2 = entries[pos];
-      } while (text2 != null && !equals(text, off, len, text2));
-    }
-    return pos;
+  @Override
+  public boolean contains(Object o) {
+    return map.containsKey(o);
   }
 
-  /** Returns true if the String is in the set */  
-  private int getSlot(CharSequence text) {
-    int code = getHashCode(text);
-    int pos = code & (entries.length-1);
-    char[] text2 = entries[pos];
-    if (text2 != null && !equals(text, text2)) {
-      final int inc = ((code>>8)+code)|1;
-      do {
-        code += inc;
-        pos = code & (entries.length-1);
-        text2 = entries[pos];
-      } while (text2 != null && !equals(text, text2));
-    }
-    return pos;
+  @Override
+  public boolean add(Object o) {
+    return map.put(o, PLACEHOLDER) == null;
   }
 
   /** Add this CharSequence into the set */
   public boolean add(CharSequence text) {
-    return add(text.toString()); // could be more efficient
+    return map.put(text, PLACEHOLDER) == null;
   }
   
   /** Add this String into the set */
   public boolean add(String text) {
-    return add(text.toCharArray());
+    return map.put(text, PLACEHOLDER) == null;
   }
 
   /** Add this char[] directly to the set.
@@ -211,140 +171,12 @@
    * The user should never modify this text array after calling this method.
    */
   public boolean add(char[] text) {
-    if (ignoreCase)
-      for(int i=0;i<text.length;){
-        i += Character.toChars(
-              Character.toLowerCase(
-                  charUtils.codePointAt(text, i)), text, i);
-      }
-    int slot = getSlot(text, 0, text.length);
-    if (entries[slot] != null) return false;
-    entries[slot] = text;
-    count++;
-
-    if (count + (count>>2) > entries.length) {
-      rehash();
-    }
-
-    return true;
-  }
-
-  private boolean equals(char[] text1, int off, int len, char[] text2) {
-    if (len != text2.length)
-      return false;
-    final int limit = off+len;
-    if (ignoreCase) {
-      for(int i=0;i<len;) {
-        final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
-          return false;
-        i += Character.charCount(codePointAt); 
-      }
-    } else {
-      for(int i=0;i<len;i++) {
-        if (text1[off+i] != text2[i])
-          return false;
-      }
-    }
-    return true;
-  }
-
-  private boolean equals(CharSequence text1, char[] text2) {
-    int len = text1.length();
-    if (len != text2.length)
-      return false;
-    if (ignoreCase) {
-      for(int i=0;i<len;) {
-        final int codePointAt = charUtils.codePointAt(text1, i);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
-          return false;
-        i += Character.charCount(codePointAt);
-      }
-    } else {
-      for(int i=0;i<len;i++) {
-        if (text1.charAt(i) != text2[i])
-          return false;
-      }
-    }
-    return true;
-  }
-  
-
-
-  private void rehash() {
-    final int newSize = 2*entries.length;
-    char[][] oldEntries = entries;
-    entries = new char[newSize][];
-
-    for(int i=0;i<oldEntries.length;i++) {
-      char[] text = oldEntries[i];
-      if (text != null) {
-        // todo: could be faster... no need to compare strings on collision
-        entries[getSlot(text,0,text.length)] = text;
-      }
-    }
-  }
-  
-  private int getHashCode(char[] text, int offset, int len) {
-    int code = 0;
-    final int stop = offset + len;
-    if (ignoreCase) {
-      for (int i=offset; i<stop;) {
-        final int codePointAt = charUtils.codePointAt(text, i, stop);
-        code = code*31 + Character.toLowerCase(codePointAt);
-        i += Character.charCount(codePointAt);
-      }
-    } else {
-      for (int i=offset; i<stop; i++) {
-        code = code*31 + text[i];
-      }
-    }
-    return code;
+    return map.put(text, PLACEHOLDER) == null;
   }
 
-  private int getHashCode(CharSequence text) {
-    int code = 0;
-    int len = text.length();
-    if (ignoreCase) {
-      for (int i=0; i<len;) {
-        int codePointAt = charUtils.codePointAt(text, i);
-        code = code*31 + Character.toLowerCase(codePointAt);
-        i += Character.charCount(codePointAt);
-      }
-    } else {
-      for (int i=0; i<len; i++) {
-        code = code*31 + text.charAt(i);
-      }
-    }
-    return code;
-  }
-
-
   @Override
   public int size() {
-    return count;
-  }
-
-  @Override
-  public boolean isEmpty() {
-    return count==0;
-  }
-
-  @Override
-  public boolean contains(Object o) {
-    if (o instanceof char[]) {
-      final char[] text = (char[])o;
-      return contains(text, 0, text.length);
-    } 
-    return contains(o.toString());
-  }
-
-  @Override
-  public boolean add(Object o) {
-    if (o instanceof char[]) {
-      return add((char[])o);
-    }
-    return add(o.toString());
+    return map.size();
   }
   
   /**
@@ -362,14 +194,9 @@
       throw new NullPointerException("Given set is null");
     if (set == EMPTY_SET)
       return EMPTY_SET;
-    if (set instanceof UnmodifiableCharArraySet)
+    if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
       return set;
-
-    /*
-     * Instead of delegating calls to the given set copy the low-level values to
-     * the unmodifiable Subclass
-     */
-    return new UnmodifiableCharArraySet(set.matchVersion, set.entries, set.ignoreCase, set.count);
+    return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
   }
 
   /**
@@ -387,7 +214,7 @@
   public static CharArraySet copy(final Set<?> set) {
     if(set == EMPTY_SET)
       return EMPTY_SET;
-    return (set instanceof CharArraySet) ? copy((CharArraySet) set) : copy(Version.LUCENE_30, set);
+    return copy(Version.LUCENE_30, set);
   }
   
   /**
@@ -416,29 +243,27 @@
       return EMPTY_SET;
     if(set instanceof CharArraySet) {
       final CharArraySet source = (CharArraySet) set;
-      // use fast path instead of iterating all values
-      // this is even on very small sets ~10 times faster than iterating
-      final char[][] entries = new char[source.entries.length][];
-      System.arraycopy(source.entries, 0, entries, 0, entries.length);
-      return new CharArraySet(source.matchVersion, entries, source.ignoreCase, source.count);
+      return new CharArraySet(CharArrayMap.copy(source.map.matchVersion, source.map));
     }
     return new CharArraySet(matchVersion, set, false);
   }
   
-
   /** The Iterator<String> for this set.  Strings are constructed on the fly, so
-   * use <code>nextCharArray</code> for more efficient access. */
+   * use <code>nextCharArray</code> for more efficient access.
+   * @deprecated Use the standard iterator, which returns {@code char[]} instances.
+   */
+  @Deprecated
   public class CharArraySetIterator implements Iterator<String> {
     int pos=-1;
     char[] next;
-    CharArraySetIterator() {
+    private CharArraySetIterator() {
       goNext();
     }
 
     private void goNext() {
       next = null;
       pos++;
-      while (pos < entries.length && (next=entries[pos]) == null) pos++;
+      while (pos < map.keys.length && (next=map.keys[pos]) == null) pos++;
     }
 
     public boolean hasNext() {
@@ -463,61 +288,41 @@
     }
   }
 
-  /** returns an iterator of new allocated Strings */
+  /** returns an iterator of new allocated Strings (an instance of {@link CharArraySetIterator}).
+   * @deprecated Use {@link #iterator}, which returns {@code char[]} instances.
+   */
+  @Deprecated
   public Iterator<String> stringIterator() {
     return new CharArraySetIterator();
   }
 
-  /** returns an iterator of new allocated Strings, this method violates the Set interface */
-  @Override
-  @SuppressWarnings("unchecked")
+  /** Returns an {@link Iterator} depending on the version used:
+   * <ul>
+   * <li>if {@code matchVersion} &ge; 3.1, it returns {@code char[]} instances in this set.</li>
+   * <li>if {@code matchVersion} is 3.0 or older, it returns new
+   * allocated Strings, so this method violates the Set interface.
+   * It is kept this way for backwards compatibility, normally it should
+   * return {@code char[]} on {@code next()}</li>
+   * </ul>
+   */
+  @Override @SuppressWarnings("unchecked")
   public Iterator<Object> iterator() {
-    return (Iterator) stringIterator();
+    // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+    return map.matchVersion.onOrAfter(Version.LUCENE_31) ?
+      map.originalKeySet().iterator() : (Iterator) stringIterator();
   }
   
-  /**
-   * Efficient unmodifiable {@link CharArraySet}. This implementation does not
-   * delegate calls to a give {@link CharArraySet} like
-   * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
-   * the internal representation of a {@link CharArraySet} to a super
-   * constructor and overrides all mutators. 
-   */
-  private static final class UnmodifiableCharArraySet extends CharArraySet {
-
-    private UnmodifiableCharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase,
-        int count) {
-      super(matchVersion, entries, ignoreCase, count);
-    }
-
-    @Override
-    public void clear() {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public boolean add(Object o){
-      throw new UnsupportedOperationException();
-    }
-    
-    @Override
-    public boolean addAll(Collection<? extends Object> coll) {
-      throw new UnsupportedOperationException();
-    }
-    
-    @Override
-    public boolean add(char[] text) {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public boolean add(CharSequence text) {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public boolean add(String text) {
-      throw new UnsupportedOperationException();
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder("[");
+    for (Object item : this) {
+      if (sb.length()>1) sb.append(", ");
+      if (item instanceof char[]) {
+        sb.append((char[]) item);
+      } else {
+        sb.append(item);
+      }
     }
+    return sb.append(']').toString();
   }
-
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharFilter.java Fri Feb 26 13:09:54 2010
@@ -24,9 +24,6 @@
  * They can be used as {@link java.io.Reader} with additional offset
  * correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
  * if a CharFilter/CharStream subclass is used.
- *
- * @version $Id$
- *
  */
 public abstract class CharFilter extends CharStream {
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Feb 26 13:09:54 2010
@@ -23,59 +23,310 @@
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.AttributeSource;
-
-/** An abstract base class for simple, character-oriented tokenizers.*/
+import org.apache.lucene.util.CharacterUtils;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
+import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
+
+/**
+ * An abstract base class for simple, character-oriented tokenizers. 
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link #isTokenChar(int)} and
+ * {@link #normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ * A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API
+ * moved from UTF-16 code units to UTF-32 codepoints to eventually add support
+ * for <a href=
+ * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+ * >supplementary characters</a>. The old <i>char</i> based API has been
+ * deprecated and should be replaced with the <i>int</i> based methods
+ * {@link #isTokenChar(int)} and {@link #normalize(int)}.
+ * </p>
+ * <p>
+ * As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a
+ * {@link Version} argument. Based on the given {@link Version} either the new
+ * API or a backwards compatibility layer is used at runtime. For
+ * {@link Version} < 3.1 the backwards compatibility layer ensures correct
+ * behavior even for indexes build with previous versions of Lucene. If a
+ * {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to
+ * be implemented by the instantiated class. Yet, the old <i>char</i> based API
+ * is not required anymore even if backwards compatibility must be preserved.
+ * {@link CharTokenizer} subclasses implementing the new API are fully backwards
+ * compatible if instantiated with {@link Version} < 3.1.
+ * </p>
+ * <p>
+ * <strong>Note:</strong> If you use a subclass of {@link CharTokenizer} with {@link Version} >=
+ * 3.1 on an index build with a version < 3.1, created tokens might not be
+ * compatible with the terms in your index.
+ * </p>
+ **/
 public abstract class CharTokenizer extends Tokenizer {
-  public CharTokenizer(Reader input) {
+  
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param input
+   *          the input to split up into tokens
+   */
+  public CharTokenizer(Version matchVersion, Reader input) {
     super(input);
+    charUtils = CharacterUtils.getInstance(matchVersion);
     offsetAtt = addAttribute(OffsetAttribute.class);
     termAtt = addAttribute(TermAttribute.class);
-  }
+    useOldAPI = useOldAPI(matchVersion);
+    ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
 
-  public CharTokenizer(AttributeSource source, Reader input) {
+  }
+  
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param source
+   *          the attribute source to use for this {@link Tokenizer}
+   * @param input
+   *          the input to split up into tokens
+   */
+  public CharTokenizer(Version matchVersion, AttributeSource source,
+      Reader input) {
     super(source, input);
+    charUtils = CharacterUtils.getInstance(matchVersion);
     offsetAtt = addAttribute(OffsetAttribute.class);
     termAtt = addAttribute(TermAttribute.class);
+    useOldAPI = useOldAPI(matchVersion);
+    ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
   }
-
-  public CharTokenizer(AttributeFactory factory, Reader input) {
+  
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param factory
+   *          the attribute factory to use for this {@link Tokenizer}
+   * @param input
+   *          the input to split up into tokens
+   */
+  public CharTokenizer(Version matchVersion, AttributeFactory factory,
+      Reader input) {
     super(factory, input);
+    charUtils = CharacterUtils.getInstance(matchVersion);
     offsetAtt = addAttribute(OffsetAttribute.class);
     termAtt = addAttribute(TermAttribute.class);
+    useOldAPI = useOldAPI(matchVersion);
+    ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
+  }
+  
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * @param input the input to split up into tokens
+   * @deprecated use {@link #CharTokenizer(Version, Reader)} instead. This will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated
+  public CharTokenizer(Reader input) {
+    this(Version.LUCENE_30, input);
+  }
+
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * @param input the input to split up into tokens
+   * @param source the attribute source to use for this {@link Tokenizer}
+   * @deprecated use {@link #CharTokenizer(Version, AttributeSource, Reader)} instead. This will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated
+  public CharTokenizer(AttributeSource source, Reader input) {
+    this(Version.LUCENE_30, source, input);
+  }
+
+  /**
+   * Creates a new {@link CharTokenizer} instance
+   * @param input the input to split up into tokens
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @deprecated use {@link #CharTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. This will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated
+  public CharTokenizer(AttributeFactory factory, Reader input) {
+    this(Version.LUCENE_30, factory, input);
   }
   
   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 4096;
-  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
   
-  private TermAttribute termAtt;
-  private OffsetAttribute offsetAtt;
+  private final TermAttribute termAtt;
+  private final OffsetAttribute offsetAtt;
+  
+  private final CharacterUtils charUtils;
+  private final CharacterBuffer ioBuffer;
+  
+  /**
+   * @deprecated this will be removed in lucene 4.0
+   */
+  @Deprecated
+  private final boolean useOldAPI;
+  
+  /**
+   * @deprecated this will be removed in lucene 4.0
+   */
+  @Deprecated
+  private static final VirtualMethod<CharTokenizer> isTokenCharMethod =
+    new VirtualMethod<CharTokenizer>(CharTokenizer.class, "isTokenChar", char.class);
+  
+  /**
+   * @deprecated this will be removed in lucene 4.0
+   */
+  @Deprecated
+  private static final VirtualMethod<CharTokenizer> normalizeMethod =
+    new VirtualMethod<CharTokenizer>(CharTokenizer.class, "normalize", char.class);
 
-  /** Returns true iff a character should be included in a token.  This
+  /**
+   * Returns true iff a UTF-16 code unit should be included in a token. This
    * tokenizer generates as tokens adjacent sequences of characters which
-   * satisfy this predicate.  Characters for which this is false are used to
-   * define token boundaries and are not included in tokens. */
-  protected abstract boolean isTokenChar(char c);
-
-  /** Called on each token character to normalize it before it is added to the
-   * token.  The default implementation does nothing.  Subclasses may use this
-   * to, e.g., lowercase tokens. */
+   * satisfy this predicate. Characters for which this is <code>false</code> are
+   * used to define token boundaries and are not included in tokens.
+   * <p>
+   * Note: This method cannot handle <a href=
+   * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+   * >supplementary characters</a>. To support all Unicode characters, including
+   * supplementary characters, use the {@link #isTokenChar(int)} method.
+   * </p>
+   * 
+   * @deprecated use {@link #isTokenChar(int)} instead. This method will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated  
+  protected boolean isTokenChar(char c) {
+    return isTokenChar((int)c); 
+  }
+
+  /**
+   * Called on each token UTF-16 code unit to normalize it before it is added to the
+   * token. The default implementation does nothing. Subclasses may use this to,
+   * e.g., lowercase tokens.
+   * <p>
+   * Note: This method cannot handle <a href=
+   * "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
+   * >supplementary characters</a>. To support all Unicode characters, including
+   * supplementary characters, use the {@link #normalize(int)} method.
+   * </p>
+   * 
+   * @deprecated use {@link #normalize(int)} instead. This method will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated 
   protected char normalize(char c) {
+    return (char) normalize((int) c);
+  }
+
+  /**
+   * Returns true iff a codepoint should be included in a token. This tokenizer
+   * generates as tokens adjacent sequences of codepoints which satisfy this
+   * predicate. Codepoints for which this is false are used to define token
+   * boundaries and are not included in tokens.
+   * <p>
+   * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
+   * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
+   * compatible int based API to support codepoints instead of UTF-16 code
+   * units. Subclasses of {@link CharTokenizer} must not override the char based
+   * methods if a {@link Version} >= 3.1 is passed to the constructor.
+   * <p>
+   * <p>
+   * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
+   * </p>
+   */
+  protected boolean isTokenChar(int c) {
+    throw new UnsupportedOperationException("since LUCENE_3_1 subclasses of CharTokenizer must implement isTokenChar(int)");
+  }
+
+  /**
+   * Called on each token character to normalize it before it is added to the
+   * token. The default implementation does nothing. Subclasses may use this to,
+   * e.g., lowercase tokens.
+   * <p>
+   * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and
+   * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0
+   * compatible int based API to support codepoints instead of UTF-16 code
+   * units. Subclasses of {@link CharTokenizer} must not override the char based
+   * methods if a {@link Version} >= 3.1 is passed to the constructor.
+   * <p>
+   * <p>
+   * NOTE: This method will be marked <i>abstract</i> in Lucene 4.0.
+   * </p>
+   */
+  protected int normalize(int c) {
     return c;
   }
 
   @Override
   public final boolean incrementToken() throws IOException {
     clearAttributes();
+    if(useOldAPI) // TODO remove this in LUCENE 4.0
+      return incrementTokenOld();
+    int length = 0;
+    int start = bufferIndex;
+    char[] buffer = termAtt.termBuffer();
+    while (true) {
+      if (bufferIndex >= dataLen) {
+        offset += dataLen;
+        if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
+          dataLen = 0; // so next offset += dataLen won't decrement offset
+          if (length > 0)
+            break;
+          else
+            return false;
+        }
+        dataLen = ioBuffer.getLength();
+        bufferIndex = 0;
+      }
+      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
+      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
+      bufferIndex += Character.charCount(c);
+
+      if (isTokenChar(c)) {               // if it's a token char
+        if (length == 0)                 // start of token
+          start = offset + bufferIndex - 1;
+        else if (length >= buffer.length-1) // check if a supplementary could run out of bounds
+          buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer
+        length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
+        if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
+          break;
+      } else if (length > 0)             // at non-Letter w/ chars
+        break;                           // return 'em
+    }
+
+    termAtt.setTermLength(length);
+    offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
+    return true;
+    
+  }
+  
+  /**
+   * The <= 3.0 version of incrementToken. This is a backwards compat implementation used
+   * if a version <= 3.0 is provided to the ctor. 
+   * @deprecated remove in 4.0
+   */
+  @Deprecated
+  private boolean incrementTokenOld() throws IOException {
     int length = 0;
     int start = bufferIndex;
     char[] buffer = termAtt.termBuffer();
+    final char[] oldIoBuffer = ioBuffer.getBuffer();
     while (true) {
 
       if (bufferIndex >= dataLen) {
         offset += dataLen;
-        dataLen = input.read(ioBuffer);
+        dataLen = input.read(oldIoBuffer);
         if (dataLen == -1) {
           dataLen = 0;                            // so next offset += dataLen won't decrement offset
           if (length > 0)
@@ -86,7 +337,7 @@
         bufferIndex = 0;
       }
 
-      final char c = ioBuffer[bufferIndex++];
+      final char c = oldIoBuffer[bufferIndex++];
 
       if (isTokenChar(c)) {               // if it's a token char
 
@@ -107,12 +358,14 @@
     termAtt.setTermLength(length);
     offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
     return true;
-  }
+  }  
+  
+  
   
   @Override
   public final void end() {
     // set final offset
-    int finalOffset = correctOffset(offset);
+    final int finalOffset = correctOffset(offset);
     offsetAtt.setOffset(finalOffset, finalOffset);
   }
 
@@ -122,5 +375,19 @@
     bufferIndex = 0;
     offset = 0;
     dataLen = 0;
+    ioBuffer.reset(); // make sure to reset the IO buffer!!
   }
-}
+
+  /**
+   * @deprecated this will be removed in lucene 4.0
+   */
+  @Deprecated
+  private boolean useOldAPI(Version matchVersion) {
+    final Class<? extends CharTokenizer> clazz = this.getClass();
+    if (matchVersion.onOrAfter(Version.LUCENE_31)
+        && (isTokenCharMethod.isOverriddenAsOf(clazz) || normalizeMethod
+            .isOverriddenAsOf(clazz))) throw new IllegalArgumentException(
+        "For matchVersion >= LUCENE_31, CharTokenizer subclasses must not override isTokenChar(char) or normalize(char).");
+    return !matchVersion.onOrAfter(Version.LUCENE_31);
+  } 
+}
\ No newline at end of file

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LetterTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,34 +20,106 @@
 import java.io.Reader;
 
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
 
-/** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
-  to say, it defines tokens as maximal strings of adjacent letters, as defined
-  by java.lang.Character.isLetter() predicate.
-
-  Note: this does a decent job for most European languages, but does a terrible
-  job for some Asian languages, where words are not separated by spaces. */
+/**
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
+ * say, it defines tokens as maximal strings of adjacent letters, as defined by
+ * java.lang.Character.isLetter() predicate.
+ * <p>
+ * Note: this does a decent job for most European languages, but does a terrible
+ * job for some Asian languages, where words are not separated by spaces.
+ * </p>
+ * <p>
+ * <a name="version"/>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link LetterTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * </p>
+ */
 
 public class LetterTokenizer extends CharTokenizer {
-  /** Construct a new LetterTokenizer. */
+  
+  /**
+   * Construct a new LetterTokenizer.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LetterTokenizer(Version matchVersion, Reader in) {
+    super(matchVersion, in);
+  }
+  
+  /**
+   * Construct a new LetterTokenizer using a given {@link AttributeSource}.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param source
+   *          the attribute source to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+    super(matchVersion, source, in);
+  }
+  
+  /**
+   * Construct a new LetterTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param factory
+   *          the attribute factory to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+    super(matchVersion, factory, in);
+  }
+  
+  /**
+   * Construct a new LetterTokenizer.
+   * 
+   * @deprecated use {@link #LetterTokenizer(Version, Reader)} instead. This
+   *             will be removed in Lucene 4.0.
+   */
   public LetterTokenizer(Reader in) {
-    super(in);
+    super(Version.LUCENE_30, in);
   }
   
-  /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
+  /**
+   * Construct a new LetterTokenizer using a given {@link AttributeSource}. 
+   * @deprecated
+   * use {@link #LetterTokenizer(Version, AttributeSource, Reader)} instead.
+   * This will be removed in Lucene 4.0.
+   */
   public LetterTokenizer(AttributeSource source, Reader in) {
-    super(source, in);
+    super(Version.LUCENE_30, source, in);
   }
   
-  /** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+  /**
+   * Construct a new LetterTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   * 
+   * @deprecated use {@link #LetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
+   *             instead. This will be removed in Lucene 4.0.
+   */
   public LetterTokenizer(AttributeFactory factory, Reader in) {
-    super(factory, in);
+    super(Version.LUCENE_30, factory, in);
   }
-
+  
   /** Collects only characters which satisfy
-   * {@link Character#isLetter(char)}.*/
+   * {@link Character#isLetter(int)}.*/
   @Override
-  protected boolean isTokenChar(char c) {
+  protected boolean isTokenChar(int c) {
     return Character.isLetter(c);
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,6 +20,7 @@
 import java.io.Reader;
 
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
 
 /**
  * LowerCaseTokenizer performs the function of LetterTokenizer
@@ -30,27 +31,98 @@
  * <P>
  * Note: this does a decent job for most European languages, but does a terrible
  * job for some Asian languages, where words are not separated by spaces.
+ * </p>
+ * <p>
+ * <a name="version"/>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link LowerCaseTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * </p>
  */
 public final class LowerCaseTokenizer extends LetterTokenizer {
-  /** Construct a new LowerCaseTokenizer. */
+  
+  /**
+   * Construct a new LowerCaseTokenizer.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * 
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LowerCaseTokenizer(Version matchVersion, Reader in) {
+    super(matchVersion, in);
+  }
+
+  /** 
+   * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}.
+   *
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param source
+   *          the attribute source to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LowerCaseTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+    super(matchVersion, source, in);
+  }
+
+  /**
+   * Construct a new LowerCaseTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   *
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param factory
+   *          the attribute factory to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+    super(matchVersion, factory, in);
+  }
+  
+  /**
+   * Construct a new LowerCaseTokenizer.
+   * 
+   * @deprecated use {@link #LowerCaseTokenizer(Reader)} instead. This will be
+   *             removed in Lucene 4.0.
+   */
+  @Deprecated
   public LowerCaseTokenizer(Reader in) {
-    super(in);
+    super(Version.LUCENE_30, in);
   }
 
-  /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
+  /**
+   * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}.
+   * 
+   * @deprecated use {@link #LowerCaseTokenizer(AttributeSource, Reader)}
+   *             instead. This will be removed in Lucene 4.0.
+   */
   public LowerCaseTokenizer(AttributeSource source, Reader in) {
-    super(source, in);
+    super(Version.LUCENE_30, source, in);
   }
 
-  /** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+  /**
+   * Construct a new LowerCaseTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   * 
+   * @deprecated use {@link #LowerCaseTokenizer(AttributeSource.AttributeFactory, Reader)}
+   *             instead. This will be removed in Lucene 4.0.
+   */
   public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
-    super(factory, in);
+    super(Version.LUCENE_30, factory, in);
   }
   
   /** Converts char to lower case
-   * {@link Character#toLowerCase(char)}.*/
+   * {@link Character#toLowerCase(int)}.*/
   @Override
-  protected char normalize(char c) {
+  protected int normalize(int c) {
     return Character.toLowerCase(c);
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Feb 26 13:09:54 2010
@@ -82,8 +82,7 @@
  * href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
  * parameter as well as how numeric fields work under the hood.</p>
  *
- * <p><font color="red"><b>NOTE:</b> This API is experimental and
- * might change in incompatible ways in the next release.</font>
+ * @lucene.experimental
  *
  * @since 2.9
  */

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java Fri Feb 26 13:09:54 2010
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.document.Fieldable;
+
 import java.io.Reader;
 import java.io.IOException;
 import java.util.Map;
@@ -118,6 +120,15 @@
       analyzer = defaultAnalyzer;
     return analyzer.getPositionIncrementGap(fieldName);
   }
+
+  /** Return the offsetGap from the analyzer assigned to field */
+  @Override
+  public int getOffsetGap(Fieldable field) {
+    Analyzer analyzer = analyzerMap.get(field.name());
+    if (analyzer == null)
+      analyzer = defaultAnalyzer;
+    return analyzer.getOffsetGap(field);
+  }
   
   @Override
   public String toString() {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemFilter.java Fri Feb 26 13:09:54 2010
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /** Transforms the token stream as per the Porter stemming algorithm.
@@ -38,15 +39,23 @@
       }
     }
     </PRE>
+    <p>
+    Note: This filter is aware of the {@link KeywordAttribute}. To prevent
+    certain terms from being passed to the stemmer
+    {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
+    in a previous {@link TokenStream}.
+    </p>
 */
 public final class PorterStemFilter extends TokenFilter {
-  private PorterStemmer stemmer;
-  private TermAttribute termAtt;
+  private final PorterStemmer stemmer;
+  private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
 
   public PorterStemFilter(TokenStream in) {
     super(in);
     stemmer = new PorterStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
 
   @Override
@@ -54,7 +63,7 @@
     if (!input.incrementToken())
       return false;
 
-    if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
+    if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
       termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
     return true;
   }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/PorterStemmer.java Fri Feb 26 13:09:54 2010
@@ -44,7 +44,12 @@
 */
 
 
-import java.io.*;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.FileInputStream;
+
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
+import org.apache.lucene.util.ArrayUtil;
 
 /**
  *
@@ -61,11 +66,10 @@
   private int i,    /* offset into b */
     j, k, k0;
   private boolean dirty = false;
-  private static final int INC = 50; /* unit of size whereby b is increased */
-  private static final int EXTRA = 1;
+  private static final int INITIAL_SIZE = 50;
 
   public PorterStemmer() {
-    b = new char[INC];
+    b = new char[INITIAL_SIZE];
     i = 0;
   }
 
@@ -81,10 +85,8 @@
    * adding characters, you can call stem(void) to process the word.
    */
   public void add(char ch) {
-    if (b.length <= i + EXTRA) {
-      char[] new_b = new char[b.length+INC];
-      System.arraycopy(b, 0, new_b, 0, b.length);
-      b = new_b;
+    if (b.length <= i) {
+      b = ArrayUtil.grow(b, i+1);
     }
     b[i++] = ch;
   }
@@ -451,8 +453,7 @@
   public boolean stem(char[] wordBuffer, int offset, int wordLen) {
     reset();
     if (b.length < wordLen) {
-      char[] new_b = new char[wordLen + EXTRA];
-      b = new_b;
+      b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
     }
     System.arraycopy(wordBuffer, offset, b, 0, wordLen);
     i = wordLen;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -19,14 +19,42 @@
 
 import java.io.Reader;
 
-/** An {@link Analyzer} that filters {@link LetterTokenizer} 
- *  with {@link LowerCaseFilter} */
+import org.apache.lucene.util.Version;
 
+/** An {@link Analyzer} that filters {@link LetterTokenizer} 
+ *  with {@link LowerCaseFilter} 
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link LowerCaseTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ **/
 public final class SimpleAnalyzer extends ReusableAnalyzerBase {
 
+  private final Version matchVersion;
+  
+  /**
+   * Creates a new {@link SimpleAnalyzer}
+   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
+   */
+  public SimpleAnalyzer(Version matchVersion) {
+    this.matchVersion = matchVersion;
+  }
+  
+  /**
+   * Creates a new {@link SimpleAnalyzer}
+   * @deprecated use {@link #SimpleAnalyzer(Version)} instead 
+   */
+  @Deprecated  public SimpleAnalyzer() {
+    this(Version.LUCENE_30);
+  }
   @Override
   protected TokenStreamComponents createComponents(final String fieldName,
       final Reader reader) {
-    return new TokenStreamComponents(new LowerCaseTokenizer(reader));
+    return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion, reader));
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/StopAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -99,7 +99,7 @@
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
       Reader reader) {
-    final Tokenizer source = new LowerCaseTokenizer(reader);
+    final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
     return new TokenStreamComponents(source, new StopFilter(matchVersion,
           source, stopwords));
   }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Token.java Fri Feb 26 13:09:54 2010
@@ -29,6 +29,7 @@
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /** 
   A Token is an occurrence of a term from the text of a field.  It consists of
@@ -347,12 +348,12 @@
   public char[] resizeTermBuffer(int newSize) {
     if (termBuffer == null) {
       // The buffer is always at least MIN_BUFFER_SIZE
-      termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; 
+      termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; 
     } else {
       if(termBuffer.length < newSize){
         // Not big enough; create a new array with slight
         // over allocation and preserve content
-        final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
+        final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
         System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
         termBuffer = newCharBuffer;
       }
@@ -367,19 +368,19 @@
   private void growTermBuffer(int newSize) {
     if (termBuffer == null) {
       // The buffer is always at least MIN_BUFFER_SIZE    
-      termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];   
+      termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
     } else {
       if(termBuffer.length < newSize){
         // Not big enough; create a new array with slight
         // over allocation:
-        termBuffer = new char[ArrayUtil.getNextSize(newSize)];
+        termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
       }
     } 
   }
   
   private void initTermBuffer() {
     if (termBuffer == null) {
-      termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
+      termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
       termLength = 0;
     }
   }
@@ -453,14 +454,14 @@
   }
 
   /**
-   * EXPERIMENTAL:  While we think this is here to stay, we may want to change it to be a long.
    * <p/>
    *
    * Get the bitset for any bits that have been set.  This is completely distinct from {@link #type()}, although they do share similar purposes.
    * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
    *
-   *
+   * 
    * @return The bits
+   * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
    */
   public int getFlags() {
     return flags;

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/Tokenizer.java
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Fri Feb 26 13:09:54 2010
@@ -0,0 +1 @@
+/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/analysis/Tokenizer.java:909334

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -19,13 +19,44 @@
 
 import java.io.Reader;
 
-/** An Analyzer that uses {@link WhitespaceTokenizer}. */
+import org.apache.lucene.util.Version;
 
+/**
+ * An Analyzer that uses {@link WhitespaceTokenizer}.
+ * <p>
+ * <a name="version">You must specify the required {@link Version} compatibility
+ * when creating {@link CharTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link WhitespaceTokenizer} uses an int based API to normalize and
+ * detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ * <p>
+ **/
 public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
-
+  
+  private final Version matchVersion;
+  
+  /**
+   * Creates a new {@link WhitespaceAnalyzer}
+   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
+   */
+  public WhitespaceAnalyzer(Version matchVersion) {
+    this.matchVersion = matchVersion;
+  }
+  
+  /**
+   * Creates a new {@link WhitespaceAnalyzer}
+   * @deprecated use {@link #WhitespaceAnalyzer(Version)} instead 
+   */
+  @Deprecated
+  public WhitespaceAnalyzer() {
+    this(Version.LUCENE_30);
+  }
+  
   @Override
   protected TokenStreamComponents createComponents(final String fieldName,
       final Reader reader) {
-    return new TokenStreamComponents(new WhitespaceTokenizer(reader));
+    return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion, reader));
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java Fri Feb 26 13:09:54 2010
@@ -20,30 +20,102 @@
 import java.io.Reader;
 
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
 
-/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
- * Adjacent sequences of non-Whitespace characters form tokens. */
-
+/**
+ * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * Adjacent sequences of non-Whitespace characters form tokens. <a
+ * name="version"/>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link WhitespaceTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
+ * {@link CharTokenizer#normalize(int)} for details.</li>
+ * </ul>
+ */
 public class WhitespaceTokenizer extends CharTokenizer {
-  /** Construct a new WhitespaceTokenizer. */
+  
+  /**
+   * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
+   * to match See {@link <a href="#version">above</a>}
+   * 
+   * @param in
+   *          the input to split up into tokens
+   */
+  public WhitespaceTokenizer(Version matchVersion, Reader in) {
+    super(matchVersion, in);
+  }
+
+  /**
+   * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param source
+   *          the attribute source to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public WhitespaceTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+    super(matchVersion, source, in);
+  }
+
+  /**
+   * Construct a new WhitespaceTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   *
+   * @param
+   *          matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param factory
+   *          the attribute factory to use for this {@link Tokenizer}
+   * @param in
+   *          the input to split up into tokens
+   */
+  public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+    super(matchVersion, factory, in);
+  }
+  
+  /**
+   * Construct a new WhitespaceTokenizer.
+   * 
+   * @deprecated use {@link #WhitespaceTokenizer(Version, Reader)} instead. This will
+   *             be removed in Lucene 4.0.
+   */
+  @Deprecated
   public WhitespaceTokenizer(Reader in) {
     super(in);
   }
 
-  /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
+  /**
+   * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}.
+   * 
+   * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource, Reader)}
+   *             instead. This will be removed in Lucene 4.0.
+   */
+  @Deprecated
   public WhitespaceTokenizer(AttributeSource source, Reader in) {
     super(source, in);
   }
 
-  /** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */
+  /**
+   * Construct a new WhitespaceTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
+   * 
+   * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
+   *             instead. This will be removed in Lucene 4.0.
+   */
+  @Deprecated
   public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
     super(factory, in);
   }
   
   /** Collects only characters which do not satisfy
-   * {@link Character#isWhitespace(char)}.*/
+   * {@link Character#isWhitespace(int)}.*/
   @Override
-  protected boolean isTokenChar(char c) {
+  protected boolean isTokenChar(int c) {
     return !Character.isWhitespace(c);
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/WordlistLoader.java Fri Feb 26 13:09:54 2010
@@ -191,6 +191,66 @@
     return result;
   }
 
+  /**
+   * Loads a text file in Snowball format associated with a given class (See
+   * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
+   * a {@link Set}. The words need to be in lower-case if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   * 
+   * @param aClass a class that is associated with the given stopwordResource
+   * @param stopwordResource name of the resource file associated with the given
+   *          class
+   * @return a {@link Set} with the file's words
+   * @see #getSnowballWordSet(Reader)
+   */
+  public static Set<String> getSnowballWordSet(Class<?> aClass,
+      String stopwordResource) throws IOException {
+    final Reader reader = new BufferedReader(new InputStreamReader(aClass
+        .getResourceAsStream(stopwordResource), "UTF-8"));
+    try {
+      return getSnowballWordSet(reader);
+    } finally {
+      reader.close();
+    }
+  }
+  
+  /**
+   * Reads stopwords from a stopword list in Snowball format.
+   * <p>
+   * The snowball format is the following:
+   * <ul>
+   * <li>Lines may contain multiple words separated by whitespace.
+   * <li>The comment character is the vertical line (&#124;).
+   * <li>Lines may contain trailing comments.
+   * </ul>
+   * </p>
+   * 
+   * @param reader Reader containing a Snowball stopword list
+   * @return A Set with the reader's words
+   */
+  public static Set<String> getSnowballWordSet(Reader reader)
+      throws IOException {
+    final Set<String> result = new HashSet<String>();
+    BufferedReader br = null;
+    try {
+      if (reader instanceof BufferedReader) {
+        br = (BufferedReader) reader;
+      } else {
+        br = new BufferedReader(reader);
+      }
+      String line = null;
+      while ((line = br.readLine()) != null) {
+        int comment = line.indexOf('|');
+        if (comment >= 0) line = line.substring(0, comment);
+        String words[] = line.split("\\s+");
+        for (int i = 0; i < words.length; i++)
+          if (words[i].length() > 0) result.add(words[i]);
+      }
+    } finally {
+      if (br != null) br.close();
+    }
+    return result;
+  }
 
 
   /**

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java Fri Feb 26 13:09:54 2010
@@ -23,10 +23,10 @@
 /**
  * This attribute can be used to pass different flags down the {@link Tokenizer} chain,
  * eg from one TokenFilter to another one. 
+ * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
  */
 public interface FlagsAttribute extends Attribute {
   /**
-   * EXPERIMENTAL:  While we think this is here to stay, we may want to change it to be a long.
    * <p/>
    *
    * Get the bitset for any bits that have been set.  This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java Fri Feb 26 13:09:54 2010
@@ -24,12 +24,12 @@
 /**
  * This attribute can be used to pass different flags down the tokenizer chain,
  * eg from one TokenFilter to another one. 
+ * @lucene.experimental While we think this is here to stay, we may want to change it to be a long.
  */
 public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
   private int flags = 0;
   
   /**
-   * EXPERIMENTAL:  While we think this is here to stay, we may want to change it to be a long.
    * <p/>
    *
    * Get the bitset for any bits that have been set.  This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java Fri Feb 26 13:09:54 2010
@@ -21,6 +21,7 @@
 
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /**
  * The term text of a Token.
@@ -106,12 +107,12 @@
   public char[] resizeTermBuffer(int newSize) {
     if (termBuffer == null) {
       // The buffer is always at least MIN_BUFFER_SIZE
-      termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; 
+      termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; 
     } else {
       if(termBuffer.length < newSize){
         // Not big enough; create a new array with slight
         // over allocation and preserve content
-        final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)];
+        final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
         System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
         termBuffer = newCharBuffer;
       }
@@ -127,19 +128,19 @@
   private void growTermBuffer(int newSize) {
     if (termBuffer == null) {
       // The buffer is always at least MIN_BUFFER_SIZE
-      termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];   
+      termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)];   
     } else {
       if(termBuffer.length < newSize){
         // Not big enough; create a new array with slight
         // over allocation:
-        termBuffer = new char[ArrayUtil.getNextSize(newSize)];
+        termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)];
       }
     } 
   }
   
   private void initTermBuffer() {
     if (termBuffer == null) {
-      termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)];
+      termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)];
       termLength = 0;
     }
   }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/document/NumericField.java Fri Feb 26 13:09:54 2010
@@ -134,9 +134,7 @@
  * values are returned as {@link String}s (according to
  * <code>toString(value)</code> of the used data type).
  *
- * <p><font color="red"><b>NOTE:</b> This API is
- * experimental and might change in incompatible ways in the
- * next release.</font>
+ * @lucene.experimental
  *
  * @since 2.9
  */

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/ByteBlockPool.java Fri Feb 26 13:09:54 2010
@@ -35,6 +35,8 @@
 
 import java.util.Arrays;
 import org.apache.lucene.util.BytesRef;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+import org.apache.lucene.util.ArrayUtil;
 
 final class ByteBlockPool {
 
@@ -84,7 +86,8 @@
 
   public void nextBuffer() {
     if (1+bufferUpto == buffers.length) {
-      byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][];
+      byte[][] newBuffers = new byte[ArrayUtil.oversize(buffers.length+1,
+                                                        NUM_BYTES_OBJECT_REF)][];
       System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
       buffers = newBuffers;
     }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java Fri Feb 26 13:09:54 2010
@@ -44,9 +44,7 @@
  * <p>As this tool checks every byte in the index, on a large
  * index it can take quite a long time to run.
  *
- * <p><b>WARNING</b>: this tool and API is new and
- * experimental and is subject to suddenly change in the
- * next release.  Please make a complete backup of your
+ * @lucene.experimental Please make a complete backup of your
  * index before using this to fix your index!
  */
 public class CheckIndex {
@@ -57,8 +55,7 @@
   /**
    * Returned from {@link #checkIndex()} detailing the health and status of the index.
    *
-   * <p><b>WARNING</b>: this API is new and experimental and is
-   * subject to suddenly change in the next release.
+   * @lucene.experimental
    **/
 
   public static class Status {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DirectoryReader.java Fri Feb 26 13:09:54 2010
@@ -985,6 +985,12 @@
     // not a good idea):
     FieldCache.DEFAULT.purge(this);
 
+    if (writer != null) {
+      // Since we just closed, writer may now be able to
+      // delete unused files:
+      writer.deleteUnusedFiles();
+    }
+
     // throw the first exception
     if (ioe != null) throw ioe;
   }
@@ -1032,7 +1038,7 @@
   /**
    * Expert: return the IndexCommit that this reader has opened.
    * <p/>
-   * <p><b>WARNING</b>: this API is new and experimental and may suddenly change.</p>
+   * @lucene.experimental
    */
   @Override
   public IndexCommit getIndexCommit() throws IOException {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldConsumers.java Fri Feb 26 13:09:54 2010
@@ -25,6 +25,7 @@
 import java.io.IOException;
 
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /** This is just a "splitter" class: it lets you wrap two
  *  DocFieldConsumer instances as a single consumer. */
@@ -117,7 +118,7 @@
         // enough space to recycle all outstanding PerDoc
         // instances
         assert allocCount == 1+docFreeList.length;
-        docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+        docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
       }
       return new PerDoc();
     } else

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /**
  * Gathers all Fieldables for a document under the same
@@ -340,7 +341,7 @@
         // enough space to recycle all outstanding PerDoc
         // instances
         assert allocCount == 1+docFreeList.length;
-        docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+        docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
       }
       return new PerDoc();
     } else

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java Fri Feb 26 13:09:54 2010
@@ -42,6 +42,7 @@
 import org.apache.lucene.util.Constants;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /**
  * This class accepts multiple added documents and directly
@@ -999,57 +1000,58 @@
     assert checkDeleteTerm(null);
 
     // Delete by term
-    try {
-      Fields fields = reader.fields();
-      TermsEnum termsEnum = null;
-
-      String currentField = null;
-      BytesRef termRef = new BytesRef();
-      DocsEnum docs = null;
-
-      for (Entry<Term, BufferedDeletes.Num> entry: deletesFlushed.terms.entrySet()) {
-        Term term = entry.getKey();
-        // Since we visit terms sorted, we gain performance
-        // by re-using the same TermsEnum and seeking only
-        // forwards
-        if (term.field() != currentField) {
-          // nocommit -- once we sync up branch again, add
-          // assert that this field is always > last one
-          currentField = term.field();
-          Terms terms = fields.terms(currentField);
-          if (terms != null) {
-            termsEnum = terms.iterator();
-          } else {
-            termsEnum = null;
+    if (deletesFlushed.terms.size() > 0) {
+      try {
+        Fields fields = reader.fields();
+        TermsEnum termsEnum = null;
+        
+        String currentField = null;
+        BytesRef termRef = new BytesRef();
+        DocsEnum docs = null;
+        
+        for (Entry<Term, BufferedDeletes.Num> entry: deletesFlushed.terms.entrySet()) {
+          Term term = entry.getKey();
+          // Since we visit terms sorted, we gain performance
+          // by re-using the same TermsEnum and seeking only
+          // forwards
+          if (term.field() != currentField) {
+            // nocommit -- once we sync up branch again, add
+            // assert that this field is always > last one
+            currentField = term.field();
+            Terms terms = fields.terms(currentField);
+            if (terms != null) {
+              termsEnum = terms.iterator();
+            } else {
+              termsEnum = null;
+            }
           }
-        }
-
-        if (termsEnum == null) {
-          continue;
-        }
-
-        termRef.copy(term.text());
-        if (termsEnum.seek(termRef) == TermsEnum.SeekStatus.FOUND) {
-          DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
-
-          if (docsEnum != null) {
-            docs = docsEnum;
-            int limit = entry.getValue().getNum();
-            while (true) {
-              final int docID = docs.nextDoc();
-              if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) {
-                break;
+          
+          if (termsEnum == null) {
+            continue;
+          }
+          
+          termRef.copy(term.text());
+          if (termsEnum.seek(termRef) == TermsEnum.SeekStatus.FOUND) {
+            DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
+            
+            if (docsEnum != null) {
+              docs = docsEnum;
+              int limit = entry.getValue().getNum();
+              while (true) {
+                final int docID = docs.nextDoc();
+                if (docID == DocsEnum.NO_MORE_DOCS || docIDStart+docID >= limit) {
+                  break;
+                }
+                reader.deleteDocument(docID);
+                any = true;
               }
-              reader.deleteDocument(docID);
-              any = true;
             }
           }
         }
+      } finally {
+        //docs.close();
       }
-    } finally {
-      //docs.close();
     }
-
     // Delete by docID
     for (Integer docIdInt : deletesFlushed.docIDs) {
       int docID = docIdInt.intValue();
@@ -1060,23 +1062,28 @@
     }
 
     // Delete by query
-    IndexSearcher searcher = new IndexSearcher(reader);
-    for (Entry<Query, Integer> entry : deletesFlushed.queries.entrySet()) {
-      Query query = entry.getKey();
-      int limit = entry.getValue().intValue();
-      Weight weight = query.weight(searcher);
-      Scorer scorer = weight.scorer(reader, true, false);
-      if (scorer != null) {
-        while(true)  {
-          int doc = scorer.nextDoc();
-          if (((long) docIDStart) + doc >= limit)
-            break;
-          reader.deleteDocument(doc);
-          any = true;
+    if (deletesFlushed.queries.size() > 0) {
+      IndexSearcher searcher = new IndexSearcher(reader);
+      try {
+        for (Entry<Query, Integer> entry : deletesFlushed.queries.entrySet()) {
+          Query query = entry.getKey();
+          int limit = entry.getValue().intValue();
+          Weight weight = query.weight(searcher);
+          Scorer scorer = weight.scorer(reader, true, false);
+          if (scorer != null) {
+            while(true)  {
+              int doc = scorer.nextDoc();
+              if (((long) docIDStart) + doc >= limit)
+                break;
+              reader.deleteDocument(doc);
+              any = true;
+            }
+          }
         }
+      } finally {
+        searcher.close();
       }
     }
-    searcher.close();
     return any;
   }
 
@@ -1507,7 +1514,7 @@
         int gap = doc.docID - nextWriteDocID;
         if (gap >= waiting.length) {
           // Grow queue
-          DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)];
+          DocWriter[] newArray = new DocWriter[ArrayUtil.oversize(gap, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
           assert nextWriteLoc >= 0;
           System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc);
           System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc);

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FieldInvertState.java Fri Feb 26 13:09:54 2010
@@ -23,8 +23,7 @@
  * being added to the index. The information collected in this class is
  * also used to calculate the normalization factor for a field.
  * 
- * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
- * change.</p>
+ * @lucene.experimental
  */
 public final class FieldInvertState {
   int position;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexCommit.java Fri Feb 26 13:09:54 2010
@@ -37,8 +37,7 @@
  * associated with it. The segments file associated with a
  * later index commit point would have a larger N.</p>
  *
- * <p><b>WARNING</b>: This API is a new and experimental and
- * may suddenly change. </p>
+ * @lucene.experimental
 */
 
 public abstract class IndexCommit {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexFileDeleter.java Fri Feb 26 13:09:54 2010
@@ -343,7 +343,7 @@
     deletePendingFiles();
   }
 
-  private void deletePendingFiles() throws IOException {
+  public void deletePendingFiles() throws IOException {
     if (deletable != null) {
       List<String> oldDeletable = deletable;
       deletable = null;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexReader.java Fri Feb 26 13:09:54 2010
@@ -1266,8 +1266,7 @@
    * readers that correspond to a Directory with its own
    * segments_N file.
    *
-   * <p><b>WARNING</b>: this API is new and experimental and
-   * may suddenly change.</p>
+   * @lucene.experimental
    */
   public IndexCommit getIndexCommit() throws IOException {
     throw new UnsupportedOperationException("This reader does not support this method.");

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java Fri Feb 26 13:09:54 2010
@@ -362,8 +362,7 @@
    * if you attempt to reopen any of those readers, you'll
    * hit an {@link AlreadyClosedException}.</p>
    *
-   * <p><b>NOTE:</b> This API is experimental and might
-   * change in incompatible ways in the next release.</p>
+   * @lucene.experimental
    *
    * @return IndexReader that covers entire index plus all
    * changes made so far by this IndexWriter instance
@@ -3327,12 +3326,18 @@
     }
   }
 
-  // This is called after pending added and deleted
-  // documents have been flushed to the Directory but before
-  // the change is committed (new segments_N file written).
-  void doAfterFlush()
-    throws IOException {
-  }
+  /**
+   * A hook for extending classes to execute operations after pending added and
+   * deleted documents have been flushed to the Directory but before the change
+   * is committed (new segments_N file written).
+   */
+  protected void doAfterFlush() throws IOException {}
+
+  /**
+   * A hook for extending classes to execute operations before pending added and
+   * deleted documents are flushed to the Directory.
+   */
+  protected void doBeforeFlush() throws IOException {}
 
   /** Expert: prepare for commit.
    *
@@ -3540,6 +3545,8 @@
 
     assert testPoint("startDoFlush");
 
+    doBeforeFlush();
+    
     flushCount++;
 
     // If we are flushing because too many deletes
@@ -4857,8 +4864,7 @@
    *  search, but will reduce search latency on opening a
    *  new near real-time reader after a merge completes.
    *
-   * <p><b>NOTE:</b> This API is experimental and might
-   * change in incompatible ways in the next release.</p>
+   * @lucene.experimental
    *
    * <p><b>NOTE</b>: warm is called before any deletes have
    * been carried over to the merged segment. */
@@ -4917,4 +4923,25 @@
   synchronized boolean isClosed() {
     return closed;
   }
+
+  /** Expert: remove any index files that are no longer
+   *  used.
+   * 
+   *  <p> IndexWriter normally deletes unused files itself,
+   *  during indexing.  However, on Windows, which disallows
+   *  deletion of open files, if there is a reader open on
+   *  the index then those files cannot be deleted.  This is
+   *  fine, because IndexWriter will periodically retry
+   *  the deletion.</p>
+   *
+   *  <p> However, IndexWriter doesn't try that often: only
+   *  on open, close, flushing a new segment, and finishing
+   *  a merge.  If you don't do any of these actions with your
+   *  IndexWriter, you'll see the unused files linger.  If
+   *  that's a problem, call this method to delete them
+   *  (once you've closed the open readers that were
+   *  preventing their deletion). */
+  public synchronized void deleteUnusedFiles() throws IOException {
+    deleter.deletePendingFiles();
+  }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergePolicy.java Fri Feb 26 13:09:54 2010
@@ -51,8 +51,7 @@
  * <p>The default MergePolicy is {@link
  * LogByteSizeMergePolicy}.</p>
  *
- * <p><b>NOTE:</b> This API is new and still experimental
- * (subject to change suddenly in the next release)</p>
+ * @lucene.experimental
  *
  * <p><b>NOTE</b>: This class typically requires access to
  * package-private APIs (e.g. <code>SegmentInfos</code>) to do its job;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/MergeScheduler.java Fri Feb 26 13:09:54 2010
@@ -24,8 +24,7 @@
  *  selected by a {@link MergePolicy}.  The default
  *  MergeScheduler is {@link ConcurrentMergeScheduler}.</p>
  *
- * <p><b>NOTE:</b> This API is new and still experimental
- * (subject to change suddenly in the next release)</p>
+ * @lucene.experimental
  *
  * <p><b>NOTE</b>: This class typically requires access to
  * package-private APIs (eg, SegmentInfos) to do its job;