You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC

svn commit: r1512909 [10/38] - in /lucene/dev/branches/lucene4956: ./ dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/ dev-tools/idea/solr/core/src/test/ dev-too...

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java Sun Aug 11 12:19:13 2013
@@ -19,10 +19,40 @@ package org.apache.lucene.search.similar
 
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
 
-/** Expert: Default scoring implementation. */
+/**
+ * Expert: Default scoring implementation which {@link #encodeNormValue(float)
+ * encodes} norm values as a single byte before being stored. At search time,
+ * the norm byte value is read from the index
+ * {@link org.apache.lucene.store.Directory directory} and
+ * {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
+ * This encoding/decoding, while reducing index size, comes with the price of
+ * precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
+ * instance, <i>decode(encode(0.89)) = 0.75</i>.
+ * <p>
+ * Compression of norm values to a single byte saves memory at search time,
+ * because once a field is referenced at search time, its norms - for all
+ * documents - are maintained in memory.
+ * <p>
+ * The rationale supporting such lossy compression of norm values is that given
+ * the difficulty (and inaccuracy) of users to express their true information
+ * need by a query, only big differences matter. <br>
+ * &nbsp;<br>
+ * Last, note that search time is too late to modify this <i>norm</i> part of
+ * scoring, e.g. by using a different {@link Similarity} for search.
+ */
 public class DefaultSimilarity extends TFIDFSimilarity {
   
+  /** Cache of decoded bytes. */
+  private static final float[] NORM_TABLE = new float[256];
+
+  static {
+    for (int i = 0; i < 256; i++) {
+      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+    }
+  }
+
   /** Sole constructor: parameter-free */
   public DefaultSimilarity() {}
   
@@ -38,6 +68,35 @@ public class DefaultSimilarity extends T
     return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
   }
   
+  /**
+   * Encodes a normalization factor for storage in an index.
+   * <p>
+   * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+   * zero-exponent point at 15, thus representing values from around 7x10^9 to
+   * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+   * represented. Negative numbers are rounded up to zero. Values too large to
+   * represent are rounded down to the largest representable value. Positive
+   * values too small to represent are rounded up to the smallest positive
+   * representable value.
+   * 
+   * @see org.apache.lucene.document.Field#setBoost(float)
+   * @see org.apache.lucene.util.SmallFloat
+   */
+  @Override
+  public final long encodeNormValue(float f) {
+    return SmallFloat.floatToByte315(f);
+  }
+
+  /**
+   * Decodes the norm value, assuming it is a single byte.
+   * 
+   * @see #encodeNormValue(float)
+   */
+  @Override
+  public final float decodeNormValue(long norm) {
+    return NORM_TABLE[(int) (norm & 0xFF)];  // & 0xFF maps negative bytes to positive above 127
+  }
+
   /** Implemented as
    *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
    *  <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java Sun Aug 11 12:19:13 2013
@@ -57,60 +57,25 @@ public class MultiSimilarity extends Sim
   }
 
   @Override
-  public ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
-    ExactSimScorer subScorers[] = new ExactSimScorer[sims.length];
+  public SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+    SimScorer subScorers[] = new SimScorer[sims.length];
     for (int i = 0; i < subScorers.length; i++) {
-      subScorers[i] = sims[i].exactSimScorer(((MultiStats)stats).subStats[i], context);
-    }
-    return new MultiExactDocScorer(subScorers);
-  }
-
-  @Override
-  public SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
-    SloppySimScorer subScorers[] = new SloppySimScorer[sims.length];
-    for (int i = 0; i < subScorers.length; i++) {
-      subScorers[i] = sims[i].sloppySimScorer(((MultiStats)stats).subStats[i], context);
-    }
-    return new MultiSloppyDocScorer(subScorers);
-  }
-  
-  static class MultiExactDocScorer extends ExactSimScorer {
-    private final ExactSimScorer subScorers[];
-    
-    MultiExactDocScorer(ExactSimScorer subScorers[]) {
-      this.subScorers = subScorers;
-    }
-    
-    @Override
-    public float score(int doc, int freq) {
-      float sum = 0.0f;
-      for (ExactSimScorer subScorer : subScorers) {
-        sum += subScorer.score(doc, freq);
-      }
-      return sum;
-    }
-
-    @Override
-    public Explanation explain(int doc, Explanation freq) {
-      Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:");
-      for (ExactSimScorer subScorer : subScorers) {
-        expl.addDetail(subScorer.explain(doc, freq));
-      }
-      return expl;
+      subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context);
     }
+    return new MultiSimScorer(subScorers);
   }
   
-  static class MultiSloppyDocScorer extends SloppySimScorer {
-    private final SloppySimScorer subScorers[];
+  static class MultiSimScorer extends SimScorer {
+    private final SimScorer subScorers[];
     
-    MultiSloppyDocScorer(SloppySimScorer subScorers[]) {
+    MultiSimScorer(SimScorer subScorers[]) {
       this.subScorers = subScorers;
     }
     
     @Override
     public float score(int doc, float freq) {
       float sum = 0.0f;
-      for (SloppySimScorer subScorer : subScorers) {
+      for (SimScorer subScorer : subScorers) {
         sum += subScorer.score(doc, freq);
       }
       return sum;
@@ -119,7 +84,7 @@ public class MultiSimilarity extends Sim
     @Override
     public Explanation explain(int doc, Explanation freq) {
       Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:");
-      for (SloppySimScorer subScorer : subScorers) {
+      for (SimScorer subScorer : subScorers) {
         expl.addDetail(subScorer.explain(doc, freq));
       }
       return expl;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java Sun Aug 11 12:19:13 2013
@@ -54,15 +54,9 @@ public abstract class PerFieldSimilarity
   }
 
   @Override
-  public final ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
+  public final SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
     PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
-    return perFieldWeight.delegate.exactSimScorer(perFieldWeight.delegateWeight, context);
-  }
-
-  @Override
-  public final SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
-    PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
-    return perFieldWeight.delegate.sloppySimScorer(perFieldWeight.delegateWeight, context);
+    return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context);
   }
   
   /** 

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java Sun Aug 11 12:19:13 2013
@@ -88,10 +88,8 @@ import org.apache.lucene.util.SmallFloat
  *       is called for each query leaf node, {@link Similarity#queryNorm(float)} is called for the top-level
  *       query, and finally {@link Similarity.SimWeight#normalize(float, float)} passes down the normalization value
  *       and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s).
- *   <li>For each segment in the index, the Query creates a {@link #exactSimScorer(SimWeight, AtomicReaderContext)}
- *       (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a 
- *       {@link #sloppySimScorer(SimWeight, AtomicReaderContext)} (for queries with sloppy frequencies such as
- *       SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document.
+ *   <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, AtomicReaderContext)}
+ *       The score() method is called for each matching document.
  * </ol>
  * <p>
  * <a name="explaintime"/>
@@ -166,76 +164,31 @@ public abstract class Similarity {
    * @return SimWeight object with the information this Similarity needs to score a query.
    */
   public abstract SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats);
-  
-  /**
-   * Creates a new {@link Similarity.ExactSimScorer} to score matching documents from a segment of the inverted index.
-   * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
-   * @param context segment of the inverted index to be scored.
-   * @return ExactSimScorer for scoring documents across <code>context</code>
-   * @throws IOException if there is a low-level I/O error
-   */
-  public abstract ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
-  
+
   /**
-   * Creates a new {@link Similarity.SloppySimScorer} to score matching documents from a segment of the inverted index.
+   * Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index.
    * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
    * @param context segment of the inverted index to be scored.
    * @return SloppySimScorer for scoring documents across <code>context</code>
    * @throws IOException if there is a low-level I/O error
    */
-  public abstract SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
-  
-  /**
-   * API for scoring exact queries such as {@link TermQuery} and 
-   * exact {@link PhraseQuery}.
-   * <p>
-   * Frequencies are integers (the term or phrase frequency within the document)
-   */
-  public static abstract class ExactSimScorer {
-    
-    /**
-     * Sole constructor. (For invocation by subclass 
-     * constructors, typically implicit.)
-     */
-    public ExactSimScorer() {}
-
-    /**
-     * Score a single document
-     * @param doc document id
-     * @param freq term frequency
-     * @return document's score
-     */
-    public abstract float score(int doc, int freq);
-    
-    /**
-     * Explain the score for a single document
-     * @param doc document id
-     * @param freq Explanation of how the term frequency was computed
-     * @return document's score
-     */
-    public Explanation explain(int doc, Explanation freq) {
-      Explanation result = new Explanation(score(doc, (int)freq.getValue()), 
-          "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
-      result.addDetail(freq);
-      return result;
-    }
-  }
+  public abstract SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
   
   /**
-   * API for scoring "sloppy" queries such as {@link SpanQuery} and 
-   * sloppy {@link PhraseQuery}.
+   * API for scoring "sloppy" queries such as {@link TermQuery},
+   * {@link SpanQuery}, and {@link PhraseQuery}.
    * <p>
    * Frequencies are floating-point values: an approximate 
    * within-document frequency adjusted for "sloppiness" by 
-   * {@link SloppySimScorer#computeSlopFactor(int)}.
+   * {@link SimScorer#computeSlopFactor(int)}.
    */
-  public static abstract class SloppySimScorer {
+  public static abstract class SimScorer {
     
     /**
      * Sole constructor. (For invocation by subclass 
      * constructors, typically implicit.)
      */
-    public SloppySimScorer() {}
+    public SimScorer() {}
 
     /**
      * Score a single document

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java Sun Aug 11 12:19:13 2013
@@ -190,38 +190,20 @@ public abstract class SimilarityBase ext
   }
   
   @Override
-  public ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+  public SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
     if (stats instanceof MultiSimilarity.MultiStats) {
       // a multi term query (e.g. phrase). return the summation, 
       // scoring almost as if it were boolean query
       SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
-      ExactSimScorer subScorers[] = new ExactSimScorer[subStats.length];
+      SimScorer subScorers[] = new SimScorer[subStats.length];
       for (int i = 0; i < subScorers.length; i++) {
         BasicStats basicstats = (BasicStats) subStats[i];
-        subScorers[i] = new BasicExactDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
+        subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
       }
-      return new MultiSimilarity.MultiExactDocScorer(subScorers);
+      return new MultiSimilarity.MultiSimScorer(subScorers);
     } else {
       BasicStats basicstats = (BasicStats) stats;
-      return new BasicExactDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
-    }
-  }
-  
-  @Override
-  public SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
-    if (stats instanceof MultiSimilarity.MultiStats) {
-      // a multi term query (e.g. phrase). return the summation, 
-      // scoring almost as if it were boolean query
-      SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
-      SloppySimScorer subScorers[] = new SloppySimScorer[subStats.length];
-      for (int i = 0; i < subScorers.length; i++) {
-        BasicStats basicstats = (BasicStats) subStats[i];
-        subScorers[i] = new BasicSloppyDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
-      }
-      return new MultiSimilarity.MultiSloppyDocScorer(subScorers);
-    } else {
-      BasicStats basicstats = (BasicStats) stats;
-      return new BasicSloppyDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
+      return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
     }
   }
   
@@ -277,46 +259,17 @@ public abstract class SimilarityBase ext
   
   // --------------------------------- Classes ---------------------------------
   
-  /** Delegates the {@link #score(int, int)} and
-   * {@link #explain(int, Explanation)} methods to
-   * {@link SimilarityBase#score(BasicStats, float, float)} and
-   * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
-   * respectively.
-   */
-  private class BasicExactDocScorer extends ExactSimScorer {
-    private final BasicStats stats;
-    private final NumericDocValues norms;
-    
-    BasicExactDocScorer(BasicStats stats, NumericDocValues norms) throws IOException {
-      this.stats = stats;
-      this.norms = norms;
-    }
-    
-    @Override
-    public float score(int doc, int freq) {
-      // We have to supply something in case norms are omitted
-      return SimilarityBase.this.score(stats, freq,
-          norms == null ? 1F : decodeNormValue((byte)norms.get(doc)));
-    }
-    
-    @Override
-    public Explanation explain(int doc, Explanation freq) {
-      return SimilarityBase.this.explain(stats, doc, freq,
-          norms == null ? 1F : decodeNormValue((byte)norms.get(doc)));
-    }
-  }
-  
   /** Delegates the {@link #score(int, float)} and
    * {@link #explain(int, Explanation)} methods to
    * {@link SimilarityBase#score(BasicStats, float, float)} and
    * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
    * respectively.
    */
-  private class BasicSloppyDocScorer extends SloppySimScorer {
+  private class BasicSimScorer extends SimScorer {
     private final BasicStats stats;
     private final NumericDocValues norms;
     
-    BasicSloppyDocScorer(BasicStats stats, NumericDocValues norms) throws IOException {
+    BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
       this.stats = stats;
       this.norms = norms;
     }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Sun Aug 11 12:19:13 2013
@@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSea
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;
 
 
 /**
@@ -496,27 +495,8 @@ import org.apache.lucene.util.SmallFloat
  *          <td></td>
  *        </tr>
  *      </table>
- *      <br>&nbsp;<br>
- *      However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
- *      before being stored.
- *      At search time, the norm byte value is read from the index
- *      {@link org.apache.lucene.store.Directory directory} and
- *      {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
- *      This encoding/decoding, while reducing index size, comes with the price of
- *      precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
- *      For instance, <i>decode(encode(0.89)) = 0.75</i>.
- *      <br>&nbsp;<br>
- *      Compression of norm values to a single byte saves memory at search time, 
- *      because once a field is referenced at search time, its norms - for 
- *      all documents - are maintained in memory.
- *      <br>&nbsp;<br>
- *      The rationale supporting such lossy compression of norm values is that
- *      given the difficulty (and inaccuracy) of users to express their true information
- *      need by a query, only big differences matter.
- *      <br>&nbsp;<br>
- *      Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
- *      using a different {@link Similarity} for search.
- *      <br>&nbsp;<br>
+ *      Note that search time is too late to modify this <i>norm</i> part of scoring, 
+ *      e.g. by using a different {@link Similarity} for search.
  *    </li>
  * </ol>
  *
@@ -572,25 +552,6 @@ public abstract class TFIDFSimilarity ex
    * when <code>freq</code> is large, and smaller values when <code>freq</code>
    * is small.
    *
-   * <p>The default implementation calls {@link #tf(float)}.
-   *
-   * @param freq the frequency of a term within a document
-   * @return a score factor based on a term's within-document frequency
-   */
-  public float tf(int freq) {
-    return tf((float)freq);
-  }
-
-  /** Computes a score factor based on a term or phrase's frequency in a
-   * document.  This value is multiplied by the {@link #idf(long, long)}
-   * factor for each term in the query and these products are then summed to
-   * form the initial score for a document.
-   *
-   * <p>Terms and phrases repeated in a document indicate the topic of the
-   * document, so implementations of this method usually return larger values
-   * when <code>freq</code> is large, and smaller values when <code>freq</code>
-   * is small.
-   *
    * @param freq the frequency of a term within a document
    * @return a score factor based on a term's within-document frequency
    */
@@ -655,7 +616,7 @@ public abstract class TFIDFSimilarity ex
 
   /** Computes a score factor based on a term's document frequency (the number
    * of documents which contain the term).  This value is multiplied by the
-   * {@link #tf(int)} factor for each term in the query and these products are
+   * {@link #tf(float)} factor for each term in the query and these products are
    * then summed to form the initial score for a document.
    *
    * <p>Terms that occur in fewer documents are better indicators of topic, so
@@ -685,38 +646,15 @@ public abstract class TFIDFSimilarity ex
     return encodeNormValue(normValue);
   }
   
-  /** Cache of decoded bytes. */
-  private static final float[] NORM_TABLE = new float[256];
-
-  static {
-    for (int i = 0; i < 256; i++) {
-      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
-    }
-  }
-
-  /** Decodes a normalization factor stored in an index.
+  /**
+   * Decodes a normalization factor stored in an index.
+   * 
    * @see #encodeNormValue(float)
    */
-  public float decodeNormValue(byte b) {
-    return NORM_TABLE[b & 0xFF];  // & 0xFF maps negative bytes to positive above 127
-  }
+  public abstract float decodeNormValue(long norm);
 
-  /** Encodes a normalization factor for storage in an index.
-  *
-  * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
-  * the zero-exponent point at 15, thus
-  * representing values from around 7x10^9 to 2x10^-9 with about one
-  * significant decimal digit of accuracy.  Zero is also represented.
-  * Negative numbers are rounded up to zero.  Values too large to represent
-  * are rounded down to the largest representable value.  Positive values too
-  * small to represent are rounded up to the smallest positive representable
-  * value.
-  * @see org.apache.lucene.document.Field#setBoost(float)
-  * @see org.apache.lucene.util.SmallFloat
-  */
-  public byte encodeNormValue(float f) {
-    return SmallFloat.floatToByte315(f);
-  }
+  /** Encodes a normalization factor for storage in an index. */
+  public abstract long encodeNormValue(float f);
  
   /** Computes the amount of a sloppy phrase match, based on an edit distance.
    * This value is summed for each sloppy phrase match in a document to form
@@ -755,49 +693,17 @@ public abstract class TFIDFSimilarity ex
   }
 
   @Override
-  public final ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+  public final SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
     IDFStats idfstats = (IDFStats) stats;
-    return new ExactTFIDFDocScorer(idfstats, context.reader().getNormValues(idfstats.field));
-  }
-
-  @Override
-  public final SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
-    IDFStats idfstats = (IDFStats) stats;
-    return new SloppyTFIDFDocScorer(idfstats, context.reader().getNormValues(idfstats.field));
-  }
-  
-  // TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot.
-
-  private final class ExactTFIDFDocScorer extends ExactSimScorer {
-    private final IDFStats stats;
-    private final float weightValue;
-    private final NumericDocValues norms;
-    
-    ExactTFIDFDocScorer(IDFStats stats, NumericDocValues norms) throws IOException {
-      this.stats = stats;
-      this.weightValue = stats.value;
-      this.norms = norms; 
-    }
-    
-    @Override
-    public float score(int doc, int freq) {
-      final float raw = tf(freq)*weightValue;  // compute tf(f)*weight
-
-      return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
-    }
-
-    @Override
-    public Explanation explain(int doc, Explanation freq) {
-      return explainScore(doc, freq, stats, norms);
-    }
+    return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
   }
   
-  private final class SloppyTFIDFDocScorer extends SloppySimScorer {
+  private final class TFIDFSimScorer extends SimScorer {
     private final IDFStats stats;
     private final float weightValue;
     private final NumericDocValues norms;
     
-    SloppyTFIDFDocScorer(IDFStats stats, NumericDocValues norms) throws IOException {
+    TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
       this.stats = stats;
       this.weightValue = stats.value;
       this.norms = norms;
@@ -807,7 +713,7 @@ public abstract class TFIDFSimilarity ex
     public float score(int doc, float freq) {
       final float raw = tf(freq) * weightValue; // compute tf(f)*weight
       
-      return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc));  // normalize for field
+      return norms == null ? raw : raw * decodeNormValue(norms.get(doc));  // normalize for field
     }
     
     @Override
@@ -894,8 +800,7 @@ public abstract class TFIDFSimilarity ex
     fieldExpl.addDetail(stats.idf);
 
     Explanation fieldNormExpl = new Explanation();
-    float fieldNorm =
-      norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f;
+    float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f;
     fieldNormExpl.setValue(fieldNorm);
     fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
     fieldExpl.addDetail(fieldNormExpl);

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java Sun Aug 11 12:19:13 2013
@@ -22,6 +22,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.InPlaceMergeSorter;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -72,13 +73,19 @@ public class NearSpansOrdered extends Sp
   private List<byte[]> matchPayload;
 
   private final Spans[] subSpansByDoc;
-  private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() {
+  // Even though the array is probably almost sorted, InPlaceMergeSorter will likely
+  // perform better since it has a lower overhead than TimSorter for small arrays
+  private final InPlaceMergeSorter sorter = new InPlaceMergeSorter() {
     @Override
-    public int compare(Spans o1, Spans o2) {
-      return o1.doc() - o2.doc();
+    protected void swap(int i, int j) {
+      ArrayUtil.swap(subSpansByDoc, i, j);
+    }
+    @Override
+    protected int compare(int i, int j) {
+      return subSpansByDoc[i].doc() - subSpansByDoc[j].doc();
     }
   };
-  
+
   private SpanNearQuery query;
   private boolean collectPayloads = true;
   
@@ -204,7 +211,7 @@ public class NearSpansOrdered extends Sp
 
   /** Advance the subSpans to the same document */
   private boolean toSameDoc() throws IOException {
-    ArrayUtil.timSort(subSpansByDoc, spanDocComparator);
+    sorter.sort(0, subSpansByDoc.length);
     int firstIndex = 0;
     int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
     while (subSpansByDoc[firstIndex].doc() != maxDoc) {

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java Sun Aug 11 12:19:13 2013
@@ -31,16 +31,36 @@ import java.util.Collection;
 import java.util.Map;
 import java.util.Set;
 
-/** Removes matches which overlap with another SpanQuery. */
+/** Removes matches which overlap with another SpanQuery or 
+ * within a x tokens before or y tokens after another SpanQuery. */
 public class SpanNotQuery extends SpanQuery implements Cloneable {
   private SpanQuery include;
   private SpanQuery exclude;
+  private final int pre;
+  private final int post;
 
   /** Construct a SpanNotQuery matching spans from <code>include</code> which
    * have no overlap with spans from <code>exclude</code>.*/
   public SpanNotQuery(SpanQuery include, SpanQuery exclude) {
+     this(include, exclude, 0, 0);
+  }
+
+  
+  /** Construct a SpanNotQuery matching spans from <code>include</code> which
+   * have no overlap with spans from <code>exclude</code> within 
+   * <code>dist</code> tokens of <code>include</code>. */
+  public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) {
+     this(include, exclude, dist, dist);
+  }
+  
+  /** Construct a SpanNotQuery matching spans from <code>include</code> which
+   * have no overlap with spans from <code>exclude</code> within 
+   * <code>pre</code> tokens before or <code>post</code> tokens of <code>include</code>. */
+  public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) {
     this.include = include;
     this.exclude = exclude;
+    this.pre = (pre >=0) ? pre : 0;
+    this.post = (post >= 0) ? post : 0;
 
     if (!include.getField().equals(exclude.getField()))
       throw new IllegalArgumentException("Clauses must have same field.");
@@ -65,6 +85,10 @@ public class SpanNotQuery extends SpanQu
     buffer.append(include.toString(field));
     buffer.append(", ");
     buffer.append(exclude.toString(field));
+    buffer.append(", ");
+    buffer.append(Integer.toString(pre));
+    buffer.append(", ");
+    buffer.append(Integer.toString(post));
     buffer.append(")");
     buffer.append(ToStringUtils.boost(getBoost()));
     return buffer.toString();
@@ -72,7 +96,8 @@ public class SpanNotQuery extends SpanQu
 
   @Override
   public SpanNotQuery clone() {
-    SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),(SpanQuery) exclude.clone());
+    SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),
+          (SpanQuery) exclude.clone(), pre, post);
     spanNotQuery.setBoost(getBoost());
     return  spanNotQuery;
   }
@@ -98,13 +123,13 @@ public class SpanNotQuery extends SpanQu
 
             while (moreExclude                    // while exclude is before
                    && includeSpans.doc() == excludeSpans.doc()
-                   && excludeSpans.end() <= includeSpans.start()) {
+                   && excludeSpans.end() <= includeSpans.start() - pre) {
               moreExclude = excludeSpans.next();  // increment exclude
             }
 
             if (!moreExclude                      // if no intersection
                 || includeSpans.doc() != excludeSpans.doc()
-                || includeSpans.end() <= excludeSpans.start())
+                || includeSpans.end()+post <= excludeSpans.start())
               break;                              // we found a match
 
             moreInclude = includeSpans.next();    // intersected: keep scanning
@@ -126,13 +151,13 @@ public class SpanNotQuery extends SpanQu
 
           while (moreExclude                      // while exclude is before
                  && includeSpans.doc() == excludeSpans.doc()
-                 && excludeSpans.end() <= includeSpans.start()) {
+                 && excludeSpans.end() <= includeSpans.start()-pre) {
             moreExclude = excludeSpans.next();    // increment exclude
           }
 
           if (!moreExclude                      // if no intersection
                 || includeSpans.doc() != excludeSpans.doc()
-                || includeSpans.end() <= excludeSpans.start())
+                || includeSpans.end()+post <= excludeSpans.start())
             return true;                          // we found a match
 
           return next();                          // scan to next match
@@ -199,23 +224,28 @@ public class SpanNotQuery extends SpanQu
     /** Returns true iff <code>o</code> is equal to this. */
   @Override
   public boolean equals(Object o) {
-    if (this == o) return true;
-    if (!(o instanceof SpanNotQuery)) return false;
+    if (!super.equals(o))
+      return false;
 
     SpanNotQuery other = (SpanNotQuery)o;
     return this.include.equals(other.include)
             && this.exclude.equals(other.exclude)
-            && this.getBoost() == other.getBoost();
+            && this.pre == other.pre 
+            && this.post == other.post;
   }
 
   @Override
   public int hashCode() {
-    int h = include.hashCode();
-    h = (h<<1) | (h >>> 31);  // rotate left
+    int h = super.hashCode();
+    h = Integer.rotateLeft(h, 1);
+    h ^= include.hashCode();
+    h = Integer.rotateLeft(h, 1);
     h ^= exclude.hashCode();
-    h = (h<<1) | (h >>> 31);  // rotate left
-    h ^= Float.floatToRawIntBits(getBoost());
+    h = Integer.rotateLeft(h, 1);
+    h ^= pre;
+    h = Integer.rotateLeft(h, 1);
+    h ^= post;
     return h;
   }
 
-}
+}
\ No newline at end of file

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java Sun Aug 11 12:19:13 2013
@@ -34,9 +34,9 @@ public class SpanScorer extends Scorer {
   protected int doc;
   protected float freq;
   protected int numMatches;
-  protected final Similarity.SloppySimScorer docScorer;
+  protected final Similarity.SimScorer docScorer;
   
-  protected SpanScorer(Spans spans, Weight weight, Similarity.SloppySimScorer docScorer)
+  protected SpanScorer(Spans spans, Weight weight, Similarity.SimScorer docScorer)
   throws IOException {
     super(weight);
     this.docScorer = docScorer;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java Sun Aug 11 12:19:13 2013
@@ -98,7 +98,7 @@ public class SpanTermQuery extends SpanQ
         final Terms terms = fields.terms(term.field());
         if (terms != null) {
           final TermsEnum termsEnum = terms.iterator(null);
-          if (termsEnum.seekExact(term.bytes(), true)) { 
+          if (termsEnum.seekExact(term.bytes())) { 
             state = termsEnum.termState();
           } else {
             state = null;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java Sun Aug 11 12:19:13 2013
@@ -23,7 +23,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SloppySimScorer;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.Bits;
 
 import java.io.IOException;
@@ -52,7 +52,7 @@ public class SpanWeight extends Weight {
     final TermStatistics termStats[] = new TermStatistics[terms.size()];
     int i = 0;
     for (Term term : terms) {
-      TermContext state = TermContext.build(context, term, true);
+      TermContext state = TermContext.build(context, term);
       termStats[i] = searcher.termStatistics(term, state);
       termContexts.put(term, state);
       i++;
@@ -86,7 +86,7 @@ public class SpanWeight extends Weight {
     if (stats == null) {
       return null;
     } else {
-      return new SpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity.sloppySimScorer(stats, context));
+      return new SpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity.simScorer(stats, context));
     }
   }
 
@@ -97,7 +97,7 @@ public class SpanWeight extends Weight {
       int newDoc = scorer.advance(doc);
       if (newDoc == doc) {
         float freq = scorer.sloppyFreq();
-        SloppySimScorer docScorer = similarity.sloppySimScorer(stats, context);
+        SimScorer docScorer = similarity.simScorer(stats, context);
         ComplexExplanation result = new ComplexExplanation();
         result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
         Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html Sun Aug 11 12:19:13 2013
@@ -38,8 +38,8 @@ and inter-phrase proximity (when constru
 number of other {@link org.apache.lucene.search.spans.SpanQuery}s.</li>
 
 <li>A {@link org.apache.lucene.search.spans.SpanNotQuery SpanNotQuery} removes spans
-matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap
-another.  This can be used, e.g., to implement within-paragraph
+matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap (or comes
+near) another.  This can be used, e.g., to implement within-paragraph
 search.</li>
 
 <li>A {@link org.apache.lucene.search.spans.SpanFirstQuery SpanFirstQuery} matches spans

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java Sun Aug 11 12:19:13 2013
@@ -21,6 +21,7 @@ import java.io.EOFException;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.Closeable;
+import java.nio.file.NoSuchFileException;
 import java.util.Collection; // for javadocs
 
 import org.apache.lucene.util.IOUtils;
@@ -70,12 +71,12 @@ public abstract class Directory implemen
    * Returns the length of a file in the directory. This method follows the
    * following contract:
    * <ul>
-   * <li>Throws {@link FileNotFoundException} if the file does not exist
+   * <li>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+   * if the file does not exist.
    * <li>Returns a value &ge;0 if the file exists, which specifies its length.
    * </ul>
    * 
    * @param name the name of the file for which to return the length.
-   * @throws FileNotFoundException if the file does not exist.
    * @throws IOException if there was an IO error while retrieving the file's
    *         length.
    */
@@ -106,7 +107,9 @@ public abstract class Directory implemen
    * the only Directory implementations that respect this
    * parameter are {@link FSDirectory} and {@link
    * CompoundFileDirectory}.
-  */
+   * <p>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+   * if the file does not exist.
+   */
   public abstract IndexInput openInput(String name, IOContext context) throws IOException; 
   
   /** Construct a {@link Lock}.
@@ -223,6 +226,8 @@ public abstract class Directory implemen
    * efficiently open one or more sliced {@link IndexInput} instances from a
    * single file handle. The underlying file handle is kept open until the
    * {@link IndexInputSlicer} is closed.
+   * <p>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+   * if the file does not exist.
    *
    * @throws IOException
    *           if an {@link IOException} occurs

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java Sun Aug 11 12:19:13 2013
@@ -200,6 +200,9 @@ public class NIOFSDirectory extends FSDi
           }
           bb.limit(limit);
           int i = channel.read(bb, pos);
+          if (i < 0){//be defensive here, even though we checked before hand, something could have changed
+            throw new EOFException("read past EOF: " + this + " off: " + offset + " len: " + len + " pos: " + pos + " limit: " + limit + " end: " + end);
+          }
           pos += i;
           readOffset += i;
           readLength -= i;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java Sun Aug 11 12:19:13 2013
@@ -155,6 +155,9 @@ public class SimpleFSDirectory extends F
               readLength = chunkSize;
             }
             final int i = file.read(b, offset + total, readLength);
+            if (i < 0){//be defensive here, even though we checked before hand, something could have changed
+             throw new EOFException("read past EOF: " + this + " off: " + offset + " len: " + len + " total: " + total + " readLen: " + readLength + " end: " + end);
+            }
             total += i;
           } while (total < len);
         } catch (OutOfMemoryError e) {

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java Sun Aug 11 12:19:13 2013
@@ -22,8 +22,93 @@ package org.apache.lucene.util; // from 
  */
 public final class BitUtil {
 
+  private static final byte[] BYTE_COUNTS = {  // table of bits/byte
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+  };
+
+  // The General Idea: instead of having an array per byte that has
+  // the offsets of the next set bit, that array could be
+  // packed inside a 32 bit integer (8 4 bit numbers).  That
+  // should be faster than accessing an array for each index, and
+  // the total array size is kept smaller (256*sizeof(int))=1K
+  /***** the python code that generated bitlist
+  def bits2int(val):
+  arr=0
+  for shift in range(8,0,-1):
+    if val & 0x80:
+      arr = (arr << 4) | shift
+    val = val << 1
+  return arr
+
+  def int_table():
+    tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
+    return ','.join(tbl)
+  ******/
+  private static final int[] BIT_LISTS = {
+    0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43, 
+    0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321, 
+    0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62, 
+    0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643, 
+    0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532, 
+    0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321, 
+    0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
+    0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753, 
+    0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431, 
+    0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632, 
+    0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321, 
+    0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654, 
+    0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8, 
+    0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421, 
+    0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531, 
+    0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432, 
+    0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864, 
+    0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651, 
+    0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541, 
+    0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871, 
+    0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742, 
+    0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521, 
+    0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421, 
+    0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621, 
+    0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421, 
+    0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521, 
+    0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542, 
+    0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
+  };
+
   private BitUtil() {} // no instance
 
+  /** Return the number of bits sets in b. */
+  public static int bitCount(byte b) {
+    return BYTE_COUNTS[b & 0xFF];
+  }
+
+  /** Return the list of bits which are set in b encoded as followed:
+   * <code>(i >>> (4 * n)) & 0x0F</code> is the offset of the n-th set bit of
+   * the given byte plus one, or 0 if there are n or less bits set in the given
+   * byte. For example <code>bitList(12)</code> returns 0x43:<ul>
+   * <li><code>0x43 & 0x0F</code> is 3, meaning the the first bit set is at offset 3-1 = 2,</li>
+   * <li><code>(0x43 >>> 4) & 0x0F</code> is 4, meaning there is a second bit set at offset 4-1=3,</li>
+   * <li><code>(0x43 >>> 8) & 0x0F</code> is 0, meaning there is no more bit set in this byte.</li>
+   * </ul>*/
+  public static int bitList(byte b) {
+    return BIT_LISTS[b & 0xFF];
+  }
+
   // The pop methods used to rely on bit-manipulation tricks for speed but it
   // turns out that it is faster to use the Long.bitCount method (which is an
   // intrinsic since Java 6u18) in a naive loop, see LUCENE-2221

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java Sun Aug 11 12:19:13 2013
@@ -46,6 +46,8 @@ public final class Constants {
   public static final boolean SUN_OS = OS_NAME.startsWith("SunOS");
   /** True iff running on Mac OS X */
   public static final boolean MAC_OS_X = OS_NAME.startsWith("Mac OS X");
+  /** True iff running on FreeBSD */
+  public static final boolean FREE_BSD = OS_NAME.startsWith("FreeBSD");
 
   public static final String OS_ARCH = System.getProperty("os.arch");
   public static final String OS_VERSION = System.getProperty("os.version");

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java Sun Aug 11 12:19:13 2013
@@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSet
  */
 public class OpenBitSetIterator extends DocIdSetIterator {
 
-  // The General Idea: instead of having an array per byte that has
-  // the offsets of the next set bit, that array could be
-  // packed inside a 32 bit integer (8 4 bit numbers).  That
-  // should be faster than accessing an array for each index, and
-  // the total array size is kept smaller (256*sizeof(int))=1K
-  protected final static int[] bitlist={
-    0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43, 
-    0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321, 
-    0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62, 
-    0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643, 
-    0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532, 
-    0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321, 
-    0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
-    0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753, 
-    0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431, 
-    0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632, 
-    0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321, 
-    0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654, 
-    0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8, 
-    0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421, 
-    0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531, 
-    0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432, 
-    0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864, 
-    0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651, 
-    0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541, 
-    0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871, 
-    0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742, 
-    0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521, 
-    0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421, 
-    0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621, 
-    0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421, 
-    0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521, 
-    0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542, 
-    0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
-  };
-  /***** the python code that generated bitlist
-  def bits2int(val):
-  arr=0
-  for shift in range(8,0,-1):
-    if val & 0x80:
-      arr = (arr << 4) | shift
-    val = val << 1
-  return arr
-
-  def int_table():
-    tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
-    return ','.join(tbl)
-  ******/
-
   // hmmm, what about an iterator that finds zeros though,
   // or a reverse iterator... should they be separate classes
   // for efficiency, or have a common root interface?  (or
@@ -101,7 +52,7 @@ public class OpenBitSetIterator extends 
     if ((int)word ==0) {wordShift +=32; word = word >>>32; }
     if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
     if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
-    indexArray = bitlist[(int)word & 0xff];
+    indexArray = BitUtil.bitList((byte) word);
   }
 
   /***** alternate shift implementations

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java Sun Aug 11 12:19:13 2013
@@ -18,6 +18,7 @@ package org.apache.lucene.util;
  */
 
 import java.lang.management.ManagementFactory;
+import java.lang.management.PlatformManagedObject;
 import java.lang.reflect.*;
 import java.text.DecimalFormat;
 import java.text.DecimalFormatSymbols;
@@ -212,18 +213,17 @@ public final class RamUsageEstimator {
     // regardless of the architecture).
     int objectAlignment = 8;
     try {
-      final Class<?> beanClazz = Class.forName("com.sun.management.HotSpotDiagnosticMXBean");
-      final Object hotSpotBean = ManagementFactory.newPlatformMXBeanProxy(
-        ManagementFactory.getPlatformMBeanServer(),
-        "com.sun.management:type=HotSpotDiagnostic",
-        beanClazz
-      );
-      final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class);
-      final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes");
-      objectAlignment = Integer.parseInt(
-          vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()
-      );
-      supportedFeatures.add(JvmFeature.OBJECT_ALIGNMENT);
+      final Class<? extends PlatformManagedObject> beanClazz =
+        Class.forName("com.sun.management.HotSpotDiagnosticMXBean").asSubclass(PlatformManagedObject.class);
+      final Object hotSpotBean = ManagementFactory.getPlatformMXBean(beanClazz);
+      if (hotSpotBean != null) {
+        final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class);
+        final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes");
+        objectAlignment = Integer.parseInt(
+            vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()
+        );
+        supportedFeatures.add(JvmFeature.OBJECT_ALIGNMENT);
+      }
     } catch (Exception e) {
       // Ignore.
     }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java Sun Aug 11 12:19:13 2013
@@ -17,9 +17,6 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
-// TODO: probably move this to core at some point (eg,
-// cutover kuromoji, synfilter, LookaheadTokenFilter)
-
 /** Acts like forever growing T[], but internally uses a
  *  circular buffer to reuse instances of T.
  * 

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java Sun Aug 11 12:19:13 2013
@@ -28,7 +28,7 @@ import java.util.concurrent.atomic.Atomi
  *
  * @lucene.experimental
  */
-public final class SetOnce<T> {
+public final class SetOnce<T> implements Cloneable {
 
   /** Thrown when {@link SetOnce#set(Object)} is called more than once. */
   public static final class AlreadySetException extends IllegalStateException {
@@ -74,4 +74,10 @@ public final class SetOnce<T> {
   public final T get() {
     return obj;
   }
+  
+  @Override
+  public SetOnce<T> clone() {
+    return obj == null ? new SetOnce<T>() : new SetOnce<T>(obj);
+  }
+  
 }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java Sun Aug 11 12:19:13 2013
@@ -43,4 +43,14 @@ public final class ToStringUtils {
     }
   }
 
+  private final static char [] HEX = "0123456789abcdef".toCharArray();
+
+  public static String longHex(long x) {
+    char [] asHex = new char [16];
+    for (int i = 16; --i >= 0; x >>>= 4) {
+      asHex[i] = HEX[(int) x & 0x0F];
+    }
+    return "0x" + new String(asHex);
+  }
+
 }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java Sun Aug 11 12:19:13 2013
@@ -68,6 +68,13 @@ public enum Version { 
   @Deprecated
   LUCENE_44,
 
+  /**
+   * Match settings and bugs in Lucene's 4.5 release.
+   * @deprecated (5.0) Use latest
+   */
+  @Deprecated
+  LUCENE_45,
+
   /** Match settings and bugs in Lucene's 5.0 release.
    *  <p>
    *  Use this to get the latest &amp; greatest settings, bug

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java Sun Aug 11 12:19:13 2013
@@ -219,7 +219,7 @@ final public class SpecialOperations {
   /**
    * Returns the set of accepted strings, assuming that at most
    * <code>limit</code> strings are accepted. If more than <code>limit</code> 
-   * strings are accepted, null is returned. If <code>limit</code>&lt;0, then 
+   * strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then 
    * the limit is infinite.
    */
   public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
@@ -227,11 +227,9 @@ final public class SpecialOperations {
     if (a.isSingleton()) {
       if (limit > 0) {
         strings.add(Util.toUTF32(a.singleton, new IntsRef()));
-      } else {
-        return null;
       }
     } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
-      return null;
+      return strings;
     }
     return strings;
   }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java Sun Aug 11 12:19:13 2013
@@ -117,9 +117,9 @@ public class Builder<T> {
    * 
    * @param doShareSuffix 
    *    If <code>true</code>, the shared suffixes will be compacted into unique paths.
-   *    This requires an additional hash map for lookups in memory. Setting this parameter to
-   *    <code>false</code> creates a single path for all input sequences. This will result in a larger
-   *    graph, but may require less memory and will speed up construction.  
+   *    This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
+   *    <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
+   *    FST, but requires substantially less memory and CPU during building.  
    *
    * @param doShareNonSingletonNodes
    *    Only used if doShareSuffix is true.  Set this to

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sun Aug 11 12:19:13 2013
@@ -171,6 +171,8 @@ public final class FST<T> {
   private final boolean allowArrayArcs;
 
   private Arc<T> cachedRootArcs[];
+  private Arc<T> assertingCachedRootArcs[]; // only set wit assert
+
 
   /** Represents a single arc. */
   public final static class Arc<T> {
@@ -213,7 +215,7 @@ public final class FST<T> {
       }
       return this;
     }
-
+    
     boolean flag(int flag) {
       return FST.flag(flags, flag);
     }
@@ -420,11 +422,18 @@ public final class FST<T> {
       return node;
     }
   }
-
+  
   // Caches first 128 labels
   @SuppressWarnings({"rawtypes","unchecked"})
   private void cacheRootArcs() throws IOException {
     cachedRootArcs = (Arc<T>[]) new Arc[0x80];
+    readRootArcs(cachedRootArcs);
+    
+    assert setAssertingRootArcs(cachedRootArcs);
+    assert assertRootArcs();
+  }
+  
+  public void readRootArcs(Arc<T>[] arcs) throws IOException {
     final Arc<T> arc = new Arc<T>();
     getFirstArc(arc);
     final BytesReader in = getBytesReader();
@@ -433,7 +442,7 @@ public final class FST<T> {
       while(true) {
         assert arc.label != END_LABEL;
         if (arc.label < cachedRootArcs.length) {
-          cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
+          arcs[arc.label] = new Arc<T>().copyFrom(arc);
         } else {
           break;
         }
@@ -444,6 +453,38 @@ public final class FST<T> {
       }
     }
   }
+  
+  @SuppressWarnings({"rawtypes","unchecked"})
+  private boolean setAssertingRootArcs(Arc<T>[] arcs) throws IOException {
+    assertingCachedRootArcs = (Arc<T>[]) new Arc[arcs.length];
+    readRootArcs(assertingCachedRootArcs);
+    return true;
+  }
+  
+  private boolean assertRootArcs() {
+    assert cachedRootArcs != null;
+    assert assertingCachedRootArcs != null;
+    for (int i = 0; i < cachedRootArcs.length; i++) {
+      final Arc<T> root = cachedRootArcs[i];
+      final Arc<T> asserting = assertingCachedRootArcs[i];
+      if (root != null) { 
+        assert root.arcIdx == asserting.arcIdx;
+        assert root.bytesPerArc == asserting.bytesPerArc;
+        assert root.flags == asserting.flags;
+        assert root.label == asserting.label;
+        assert root.nextArc == asserting.nextArc;
+        assert root.nextFinalOutput.equals(asserting.nextFinalOutput);
+        assert root.node == asserting.node;
+        assert root.numArcs == asserting.numArcs;
+        assert root.output.equals(asserting.output);
+        assert root.posArcsStart == asserting.posArcsStart;
+        assert root.target == asserting.target;
+      } else {
+        assert root == null && asserting == null;
+      } 
+    }
+    return true;
+  }
 
   public T getEmptyOutput() {
     return emptyOutput;
@@ -1099,7 +1140,7 @@ public final class FST<T> {
   /** Finds an arc leaving the incoming arc, replacing the arc in place.
    *  This returns null if the arc was not found, else the incoming arc. */
   public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
-    assert cachedRootArcs != null;
+    assert assertRootArcs();
 
     if (labelToMatch == END_LABEL) {
       if (follow.isFinal()) {
@@ -1123,7 +1164,7 @@ public final class FST<T> {
     if (follow.target == startNode && labelToMatch < cachedRootArcs.length) {
       final Arc<T> result = cachedRootArcs[labelToMatch];
       if (result == null) {
-        return result;
+        return null;
       } else {
         arc.copyFrom(result);
         return arc;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java Sun Aug 11 12:19:13 2013
@@ -19,21 +19,21 @@ package org.apache.lucene.util.fst;
 
 import java.io.IOException;
 
-import org.apache.lucene.util.packed.GrowableWriter;
 import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PagedGrowableWriter;
 
 // Used to dedup states (lookup already-frozen states)
 final class NodeHash<T> {
 
-  private GrowableWriter table;
-  private int count;
-  private int mask;
+  private PagedGrowableWriter table;
+  private long count;
+  private long mask;
   private final FST<T> fst;
   private final FST.Arc<T> scratchArc = new FST.Arc<T>();
   private final FST.BytesReader in;
 
   public NodeHash(FST<T> fst, FST.BytesReader in) {
-    table = new GrowableWriter(8, 16, PackedInts.COMPACT);
+    table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
     mask = 15;
     this.fst = fst;
     this.in = in;
@@ -69,10 +69,10 @@ final class NodeHash<T> {
 
   // hash code for an unfrozen node.  This must be identical
   // to the un-frozen case (below)!!
-  private int hash(Builder.UnCompiledNode<T> node) {
+  private long hash(Builder.UnCompiledNode<T> node) {
     final int PRIME = 31;
     //System.out.println("hash unfrozen");
-    int h = 0;
+    long h = 0;
     // TODO: maybe if number of arcs is high we can safely subsample?
     for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
       final Builder.Arc<T> arc = node.arcs[arcIdx];
@@ -87,14 +87,14 @@ final class NodeHash<T> {
       }
     }
     //System.out.println("  ret " + (h&Integer.MAX_VALUE));
-    return h & Integer.MAX_VALUE;
+    return h & Long.MAX_VALUE;
   }
 
   // hash code for a frozen node
-  private int hash(long node) throws IOException {
+  private long hash(long node) throws IOException {
     final int PRIME = 31;
     //System.out.println("hash frozen node=" + node);
-    int h = 0;
+    long h = 0;
     fst.readFirstRealTargetArc(node, scratchArc, in);
     while(true) {
       //System.out.println("  label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
@@ -111,13 +111,13 @@ final class NodeHash<T> {
       fst.readNextRealArc(scratchArc, in);
     }
     //System.out.println("  ret " + (h&Integer.MAX_VALUE));
-    return h & Integer.MAX_VALUE;
+    return h & Long.MAX_VALUE;
   }
 
   public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
-    // System.out.println("hash: add count=" + count + " vs " + table.size());
-    final int h = hash(nodeIn);
-    int pos = h & mask;
+    //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
+    final long h = hash(nodeIn);
+    long pos = h & mask;
     int c = 0;
     while(true) {
       final long v = table.get(pos);
@@ -128,7 +128,8 @@ final class NodeHash<T> {
         assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
         count++;
         table.set(pos, node);
-        if (table.size() < 2*count) {
+        // Rehash at 2/3 occupancy:
+        if (count > 2*table.size()/3) {
           rehash();
         }
         return node;
@@ -144,7 +145,7 @@ final class NodeHash<T> {
 
   // called only by rehash
   private void addNew(long address) throws IOException {
-    int pos = hash(address) & mask;
+    long pos = hash(address) & mask;
     int c = 0;
     while(true) {
       if (table.get(pos) == 0) {
@@ -158,23 +159,15 @@ final class NodeHash<T> {
   }
 
   private void rehash() throws IOException {
-    final GrowableWriter oldTable = table;
+    final PagedGrowableWriter oldTable = table;
 
-    if (oldTable.size() >= Integer.MAX_VALUE/2) {
-      throw new IllegalStateException("FST too large (> 2.1 GB)");
-    }
-
-    table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
+    table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count), PackedInts.COMPACT);
     mask = table.size()-1;
-    for(int idx=0;idx<oldTable.size();idx++) {
+    for(long idx=0;idx<oldTable.size();idx++) {
       final long address = oldTable.get(idx);
       if (address != 0) {
         addNew(address);
       }
     }
   }
-
-  public int count() {
-    return count;
-  }
 }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java Sun Aug 11 12:19:13 2013
@@ -33,26 +33,13 @@ public final class PositiveIntOutputs ex
   
   private final static Long NO_OUTPUT = new Long(0);
 
-  private final boolean doShare;
+  private final static PositiveIntOutputs singleton = new PositiveIntOutputs();
 
-  private final static PositiveIntOutputs singletonShare = new PositiveIntOutputs(true);
-  private final static PositiveIntOutputs singletonNoShare = new PositiveIntOutputs(false);
-
-  private PositiveIntOutputs(boolean doShare) {
-    this.doShare = doShare;
+  private PositiveIntOutputs() {
   }
 
-  /** Returns the instance of PositiveIntOutputs. */
   public static PositiveIntOutputs getSingleton() {
-    return getSingleton(true);
-  }
-
-  /** Expert: pass doShare=false to disable output sharing.
-   *  In some cases this may result in a smaller FST,
-   *  however it will also break methods like {@link
-   *  Util#getByOutput} and {@link Util#shortestPaths}. */
-  public static PositiveIntOutputs getSingleton(boolean doShare) {
-    return doShare ? singletonShare : singletonNoShare;
+    return singleton;
   }
 
   @Override
@@ -61,14 +48,10 @@ public final class PositiveIntOutputs ex
     assert valid(output2);
     if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
       return NO_OUTPUT;
-    } else if (doShare) {
+    } else {
       assert output1 > 0;
       assert output2 > 0;
       return Math.min(output1, output2);
-    } else if (output1.equals(output2)) {
-      return output1;
-    } else {
-      return NO_OUTPUT;
     }
   }
 
@@ -134,6 +117,6 @@ public final class PositiveIntOutputs ex
 
   @Override
   public String toString() {
-    return "PositiveIntOutputs(doShare=" + doShare + ")";
+    return "PositiveIntOutputs";
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Sun Aug 11 12:19:13 2013
@@ -93,9 +93,7 @@ public final class Util {
    *
    *  <p>NOTE: this only works with {@code FST<Long>}, only
    *  works when the outputs are ascending in order with
-   *  the inputs and only works when you shared
-   *  the outputs (pass doShare=true to {@link
-   *  PositiveIntOutputs#getSingleton}).
+   *  the inputs.
    *  For example, simple ordinals (0, 1,
    *  2, ...), or file offets (when appending to a file)
    *  fit this. */
@@ -517,11 +515,7 @@ public final class Util {
   }
 
   /** Starting from node, find the top N min cost 
-   *  completions to a final node.
-   *
-   *  <p>NOTE: you must share the outputs when you build the
-   *  FST (pass doShare=true to {@link
-   *  PositiveIntOutputs#getSingleton}). */
+   *  completions to a final node. */
   public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
                                                  boolean allowEmptyString) throws IOException {
 
@@ -814,7 +808,7 @@ public final class Util {
     final int charLimit = offset + length;
     while(charIdx < charLimit) {
       scratch.grow(intIdx+1);
-      final int utf32 = Character.codePointAt(s, charIdx);
+      final int utf32 = Character.codePointAt(s, charIdx, charLimit);
       scratch.ints[intIdx] = utf32;
       charIdx += Character.charCount(utf32);
       intIdx++;

Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html Sun Aug 11 12:19:13 2013
@@ -43,7 +43,7 @@ FST Construction example:
     String inputValues[] = {"cat", "dog", "dogs"};
     long outputValues[] = {5, 7, 12};
     
-    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
     Builder&lt;Long&gt; builder = new Builder&lt;Long&gt;(INPUT_TYPE.BYTE1, outputs);
     BytesRef scratchBytes = new BytesRef();
     IntsRef scratchInts = new IntsRef();
@@ -60,8 +60,7 @@ Retrieval by key:
 </pre>
 Retrieval by value:
 <pre class="prettyprint">
-    // Only works because outputs are also in sorted order, and
-    // we passed 'true' for sharing to PositiveIntOutputs.getSingleton
+    // Only works because outputs are also in sorted order
     IntsRef key = Util.getByOutput(fst, 12);
     System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs
 </pre>
@@ -77,7 +76,6 @@ Iterate over key-value pairs in sorted o
 </pre>
 N-shortest paths by weight:
 <pre class="prettyprint">
-    // Only works because we passed 'true' for sharing to PositiveIntOutputs.getSingleton
     Comparator&lt;Long&gt; comparator = new Comparator&lt;Long&gt;() {
       public int compare(Long left, Long right) {
         return left.compareTo(right);