You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2013/08/11 14:19:39 UTC
svn commit: r1512909 [10/38] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/suggest/ dev-tools/idea/solr/contrib/dataimporthandler/
dev-tools/idea/solr/core/src/test/ dev-too...
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java Sun Aug 11 12:19:13 2013
@@ -19,10 +19,40 @@ package org.apache.lucene.search.similar
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
-/** Expert: Default scoring implementation. */
+/**
+ * Expert: Default scoring implementation which {@link #encodeNormValue(float)
+ * encodes} norm values as a single byte before being stored. At search time,
+ * the norm byte value is read from the index
+ * {@link org.apache.lucene.store.Directory directory} and
+ * {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
+ * This encoding/decoding, while reducing index size, comes with the price of
+ * precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
+ * instance, <i>decode(encode(0.89)) = 0.75</i>.
+ * <p>
+ * Compression of norm values to a single byte saves memory at search time,
+ * because once a field is referenced at search time, its norms - for all
+ * documents - are maintained in memory.
+ * <p>
+ * The rationale supporting such lossy compression of norm values is that given
+ * the difficulty (and inaccuracy) of users to express their true information
+ * need by a query, only big differences matter. <br>
+ * <br>
+ * Last, note that search time is too late to modify this <i>norm</i> part of
+ * scoring, e.g. by using a different {@link Similarity} for search.
+ */
public class DefaultSimilarity extends TFIDFSimilarity {
+ /** Cache of decoded bytes. */
+ private static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++) {
+ NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ }
+ }
+
/** Sole constructor: parameter-free */
public DefaultSimilarity() {}
@@ -38,6 +68,35 @@ public class DefaultSimilarity extends T
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
+ /**
+ * Encodes a normalization factor for storage in an index.
+ * <p>
+ * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+ * zero-exponent point at 15, thus representing values from around 7x10^9 to
+ * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+ * represented. Negative numbers are rounded up to zero. Values too large to
+ * represent are rounded down to the largest representable value. Positive
+ * values too small to represent are rounded up to the smallest positive
+ * representable value.
+ *
+ * @see org.apache.lucene.document.Field#setBoost(float)
+ * @see org.apache.lucene.util.SmallFloat
+ */
+ @Override
+ public final long encodeNormValue(float f) {
+ return SmallFloat.floatToByte315(f);
+ }
+
+ /**
+ * Decodes the norm value, assuming it is a single byte.
+ *
+ * @see #encodeNormValue(float)
+ */
+ @Override
+ public final float decodeNormValue(long norm) {
+ return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
+ }
+
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java Sun Aug 11 12:19:13 2013
@@ -57,60 +57,25 @@ public class MultiSimilarity extends Sim
}
@Override
- public ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
- ExactSimScorer subScorers[] = new ExactSimScorer[sims.length];
+ public SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+ SimScorer subScorers[] = new SimScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
- subScorers[i] = sims[i].exactSimScorer(((MultiStats)stats).subStats[i], context);
- }
- return new MultiExactDocScorer(subScorers);
- }
-
- @Override
- public SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
- SloppySimScorer subScorers[] = new SloppySimScorer[sims.length];
- for (int i = 0; i < subScorers.length; i++) {
- subScorers[i] = sims[i].sloppySimScorer(((MultiStats)stats).subStats[i], context);
- }
- return new MultiSloppyDocScorer(subScorers);
- }
-
- static class MultiExactDocScorer extends ExactSimScorer {
- private final ExactSimScorer subScorers[];
-
- MultiExactDocScorer(ExactSimScorer subScorers[]) {
- this.subScorers = subScorers;
- }
-
- @Override
- public float score(int doc, int freq) {
- float sum = 0.0f;
- for (ExactSimScorer subScorer : subScorers) {
- sum += subScorer.score(doc, freq);
- }
- return sum;
- }
-
- @Override
- public Explanation explain(int doc, Explanation freq) {
- Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:");
- for (ExactSimScorer subScorer : subScorers) {
- expl.addDetail(subScorer.explain(doc, freq));
- }
- return expl;
+ subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context);
}
+ return new MultiSimScorer(subScorers);
}
- static class MultiSloppyDocScorer extends SloppySimScorer {
- private final SloppySimScorer subScorers[];
+ static class MultiSimScorer extends SimScorer {
+ private final SimScorer subScorers[];
- MultiSloppyDocScorer(SloppySimScorer subScorers[]) {
+ MultiSimScorer(SimScorer subScorers[]) {
this.subScorers = subScorers;
}
@Override
public float score(int doc, float freq) {
float sum = 0.0f;
- for (SloppySimScorer subScorer : subScorers) {
+ for (SimScorer subScorer : subScorers) {
sum += subScorer.score(doc, freq);
}
return sum;
@@ -119,7 +84,7 @@ public class MultiSimilarity extends Sim
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:");
- for (SloppySimScorer subScorer : subScorers) {
+ for (SimScorer subScorer : subScorers) {
expl.addDetail(subScorer.explain(doc, freq));
}
return expl;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java Sun Aug 11 12:19:13 2013
@@ -54,15 +54,9 @@ public abstract class PerFieldSimilarity
}
@Override
- public final ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
+ public final SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
- return perFieldWeight.delegate.exactSimScorer(perFieldWeight.delegateWeight, context);
- }
-
- @Override
- public final SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
- PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
- return perFieldWeight.delegate.sloppySimScorer(perFieldWeight.delegateWeight, context);
+ return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context);
}
/**
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java Sun Aug 11 12:19:13 2013
@@ -88,10 +88,8 @@ import org.apache.lucene.util.SmallFloat
* is called for each query leaf node, {@link Similarity#queryNorm(float)} is called for the top-level
* query, and finally {@link Similarity.SimWeight#normalize(float, float)} passes down the normalization value
* and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s).
- * <li>For each segment in the index, the Query creates a {@link #exactSimScorer(SimWeight, AtomicReaderContext)}
- * (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a
- * {@link #sloppySimScorer(SimWeight, AtomicReaderContext)} (for queries with sloppy frequencies such as
- * SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document.
+ * <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, AtomicReaderContext)}
+ * The score() method is called for each matching document.
* </ol>
* <p>
* <a name="explaintime"/>
@@ -166,76 +164,31 @@ public abstract class Similarity {
* @return SimWeight object with the information this Similarity needs to score a query.
*/
public abstract SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats);
-
- /**
- * Creates a new {@link Similarity.ExactSimScorer} to score matching documents from a segment of the inverted index.
- * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
- * @param context segment of the inverted index to be scored.
- * @return ExactSimScorer for scoring documents across <code>context</code>
- * @throws IOException if there is a low-level I/O error
- */
- public abstract ExactSimScorer exactSimScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
-
+
/**
- * Creates a new {@link Similarity.SloppySimScorer} to score matching documents from a segment of the inverted index.
+ * Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index.
* @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
* @param context segment of the inverted index to be scored.
* @return SloppySimScorer for scoring documents across <code>context</code>
* @throws IOException if there is a low-level I/O error
*/
- public abstract SloppySimScorer sloppySimScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
-
- /**
- * API for scoring exact queries such as {@link TermQuery} and
- * exact {@link PhraseQuery}.
- * <p>
- * Frequencies are integers (the term or phrase frequency within the document)
- */
- public static abstract class ExactSimScorer {
-
- /**
- * Sole constructor. (For invocation by subclass
- * constructors, typically implicit.)
- */
- public ExactSimScorer() {}
-
- /**
- * Score a single document
- * @param doc document id
- * @param freq term frequency
- * @return document's score
- */
- public abstract float score(int doc, int freq);
-
- /**
- * Explain the score for a single document
- * @param doc document id
- * @param freq Explanation of how the term frequency was computed
- * @return document's score
- */
- public Explanation explain(int doc, Explanation freq) {
- Explanation result = new Explanation(score(doc, (int)freq.getValue()),
- "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
- result.addDetail(freq);
- return result;
- }
- }
+ public abstract SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException;
/**
- * API for scoring "sloppy" queries such as {@link SpanQuery} and
- * sloppy {@link PhraseQuery}.
+ * API for scoring "sloppy" queries such as {@link TermQuery},
+ * {@link SpanQuery}, and {@link PhraseQuery}.
* <p>
* Frequencies are floating-point values: an approximate
* within-document frequency adjusted for "sloppiness" by
- * {@link SloppySimScorer#computeSlopFactor(int)}.
+ * {@link SimScorer#computeSlopFactor(int)}.
*/
- public static abstract class SloppySimScorer {
+ public static abstract class SimScorer {
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
- public SloppySimScorer() {}
+ public SimScorer() {}
/**
* Score a single document
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java Sun Aug 11 12:19:13 2013
@@ -190,38 +190,20 @@ public abstract class SimilarityBase ext
}
@Override
- public ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+ public SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
- ExactSimScorer subScorers[] = new ExactSimScorer[subStats.length];
+ SimScorer subScorers[] = new SimScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
BasicStats basicstats = (BasicStats) subStats[i];
- subScorers[i] = new BasicExactDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
+ subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
}
- return new MultiSimilarity.MultiExactDocScorer(subScorers);
+ return new MultiSimilarity.MultiSimScorer(subScorers);
} else {
BasicStats basicstats = (BasicStats) stats;
- return new BasicExactDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
- }
- }
-
- @Override
- public SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
- if (stats instanceof MultiSimilarity.MultiStats) {
- // a multi term query (e.g. phrase). return the summation,
- // scoring almost as if it were boolean query
- SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
- SloppySimScorer subScorers[] = new SloppySimScorer[subStats.length];
- for (int i = 0; i < subScorers.length; i++) {
- BasicStats basicstats = (BasicStats) subStats[i];
- subScorers[i] = new BasicSloppyDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
- }
- return new MultiSimilarity.MultiSloppyDocScorer(subScorers);
- } else {
- BasicStats basicstats = (BasicStats) stats;
- return new BasicSloppyDocScorer(basicstats, context.reader().getNormValues(basicstats.field));
+ return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
}
}
@@ -277,46 +259,17 @@ public abstract class SimilarityBase ext
// --------------------------------- Classes ---------------------------------
- /** Delegates the {@link #score(int, int)} and
- * {@link #explain(int, Explanation)} methods to
- * {@link SimilarityBase#score(BasicStats, float, float)} and
- * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
- * respectively.
- */
- private class BasicExactDocScorer extends ExactSimScorer {
- private final BasicStats stats;
- private final NumericDocValues norms;
-
- BasicExactDocScorer(BasicStats stats, NumericDocValues norms) throws IOException {
- this.stats = stats;
- this.norms = norms;
- }
-
- @Override
- public float score(int doc, int freq) {
- // We have to supply something in case norms are omitted
- return SimilarityBase.this.score(stats, freq,
- norms == null ? 1F : decodeNormValue((byte)norms.get(doc)));
- }
-
- @Override
- public Explanation explain(int doc, Explanation freq) {
- return SimilarityBase.this.explain(stats, doc, freq,
- norms == null ? 1F : decodeNormValue((byte)norms.get(doc)));
- }
- }
-
/** Delegates the {@link #score(int, float)} and
* {@link #explain(int, Explanation)} methods to
* {@link SimilarityBase#score(BasicStats, float, float)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
* respectively.
*/
- private class BasicSloppyDocScorer extends SloppySimScorer {
+ private class BasicSimScorer extends SimScorer {
private final BasicStats stats;
private final NumericDocValues norms;
- BasicSloppyDocScorer(BasicStats stats, NumericDocValues norms) throws IOException {
+ BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
this.stats = stats;
this.norms = norms;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Sun Aug 11 12:19:13 2013
@@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSea
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;
/**
@@ -496,27 +495,8 @@ import org.apache.lucene.util.SmallFloat
* <td></td>
* </tr>
* </table>
- * <br> <br>
- * However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
- * before being stored.
- * At search time, the norm byte value is read from the index
- * {@link org.apache.lucene.store.Directory directory} and
- * {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
- * This encoding/decoding, while reducing index size, comes with the price of
- * precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
- * For instance, <i>decode(encode(0.89)) = 0.75</i>.
- * <br> <br>
- * Compression of norm values to a single byte saves memory at search time,
- * because once a field is referenced at search time, its norms - for
- * all documents - are maintained in memory.
- * <br> <br>
- * The rationale supporting such lossy compression of norm values is that
- * given the difficulty (and inaccuracy) of users to express their true information
- * need by a query, only big differences matter.
- * <br> <br>
- * Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
- * using a different {@link Similarity} for search.
- * <br> <br>
+ * Note that search time is too late to modify this <i>norm</i> part of scoring,
+ * e.g. by using a different {@link Similarity} for search.
* </li>
* </ol>
*
@@ -572,25 +552,6 @@ public abstract class TFIDFSimilarity ex
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
- * <p>The default implementation calls {@link #tf(float)}.
- *
- * @param freq the frequency of a term within a document
- * @return a score factor based on a term's within-document frequency
- */
- public float tf(int freq) {
- return tf((float)freq);
- }
-
- /** Computes a score factor based on a term or phrase's frequency in a
- * document. This value is multiplied by the {@link #idf(long, long)}
- * factor for each term in the query and these products are then summed to
- * form the initial score for a document.
- *
- * <p>Terms and phrases repeated in a document indicate the topic of the
- * document, so implementations of this method usually return larger values
- * when <code>freq</code> is large, and smaller values when <code>freq</code>
- * is small.
- *
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
@@ -655,7 +616,7 @@ public abstract class TFIDFSimilarity ex
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
- * {@link #tf(int)} factor for each term in the query and these products are
+ * {@link #tf(float)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
@@ -685,38 +646,15 @@ public abstract class TFIDFSimilarity ex
return encodeNormValue(normValue);
}
- /** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
-
- static {
- for (int i = 0; i < 256; i++) {
- NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
- }
- }
-
- /** Decodes a normalization factor stored in an index.
+ /**
+ * Decodes a normalization factor stored in an index.
+ *
* @see #encodeNormValue(float)
*/
- public float decodeNormValue(byte b) {
- return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
- }
+ public abstract float decodeNormValue(long norm);
- /** Encodes a normalization factor for storage in an index.
- *
- * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
- * the zero-exponent point at 15, thus
- * representing values from around 7x10^9 to 2x10^-9 with about one
- * significant decimal digit of accuracy. Zero is also represented.
- * Negative numbers are rounded up to zero. Values too large to represent
- * are rounded down to the largest representable value. Positive values too
- * small to represent are rounded up to the smallest positive representable
- * value.
- * @see org.apache.lucene.document.Field#setBoost(float)
- * @see org.apache.lucene.util.SmallFloat
- */
- public byte encodeNormValue(float f) {
- return SmallFloat.floatToByte315(f);
- }
+ /** Encodes a normalization factor for storage in an index. */
+ public abstract long encodeNormValue(float f);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
@@ -755,49 +693,17 @@ public abstract class TFIDFSimilarity ex
}
@Override
- public final ExactSimScorer exactSimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
+ public final SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
IDFStats idfstats = (IDFStats) stats;
- return new ExactTFIDFDocScorer(idfstats, context.reader().getNormValues(idfstats.field));
- }
-
- @Override
- public final SloppySimScorer sloppySimScorer(SimWeight stats, AtomicReaderContext context) throws IOException {
- IDFStats idfstats = (IDFStats) stats;
- return new SloppyTFIDFDocScorer(idfstats, context.reader().getNormValues(idfstats.field));
- }
-
- // TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot.
-
- private final class ExactTFIDFDocScorer extends ExactSimScorer {
- private final IDFStats stats;
- private final float weightValue;
- private final NumericDocValues norms;
-
- ExactTFIDFDocScorer(IDFStats stats, NumericDocValues norms) throws IOException {
- this.stats = stats;
- this.weightValue = stats.value;
- this.norms = norms;
- }
-
- @Override
- public float score(int doc, int freq) {
- final float raw = tf(freq)*weightValue; // compute tf(f)*weight
-
- return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
- }
-
- @Override
- public Explanation explain(int doc, Explanation freq) {
- return explainScore(doc, freq, stats, norms);
- }
+ return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
}
- private final class SloppyTFIDFDocScorer extends SloppySimScorer {
+ private final class TFIDFSimScorer extends SimScorer {
private final IDFStats stats;
private final float weightValue;
private final NumericDocValues norms;
- SloppyTFIDFDocScorer(IDFStats stats, NumericDocValues norms) throws IOException {
+ TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
@@ -807,7 +713,7 @@ public abstract class TFIDFSimilarity ex
public float score(int doc, float freq) {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
- return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
+ return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize for field
}
@Override
@@ -894,8 +800,7 @@ public abstract class TFIDFSimilarity ex
fieldExpl.addDetail(stats.idf);
Explanation fieldNormExpl = new Explanation();
- float fieldNorm =
- norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f;
+ float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java Sun Aug 11 12:19:13 2013
@@ -22,6 +22,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.InPlaceMergeSorter;
import java.io.IOException;
import java.util.ArrayList;
@@ -72,13 +73,19 @@ public class NearSpansOrdered extends Sp
private List<byte[]> matchPayload;
private final Spans[] subSpansByDoc;
- private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() {
+ // Even though the array is probably almost sorted, InPlaceMergeSorter will likely
+ // perform better since it has a lower overhead than TimSorter for small arrays
+ private final InPlaceMergeSorter sorter = new InPlaceMergeSorter() {
@Override
- public int compare(Spans o1, Spans o2) {
- return o1.doc() - o2.doc();
+ protected void swap(int i, int j) {
+ ArrayUtil.swap(subSpansByDoc, i, j);
+ }
+ @Override
+ protected int compare(int i, int j) {
+ return subSpansByDoc[i].doc() - subSpansByDoc[j].doc();
}
};
-
+
private SpanNearQuery query;
private boolean collectPayloads = true;
@@ -204,7 +211,7 @@ public class NearSpansOrdered extends Sp
/** Advance the subSpans to the same document */
private boolean toSameDoc() throws IOException {
- ArrayUtil.timSort(subSpansByDoc, spanDocComparator);
+ sorter.sort(0, subSpansByDoc.length);
int firstIndex = 0;
int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
while (subSpansByDoc[firstIndex].doc() != maxDoc) {
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java Sun Aug 11 12:19:13 2013
@@ -31,16 +31,36 @@ import java.util.Collection;
import java.util.Map;
import java.util.Set;
-/** Removes matches which overlap with another SpanQuery. */
+/** Removes matches which overlap with another SpanQuery or
+ * within a x tokens before or y tokens after another SpanQuery. */
public class SpanNotQuery extends SpanQuery implements Cloneable {
private SpanQuery include;
private SpanQuery exclude;
+ private final int pre;
+ private final int post;
/** Construct a SpanNotQuery matching spans from <code>include</code> which
* have no overlap with spans from <code>exclude</code>.*/
public SpanNotQuery(SpanQuery include, SpanQuery exclude) {
+ this(include, exclude, 0, 0);
+ }
+
+
+ /** Construct a SpanNotQuery matching spans from <code>include</code> which
+ * have no overlap with spans from <code>exclude</code> within
+ * <code>dist</code> tokens of <code>include</code>. */
+ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) {
+ this(include, exclude, dist, dist);
+ }
+
+ /** Construct a SpanNotQuery matching spans from <code>include</code> which
+ * have no overlap with spans from <code>exclude</code> within
+ * <code>pre</code> tokens before or <code>post</code> tokens of <code>include</code>. */
+ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) {
this.include = include;
this.exclude = exclude;
+ this.pre = (pre >=0) ? pre : 0;
+ this.post = (post >= 0) ? post : 0;
if (!include.getField().equals(exclude.getField()))
throw new IllegalArgumentException("Clauses must have same field.");
@@ -65,6 +85,10 @@ public class SpanNotQuery extends SpanQu
buffer.append(include.toString(field));
buffer.append(", ");
buffer.append(exclude.toString(field));
+ buffer.append(", ");
+ buffer.append(Integer.toString(pre));
+ buffer.append(", ");
+ buffer.append(Integer.toString(post));
buffer.append(")");
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
@@ -72,7 +96,8 @@ public class SpanNotQuery extends SpanQu
@Override
public SpanNotQuery clone() {
- SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),(SpanQuery) exclude.clone());
+ SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),
+ (SpanQuery) exclude.clone(), pre, post);
spanNotQuery.setBoost(getBoost());
return spanNotQuery;
}
@@ -98,13 +123,13 @@ public class SpanNotQuery extends SpanQu
while (moreExclude // while exclude is before
&& includeSpans.doc() == excludeSpans.doc()
- && excludeSpans.end() <= includeSpans.start()) {
+ && excludeSpans.end() <= includeSpans.start() - pre) {
moreExclude = excludeSpans.next(); // increment exclude
}
if (!moreExclude // if no intersection
|| includeSpans.doc() != excludeSpans.doc()
- || includeSpans.end() <= excludeSpans.start())
+ || includeSpans.end()+post <= excludeSpans.start())
break; // we found a match
moreInclude = includeSpans.next(); // intersected: keep scanning
@@ -126,13 +151,13 @@ public class SpanNotQuery extends SpanQu
while (moreExclude // while exclude is before
&& includeSpans.doc() == excludeSpans.doc()
- && excludeSpans.end() <= includeSpans.start()) {
+ && excludeSpans.end() <= includeSpans.start()-pre) {
moreExclude = excludeSpans.next(); // increment exclude
}
if (!moreExclude // if no intersection
|| includeSpans.doc() != excludeSpans.doc()
- || includeSpans.end() <= excludeSpans.start())
+ || includeSpans.end()+post <= excludeSpans.start())
return true; // we found a match
return next(); // scan to next match
@@ -199,23 +224,28 @@ public class SpanNotQuery extends SpanQu
/** Returns true iff <code>o</code> is equal to this. */
@Override
public boolean equals(Object o) {
- if (this == o) return true;
- if (!(o instanceof SpanNotQuery)) return false;
+ if (!super.equals(o))
+ return false;
SpanNotQuery other = (SpanNotQuery)o;
return this.include.equals(other.include)
&& this.exclude.equals(other.exclude)
- && this.getBoost() == other.getBoost();
+ && this.pre == other.pre
+ && this.post == other.post;
}
@Override
public int hashCode() {
- int h = include.hashCode();
- h = (h<<1) | (h >>> 31); // rotate left
+ int h = super.hashCode();
+ h = Integer.rotateLeft(h, 1);
+ h ^= include.hashCode();
+ h = Integer.rotateLeft(h, 1);
h ^= exclude.hashCode();
- h = (h<<1) | (h >>> 31); // rotate left
- h ^= Float.floatToRawIntBits(getBoost());
+ h = Integer.rotateLeft(h, 1);
+ h ^= pre;
+ h = Integer.rotateLeft(h, 1);
+ h ^= post;
return h;
}
-}
+}
\ No newline at end of file
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java Sun Aug 11 12:19:13 2013
@@ -34,9 +34,9 @@ public class SpanScorer extends Scorer {
protected int doc;
protected float freq;
protected int numMatches;
- protected final Similarity.SloppySimScorer docScorer;
+ protected final Similarity.SimScorer docScorer;
- protected SpanScorer(Spans spans, Weight weight, Similarity.SloppySimScorer docScorer)
+ protected SpanScorer(Spans spans, Weight weight, Similarity.SimScorer docScorer)
throws IOException {
super(weight);
this.docScorer = docScorer;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java Sun Aug 11 12:19:13 2013
@@ -98,7 +98,7 @@ public class SpanTermQuery extends SpanQ
final Terms terms = fields.terms(term.field());
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
- if (termsEnum.seekExact(term.bytes(), true)) {
+ if (termsEnum.seekExact(term.bytes())) {
state = termsEnum.termState();
} else {
state = null;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java Sun Aug 11 12:19:13 2013
@@ -23,7 +23,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SloppySimScorer;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.Bits;
import java.io.IOException;
@@ -52,7 +52,7 @@ public class SpanWeight extends Weight {
final TermStatistics termStats[] = new TermStatistics[terms.size()];
int i = 0;
for (Term term : terms) {
- TermContext state = TermContext.build(context, term, true);
+ TermContext state = TermContext.build(context, term);
termStats[i] = searcher.termStatistics(term, state);
termContexts.put(term, state);
i++;
@@ -86,7 +86,7 @@ public class SpanWeight extends Weight {
if (stats == null) {
return null;
} else {
- return new SpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity.sloppySimScorer(stats, context));
+ return new SpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity.simScorer(stats, context));
}
}
@@ -97,7 +97,7 @@ public class SpanWeight extends Weight {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.sloppyFreq();
- SloppySimScorer docScorer = similarity.sloppySimScorer(stats, context);
+ SimScorer docScorer = similarity.simScorer(stats, context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/search/spans/package.html Sun Aug 11 12:19:13 2013
@@ -38,8 +38,8 @@ and inter-phrase proximity (when constru
number of other {@link org.apache.lucene.search.spans.SpanQuery}s.</li>
<li>A {@link org.apache.lucene.search.spans.SpanNotQuery SpanNotQuery} removes spans
-matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap
-another. This can be used, e.g., to implement within-paragraph
+matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap (or comes
+near) another. This can be used, e.g., to implement within-paragraph
search.</li>
<li>A {@link org.apache.lucene.search.spans.SpanFirstQuery SpanFirstQuery} matches spans
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/Directory.java Sun Aug 11 12:19:13 2013
@@ -21,6 +21,7 @@ import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Closeable;
+import java.nio.file.NoSuchFileException;
import java.util.Collection; // for javadocs
import org.apache.lucene.util.IOUtils;
@@ -70,12 +71,12 @@ public abstract class Directory implemen
* Returns the length of a file in the directory. This method follows the
* following contract:
* <ul>
- * <li>Throws {@link FileNotFoundException} if the file does not exist
+ * <li>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+ * if the file does not exist.
* <li>Returns a value ≥0 if the file exists, which specifies its length.
* </ul>
*
* @param name the name of the file for which to return the length.
- * @throws FileNotFoundException if the file does not exist.
* @throws IOException if there was an IO error while retrieving the file's
* length.
*/
@@ -106,7 +107,9 @@ public abstract class Directory implemen
* the only Directory implementations that respect this
* parameter are {@link FSDirectory} and {@link
* CompoundFileDirectory}.
- */
+ * <p>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+ * if the file does not exist.
+ */
public abstract IndexInput openInput(String name, IOContext context) throws IOException;
/** Construct a {@link Lock}.
@@ -223,6 +226,8 @@ public abstract class Directory implemen
* efficiently open one or more sliced {@link IndexInput} instances from a
* single file handle. The underlying file handle is kept open until the
* {@link IndexInputSlicer} is closed.
+ * <p>Throws {@link FileNotFoundException} or {@link NoSuchFileException}
+ * if the file does not exist.
*
* @throws IOException
* if an {@link IOException} occurs
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java Sun Aug 11 12:19:13 2013
@@ -200,6 +200,9 @@ public class NIOFSDirectory extends FSDi
}
bb.limit(limit);
int i = channel.read(bb, pos);
+ if (i < 0){//be defensive here, even though we checked before hand, something could have changed
+ throw new EOFException("read past EOF: " + this + " off: " + offset + " len: " + len + " pos: " + pos + " limit: " + limit + " end: " + end);
+ }
pos += i;
readOffset += i;
readLength -= i;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/store/SimpleFSDirectory.java Sun Aug 11 12:19:13 2013
@@ -155,6 +155,9 @@ public class SimpleFSDirectory extends F
readLength = chunkSize;
}
final int i = file.read(b, offset + total, readLength);
+ if (i < 0){//be defensive here, even though we checked before hand, something could have changed
+ throw new EOFException("read past EOF: " + this + " off: " + offset + " len: " + len + " total: " + total + " readLen: " + readLength + " end: " + end);
+ }
total += i;
} while (total < len);
} catch (OutOfMemoryError e) {
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/BitUtil.java Sun Aug 11 12:19:13 2013
@@ -22,8 +22,93 @@ package org.apache.lucene.util; // from
*/
public final class BitUtil {
+ private static final byte[] BYTE_COUNTS = { // table of bits/byte
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+ };
+
+ // The General Idea: instead of having an array per byte that has
+ // the offsets of the next set bit, that array could be
+ // packed inside a 32 bit integer (8 4 bit numbers). That
+ // should be faster than accessing an array for each index, and
+ // the total array size is kept smaller (256*sizeof(int))=1K
+ /***** the python code that generated bitlist
+ def bits2int(val):
+ arr=0
+ for shift in range(8,0,-1):
+ if val & 0x80:
+ arr = (arr << 4) | shift
+ val = val << 1
+ return arr
+
+ def int_table():
+ tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
+ return ','.join(tbl)
+ ******/
+ private static final int[] BIT_LISTS = {
+ 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
+ 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
+ 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
+ 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
+ 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
+ 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
+ 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
+ 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
+ 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
+ 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
+ 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
+ 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
+ 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
+ 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
+ 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
+ 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
+ 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
+ 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
+ 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
+ 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
+ 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
+ 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
+ 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
+ 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
+ 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
+ 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
+ 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
+ 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
+ };
+
private BitUtil() {} // no instance
+ /** Return the number of bits sets in b. */
+ public static int bitCount(byte b) {
+ return BYTE_COUNTS[b & 0xFF];
+ }
+
+ /** Return the list of bits which are set in b encoded as followed:
+ * <code>(i >>> (4 * n)) & 0x0F</code> is the offset of the n-th set bit of
+ * the given byte plus one, or 0 if there are n or less bits set in the given
+ * byte. For example <code>bitList(12)</code> returns 0x43:<ul>
+ * <li><code>0x43 & 0x0F</code> is 3, meaning the the first bit set is at offset 3-1 = 2,</li>
+ * <li><code>(0x43 >>> 4) & 0x0F</code> is 4, meaning there is a second bit set at offset 4-1=3,</li>
+ * <li><code>(0x43 >>> 8) & 0x0F</code> is 0, meaning there is no more bit set in this byte.</li>
+ * </ul>*/
+ public static int bitList(byte b) {
+ return BIT_LISTS[b & 0xFF];
+ }
+
// The pop methods used to rely on bit-manipulation tricks for speed but it
// turns out that it is faster to use the Long.bitCount method (which is an
// intrinsic since Java 6u18) in a naive loop, see LUCENE-2221
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Constants.java Sun Aug 11 12:19:13 2013
@@ -46,6 +46,8 @@ public final class Constants {
public static final boolean SUN_OS = OS_NAME.startsWith("SunOS");
/** True iff running on Mac OS X */
public static final boolean MAC_OS_X = OS_NAME.startsWith("Mac OS X");
+ /** True iff running on FreeBSD */
+ public static final boolean FREE_BSD = OS_NAME.startsWith("FreeBSD");
public static final String OS_ARCH = System.getProperty("os.arch");
public static final String OS_VERSION = System.getProperty("os.version");
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java Sun Aug 11 12:19:13 2013
@@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSet
*/
public class OpenBitSetIterator extends DocIdSetIterator {
- // The General Idea: instead of having an array per byte that has
- // the offsets of the next set bit, that array could be
- // packed inside a 32 bit integer (8 4 bit numbers). That
- // should be faster than accessing an array for each index, and
- // the total array size is kept smaller (256*sizeof(int))=1K
- protected final static int[] bitlist={
- 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
- 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
- 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
- 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
- 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
- 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
- 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
- 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
- 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
- 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
- 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
- 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
- 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
- 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
- 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
- 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
- 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
- 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
- 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
- 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
- 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
- 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
- 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
- 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
- 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
- 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
- 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
- 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
- };
- /***** the python code that generated bitlist
- def bits2int(val):
- arr=0
- for shift in range(8,0,-1):
- if val & 0x80:
- arr = (arr << 4) | shift
- val = val << 1
- return arr
-
- def int_table():
- tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
- return ','.join(tbl)
- ******/
-
// hmmm, what about an iterator that finds zeros though,
// or a reverse iterator... should they be separate classes
// for efficiency, or have a common root interface? (or
@@ -101,7 +52,7 @@ public class OpenBitSetIterator extends
if ((int)word ==0) {wordShift +=32; word = word >>>32; }
if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
- indexArray = bitlist[(int)word & 0xff];
+ indexArray = BitUtil.bitList((byte) word);
}
/***** alternate shift implementations
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java Sun Aug 11 12:19:13 2013
@@ -18,6 +18,7 @@ package org.apache.lucene.util;
*/
import java.lang.management.ManagementFactory;
+import java.lang.management.PlatformManagedObject;
import java.lang.reflect.*;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
@@ -212,18 +213,17 @@ public final class RamUsageEstimator {
// regardless of the architecture).
int objectAlignment = 8;
try {
- final Class<?> beanClazz = Class.forName("com.sun.management.HotSpotDiagnosticMXBean");
- final Object hotSpotBean = ManagementFactory.newPlatformMXBeanProxy(
- ManagementFactory.getPlatformMBeanServer(),
- "com.sun.management:type=HotSpotDiagnostic",
- beanClazz
- );
- final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class);
- final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes");
- objectAlignment = Integer.parseInt(
- vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()
- );
- supportedFeatures.add(JvmFeature.OBJECT_ALIGNMENT);
+ final Class<? extends PlatformManagedObject> beanClazz =
+ Class.forName("com.sun.management.HotSpotDiagnosticMXBean").asSubclass(PlatformManagedObject.class);
+ final Object hotSpotBean = ManagementFactory.getPlatformMXBean(beanClazz);
+ if (hotSpotBean != null) {
+ final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class);
+ final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes");
+ objectAlignment = Integer.parseInt(
+ vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()
+ );
+ supportedFeatures.add(JvmFeature.OBJECT_ALIGNMENT);
+ }
} catch (Exception e) {
// Ignore.
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/RollingBuffer.java Sun Aug 11 12:19:13 2013
@@ -17,9 +17,6 @@ package org.apache.lucene.util;
* limitations under the License.
*/
-// TODO: probably move this to core at some point (eg,
-// cutover kuromoji, synfilter, LookaheadTokenFilter)
-
/** Acts like forever growing T[], but internally uses a
* circular buffer to reuse instances of T.
*
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/SetOnce.java Sun Aug 11 12:19:13 2013
@@ -28,7 +28,7 @@ import java.util.concurrent.atomic.Atomi
*
* @lucene.experimental
*/
-public final class SetOnce<T> {
+public final class SetOnce<T> implements Cloneable {
/** Thrown when {@link SetOnce#set(Object)} is called more than once. */
public static final class AlreadySetException extends IllegalStateException {
@@ -74,4 +74,10 @@ public final class SetOnce<T> {
public final T get() {
return obj;
}
+
+ @Override
+ public SetOnce<T> clone() {
+ return obj == null ? new SetOnce<T>() : new SetOnce<T>(obj);
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/ToStringUtils.java Sun Aug 11 12:19:13 2013
@@ -43,4 +43,14 @@ public final class ToStringUtils {
}
}
+ private final static char [] HEX = "0123456789abcdef".toCharArray();
+
+ public static String longHex(long x) {
+ char [] asHex = new char [16];
+ for (int i = 16; --i >= 0; x >>>= 4) {
+ asHex[i] = HEX[(int) x & 0x0F];
+ }
+ return "0x" + new String(asHex);
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/Version.java Sun Aug 11 12:19:13 2013
@@ -68,6 +68,13 @@ public enum Version {
@Deprecated
LUCENE_44,
+ /**
+ * Match settings and bugs in Lucene's 4.5 release.
+ * @deprecated (5.0) Use latest
+ */
+ @Deprecated
+ LUCENE_45,
+
/** Match settings and bugs in Lucene's 5.0 release.
* <p>
* Use this to get the latest & greatest settings, bug
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java Sun Aug 11 12:19:13 2013
@@ -219,7 +219,7 @@ final public class SpecialOperations {
/**
* Returns the set of accepted strings, assuming that at most
* <code>limit</code> strings are accepted. If more than <code>limit</code>
- * strings are accepted, null is returned. If <code>limit</code><0, then
+ * strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then
* the limit is infinite.
*/
public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
@@ -227,11 +227,9 @@ final public class SpecialOperations {
if (a.isSingleton()) {
if (limit > 0) {
strings.add(Util.toUTF32(a.singleton, new IntsRef()));
- } else {
- return null;
}
} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
- return null;
+ return strings;
}
return strings;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java Sun Aug 11 12:19:13 2013
@@ -117,9 +117,9 @@ public class Builder<T> {
*
* @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
- * This requires an additional hash map for lookups in memory. Setting this parameter to
- * <code>false</code> creates a single path for all input sequences. This will result in a larger
- * graph, but may require less memory and will speed up construction.
+ * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
+ * <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
+ * FST, but requires substantially less memory and CPU during building.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sun Aug 11 12:19:13 2013
@@ -171,6 +171,8 @@ public final class FST<T> {
private final boolean allowArrayArcs;
private Arc<T> cachedRootArcs[];
+ private Arc<T> assertingCachedRootArcs[]; // only set wit assert
+
/** Represents a single arc. */
public final static class Arc<T> {
@@ -213,7 +215,7 @@ public final class FST<T> {
}
return this;
}
-
+
boolean flag(int flag) {
return FST.flag(flags, flag);
}
@@ -420,11 +422,18 @@ public final class FST<T> {
return node;
}
}
-
+
// Caches first 128 labels
@SuppressWarnings({"rawtypes","unchecked"})
private void cacheRootArcs() throws IOException {
cachedRootArcs = (Arc<T>[]) new Arc[0x80];
+ readRootArcs(cachedRootArcs);
+
+ assert setAssertingRootArcs(cachedRootArcs);
+ assert assertRootArcs();
+ }
+
+ public void readRootArcs(Arc<T>[] arcs) throws IOException {
final Arc<T> arc = new Arc<T>();
getFirstArc(arc);
final BytesReader in = getBytesReader();
@@ -433,7 +442,7 @@ public final class FST<T> {
while(true) {
assert arc.label != END_LABEL;
if (arc.label < cachedRootArcs.length) {
- cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
+ arcs[arc.label] = new Arc<T>().copyFrom(arc);
} else {
break;
}
@@ -444,6 +453,38 @@ public final class FST<T> {
}
}
}
+
+ @SuppressWarnings({"rawtypes","unchecked"})
+ private boolean setAssertingRootArcs(Arc<T>[] arcs) throws IOException {
+ assertingCachedRootArcs = (Arc<T>[]) new Arc[arcs.length];
+ readRootArcs(assertingCachedRootArcs);
+ return true;
+ }
+
+ private boolean assertRootArcs() {
+ assert cachedRootArcs != null;
+ assert assertingCachedRootArcs != null;
+ for (int i = 0; i < cachedRootArcs.length; i++) {
+ final Arc<T> root = cachedRootArcs[i];
+ final Arc<T> asserting = assertingCachedRootArcs[i];
+ if (root != null) {
+ assert root.arcIdx == asserting.arcIdx;
+ assert root.bytesPerArc == asserting.bytesPerArc;
+ assert root.flags == asserting.flags;
+ assert root.label == asserting.label;
+ assert root.nextArc == asserting.nextArc;
+ assert root.nextFinalOutput.equals(asserting.nextFinalOutput);
+ assert root.node == asserting.node;
+ assert root.numArcs == asserting.numArcs;
+ assert root.output.equals(asserting.output);
+ assert root.posArcsStart == asserting.posArcsStart;
+ assert root.target == asserting.target;
+ } else {
+ assert root == null && asserting == null;
+ }
+ }
+ return true;
+ }
public T getEmptyOutput() {
return emptyOutput;
@@ -1099,7 +1140,7 @@ public final class FST<T> {
/** Finds an arc leaving the incoming arc, replacing the arc in place.
* This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
- assert cachedRootArcs != null;
+ assert assertRootArcs();
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
@@ -1123,7 +1164,7 @@ public final class FST<T> {
if (follow.target == startNode && labelToMatch < cachedRootArcs.length) {
final Arc<T> result = cachedRootArcs[labelToMatch];
if (result == null) {
- return result;
+ return null;
} else {
arc.copyFrom(result);
return arc;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java Sun Aug 11 12:19:13 2013
@@ -19,21 +19,21 @@ package org.apache.lucene.util.fst;
import java.io.IOException;
-import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PagedGrowableWriter;
// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
- private GrowableWriter table;
- private int count;
- private int mask;
+ private PagedGrowableWriter table;
+ private long count;
+ private long mask;
private final FST<T> fst;
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
private final FST.BytesReader in;
public NodeHash(FST<T> fst, FST.BytesReader in) {
- table = new GrowableWriter(8, 16, PackedInts.COMPACT);
+ table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
mask = 15;
this.fst = fst;
this.in = in;
@@ -69,10 +69,10 @@ final class NodeHash<T> {
// hash code for an unfrozen node. This must be identical
// to the un-frozen case (below)!!
- private int hash(Builder.UnCompiledNode<T> node) {
+ private long hash(Builder.UnCompiledNode<T> node) {
final int PRIME = 31;
//System.out.println("hash unfrozen");
- int h = 0;
+ long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
@@ -87,14 +87,14 @@ final class NodeHash<T> {
}
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
- return h & Integer.MAX_VALUE;
+ return h & Long.MAX_VALUE;
}
// hash code for a frozen node
- private int hash(long node) throws IOException {
+ private long hash(long node) throws IOException {
final int PRIME = 31;
//System.out.println("hash frozen node=" + node);
- int h = 0;
+ long h = 0;
fst.readFirstRealTargetArc(node, scratchArc, in);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
@@ -111,13 +111,13 @@ final class NodeHash<T> {
fst.readNextRealArc(scratchArc, in);
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
- return h & Integer.MAX_VALUE;
+ return h & Long.MAX_VALUE;
}
public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
- // System.out.println("hash: add count=" + count + " vs " + table.size());
- final int h = hash(nodeIn);
- int pos = h & mask;
+ //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
+ final long h = hash(nodeIn);
+ long pos = h & mask;
int c = 0;
while(true) {
final long v = table.get(pos);
@@ -128,7 +128,8 @@ final class NodeHash<T> {
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;
table.set(pos, node);
- if (table.size() < 2*count) {
+ // Rehash at 2/3 occupancy:
+ if (count > 2*table.size()/3) {
rehash();
}
return node;
@@ -144,7 +145,7 @@ final class NodeHash<T> {
// called only by rehash
private void addNew(long address) throws IOException {
- int pos = hash(address) & mask;
+ long pos = hash(address) & mask;
int c = 0;
while(true) {
if (table.get(pos) == 0) {
@@ -158,23 +159,15 @@ final class NodeHash<T> {
}
private void rehash() throws IOException {
- final GrowableWriter oldTable = table;
+ final PagedGrowableWriter oldTable = table;
- if (oldTable.size() >= Integer.MAX_VALUE/2) {
- throw new IllegalStateException("FST too large (> 2.1 GB)");
- }
-
- table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
+ table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count), PackedInts.COMPACT);
mask = table.size()-1;
- for(int idx=0;idx<oldTable.size();idx++) {
+ for(long idx=0;idx<oldTable.size();idx++) {
final long address = oldTable.get(idx);
if (address != 0) {
addNew(address);
}
}
}
-
- public int count() {
- return count;
- }
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java Sun Aug 11 12:19:13 2013
@@ -33,26 +33,13 @@ public final class PositiveIntOutputs ex
private final static Long NO_OUTPUT = new Long(0);
- private final boolean doShare;
+ private final static PositiveIntOutputs singleton = new PositiveIntOutputs();
- private final static PositiveIntOutputs singletonShare = new PositiveIntOutputs(true);
- private final static PositiveIntOutputs singletonNoShare = new PositiveIntOutputs(false);
-
- private PositiveIntOutputs(boolean doShare) {
- this.doShare = doShare;
+ private PositiveIntOutputs() {
}
- /** Returns the instance of PositiveIntOutputs. */
public static PositiveIntOutputs getSingleton() {
- return getSingleton(true);
- }
-
- /** Expert: pass doShare=false to disable output sharing.
- * In some cases this may result in a smaller FST,
- * however it will also break methods like {@link
- * Util#getByOutput} and {@link Util#shortestPaths}. */
- public static PositiveIntOutputs getSingleton(boolean doShare) {
- return doShare ? singletonShare : singletonNoShare;
+ return singleton;
}
@Override
@@ -61,14 +48,10 @@ public final class PositiveIntOutputs ex
assert valid(output2);
if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
return NO_OUTPUT;
- } else if (doShare) {
+ } else {
assert output1 > 0;
assert output2 > 0;
return Math.min(output1, output2);
- } else if (output1.equals(output2)) {
- return output1;
- } else {
- return NO_OUTPUT;
}
}
@@ -134,6 +117,6 @@ public final class PositiveIntOutputs ex
@Override
public String toString() {
- return "PositiveIntOutputs(doShare=" + doShare + ")";
+ return "PositiveIntOutputs";
}
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Sun Aug 11 12:19:13 2013
@@ -93,9 +93,7 @@ public final class Util {
*
* <p>NOTE: this only works with {@code FST<Long>}, only
* works when the outputs are ascending in order with
- * the inputs and only works when you shared
- * the outputs (pass doShare=true to {@link
- * PositiveIntOutputs#getSingleton}).
+ * the inputs.
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
@@ -517,11 +515,7 @@ public final class Util {
}
/** Starting from node, find the top N min cost
- * completions to a final node.
- *
- * <p>NOTE: you must share the outputs when you build the
- * FST (pass doShare=true to {@link
- * PositiveIntOutputs#getSingleton}). */
+ * completions to a final node. */
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
boolean allowEmptyString) throws IOException {
@@ -814,7 +808,7 @@ public final class Util {
final int charLimit = offset + length;
while(charIdx < charLimit) {
scratch.grow(intIdx+1);
- final int utf32 = Character.codePointAt(s, charIdx);
+ final int utf32 = Character.codePointAt(s, charIdx, charLimit);
scratch.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html?rev=1512909&r1=1512908&r2=1512909&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/util/fst/package.html Sun Aug 11 12:19:13 2013
@@ -43,7 +43,7 @@ FST Construction example:
String inputValues[] = {"cat", "dog", "dogs"};
long outputValues[] = {5, 7, 12};
- PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
BytesRef scratchBytes = new BytesRef();
IntsRef scratchInts = new IntsRef();
@@ -60,8 +60,7 @@ Retrieval by key:
</pre>
Retrieval by value:
<pre class="prettyprint">
- // Only works because outputs are also in sorted order, and
- // we passed 'true' for sharing to PositiveIntOutputs.getSingleton
+ // Only works because outputs are also in sorted order
IntsRef key = Util.getByOutput(fst, 12);
System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs
</pre>
@@ -77,7 +76,6 @@ Iterate over key-value pairs in sorted o
</pre>
N-shortest paths by weight:
<pre class="prettyprint">
- // Only works because we passed 'true' for sharing to PositiveIntOutputs.getSingleton
Comparator<Long> comparator = new Comparator<Long>() {
public int compare(Long left, Long right) {
return left.compareTo(right);