You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/08/13 19:37:16 UTC

svn commit: r1695744 - in /lucene/dev/trunk: lucene/ lucene/core/src/java/org/apache/lucene/search/similarities/ lucene/core/src/test/org/apache/lucene/index/ lucene/core/src/test/org/apache/lucene/search/ lucene/core/src/test/org/apache/lucene/search/...

Author: rmuir
Date: Thu Aug 13 17:37:15 2015
New Revision: 1695744

URL: http://svn.apache.org/r1695744
Log:
LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/MIGRATE.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
    lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Aug 13 17:37:15 2015
@@ -19,6 +19,10 @@ New Features
   for counting ranges that align with the underlying terms as defined by the
   NumberRangePrefixTree (e.g. familiar date units like days).  (David Smiley)
 
+* LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
+  length computations, to avoid skew from documents that don't have the field.
+  (Ahmet Arslan via Robert Muir)
+
 API Changes
 
 * LUCENE-3312: The API of oal.document was restructured to

Modified: lucene/dev/trunk/lucene/MIGRATE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/MIGRATE.txt?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/MIGRATE.txt (original)
+++ lucene/dev/trunk/lucene/MIGRATE.txt Thu Aug 13 17:37:15 2015
@@ -1,5 +1,22 @@
 # Apache Lucene Migration Guide
 
+## The way how number of document calculated is changed (LUCENE-6711)
+The number of documents (numDocs) is used to calculate term specificity (idf) and average document length (avdl).
+Prior to LUCENE-6711, collectionStats.maxDoc() was used for the statistics.
+Now, collectionStats.docCount() is used whenever possible, if not maxDocs() is used.
+
+Assume that a collection contains 100 documents, and 50 of them have "keywords" field.
+In this example, maxDocs is 100 while docCount is 50 for the "keywords" field.
+The total number of tokens for "keywords" field is divided by docCount to obtain avdl.
+Therefore, docCount which is the total number of documents that have at least one term for the field, is a more precise metric for optional fields.
+
+DefaultSimilarity does not leverage avdl, so this change would have relatively minor change in the result list.
+Because relative idf values of terms will remain same.
+However, when combined with other factors such as term frequency, relative ranking of documents could change.
+Some Similarity implementations (such as the ones instantiated with NormalizationH2 and BM25) take account into avdl and would have notable change in ranked list.
+Especially if you have a collection of documents with varying lengths.
+Because NormalizationH2 tends to punish documents longer than avdl.
+
 ## Separation of IndexDocument and StoredDocument (LUCENE-3312)
 
 The API of oal.document was restructured to differentiate between stored 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java Thu Aug 13 17:37:15 2015
@@ -63,9 +63,9 @@ public class BM25Similarity extends Simi
     this.b  = 0.75f;
   }
   
-  /** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
-  protected float idf(long docFreq, long numDocs) {
-    return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+  /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
+  protected float idf(long docFreq, long docCount) {
+    return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
   }
   
   /** Implemented as <code>1 / (distance + 1)</code>. */
@@ -78,7 +78,7 @@ public class BM25Similarity extends Simi
     return 1;
   }
   
-  /** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
+  /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
    * or returns <code>1</code> if the index does not store sumTotalTermFreq:
    * any field that omits frequency information). */
   protected float avgFieldLength(CollectionStatistics collectionStats) {
@@ -86,7 +86,8 @@ public class BM25Similarity extends Simi
     if (sumTotalTermFreq <= 0) {
       return 1f;       // field does not exist, or stat is unsupported
     } else {
-      return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
+      final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+      return (float) (sumTotalTermFreq / (double) docCount);
     }
   }
   
@@ -150,14 +151,14 @@ public class BM25Similarity extends Simi
    * The default implementation uses:
    * 
    * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
    * </pre>
    * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
    * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
    * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
    *   
    * @param collectionStats collection-level statistics
    * @param termStats term-level statistics for the term
@@ -166,9 +167,9 @@ public class BM25Similarity extends Simi
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
     final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
-    final float idf = idf(df, max);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+    final float idf = idf(df, docCount);
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
   }
 
   /**
@@ -185,13 +186,13 @@ public class BM25Similarity extends Simi
    *         for each term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
     float idf = 0.0f;
     List<Explanation> details = new ArrayList<>();
     for (final TermStatistics stat : termStats ) {
       final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
-      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      final float termIdf = idf(df, docCount);
+      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
       idf += termIdf;
     }
     return Explanation.match(idf, "idf(), sum of:", details);

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java Thu Aug 13 17:37:15 2015
@@ -133,10 +133,10 @@ public class DefaultSimilarity extends T
     return 1;
   }
 
-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
   @Override
-  public float idf(long docFreq, long numDocs) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  public float idf(long docFreq, long docCount) {
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
   }
     
   /** 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java Thu Aug 13 17:37:15 2015
@@ -102,7 +102,7 @@ public abstract class SimilarityBase ext
   protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
     // #positions(field) must be >= #positions(term)
     assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
-    long numberOfDocuments = collectionStats.maxDoc();
+    long numberOfDocuments = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
     
     long docFreq = termStats.docFreq();
     long totalTermFreq = termStats.totalTermFreq();

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Thu Aug 13 17:37:15 2015
@@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
  *          </td>
  *          <td valign="middle" align="center">
  *            <table summary="inverse document frequency computation">
- *               <tr><td align="center" style="text-align: center"><small>numDocs</small></td></tr>
+ *               <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
  *               <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
  *               <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
  *            </table>
@@ -566,14 +566,14 @@ public abstract class TFIDFSimilarity ex
    * The default implementation uses:
    * 
    * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
    * </pre>
    * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
    * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
    * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
    *   
    * @param collectionStats collection-level statistics
    * @param termStats term-level statistics for the term
@@ -582,9 +582,9 @@ public abstract class TFIDFSimilarity ex
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
     final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
-    final float idf = idf(df, max);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+    final float idf = idf(df, docCount);
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
   }
 
   /**
@@ -601,13 +601,13 @@ public abstract class TFIDFSimilarity ex
    *         for each term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
     float idf = 0.0f;
     List<Explanation> subs = new ArrayList<>();
     for (final TermStatistics stat : termStats ) {
       final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
-      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      final float termIdf = idf(df, docCount);
+      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
       idf += termIdf;
     }
     return Explanation.match(idf, "idf(), sum of:", subs);
@@ -623,10 +623,10 @@ public abstract class TFIDFSimilarity ex
    * and smaller values for common terms.
    *
    * @param docFreq the number of documents which contain the term
-   * @param numDocs the total number of documents in the collection
+   * @param docCount the total number of documents in the collection
    * @return a score factor based on the term's document frequency
    */
-  public abstract float idf(long docFreq, long numDocs);
+  public abstract float idf(long docFreq, long docCount);
 
   /**
    * Compute an index-time normalization value for this field instance.

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java Thu Aug 13 17:37:15 2015
@@ -118,7 +118,7 @@ public class TestMaxTermFrequency extend
     @Override public float coord(int overlap, int maxOverlap) { return 0; }
     @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
     @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
     @Override public float sloppyFreq(int distance) { return 0; }
     @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
   }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNorms.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNorms.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestNorms.java Thu Aug 13 17:37:15 2015
@@ -67,7 +67,7 @@ public class TestNorms extends LuceneTes
     @Override public float coord(int overlap, int maxOverlap) { return 0; }
     @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
     @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
     @Override public float sloppyFreq(int distance) { return 0; }
     @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
   }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java Thu Aug 13 17:37:15 2015
@@ -54,7 +54,7 @@ public class TestOmitTf extends LuceneTe
     @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
     @Override public float tf(float freq) { return freq; }
     @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
     @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
       return Explanation.match(1.0f, "Inexplicable");
     }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java Thu Aug 13 17:37:15 2015
@@ -80,7 +80,7 @@ public class TestDisjunctionMaxQuery ext
     }
     
     @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
       return 1.0f;
     }
   }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java Thu Aug 13 17:37:15 2015
@@ -46,7 +46,7 @@ public class TestSimilarity extends Luce
     @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
     @Override public float tf(float freq) { return freq; }
     @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
     @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
       return Explanation.match(1.0f, "Inexplicable"); 
     }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java Thu Aug 13 17:37:15 2015
@@ -142,7 +142,7 @@ public class TestSimilarityProvider exte
     }
 
     @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
       return 1f;
     }
 
@@ -190,7 +190,7 @@ public class TestSimilarityProvider exte
     }
 
     @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
       return 10f;
     }
 

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java Thu Aug 13 17:37:15 2015
@@ -288,7 +288,7 @@ public class TestPayloadTermQuery extend
     }
 
     @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
       return 1;
     }
 

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java Thu Aug 13 17:37:15 2015
@@ -38,6 +38,7 @@ import org.apache.lucene.search.spans.Sp
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
 
 /**
  * Tests against all the similarities we have
@@ -159,6 +160,48 @@ public class TestSimilarity2 extends Luc
     ir.close();
     dir.close();
   }
+  
+  /** make sure scores are not skewed by docs not containing the field */
+  public void testNoFieldSkew() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(newTextField("foo", "bar baz somethingelse", Field.Store.NO));
+    iw.addDocument(doc);
+    IndexReader ir = iw.getReader();
+    IndexSearcher is = newSearcher(ir);
+    
+    BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
+    queryBuilder.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
+    queryBuilder.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
+    Query query = queryBuilder.build();
+    
+    // collect scores
+    List<Float> scores = new ArrayList<>();
+    for (Similarity sim : sims) {
+      is.setSimilarity(sim);
+      scores.add(is.explain(query, 0).getValue());
+    }
+    ir.close();
+    
+    // add some additional docs without the field
+    int numExtraDocs = TestUtil.nextInt(random(), 1, 1000);
+    for (int i = 0; i < numExtraDocs; i++) {
+      iw.addDocument(new Document());
+    }
+    
+    // check scores are the same
+    ir = iw.getReader();
+    is = newSearcher(ir);
+    for (int i = 0; i < sims.size(); i++) {
+      is.setSimilarity(sims.get(i));
+      assertEquals(scores.get(i).floatValue(), is.explain(query, 0).getValue(), 0F);
+    }
+    
+    iw.close();
+    ir.close();
+    dir.close();
+  }
   
   /** make sure all sims work if TF is omitted */
   public void testOmitTF() throws Exception {

Modified: lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java (original)
+++ lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java Thu Aug 13 17:37:15 2015
@@ -203,10 +203,10 @@ class PreciseDefaultSimilarity extends T
     return 1;
   }
 
-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
   @Override
-  public float idf(long docFreq, long numDocs) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  public float idf(long docFreq, long docCount) {
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
   }
 
   /**

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java?rev=1695744&r1=1695743&r2=1695744&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java Thu Aug 13 17:37:15 2015
@@ -566,8 +566,8 @@ public class TestExtendedDismaxParser ex
   }
   
   public void testAliasingBoost() throws Exception {
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.5"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.5"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.1"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.1"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
   }
   
   public void testCyclicAliasing() throws Exception {