You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/09/09 05:16:16 UTC

svn commit: r1701895 - in /lucene/dev/trunk: lucene/ lucene/classification/src/test/org/apache/lucene/classification/ lucene/core/src/java/org/apache/lucene/search/similarities/ lucene/core/src/test/org/apache/lucene/search/ lucene/core/src/test/org/ap...

Author: rmuir
Date: Wed Sep  9 03:16:15 2015
New Revision: 1701895

URL: http://svn.apache.org/r1701895
Log:
LUCENE-6758: don't let queries over nonexistent fields screw up querynorm

Removed:
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestDefaultSimilarity.java
    lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestValueSources.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/stats/TestDefaultStatsCache.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Sep  9 03:16:15 2015
@@ -28,6 +28,9 @@ New Features
   length computations, to avoid skew from documents that don't have the field.
   (Ahmet Arslan via Robert Muir)
 
+* LUCENE-6758: Use docCount+1 for DefaultSimilarity's IDF, so that queries 
+  containing nonexistent fields won't screw up querynorm. (Terry Smith, Robert Muir)
+
 * SOLR-7876: The QueryTimeout interface now has a isTimeoutEnabled method
   that can return false to exit from ExitableDirectoryReader wrapping at
   the point fields() is called. (yonik)

Modified: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (original)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Wed Sep  9 03:16:15 2015
@@ -25,6 +25,7 @@ import org.apache.lucene.classification.
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.similarities.BM25Similarity;
 import org.apache.lucene.search.similarities.LMDirichletSimilarity;
 import org.apache.lucene.util.BytesRef;
 import org.junit.Test;
@@ -42,7 +43,7 @@ public class KNearestNeighborClassifierT
       leafReader = getSampleIndex(analyzer);
       checkCorrectClassification(new KNearestNeighborClassifier(leafReader, null, analyzer, null, 1, 0, 0, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
       checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new LMDirichletSimilarity(), analyzer, null, 1, 0, 0, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
-      ClassificationResult<BytesRef> resultDS =  checkCorrectClassification(new KNearestNeighborClassifier(leafReader, null, analyzer, null, 3, 2, 1, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
+      ClassificationResult<BytesRef> resultDS =  checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new BM25Similarity(), analyzer, null, 3, 2, 1, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
       ClassificationResult<BytesRef> resultLMS =  checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new LMDirichletSimilarity(), analyzer, null, 3, 2, 1, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
       assertTrue(resultDS.getScore() != resultLMS.getScore());
     } finally {

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java Wed Sep  9 03:16:15 2015
@@ -133,10 +133,10 @@ public class DefaultSimilarity extends T
     return 1;
   }
 
-  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
   @Override
   public float idf(long docFreq, long docCount) {
-    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
+    return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
   }
     
   /** 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Wed Sep  9 03:16:15 2015
@@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
  *          </td>
  *          <td valign="middle" align="center">
  *            <table summary="inverse document frequency computation">
- *               <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
+ *               <tr><td align="center" style="text-align: center"><small>docCount+1</small></td></tr>
  *               <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
  *               <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
  *            </table>

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java Wed Sep  9 03:16:15 2015
@@ -330,11 +330,11 @@ public class TestPhraseQuery extends Luc
     assertEquals(3, hits.length);
     // Make sure that those matches where the terms appear closer to
     // each other get a higher score:
-    assertEquals(0.71, hits[0].score, 0.01);
+    assertEquals(1.0, hits[0].score, 0.01);
     assertEquals(0, hits[0].doc);
-    assertEquals(0.44, hits[1].score, 0.01);
+    assertEquals(0.62, hits[1].score, 0.01);
     assertEquals(1, hits[1].doc);
-    assertEquals(0.31, hits[2].score, 0.01);
+    assertEquals(0.43, hits[2].score, 0.01);
     assertEquals(2, hits[2].doc);
     QueryUtils.check(random(), query,searcher);
     reader.close();

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java Wed Sep  9 03:16:15 2015
@@ -121,17 +121,6 @@ public class TestTermScorer extends Luce
     // The scores should be the same
     assertTrue(doc0.score + " does not equal: " + doc5.score,
         doc0.score == doc5.score);
-    /*
-     * Score should be (based on Default Sim.: All floats are approximate tf = 1
-     * numDocs = 6 docFreq(all) = 2 idf = ln(6/3) + 1 = 1.693147 idf ^ 2 =
-     * 2.8667 boost = 1 lengthNorm = 1 //there is 1 term in every document coord
-     * = 1 sumOfSquaredWeights = (idf * boost) ^ 2 = 1.693147 ^ 2 = 2.8667
-     * queryNorm = 1 / (sumOfSquaredWeights)^0.5 = 1 /(1.693147) = 0.590
-     * 
-     * score = 1 * 2.8667 * 1 * 1 * 0.590 = 1.69
-     */
-    assertTrue(doc0.score + " does not equal: " + 1.6931472f,
-        doc0.score == 1.6931472f);
   }
   
   public void testNext() throws Exception {
@@ -145,10 +134,8 @@ public class TestTermScorer extends Luce
     Scorer ts = weight.scorer(context);
     assertTrue("next did not return a doc",
         ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
-    assertTrue("score is not correct", ts.score() == 1.6931472f);
     assertTrue("next did not return a doc",
         ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
-    assertTrue("score is not correct", ts.score() == 1.6931472f);
     assertTrue("next returned a doc and it should not have",
         ts.nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
   }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestDefaultSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestDefaultSimilarity.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestDefaultSimilarity.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/similarities/TestDefaultSimilarity.java Wed Sep  9 03:16:15 2015
@@ -17,14 +17,145 @@ package org.apache.lucene.search.similar
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 
 public class TestDefaultSimilarity extends LuceneTestCase {
+  private Directory directory;
+  private IndexReader indexReader;
+  private IndexSearcher indexSearcher;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig())) {
+      Document document = new Document();
+      document.add(new StringField("test", "hit", Store.NO));
+      indexWriter.addDocument(document);
+      indexWriter.commit();
+    }
+    indexReader = DirectoryReader.open(directory);
+    indexSearcher = newSearcher(indexReader);
+    indexSearcher.setSimilarity(new DefaultSimilarity());
+  }
 
+  @Override
+  public void tearDown() throws Exception {
+    IOUtils.close(indexReader, directory);
+    super.tearDown();
+  }
+  
   // Javadocs give this as an example so we test to make sure it's correct:
   public void testPrecisionLoss() throws Exception {
     DefaultSimilarity sim = new DefaultSimilarity();
     float v = sim.decodeNormValue(sim.encodeNormValue(.89f));
     assertEquals(0.875f, v, 0.0001f);
   }
+
+
+  public void testHit() throws IOException {
+    Query query = new TermQuery(new Term("test", "hit"));
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testMiss() throws IOException {
+    Query query = new TermQuery(new Term("test", "miss"));
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(0, topDocs.totalHits);
+  }
+
+  public void testEmpty() throws IOException {
+    Query query = new TermQuery(new Term("empty", "miss"));
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(0, topDocs.totalHits);
+  }
+
+  public void testBQHit() throws IOException {
+    Query query = new BooleanQuery.Builder()
+      .add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
+      .build();
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testBQHitOrMiss() throws IOException {
+    Query query = new BooleanQuery.Builder()
+      .add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
+      .add(new TermQuery(new Term("test", "miss")), Occur.SHOULD)
+      .build();
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testBQHitOrEmpty() throws IOException {
+    Query query = new BooleanQuery.Builder()
+      .add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
+      .add(new TermQuery(new Term("empty", "miss")), Occur.SHOULD)
+      .build();
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testDMQHit() throws IOException {
+    Query query = new DisjunctionMaxQuery(
+      Arrays.asList(
+        new TermQuery(new Term("test", "hit"))),
+      0);
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testDMQHitOrMiss() throws IOException {
+    Query query = new DisjunctionMaxQuery(
+      Arrays.asList(
+        new TermQuery(new Term("test", "hit")),
+        new TermQuery(new Term("test", "miss"))),
+      0);
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
+
+  public void testDMQHitOrEmpty() throws IOException {
+    Query query = new DisjunctionMaxQuery(
+      Arrays.asList(
+        new TermQuery(new Term("test", "hit")),
+        new TermQuery(new Term("empty", "miss"))),
+      0);
+    TopDocs topDocs = indexSearcher.search(query, 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1, topDocs.scoreDocs.length);
+    assertTrue(topDocs.scoreDocs[0].score != 0);
+  }
 }

Modified: lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestValueSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestValueSources.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestValueSources.java (original)
+++ lucene/dev/trunk/lucene/queries/src/test/org/apache/lucene/queries/function/TestValueSources.java Wed Sep  9 03:16:15 2015
@@ -226,7 +226,7 @@ public class TestValueSources extends Lu
     try {
       searcher.setSimilarity(new DefaultSimilarity());
       ValueSource vs = new IDFValueSource("bogus", "bogus", "text", new BytesRef("test"));
-      assertHits(new FunctionQuery(vs), new float[] { 0.5945349f, 0.5945349f });
+      assertHits(new FunctionQuery(vs), new float[] { 1.0f, 1.0f });
       assertAllExist(vs);
     } finally {
       searcher.setSimilarity(saved);
@@ -398,7 +398,7 @@ public class TestValueSources extends Lu
       searcher.setSimilarity(new DefaultSimilarity());
       
       ValueSource vs = new QueryValueSource(new TermQuery(new Term("string","bar")), 42F);
-      assertHits(new FunctionQuery(vs), new float[] { 42F, 1F });
+      assertHits(new FunctionQuery(vs), new float[] { 42F, 1.4054651F });
 
       // valuesource should exist only for things matching the term query
       // sanity check via quick & dirty wrapper arround tf

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java Wed Sep  9 03:16:15 2015
@@ -87,8 +87,8 @@ public class DisMaxRequestHandlerTest ex
             req("cool stuff")
             ,"//*[@numFound='3']"
             ,"//result/doc[1]/int[@name='id'][.='42']"
-            ,"//result/doc[2]/int[@name='id'][.='8675309']"
-            ,"//result/doc[3]/int[@name='id'][.='666']"
+            ,"//result/doc[2]/int[@name='id'][.='666']"
+            ,"//result/doc[3]/int[@name='id'][.='8675309']"
             );
 
     assertQ("multi qf",

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java Wed Sep  9 03:16:15 2015
@@ -1014,15 +1014,15 @@ public class StatsComponentTest extends
     assertQ("functions over a query",
             req("q","*:*", "stats", "true",
                 "stats.field", "{!lucene key=k}foo_t:cow")
-            // scores are: 1.0, 0.625, 0.5, & "missing"
-            , kpre + "double[@name='min'][.='0.5']"
-            , kpre + "double[@name='max'][.='1.0']"
-            , kpre + "double[@name='sum'][.='2.125']"
+            // TODO: change to not rely on exact scores
+            , kpre + "double[@name='min'][.='0.6115717887878418']"
+            , kpre + "double[@name='max'][.='1.2231435775756836']"
+            , kpre + "double[@name='sum'][.='2.5991801023483276']"
             , kpre + "long[@name='count'][.='3']"
             , kpre + "long[@name='missing'][.='1']"
-            , kpre + "double[@name='sumOfSquares'][.='1.640625']"
-            , kpre + "double[@name='mean'][.='0.7083333333333334']"
-            , kpre + "double[@name='stddev'][.='0.2602082499332666']"
+            , kpre + "double[@name='sumOfSquares'][.='2.4545065967701163']"
+            , kpre + "double[@name='mean'][.='0.8663933674494425']"
+            , kpre + "double[@name='stddev'][.='0.3182720497380833']"
             );
     
   }

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/stats/TestDefaultStatsCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/stats/TestDefaultStatsCache.java?rev=1701895&r1=1701894&r2=1701895&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/stats/TestDefaultStatsCache.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/stats/TestDefaultStatsCache.java Wed Sep  9 03:16:15 2015
@@ -79,10 +79,6 @@ public class TestDefaultStatsCache exten
     if (clients.size() == 1) {
       // only one shard
       assertEquals(controlScore, shardScore);
-    } else {
-      assertTrue("control:" + controlScore.floatValue() + " shard:"
-          + shardScore.floatValue(),
-          controlScore.floatValue() > shardScore.floatValue());
     }
   }