You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/11/30 19:21:48 UTC

[lucene] branch main updated: LUCENE-10272: cross-check norms with postings in checkindex (#493)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 46a5a57  LUCENE-10272: cross-check norms with postings in checkindex (#493)
46a5a57 is described below

commit 46a5a57724519f349728a7aa613d0b1fe77a8c14
Author: Robert Muir <rm...@apache.org>
AuthorDate: Tue Nov 30 14:21:40 2021 -0500

    LUCENE-10272: cross-check norms with postings in checkindex (#493)
    
    Previously, CheckIndex would iterate norms and validate each one. But if norms that should be there were missing, nothing would fail. Now it computes an expected count of norms and ensures it saw them all.
---
 .../java/org/apache/lucene/index/CheckIndex.java   | 36 ++++++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 285bed2..64bc28c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -2115,6 +2115,8 @@ public final class CheckIndex implements Closeable {
 
         if (fieldInfo.hasNorms() && isVectors == false) {
           final NumericDocValues norms = normsProducer.getNorms(fieldInfo);
+          // count of valid norm values found for the field
+          int actualCount = 0;
           // Cross-check terms with norms
           for (int doc = norms.nextDoc();
               doc != DocIdSetIterator.NO_MORE_DOCS;
@@ -2126,12 +2128,15 @@ public final class CheckIndex implements Closeable {
               continue;
             }
             final long norm = norms.longValue();
-            if (norm != 0 && visitedDocs.get(doc) == false) {
-              throw new CheckIndexException(
-                  "Document "
-                      + doc
-                      + " doesn't have terms according to postings but has a norm value that is not zero: "
-                      + Long.toUnsignedString(norm));
+            if (norm != 0) {
+              actualCount++;
+              if (visitedDocs.get(doc) == false) {
+                throw new CheckIndexException(
+                    "Document "
+                        + doc
+                        + " doesn't have terms according to postings but has a norm value that is not zero: "
+                        + Long.toUnsignedString(norm));
+              }
             } else if (norm == 0 && visitedDocs.get(doc)) {
               throw new CheckIndexException(
                   "Document "
@@ -2139,6 +2144,25 @@ public final class CheckIndex implements Closeable {
                       + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms");
             }
           }
+          int expectedCount = 0;
+          for (int doc = visitedDocs.nextSetBit(0);
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc =
+                  doc + 1 >= visitedDocs.length()
+                      ? DocIdSetIterator.NO_MORE_DOCS
+                      : visitedDocs.nextSetBit(doc + 1)) {
+            if (liveDocs != null && liveDocs.get(doc) == false) {
+              // Norms may only be out of sync with terms on deleted documents.
+              // This happens when a document fails indexing and in that case it
+              // should be immediately marked as deleted by the IndexWriter.
+              continue;
+            }
+            expectedCount++;
+          }
+          if (expectedCount != actualCount) {
+            throw new CheckIndexException(
+                "actual norm count: " + actualCount + " but expected: " + expectedCount);
+          }
         }
 
         // Test seek to last term: