You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/11/04 16:58:26 UTC
[GitHub] [lucene] mikemccand commented on a change in pull request #420: [DRAFT] LUCENE-10122 Explore using NumericDocValue to store taxonomy parent array

mikemccand commented on a change in pull request #420:
URL: https://github.com/apache/lucene/pull/420#discussion_r743030399



##########
File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java
##########
@@ -129,39 +124,19 @@ private void initParents(IndexReader reader, int first) throws IOException {
     if (reader.maxDoc() == first) {
       return;
     }
-
-    // it's ok to use MultiTerms because we only iterate on one posting list.
-    // breaking it to loop over the leaves() only complicates code for no
-    // apparent gain.
-    PostingsEnum positions =
-        MultiTerms.getTermPostingsEnum(
-            reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
-
-    // shouldn't really happen, if it does, something's wrong
-    if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
-      throw new CorruptIndexException(
-          "Missing parent data for category " + first, reader.toString());
-    }
-
-    int num = reader.maxDoc();
-    for (int i = first; i < num; i++) {
-      if (positions.docID() == i) {
-        if (positions.freq() == 0) { // shouldn't happen
-          throw new CorruptIndexException(
-              "Missing parent data for category " + i, reader.toString());
-        }
-
-        parents[i] = positions.nextPosition();
-
-        if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-          if (i + 1 < num) {
-            throw new CorruptIndexException(
-                "Missing parent data for category " + (i + 1), reader.toString());
-          }
-          break;
+    for (LeafReaderContext leafContext: reader.leaves()) {
+      int leafDocNum = leafContext.reader().maxDoc();
+      if (leafContext.docBase + leafDocNum <= first) {
+        // skip this leaf if it does not contain new categories
+        continue;
+      }
+      NumericDocValues parentValues = leafContext.reader().getNumericDocValues(Consts.FIELD_PAYLOADS);
+      for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
+        if (parentValues.advanceExact(doc) == false) {
+          throw new CorruptIndexException("Missing parent data for category " + doc + leafContext.docBase, reader.toString());
         }
-      } else { // this shouldn't happen
-        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
+        // we're putting a int and converting it back so it should be safe
+        parents[doc + leafContext.docBase] = (int) parentValues.longValue();

Review comment:
       Maybe use `Math.toIntExact` for paranoia?

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java
##########
@@ -129,39 +124,19 @@ private void initParents(IndexReader reader, int first) throws IOException {
     if (reader.maxDoc() == first) {
       return;
     }
-
-    // it's ok to use MultiTerms because we only iterate on one posting list.
-    // breaking it to loop over the leaves() only complicates code for no
-    // apparent gain.
-    PostingsEnum positions =
-        MultiTerms.getTermPostingsEnum(
-            reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
-
-    // shouldn't really happen, if it does, something's wrong
-    if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
-      throw new CorruptIndexException(
-          "Missing parent data for category " + first, reader.toString());
-    }
-
-    int num = reader.maxDoc();
-    for (int i = first; i < num; i++) {
-      if (positions.docID() == i) {
-        if (positions.freq() == 0) { // shouldn't happen
-          throw new CorruptIndexException(
-              "Missing parent data for category " + i, reader.toString());
-        }
-
-        parents[i] = positions.nextPosition();
-
-        if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-          if (i + 1 < num) {
-            throw new CorruptIndexException(
-                "Missing parent data for category " + (i + 1), reader.toString());
-          }
-          break;
+    for (LeafReaderContext leafContext: reader.leaves()) {
+      int leafDocNum = leafContext.reader().maxDoc();
+      if (leafContext.docBase + leafDocNum <= first) {
+        // skip this leaf if it does not contain new categories
+        continue;
+      }
+      NumericDocValues parentValues = leafContext.reader().getNumericDocValues(Consts.FIELD_PAYLOADS);
+      for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
+        if (parentValues.advanceExact(doc) == false) {
+          throw new CorruptIndexException("Missing parent data for category " + doc + leafContext.docBase, reader.toString());
         }
-      } else { // this shouldn't happen
-        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
+        // we're putting a int and converting it back so it should be safe

Review comment:
       s/`a int`/`an int`

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
##########
@@ -466,18 +463,8 @@ protected final void ensureOpen() {
    * effectively synchronized as well.
    */
   private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
-    // Before Lucene 2.9, position increments >=0 were supported, so we
-    // added 1 to parent to allow the parent -1 (the parent of the root).
-    // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is

Review comment:
       Phew, I'm so happy we can remove this comment!  That issue was hellacious back compat nightmare.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org