You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/11/02 15:48:27 UTC

[lucene-solr] branch branch_8x updated (50e68ee -> 9bd2859)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a change to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from 50e68ee  SOLR-14865: 'Index Merge Metrics' documentation correction (#1870)
     new 169ce0c  LUCENE-9536: Optimize OrdinalMap when one segment contains all distinct values. (#1948)
     new 9bd2859  LUCENE-9536: CHANGES entry.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 lucene/CHANGES.txt                                 |  4 +-
 .../java/org/apache/lucene/index/OrdinalMap.java   | 36 +++++++++----
 .../org/apache/lucene/index/TestOrdinalMap.java    | 59 ++++++++++++++++++++--
 3 files changed, 82 insertions(+), 17 deletions(-)


[lucene-solr] 01/02: LUCENE-9536: Optimize OrdinalMap when one segment contains all distinct values. (#1948)

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 169ce0c9581f023fed4c8f2cf7aa25af8d0b192c
Author: Julie Tibshirani <ju...@elastic.co>
AuthorDate: Mon Nov 2 07:40:16 2020 -0800

    LUCENE-9536: Optimize OrdinalMap when one segment contains all distinct values. (#1948)
    
    For doc values that are not too high cardinality, it is common for some large
    segments to contain all distinct values. In this case, we can check if the first
    segment ords map perfectly to global ords, and if so store the global ord deltas
    and first segment indices as `LongValues.ZEROES`
    to save some space.
---
 .../java/org/apache/lucene/index/OrdinalMap.java   | 36 +++++++++----
 .../org/apache/lucene/index/TestOrdinalMap.java    | 59 ++++++++++++++++++++--
 2 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java b/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java
index bbb643f..1e73f1c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java
+++ b/lucene/core/src/java/org/apache/lucene/index/OrdinalMap.java
@@ -172,10 +172,12 @@ public class OrdinalMap implements Accountable {
 
   /** Cache key of whoever asked for this awful thing */
   public final IndexReader.CacheKey owner;
+  // number of global ordinals
+  final long valueCount;
   // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
-  final PackedLongValues globalOrdDeltas;
+  final LongValues globalOrdDeltas;
   // globalOrd -> first segment container
-  final PackedLongValues firstSegments;
+  final LongValues firstSegments;
   // for every segment, segmentOrd -> globalOrd
   final LongValues segmentToGlobalOrds[];
   // the map from/to segment ids
@@ -271,13 +273,25 @@ public class OrdinalMap implements Accountable {
       globalOrd++;
     }
 
-    this.firstSegments = firstSegments.build();
-    this.globalOrdDeltas = globalOrdDeltas.build();
+    long ramBytesUsed = BASE_RAM_BYTES_USED + segmentMap.ramBytesUsed();
+    this.valueCount = globalOrd;
+
+    // If the first segment contains all of the global ords, then we can apply a small optimization
+    // and hardcode the first segment indices and global ord deltas as all zeroes.
+    if (ordDeltaBits.length > 0 && ordDeltaBits[0] == 0L && ordDeltas[0].size() == this.valueCount) {
+      this.firstSegments = LongValues.ZEROES;
+      this.globalOrdDeltas = LongValues.ZEROES;
+    } else {
+      PackedLongValues packedFirstSegments = firstSegments.build();
+      PackedLongValues packedGlobalOrdDeltas = globalOrdDeltas.build();
+      this.firstSegments = packedFirstSegments;
+      this.globalOrdDeltas = packedGlobalOrdDeltas;
+      ramBytesUsed += packedFirstSegments.ramBytesUsed() + packedGlobalOrdDeltas.ramBytesUsed();
+    }
+
     // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
     segmentToGlobalOrds = new LongValues[subs.length];
-    long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed()
-      + this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
-      + segmentMap.ramBytesUsed();
+    ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds);
     for (int i = 0; i < ordDeltas.length; ++i) {
       final PackedLongValues deltas = ordDeltas[i].build();
       if (ordDeltaBits[i] == 0L) {
@@ -317,6 +331,7 @@ public class OrdinalMap implements Accountable {
         ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]);
       }
     }
+
     this.ramBytesUsed = ramBytesUsed;
   }
 
@@ -348,7 +363,7 @@ public class OrdinalMap implements Accountable {
    * Returns the total number of unique terms in global ord space.
    */
   public long getValueCount() {
-    return globalOrdDeltas.size();
+    return valueCount;
   }
 
   @Override
@@ -359,10 +374,9 @@ public class OrdinalMap implements Accountable {
   @Override
   public Collection<Accountable> getChildResources() {
     List<Accountable> resources = new ArrayList<>();
-    resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas));
-    resources.add(Accountables.namedAccountable("first segments", firstSegments));
     resources.add(Accountables.namedAccountable("segment map", segmentMap));
-    // TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing
+    // TODO: would be nice to return the ordinal and segment maps too, but it's not straightforward
+    //  because of optimizations.
     return resources;
   }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java b/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java
index 2b2ec3d..e7b2777 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestOrdinalMap.java
@@ -17,10 +17,6 @@
 package org.apache.lucene.index;
 
 
-import java.io.IOException;
-import java.lang.reflect.Field;
-import java.util.HashMap;
-
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.SortedDocValuesField;
@@ -32,6 +28,10 @@ import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.RamUsageTester;
 import org.apache.lucene.util.TestUtil;
 
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.HashMap;
+
 public class TestOrdinalMap extends LuceneTestCase {
 
   private static final Field ORDINAL_MAP_OWNER_FIELD;
@@ -46,7 +46,7 @@ public class TestOrdinalMap extends LuceneTestCase {
   private static final RamUsageTester.Accumulator ORDINAL_MAP_ACCUMULATOR = new RamUsageTester.Accumulator() {
 
     public long accumulateObject(Object o, long shallowSize, java.util.Map<Field,Object> fieldValues, java.util.Collection<Object> queue) {
-      if (o == LongValues.IDENTITY) {
+      if (o == LongValues.ZEROES || o == LongValues.IDENTITY) {
         return 0L;
       }
       if (o instanceof OrdinalMap) {
@@ -95,4 +95,53 @@ public class TestOrdinalMap extends LuceneTestCase {
     dir.close();
   }
 
+  /**
+   * Tests the case where one segment contains all of the global ords. In this case, we apply a
+   * small optimization and hardcode the first segment indices and global ord deltas as all zeroes.
+   */
+  public void testOneSegmentWithAllValues() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setCodec(
+            TestUtil.alwaysDocValuesFormat(TestUtil.getDefaultDocValuesFormat()));
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, cfg);
+
+    int numTerms = 1000;
+    for (int i = 0; i < numTerms; ++i) {
+      Document d = new Document();
+      String term = String.valueOf(i);
+      d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
+      iw.addDocument(d);
+    }
+    iw.forceMerge(1);
+
+    for (int i = 0; i < 10; ++i) {
+      Document d = new Document();
+      String term = String.valueOf(random().nextInt(numTerms));
+      d.add(new SortedDocValuesField("sdv", new BytesRef(term)));
+      iw.addDocument(d);
+    }
+    iw.commit();
+
+    DirectoryReader r = iw.getReader();
+    SortedDocValues sdv = MultiDocValues.getSortedValues(r, "sdv");
+    assertNotNull(sdv);
+    assertTrue(sdv instanceof MultiDocValues.MultiSortedDocValues);
+
+    // Check that the optimization kicks in.
+    OrdinalMap map = ((MultiDocValues.MultiSortedDocValues) sdv).mapping;
+    assertEquals(LongValues.ZEROES, map.firstSegments);
+    assertEquals(LongValues.ZEROES, map.globalOrdDeltas);
+
+    // Check the map's basic behavior.
+    assertEquals(numTerms, (int) map.getValueCount());
+    for (int i = 0; i < numTerms; i++) {
+      assertEquals(0, map.getFirstSegmentNumber(i));
+      assertEquals(i, map.getFirstSegmentOrd(i));
+    }
+
+    iw.close();
+    r.close();
+    dir.close();
+  }
+
 }


[lucene-solr] 02/02: LUCENE-9536: CHANGES entry.

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 9bd28598d62a380504d76096bcfd40bc0c92d5d4
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Nov 2 16:46:45 2020 +0100

    LUCENE-9536: CHANGES entry.
---
 lucene/CHANGES.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 2919f45..85e4176 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -21,7 +21,9 @@ Improvements
 
 Optimizations
 ---------------------
-(No changes)
+
+* LUCENE-9536: Reduced memory usage for OrdinalMap when a segment has all
+  values. (Julie Tibshirani via Adrien Grand)
 
 Bug Fixes
 ---------------------