You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2019/01/09 04:11:55 UTC

[GitHub] Ben-Zvi closed pull request #1598: DRILL-6880: For Hash-Join hash-table build - treat null keys as an equal match

Ben-Zvi closed pull request #1598: DRILL-6880: For Hash-Join hash-table build - treat null keys as an equal match 
URL: https://github.com/apache/drill/pull/1598
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/ChainedHashTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/ChainedHashTable.java
index dcdac954f24..e7abd980a45 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/ChainedHashTable.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/ChainedHashTable.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.util.List;
 
+import com.sun.codemodel.JExpression;
 import org.apache.drill.common.expression.ErrorCollector;
 import org.apache.drill.common.expression.ErrorCollectorImpl;
 import org.apache.drill.common.expression.LogicalExpression;
@@ -63,6 +64,10 @@
       GeneratorMapping.create("setupInterior" /* setup method */, "isKeyMatchInternalBuild" /* eval method */,
           null /* reset */, null /* cleanup */);
 
+  private static final GeneratorMapping BOTH_KEYS_NULL =
+    GeneratorMapping.create("setupInterior" /* setup method */, "areBothKeysNull" /* eval method */,
+      null /* reset */, null /* cleanup */);
+
   private static final GeneratorMapping KEY_MATCH_PROBE =
       GeneratorMapping.create("setupInterior" /* setup method */, "isKeyMatchInternalProbe" /* eval method */,
           null /* reset */, null /* cleanup */);
@@ -95,10 +100,14 @@
 
   private final MappingSet KeyMatchIncomingBuildMapping =
       new MappingSet("incomingRowIdx", null, "incomingBuild", null, SETUP_INTERIOR_CONSTANT, KEY_MATCH_BUILD);
+  private final MappingSet bothKeysNullIncomingBuildMapping =
+    new MappingSet("incomingRowIdx", null, "incomingBuild", null, SETUP_INTERIOR_CONSTANT, BOTH_KEYS_NULL);
   private final MappingSet KeyMatchIncomingProbeMapping =
       new MappingSet("incomingRowIdx", null, "incomingProbe", null, SETUP_INTERIOR_CONSTANT, KEY_MATCH_PROBE);
   private final MappingSet KeyMatchHtableMapping =
       new MappingSet("htRowIdx", null, "htContainer", null, SETUP_INTERIOR_CONSTANT, KEY_MATCH_BUILD);
+  private final MappingSet bothKeysNullHtableMapping =
+    new MappingSet("htRowIdx", null, "htContainer", null, SETUP_INTERIOR_CONSTANT, BOTH_KEYS_NULL);
   private final MappingSet KeyMatchHtableProbeMapping =
       new MappingSet("htRowIdx", null, "htContainer", null, SETUP_INTERIOR_CONSTANT, KEY_MATCH_PROBE);
   private final MappingSet GetHashIncomingBuildMapping =
@@ -121,6 +130,8 @@
   private RecordBatch incomingProbe;
   private final RecordBatch outgoing;
 
+  private enum SetupWork {DO_BUILD, DO_PROBE, CHECK_BOTH_NULLS};
+
   public ChainedHashTable(HashTableConfig htConfig, FragmentContext context, BufferAllocator allocator,
                           RecordBatch incomingBuild, RecordBatch incomingProbe, RecordBatch outgoing) {
 
@@ -208,12 +219,16 @@ public HashTable createAndSetupHashTable(TypedFieldId[] outKeyFieldIds) throws C
       i++;
     }
 
-
+    // Only in case of a join: Generate a special method to check if both the new key and the existing key (in this HT bucket) are nulls
+    // (used by Hash-Join to avoid creating a long hash-table chain of null keys, which can lead to useless O(n^2) work on that chain.)
+    // The logic is: Nulls match on build, and don't match on probe. Note that this logic covers outer joins as well.
+    setupIsKeyMatchInternal(cgInner, bothKeysNullIncomingBuildMapping, bothKeysNullHtableMapping, keyExprsBuild,
+        htConfig.getComparators(), htKeyFieldIds, SetupWork.CHECK_BOTH_NULLS);
     // generate code for isKeyMatch(), setValue(), getHash() and outputRecordKeys()
     setupIsKeyMatchInternal(cgInner, KeyMatchIncomingBuildMapping, KeyMatchHtableMapping, keyExprsBuild,
-        htConfig.getComparators(), htKeyFieldIds);
+        htConfig.getComparators(), htKeyFieldIds, SetupWork.DO_BUILD);
     setupIsKeyMatchInternal(cgInner, KeyMatchIncomingProbeMapping, KeyMatchHtableProbeMapping, keyExprsProbe,
-        htConfig.getComparators(), htKeyFieldIds);
+        htConfig.getComparators(), htKeyFieldIds, SetupWork.DO_PROBE);
 
     setupSetValue(cgInner, keyExprsBuild, htKeyFieldIds);
     if (outgoing != null) {
@@ -224,8 +239,8 @@ public HashTable createAndSetupHashTable(TypedFieldId[] outKeyFieldIds) throws C
     }
     setupOutputRecordKeys(cgInner, htKeyFieldIds, outKeyFieldIds);
 
-    setupGetHash(cg /* use top level code generator for getHash */, GetHashIncomingBuildMapping, incomingBuild, keyExprsBuild, false);
-    setupGetHash(cg /* use top level code generator for getHash */, GetHashIncomingProbeMapping, incomingProbe, keyExprsProbe, true);
+    setupGetHash(cg /* use top level code generator for getHash */, GetHashIncomingBuildMapping, incomingBuild, keyExprsBuild);
+    setupGetHash(cg /* use top level code generator for getHash */, GetHashIncomingProbeMapping, incomingProbe, keyExprsProbe);
 
     HashTable ht = context.getImplementationClass(top);
     ht.setup(htConfig, allocator, incomingBuild.getContainer(), incomingProbe, outgoing, htContainerOrig, context, cgInner);
@@ -234,10 +249,19 @@ public HashTable createAndSetupHashTable(TypedFieldId[] outKeyFieldIds) throws C
   }
 
   private void setupIsKeyMatchInternal(ClassGenerator<HashTable> cg, MappingSet incomingMapping, MappingSet htableMapping,
-      LogicalExpression[] keyExprs, List<Comparator> comparators, TypedFieldId[] htKeyFieldIds) {
+      LogicalExpression[] keyExprs, List<Comparator> comparators, TypedFieldId[] htKeyFieldIds, SetupWork work) {
+
+    boolean checkIfBothNulls = work == SetupWork.CHECK_BOTH_NULLS;
+
+    // Regular key matching may return false in the middle (i.e., some pair of columns did not match), and true only if all matched;
+    // but "both nulls" check returns the opposite logic (i.e., true when one pair of nulls is found, need check no more)
+    JExpression midPointResult = checkIfBothNulls ? JExpr.TRUE : JExpr.FALSE;
+    JExpression finalResult = checkIfBothNulls ? JExpr.FALSE : JExpr.TRUE;
+
     cg.setMappingSet(incomingMapping);
 
-    if (keyExprs == null || keyExprs.length == 0) {
+    if (keyExprs == null || keyExprs.length == 0 ||
+        checkIfBothNulls && ! comparators.contains(Comparator.EQUALS)) { // e.g. for Hash-Aggr, or non-equi join
       cg.getEvalBlock()._return(JExpr.FALSE);
       return;
     }
@@ -253,28 +277,29 @@ private void setupIsKeyMatchInternal(ClassGenerator<HashTable> cg, MappingSet in
 
       JConditional jc;
 
-      // codegen for nullable columns if nulls are not equal
-      if (comparators.get(i) == Comparator.EQUALS
-          && left.isOptional() && right.isOptional()) {
-        jc = cg.getEvalBlock()._if(left.getIsSet().eq(JExpr.lit(0)).
+      if ( work != SetupWork.DO_BUILD ) {  // BUILD runs this logic in a separate method - areBothKeysNull()
+        // codegen for the special case when both columns are null (i.e., return early with midPointResult)
+        if (comparators.get(i) == Comparator.EQUALS
+            && left.isOptional() && right.isOptional()) {
+          jc = cg.getEvalBlock()._if(left.getIsSet().eq(JExpr.lit(0)).
             cand(right.getIsSet().eq(JExpr.lit(0))));
-        jc._then()._return(JExpr.FALSE);
+          jc._then()._return(midPointResult);
+        }
       }
+      if ( ! checkIfBothNulls ) { // generate comparison code (at least one of the two columns' values is non-null)
+        final LogicalExpression f = FunctionGenerationHelper.getOrderingComparatorNullsHigh(left, right, context.getFunctionRegistry());
 
-      final LogicalExpression f =
-          FunctionGenerationHelper
-          .getOrderingComparatorNullsHigh(left, right, context.getFunctionRegistry());
+        HoldingContainer out = cg.addExpr(f, ClassGenerator.BlkCreateMode.FALSE);
 
-      HoldingContainer out = cg.addExpr(f, ClassGenerator.BlkCreateMode.FALSE);
+        // check if two values are not equal (comparator result != 0)
+        jc = cg.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0)));
 
-      // check if two values are not equal (comparator result != 0)
-      jc = cg.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0)));
-
-      jc._then()._return(JExpr.FALSE);
+        jc._then()._return(midPointResult);
+      }
     }
 
-    // All key expressions compared equal, so return TRUE
-    cg.getEvalBlock()._return(JExpr.TRUE);
+    // All key expressions compared the same way, so return the appropriate final result
+    cg.getEvalBlock()._return(finalResult);
   }
 
   private void setupSetValue(ClassGenerator<HashTable> cg, LogicalExpression[] keyExprs,
@@ -304,8 +329,8 @@ private void setupOutputRecordKeys(ClassGenerator<HashTable> cg, TypedFieldId[]
     }
   }
 
-  private void setupGetHash(ClassGenerator<HashTable> cg, MappingSet incomingMapping, VectorAccessible batch, LogicalExpression[] keyExprs,
-                            boolean isProbe) throws SchemaChangeException {
+  private void setupGetHash(ClassGenerator<HashTable> cg, MappingSet incomingMapping, VectorAccessible batch, LogicalExpression[] keyExprs)
+    throws SchemaChangeException {
 
     cg.setMappingSet(incomingMapping);
 
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/HashTableTemplate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/HashTableTemplate.java
index 25ada28ee15..1d8323984bf 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/HashTableTemplate.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/common/HashTableTemplate.java
@@ -87,9 +87,6 @@
   // current available (free) slot globally across all batch holders
   private int freeIndex = 0;
 
-  // Placeholder for the current index while probing the hash table
-  private IndexPointer currentIdxHolder;
-
   private BufferAllocator allocator;
 
   // The incoming build side record batch
@@ -205,32 +202,41 @@ protected void setup() throws SchemaChangeException {
       setupInterior(incomingBuild, incomingProbe, outgoing, htContainer);
     }
 
-    // Check if the key at the currentIdx position in hash table matches the key
-    // at the incomingRowIdx. if the key does not match, update the
-    // currentIdxHolder with the index of the next link.
+    // Check if the key at the current Index position in hash table matches the key
+    // at the incomingRowIdx.
     private boolean isKeyMatch(int incomingRowIdx,
-        IndexPointer currentIdxHolder,
+        int currentIndex,
         boolean isProbe) throws SchemaChangeException {
-      int currentIdxWithinBatch = currentIdxHolder.value & BATCH_MASK;
-      boolean match;
+      int currentIdxWithinBatch = currentIndex & BATCH_MASK;
 
-      if (currentIdxWithinBatch >= batchHolders.get((currentIdxHolder.value >>> 16) & BATCH_MASK).getTargetBatchRowCount()) {
+      if (currentIdxWithinBatch >= batchHolders.get((currentIndex >>> 16) & BATCH_MASK).getTargetBatchRowCount()) {
         logger.debug("Batch size = {}, incomingRowIdx = {}, currentIdxWithinBatch = {}.",
-          batchHolders.get((currentIdxHolder.value >>> 16) & BATCH_MASK).getTargetBatchRowCount(), incomingRowIdx, currentIdxWithinBatch);
+          batchHolders.get((currentIndex >>> 16) & BATCH_MASK).getTargetBatchRowCount(), incomingRowIdx, currentIdxWithinBatch);
       }
-      assert (currentIdxWithinBatch < batchHolders.get((currentIdxHolder.value >>> 16) & BATCH_MASK).getTargetBatchRowCount());
+      assert (currentIdxWithinBatch < batchHolders.get((currentIndex >>> 16) & BATCH_MASK).getTargetBatchRowCount());
       assert (incomingRowIdx < HashTable.BATCH_SIZE);
 
       if (isProbe) {
-        match = isKeyMatchInternalProbe(incomingRowIdx, currentIdxWithinBatch);
-      } else {
-        match = isKeyMatchInternalBuild(incomingRowIdx, currentIdxWithinBatch);
+        return isKeyMatchInternalProbe(incomingRowIdx, currentIdxWithinBatch);
       }
 
-      if (!match) {
-        currentIdxHolder.value = links.getAccessor().get(currentIdxWithinBatch);
-      }
-      return match;
+      // in case of a hash-join build, where both the new incoming key and the current key are null, treat them as
+      // a match; i.e. the new would be added into the helper (but not the Hash-Table !), though it would never
+      // be used (not putting it into the helper would take a bigger code change, and some performance cost, hence
+      // not worth it).  In the past such a new null key was added into the Hash-Table (i.e., no match), which
+      // created long costly chains - SEE DRILL-6880)
+      if ( areBothKeysNull(incomingRowIdx, currentIdxWithinBatch) ) { return true; }
+
+      return isKeyMatchInternalBuild(incomingRowIdx, currentIdxWithinBatch);
+    }
+
+    // This method should only be used in an "iterator like" next() fashion, to traverse a hash table chain looking for a match.
+    // Starting from the first element (i.e., index) in the chain, _isKeyMatch()_ should be called on that element; if "false" is returned,
+    // then this method should be called to return the (index to the) next element in the chain (or an EMPTY_SLOT to terminate), and then
+    // _isKeyMatch()_ should be called on that next element; and so on until a match is found - where the loop is exited with the found result.
+    // (This was not implemented as a real Java iterator as each index may point to another BatchHolder).
+    private int nextLinkInHashChain(int currentIndex) {
+      return links.getAccessor().get(currentIndex & BATCH_MASK);
     }
 
     // Insert a new <key1, key2...keyN> entry coming from the incoming batch into the hash table
@@ -423,6 +429,12 @@ protected boolean isKeyMatchInternalBuild(
       return false;
     }
 
+    @RuntimeOverridden
+    protected boolean areBothKeysNull(
+      @Named("incomingRowIdx") int incomingRowIdx, @Named("htRowIdx") int htRowIdx) throws SchemaChangeException {
+      return false;
+    }
+
     @RuntimeOverridden
     protected boolean isKeyMatchInternalProbe(
         @Named("incomingRowIdx") int incomingRowIdx, @Named("htRowIdx") int htRowIdx) throws SchemaChangeException {
@@ -511,7 +523,6 @@ public void setup(HashTableConfig htConfig, BufferAllocator allocator, VectorCon
       throw new IllegalStateException("Unexpected schema change", e);
     }
 
-    currentIdxHolder = new IndexPointer();
   }
 
   @Override
@@ -661,16 +672,15 @@ public PutStatus put(int incomingRowIdx, IndexPointer htIdxHolder, int hashCode,
 
     // if startIdx is non-empty, follow the hash chain links until we find a matching
     // key or reach the end of the chain (and remember the last link there)
-    for ( currentIdxHolder.value = startIdx;
-          currentIdxHolder.value != EMPTY_SLOT;
-          /* isKeyMatch() below also advances the currentIdxHolder to the next link */) {
-
+    for ( int currentIndex = startIdx;
+          currentIndex != EMPTY_SLOT;
+          currentIndex = lastEntryBatch.nextLinkInHashChain(currentIndex)) {
       // remember the current link, which would be the last when the next link is empty
-      lastEntryBatch = batchHolders.get((currentIdxHolder.value >>> 16) & BATCH_MASK);
-      lastEntryIdxWithinBatch = currentIdxHolder.value & BATCH_MASK;
+      lastEntryBatch = batchHolders.get((currentIndex >>> 16) & BATCH_MASK);
+      lastEntryIdxWithinBatch = currentIndex & BATCH_MASK;
 
-      if (lastEntryBatch.isKeyMatch(incomingRowIdx, currentIdxHolder, false)) {
-        htIdxHolder.value = currentIdxHolder.value;
+      if (lastEntryBatch.isKeyMatch(incomingRowIdx, currentIndex, false)) {
+        htIdxHolder.value = currentIndex;
         return PutStatus.KEY_PRESENT;
       }
     }
@@ -738,12 +748,15 @@ public PutStatus put(int incomingRowIdx, IndexPointer htIdxHolder, int hashCode,
    @Override
   public int probeForKey(int incomingRowIdx, int hashCode) throws SchemaChangeException {
     int bucketIndex = getBucketIndex(hashCode, numBuckets());
-
-    for ( currentIdxHolder.value = startIndices.getAccessor().get(bucketIndex);
-          currentIdxHolder.value != EMPTY_SLOT; ) {
-      BatchHolder bh = batchHolders.get((currentIdxHolder.value >>> 16) & BATCH_MASK);
-      if (bh.isKeyMatch(incomingRowIdx, currentIdxHolder, true /* isProbe */)) {
-        return currentIdxHolder.value;
+     int startIdx = startIndices.getAccessor().get(bucketIndex);
+     BatchHolder lastEntryBatch = null;
+
+     for ( int currentIndex = startIdx;
+           currentIndex != EMPTY_SLOT;
+           currentIndex = lastEntryBatch.nextLinkInHashChain(currentIndex)) {
+      lastEntryBatch = batchHolders.get((currentIndex >>> 16) & BATCH_MASK);
+      if (lastEntryBatch.isKeyMatch(incomingRowIdx, currentIndex, true /* isProbe */)) {
+        return currentIndex;
       }
     }
     return -1;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services