You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by lv...@apache.org on 2017/02/16 21:03:44 UTC

[3/5] incubator-impala git commit: IMPALA-4916: Fix maintenance of set of item sets in DisjointSet.

IMPALA-4916: Fix maintenance of set of item sets in DisjointSet.

The bug: The DisjointSet maintains a set of unique item sets
using a HashSet<Set<T>>. The problem is that we modified
the Set<T> elements after inserting them into the HashSet.
This caused the removal of elements from the HashSet to fail.
Removal is required for maintaining a consistent DisjointSet.
The removal could even fail for the same Set<T> instance because
the hashCode() changed from when the Set<T> was originally
inserted to when the removal was attempted due to mutation
of the Set<T>.
An inconsistent DisjointSet can lead to incorrect equivalence
classes, which can lead to missing, redundant and even
non-executable predicates. Incorrect results and crashes are
possible.

For most queries, an inconsistent DisjointSet does not alter
the equivalence classes, and even fewer queries have incorrect
plans.
In fact, many of our existing planner tests trigger this bug,
but only 3 of them lead to an incorrect value transfer graph,
and all 3 had correct plans.

The fix: Use an IdentityHashMap to store the set of item sets.
It does not rely on the hashCode() and equals() of the stored
elements, so the same object can be added and later removed,
even when mutated in the meantime.

Testing:
- Added a Preconditions check in DisjointSet that asserts
  correct removal of an item set. Many of our existing tests
  hit the check before this fix.
- Added a new unit test for DisjointSet which triggers
  the bug.
- Augmented DisjointSet.checkConsistency() to check for
  inconsistency in the set of item sets.
- Added validation of the value-transfer graph in
  single-node planner tests.
- A private core/hdfs run succeeded.

Change-Id: I609c8795c09becd78815605ea8e82e2f99e82212
Reviewed-on: http://gerrit.cloudera.org:8080/5980
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/ffd297bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/ffd297bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/ffd297bd

Branch: refs/heads/master
Commit: ffd297bdea5d58db3d1791a2528b7f1009a459c6
Parents: cec3fe3
Author: Alex Behm <al...@cloudera.com>
Authored: Fri Feb 10 07:05:07 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Feb 15 22:16:30 2017 +0000

----------------------------------------------------------------------
 .../org/apache/impala/analysis/Analyzer.java    |  9 ++++-
 .../main/java/org/apache/impala/common/Id.java  |  4 +--
 .../org/apache/impala/util/DisjointSet.java     | 37 +++++++++++++++++---
 .../org/apache/impala/planner/PlannerTest.java  |  2 +-
 .../org/apache/impala/util/TestDisjointSet.java | 17 +++++++++
 .../queries/PlannerTest/joins.test              | 15 ++++----
 6 files changed, 65 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index 34cf565..2809851 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -56,6 +56,7 @@ import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.InternalException;
 import org.apache.impala.common.Pair;
 import org.apache.impala.common.PrintUtils;
+import org.apache.impala.common.RuntimeEnv;
 import org.apache.impala.planner.PlanNode;
 import org.apache.impala.rewrite.BetweenToCompoundRule;
 import org.apache.impala.rewrite.ExprRewriter;
@@ -1979,6 +1980,12 @@ public class Analyzer {
     globalState_.valueTransferGraph = new ValueTransferGraph();
     globalState_.valueTransferGraph.computeValueTransfers();
 
+    // Validate the value-transfer graph in single-node planner tests.
+    if (RuntimeEnv.INSTANCE.isTestEnv() && getQueryOptions().num_nodes == 1) {
+      Preconditions.checkState(validateValueTransferGraph(),
+          "Failed to validate value-transfer graph.");
+    }
+
     // we start out by assigning each slot to its own equiv class
     int numSlots = globalState_.descTbl.getMaxSlotId().asInt() + 1;
     for (int i = 0; i < numSlots; ++i) {
@@ -2941,7 +2948,7 @@ public class Analyzer {
       for (Pair<SlotId, SlotId> vt: valueTransfers) {
         expectedValueTransfer[vt.first.asInt()][vt.second.asInt()] = true;
       }
-      // Set registered value tranfers in expectedValueTransfer.
+      // Set registered value transfers in expectedValueTransfer.
       for (Pair<SlotId, SlotId> vt: globalState_.registeredValueTransfers) {
         expectedValueTransfer[vt.first.asInt()][vt.second.asInt()] = true;
       }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/fe/src/main/java/org/apache/impala/common/Id.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/Id.java b/fe/src/main/java/org/apache/impala/common/Id.java
index c7357aa..b89b56d 100644
--- a/fe/src/main/java/org/apache/impala/common/Id.java
+++ b/fe/src/main/java/org/apache/impala/common/Id.java
@@ -36,9 +36,7 @@ public class Id<IdType extends Id<IdType>> implements Comparable<Id<IdType>> {
   public int asInt() { return id_; }
 
   @Override
-  public int hashCode() {
-    return Integer.valueOf(id_).hashCode();
-  }
+  public int hashCode() { return id_; }
 
   @Override
   public String toString() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/fe/src/main/java/org/apache/impala/util/DisjointSet.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/util/DisjointSet.java b/fe/src/main/java/org/apache/impala/util/DisjointSet.java
index d5543b1..1665cc7 100644
--- a/fe/src/main/java/org/apache/impala/util/DisjointSet.java
+++ b/fe/src/main/java/org/apache/impala/util/DisjointSet.java
@@ -18,10 +18,12 @@
 package org.apache.impala.util;
 
 import java.util.Collection;
+import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
@@ -35,7 +37,18 @@ import com.google.common.collect.Sets;
 public class DisjointSet<T> {
   // Maps from an item to its item set.
   private final Map<T, Set<T>> itemSets_ = Maps.newHashMap();
-  private final Set<Set<T>> uniqueSets_ = Sets.newHashSet();
+
+  // Stores the set of item sets in its keySet(). All keys are mapped to the same
+  // dummy value which is only used for validating the removal of entries.
+  // An IdentityHashMap is needed here instead of a regular HashSet because the Set<T>
+  // elements are mutated after inserting them. In a conventional HashSet mutating
+  // elements after they are inserted may cause problems because the hashCode() of the
+  // object during insertion may be different than its hashCode() after mutation. So,
+  // after changing the element it may not be possible to remove/find it again in the
+  // HashSet (looking in the wrong hash bucket), even using the same object reference.
+  private final IdentityHashMap<Set<T>, Object> uniqueSets_ =
+      new IdentityHashMap<Set<T>, Object>();
+  private static final Object DUMMY_VALUE = new Object();
 
   /**
    * Returns the item set corresponding to the given item or null if it
@@ -43,7 +56,7 @@ public class DisjointSet<T> {
    */
   public Set<T> get(T item) { return itemSets_.get(item); }
 
-  public Set<Set<T>> getSets() { return uniqueSets_; }
+  public Set<Set<T>> getSets() { return uniqueSets_.keySet(); }
 
   /**
    * Registers a new item set with a single item. Returns the new item set.
@@ -56,7 +69,7 @@ public class DisjointSet<T> {
     }
     Set<T> s = Sets.newHashSet(item);
     itemSets_.put(item, s);
-    uniqueSets_.add(s);
+    uniqueSets_.put(s, DUMMY_VALUE);
     return s;
   }
 
@@ -94,7 +107,8 @@ public class DisjointSet<T> {
       mergedItems.add(item);
       itemSets_.put(item, mergedItems);
     }
-    uniqueSets_.remove(updateItems);
+    Object removedValue = uniqueSets_.remove(updateItems);
+    Preconditions.checkState(removedValue == DUMMY_VALUE);
     return true;
   }
 
@@ -125,6 +139,7 @@ public class DisjointSet<T> {
    * Throws an IllegalStateException if an inconsistency is detected.
    */
   public void checkConsistency() {
+    // Validate map from item to item set.
     Set<Set<T>> validatedSets = Sets.newHashSet();
     for (Set<T> itemSet: itemSets_.values()) {
       // Avoid checking the same item set multiple times.
@@ -133,10 +148,22 @@ public class DisjointSet<T> {
       // the set itself.
       for (T item: itemSet) {
         if (itemSet != itemSets_.get(item)) {
-          throw new IllegalStateException("DisjointSet is in an inconsistent state.");
+          throw new IllegalStateException(
+              "DisjointSet is in an inconsistent state. Failed item set validation.");
         }
       }
       validatedSets.add(itemSet);
     }
+
+    // Validate set of item sets. Every element should appear in exactly one item set.
+    Set<T> seenItems = Sets.newHashSet();
+    for (Set<T> itemSet: uniqueSets_.keySet()) {
+      for (T item: itemSet) {
+        if (!seenItems.add(item)) {
+          throw new IllegalStateException(
+              "DisjointSet is in an inconsistent state. Failed unique set validation.");
+        }
+      }
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index 8eabe8a..a88a3cd 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -31,7 +31,7 @@ import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.Test;
 
-import jline.internal.Preconditions;
+import com.google.common.base.Preconditions;
 
 // All planner tests, except for S3 specific tests should go here.
 public class PlannerTest extends PlannerTestBase {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/fe/src/test/java/org/apache/impala/util/TestDisjointSet.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/util/TestDisjointSet.java b/fe/src/test/java/org/apache/impala/util/TestDisjointSet.java
index a932dfc..0dc290e 100644
--- a/fe/src/test/java/org/apache/impala/util/TestDisjointSet.java
+++ b/fe/src/test/java/org/apache/impala/util/TestDisjointSet.java
@@ -157,4 +157,21 @@ public class TestDisjointSet {
         && ds.get(1).containsAll(Lists.newArrayList(1, 2, 3, 4, 5, 6, 7, 8)));
     ds.checkConsistency();
   }
+
+  /**
+   * IMPALA-4916: Tests that the set of item sets is maintained correctly.
+   */
+  @Test
+  public void testUniqueSets() throws Exception {
+    DisjointSet<Integer> ds = new DisjointSet<Integer>();
+
+    int uniqueElements = 100;
+    // Generate several 2-element item sets.
+    for (int i = 0; i < uniqueElements; i += 2) ds.union(i, i + 1);
+    // Connect several item sets into one big item set. This stresses the logic
+    // of maintaining the set of item sets.
+    for (int i = uniqueElements/2; i < uniqueElements; ++i) ds.union(i, i + 1);
+
+    ds.checkConsistency();
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/ffd297bd/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
index 8161833..f28ecb0 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
@@ -1171,12 +1171,7 @@ PLAN-ROOT SINK
    partitions=24/24 files=24 size=478.45KB
    runtime filters: RF000 -> a.id, RF001 -> a.int_col
 ====
-# Test correct removal of redundant join predicates (IMPALA-1353):
-# No predicates are removed from the On-clause. The resulting plan is correct
-# but inconsistent with behavior of Impala behavior for other queries.
-# TODO: Investigate why the equivalence class computation appears to not
-# work for the slots of tables a and b. We expect predicates such as
-# 'id+id = tinyint_col' to be assigned in the union operands.
+# IMPALA-1353/IMPALA-4916: Test correct removal of redundant join predicates.
 select 1
 from functional.alltypes a
 inner join
@@ -1191,20 +1186,22 @@ on a.id = b.x and a.id = b.tinyint_col and
 PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
-|  hash predicates: a.id = tinyint_col, a.id = x, a.int_col = bigint_col, a.int_col = y
-|  runtime filters: RF000 <- tinyint_col, RF001 <- x, RF002 <- bigint_col, RF003 <- y
+|  hash predicates: a.id = x, a.int_col = y
+|  runtime filters: RF000 <- x, RF001 <- y
 |
 |--01:UNION
 |  |
 |  |--03:SCAN HDFS [functional.alltypestiny]
 |  |     partitions=4/4 files=4 size=460B
+|  |     predicates: id - id = tinyint_col, int_col / int_col = bigint_col
 |  |
 |  02:SCAN HDFS [functional.alltypessmall]
 |     partitions=4/4 files=4 size=6.32KB
+|     predicates: int_col * int_col = bigint_col, id + id = tinyint_col
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.id, RF001 -> a.id, RF002 -> a.int_col, RF003 -> a.int_col
+   runtime filters: RF000 -> a.id, RF001 -> a.int_col
 ====
 # Test creation of predicates at a join node for constructing the
 # minimum spanning tree to cover known slot equivalences (IMPALA-1102).