You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rb...@apache.org on 2016/12/15 00:14:14 UTC

hive git commit: HIVE-15422: HiveInputFormat::pushProjectionsAndFilters paths comparison generates huge number of objects for partitioned dataset (Rajesh Balamohan reviewed by Sergey Shelukhin)

Repository: hive
Updated Branches:
  refs/heads/master 5c325f7e0 -> f25ef88df


HIVE-15422: HiveInputFormat::pushProjectionsAndFilters paths comparison generates huge number of objects for partitioned dataset (Rajesh Balamohan reviewed by Sergey Shelukhin)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f25ef88d
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f25ef88d
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f25ef88d

Branch: refs/heads/master
Commit: f25ef88dfb1408c714621f45acb572584b6519f3
Parents: 5c325f7
Author: Rajesh Balamohan <rb...@apache.org>
Authored: Thu Dec 15 05:43:49 2016 +0530
Committer: Rajesh Balamohan <rb...@apache.org>
Committed: Thu Dec 15 05:43:49 2016 +0530

----------------------------------------------------------------------
 .../apache/hadoop/hive/common/FileUtils.java    | 10 ++++
 .../hadoop/hive/common/TestFileUtils.java       | 51 ++++++++++++++++++++
 .../hive/ql/exec/AbstractMapOperator.java       |  1 +
 .../hadoop/hive/ql/io/HiveInputFormat.java      | 19 +++++++-
 4 files changed, 80 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index a8ed8af..ab3439c 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -916,6 +916,16 @@ public final class FileUtils {
     return false;
   }
 
+  public static void populateParentPaths(Set<Path> parents, Path path) {
+    if (parents == null) {
+      return;
+    }
+    while(path != null) {
+      parents.add(path);
+      path = path.getParent();
+    }
+  }
+
   /**
    * Get the URI of the path. Assume to be local file system if no scheme.
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
----------------------------------------------------------------------
diff --git a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
index d82c531..5705028 100644
--- a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
+++ b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.hadoop.fs.LocalFileSystem;
@@ -146,4 +147,54 @@ public class TestFileUtils {
 
     assertEquals(unchangedPath.toString(), absolutePath.toString());
   }
+
+  @Test
+  public void testIsPathWithinSubtree() throws IOException {
+    Path splitPath = new Path("file:///user/hive/warehouse/src/data.txt");
+    Path splitPathWithNoSchema = Path.getPathWithoutSchemeAndAuthority(splitPath);
+
+    Set<Path> parents = new HashSet<>();
+    FileUtils.populateParentPaths(parents, splitPath);
+    FileUtils.populateParentPaths(parents, splitPathWithNoSchema);
+
+    Path key = new Path("/user/hive/warehouse/src");
+    verifyIsPathWithInSubTree(splitPath, key, false);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true);
+    verifyIfParentsContainPath(key, parents, true);
+
+    key = new Path("/user/hive/warehouse/src_2");
+    verifyIsPathWithInSubTree(splitPath, key, false);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+    verifyIfParentsContainPath(key, parents, false);
+
+    key = new Path("/user/hive/warehouse/src/data.txt");
+    verifyIsPathWithInSubTree(splitPath, key, false);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true);
+    verifyIfParentsContainPath(key, parents, true);
+
+    key = new Path("file:///user/hive/warehouse/src");
+    verifyIsPathWithInSubTree(splitPath, key, true);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+    verifyIfParentsContainPath(key, parents, true);
+
+    key = new Path("file:///user/hive/warehouse/src_2");
+    verifyIsPathWithInSubTree(splitPath, key, false);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+    verifyIfParentsContainPath(key, parents, false);
+
+    key = new Path("file:///user/hive/warehouse/src/data.txt");
+    verifyIsPathWithInSubTree(splitPath, key, true);
+    verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+    verifyIfParentsContainPath(key, parents, true);
+  }
+
+  private void verifyIsPathWithInSubTree(Path splitPath, Path key, boolean expected) {
+    boolean result = FileUtils.isPathWithinSubtree(splitPath, key);
+    assertEquals("splitPath=" + splitPath + ", key=" + key, expected, result);
+  }
+
+  private void verifyIfParentsContainPath(Path key, Set<Path> parents, boolean expected) {
+    boolean result = parents.contains(key);
+    assertEquals("key=" + key, expected, result);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
index 6de2c18..2e0ef74 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
@@ -114,6 +114,7 @@ public abstract class AbstractMapOperator extends Operator<MapWork>
         throw new IllegalStateException("Ambiguous input path " + fpath);
       }
       nominal = onefile;
+      break;
     }
     if (nominal == null) {
       throw new IllegalStateException("Invalid input path " + fpath);

http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 69956ec..94fcd60 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -23,9 +23,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.Map.Entry;
 
@@ -595,6 +597,8 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable>
     Iterator<Entry<Path, ArrayList<String>>> iterator = this.mrwork
         .getPathToAliases().entrySet().iterator();
 
+    Set<Path> splitParentPaths = null;
+    int pathsSize = this.mrwork.getPathToAliases().entrySet().size();
     while (iterator.hasNext()) {
       Entry<Path, ArrayList<String>> entry = iterator.next();
       Path key = entry.getKey();
@@ -610,7 +614,20 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable>
         // subdirectories.  (Unlike non-native tables, prefix mixups don't seem
         // to be a potential problem here since we are always dealing with the
         // path to something deeper than the table location.)
-        match = FileUtils.isPathWithinSubtree(splitPath, key) || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
+        if (pathsSize > 1) {
+          // Comparing paths multiple times creates lots of objects &
+          // creates GC pressure for tables having large number of partitions.
+          // In such cases, use pre-computed paths for comparison
+          if (splitParentPaths == null) {
+            splitParentPaths = new HashSet<>();
+            FileUtils.populateParentPaths(splitParentPaths, splitPath);
+            FileUtils.populateParentPaths(splitParentPaths, splitPathWithNoSchema);
+          }
+          match = splitParentPaths.contains(key);
+        } else {
+          match = FileUtils.isPathWithinSubtree(splitPath, key)
+              || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
+        }
       }
       if (match) {
         ArrayList<String> list = entry.getValue();