You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rb...@apache.org on 2016/12/15 00:14:14 UTC
hive git commit: HIVE-15422:
HiveInputFormat::pushProjectionsAndFilters paths comparison generates huge
number of objects for partitioned dataset (Rajesh Balamohan reviewed by
Sergey Shelukhin)
Repository: hive
Updated Branches:
refs/heads/master 5c325f7e0 -> f25ef88df
HIVE-15422: HiveInputFormat::pushProjectionsAndFilters paths comparison generates huge number of objects for partitioned dataset (Rajesh Balamohan reviewed by Sergey Shelukhin)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f25ef88d
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f25ef88d
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f25ef88d
Branch: refs/heads/master
Commit: f25ef88dfb1408c714621f45acb572584b6519f3
Parents: 5c325f7
Author: Rajesh Balamohan <rb...@apache.org>
Authored: Thu Dec 15 05:43:49 2016 +0530
Committer: Rajesh Balamohan <rb...@apache.org>
Committed: Thu Dec 15 05:43:49 2016 +0530
----------------------------------------------------------------------
.../apache/hadoop/hive/common/FileUtils.java | 10 ++++
.../hadoop/hive/common/TestFileUtils.java | 51 ++++++++++++++++++++
.../hive/ql/exec/AbstractMapOperator.java | 1 +
.../hadoop/hive/ql/io/HiveInputFormat.java | 19 +++++++-
4 files changed, 80 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index a8ed8af..ab3439c 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -916,6 +916,16 @@ public final class FileUtils {
return false;
}
+ public static void populateParentPaths(Set<Path> parents, Path path) {
+ if (parents == null) {
+ return;
+ }
+ while(path != null) {
+ parents.add(path);
+ path = path.getParent();
+ }
+ }
+
/**
* Get the URI of the path. Assume to be local file system if no scheme.
*/
http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
----------------------------------------------------------------------
diff --git a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
index d82c531..5705028 100644
--- a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
+++ b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.fs.LocalFileSystem;
@@ -146,4 +147,54 @@ public class TestFileUtils {
assertEquals(unchangedPath.toString(), absolutePath.toString());
}
+
+ @Test
+ public void testIsPathWithinSubtree() throws IOException {
+ Path splitPath = new Path("file:///user/hive/warehouse/src/data.txt");
+ Path splitPathWithNoSchema = Path.getPathWithoutSchemeAndAuthority(splitPath);
+
+ Set<Path> parents = new HashSet<>();
+ FileUtils.populateParentPaths(parents, splitPath);
+ FileUtils.populateParentPaths(parents, splitPathWithNoSchema);
+
+ Path key = new Path("/user/hive/warehouse/src");
+ verifyIsPathWithInSubTree(splitPath, key, false);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true);
+ verifyIfParentsContainPath(key, parents, true);
+
+ key = new Path("/user/hive/warehouse/src_2");
+ verifyIsPathWithInSubTree(splitPath, key, false);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+ verifyIfParentsContainPath(key, parents, false);
+
+ key = new Path("/user/hive/warehouse/src/data.txt");
+ verifyIsPathWithInSubTree(splitPath, key, false);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, true);
+ verifyIfParentsContainPath(key, parents, true);
+
+ key = new Path("file:///user/hive/warehouse/src");
+ verifyIsPathWithInSubTree(splitPath, key, true);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+ verifyIfParentsContainPath(key, parents, true);
+
+ key = new Path("file:///user/hive/warehouse/src_2");
+ verifyIsPathWithInSubTree(splitPath, key, false);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+ verifyIfParentsContainPath(key, parents, false);
+
+ key = new Path("file:///user/hive/warehouse/src/data.txt");
+ verifyIsPathWithInSubTree(splitPath, key, true);
+ verifyIsPathWithInSubTree(splitPathWithNoSchema, key, false);
+ verifyIfParentsContainPath(key, parents, true);
+ }
+
+ private void verifyIsPathWithInSubTree(Path splitPath, Path key, boolean expected) {
+ boolean result = FileUtils.isPathWithinSubtree(splitPath, key);
+ assertEquals("splitPath=" + splitPath + ", key=" + key, expected, result);
+ }
+
+ private void verifyIfParentsContainPath(Path key, Set<Path> parents, boolean expected) {
+ boolean result = parents.contains(key);
+ assertEquals("key=" + key, expected, result);
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
index 6de2c18..2e0ef74 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapOperator.java
@@ -114,6 +114,7 @@ public abstract class AbstractMapOperator extends Operator<MapWork>
throw new IllegalStateException("Ambiguous input path " + fpath);
}
nominal = onefile;
+ break;
}
if (nominal == null) {
throw new IllegalStateException("Invalid input path " + fpath);
http://git-wip-us.apache.org/repos/asf/hive/blob/f25ef88d/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 69956ec..94fcd60 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -23,9 +23,11 @@ import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.Map.Entry;
@@ -595,6 +597,8 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable>
Iterator<Entry<Path, ArrayList<String>>> iterator = this.mrwork
.getPathToAliases().entrySet().iterator();
+ Set<Path> splitParentPaths = null;
+ int pathsSize = this.mrwork.getPathToAliases().entrySet().size();
while (iterator.hasNext()) {
Entry<Path, ArrayList<String>> entry = iterator.next();
Path key = entry.getKey();
@@ -610,7 +614,20 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable>
// subdirectories. (Unlike non-native tables, prefix mixups don't seem
// to be a potential problem here since we are always dealing with the
// path to something deeper than the table location.)
- match = FileUtils.isPathWithinSubtree(splitPath, key) || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
+ if (pathsSize > 1) {
+ // Comparing paths multiple times creates lots of objects &
+ // creates GC pressure for tables having large number of partitions.
+ // In such cases, use pre-computed paths for comparison
+ if (splitParentPaths == null) {
+ splitParentPaths = new HashSet<>();
+ FileUtils.populateParentPaths(splitParentPaths, splitPath);
+ FileUtils.populateParentPaths(splitParentPaths, splitPathWithNoSchema);
+ }
+ match = splitParentPaths.contains(key);
+ } else {
+ match = FileUtils.isPathWithinSubtree(splitPath, key)
+ || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
+ }
}
if (match) {
ArrayList<String> list = entry.getValue();