You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ab...@apache.org on 2019/11/12 15:34:46 UTC
[hive] branch master updated: HIVE-22411: Performance degradation
on single row inserts (Attila Magyar via Ashutosh Chauhan)
This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 5bbac26 HIVE-22411: Performance degradation on single row inserts (Attila Magyar via Ashutosh Chauhan)
5bbac26 is described below
commit 5bbac263278bf9aca812c42b184edccd10641e57
Author: Attila Magyar <am...@hortonworks.com>
AuthorDate: Tue Nov 12 16:28:01 2019 +0100
HIVE-22411: Performance degradation on single row inserts (Attila Magyar via Ashutosh Chauhan)
Signed-off-by: Laszlo Bodor <bo...@gmail.com>
---
.../org/apache/hadoop/hive/common/FileUtils.java | 43 ++++++++---
.../apache/hadoop/hive/common/HiveStatsUtils.java | 41 +----------
.../apache/hadoop/hive/metastore/Warehouse.java | 4 +-
.../hadoop/hive/metastore/utils/FileUtils.java | 83 ++++++++++++----------
4 files changed, 84 insertions(+), 87 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index 651b842..61aca56 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -47,14 +47,13 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.GlobFilter;
import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
-import org.apache.hadoop.hive.conf.HiveConfUtil;
-import org.apache.hadoop.hive.io.HdfsUtils;
import org.apache.hadoop.hive.shims.HadoopShims;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.hive.shims.Utils;
@@ -332,20 +331,48 @@ public final class FileUtils {
*/
public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
List<FileStatus> results) throws IOException {
- listStatusRecursively(fs, fileStatus, HIDDEN_FILES_PATH_FILTER, results);
+ if (isS3a(fs)) {
+ // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering.
+ // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used.
+ listS3FilesRecursive(fileStatus, fs, results);
+ } else {
+ generalListStatusRecursively(fs, fileStatus, results);
+ }
}
- public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
- PathFilter filter, List<FileStatus> results) throws IOException {
+ private static void generalListStatusRecursively(FileSystem fs, FileStatus fileStatus, List<FileStatus> results) throws IOException {
if (fileStatus.isDir()) {
- for (FileStatus stat : fs.listStatus(fileStatus.getPath(), filter)) {
- listStatusRecursively(fs, stat, results);
+ for (FileStatus stat : fs.listStatus(fileStatus.getPath(), HIDDEN_FILES_PATH_FILTER)) {
+ generalListStatusRecursively(fs, stat, results);
}
} else {
results.add(fileStatus);
}
}
+ private static void listS3FilesRecursive(FileStatus base, FileSystem fs, List<FileStatus> results) throws IOException {
+ if (!base.isDirectory()) {
+ results.add(base);
+ return;
+ }
+ RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(base.getPath(), true);
+ while (remoteIterator.hasNext()) {
+ LocatedFileStatus each = remoteIterator.next();
+ Path relativePath = new Path(each.getPath().toString().replace(base.toString(), ""));
+ if (org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) {
+ results.add(each);
+ }
+ }
+ }
+
+ public static boolean isS3a(FileSystem fs) {
+ try {
+ return "s3a".equalsIgnoreCase(fs.getScheme());
+ } catch (UnsupportedOperationException ex) {
+ return false;
+ }
+ }
+
/**
* Find the parent of path that exists, if path does not exist
*
diff --git a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
index 09343e5..7641610 100644
--- a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
@@ -19,14 +19,12 @@ package org.apache.hadoop.hive.common;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,24 +54,13 @@ public class HiveStatsUtils {
*/
public static List<FileStatus> getFileStatusRecurse(Path path, int level, FileSystem fs)
throws IOException {
- return getFileStatusRecurse(path, level, fs, FileUtils.HIDDEN_FILES_PATH_FILTER, false);
- }
-
- public static List<FileStatus> getFileStatusRecurse(
- Path path, int level, FileSystem fs, PathFilter filter) throws IOException {
- return getFileStatusRecurse(path, level, fs, filter, false);
- }
-
- public static List<FileStatus> getFileStatusRecurse(
- Path path, int level, FileSystem fs, PathFilter filter, boolean allLevelsBelow)
- throws IOException {
// if level is <0, the return all files/directories under the specified path
if (level < 0) {
List<FileStatus> result = new ArrayList<FileStatus>();
try {
FileStatus fileStatus = fs.getFileStatus(path);
- FileUtils.listStatusRecursively(fs, fileStatus, filter, result);
+ FileUtils.listStatusRecursively(fs, fileStatus, result);
} catch (IOException e) {
// globStatus() API returns empty FileStatus[] when the specified path
// does not exist. But getFileStatus() throw IOException. To mimic the
@@ -90,31 +77,7 @@ public class HiveStatsUtils {
sb.append(Path.SEPARATOR).append("*");
}
Path pathPattern = new Path(path, sb.toString());
- if (!allLevelsBelow) {
- return Lists.newArrayList(fs.globStatus(pathPattern, filter));
- }
- LinkedList<FileStatus> queue = new LinkedList<>();
- List<FileStatus> results = new ArrayList<FileStatus>();
- for (FileStatus status : fs.globStatus(pathPattern)) {
- if (filter.accept(status.getPath())) {
- results.add(status);
- }
- if (status.isDirectory()) {
- queue.add(status);
- }
- }
- while (!queue.isEmpty()) {
- FileStatus status = queue.poll();
- for (FileStatus child : fs.listStatus(status.getPath())) {
- if (filter.accept(child.getPath())) {
- results.add(child);
- }
- if (child.isDirectory()) {
- queue.add(child);
- }
- }
- }
- return results;
+ return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER));
}
public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception {
diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
index 38e843a..9ce379b 100755
--- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
+++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
@@ -690,7 +690,7 @@ public class Warehouse {
try {
Path path = new Path(location);
FileSystem fileSys = path.getFileSystem(conf);
- return FileUtils.getFileStatusRecurse(path, -1, fileSys);
+ return FileUtils.getFileStatusRecurse(path, fileSys);
} catch (IOException ioe) {
MetaStoreUtils.logAndThrowMetaException(ioe);
}
@@ -708,7 +708,7 @@ public class Warehouse {
Path tablePath = getDnsPath(new Path(table.getSd().getLocation()));
try {
FileSystem fileSys = tablePath.getFileSystem(conf);
- return FileUtils.getFileStatusRecurse(tablePath, -1, fileSys);
+ return FileUtils.getFileStatusRecurse(tablePath, fileSys);
} catch (IOException ioe) {
MetaStoreUtils.logAndThrowMetaException(ioe);
}
diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
index bf206ff..e9342e6 100644
--- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
+++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
@@ -17,6 +17,14 @@
*/
package org.apache.hadoop.hive.metastore.utils;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.List;
+import java.util.NoSuchElementException;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
@@ -33,16 +41,6 @@ import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.collect.Lists;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.List;
-import java.util.NoSuchElementException;
-
public class FileUtils {
private static final PathFilter SNAPSHOT_DIR_PATH_FILTER = new PathFilter() {
@Override
@@ -311,41 +309,31 @@ public class FileUtils {
/**
* Get all file status from a root path and recursively go deep into certain levels.
*
- * @param path
+ * @param base
* the root path
- * @param level
- * the depth of directory to explore
* @param fs
* the file system
* @return array of FileStatus
* @throws IOException
*/
- public static List<FileStatus> getFileStatusRecurse(Path path, int level, FileSystem fs)
- throws IOException {
-
- // if level is <0, the return all files/directories under the specified path
- if (level < 0) {
- List<FileStatus> result = new ArrayList<>();
- try {
- FileStatus fileStatus = fs.getFileStatus(path);
- FileUtils.listStatusRecursively(fs, fileStatus, result);
- } catch (IOException e) {
- // globStatus() API returns empty FileStatus[] when the specified path
- // does not exist. But getFileStatus() throw IOException. To mimic the
- // similar behavior we will return empty array on exception. For external
- // tables, the path of the table will not exists during table creation
- return new ArrayList<>(0);
+ public static List<FileStatus> getFileStatusRecurse(Path base, FileSystem fs) {
+ try {
+ List<FileStatus> results = new ArrayList<>();
+ if (isS3a(fs)) {
+ // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering.
+ // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used.
+ listS3FilesRecursive(base, fs, results);
+ } else {
+ listStatusRecursively(fs, fs.getFileStatus(base), results);
}
- return result;
- }
-
- // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
- StringBuilder sb = new StringBuilder(path.toUri().getPath());
- for (int i = 0; i < level; i++) {
- sb.append(Path.SEPARATOR).append("*");
+ return results;
+ } catch (IOException e) {
+ // globStatus() API returns empty FileStatus[] when the specified path
+ // does not exist. But getFileStatus() throw IOException. To mimic the
+ // similar behavior we will return empty array on exception. For external
+ // tables, the path of the table will not exists during table creation
+ return new ArrayList<>(0);
}
- Path pathPattern = new Path(path, sb.toString());
- return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER));
}
/**
@@ -361,7 +349,7 @@ public class FileUtils {
* @param results
* receives enumeration of all files found
*/
- public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
+ private static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
List<FileStatus> results) throws IOException {
if (fileStatus.isDir()) {
@@ -373,6 +361,25 @@ public class FileUtils {
}
}
+ private static void listS3FilesRecursive(Path base, FileSystem fs, List<FileStatus> results) throws IOException {
+ RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(base, true);
+ while (remoteIterator.hasNext()) {
+ LocatedFileStatus each = remoteIterator.next();
+ Path relativePath = new Path(each.getPath().toString().replace(base.toString(), ""));
+ if (RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) {
+ results.add(each);
+ }
+ }
+ }
+
+ public static boolean isS3a(FileSystem fs) {
+ try {
+ return "s3a".equalsIgnoreCase(fs.getScheme());
+ } catch (UnsupportedOperationException ex) {
+ return false;
+ }
+ }
+
public static String makePartName(List<String> partCols, List<String> vals) {
return makePartName(partCols, vals, null);
}