You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ab...@apache.org on 2019/11/12 15:34:46 UTC

[hive] branch master updated: HIVE-22411: Performance degradation on single row inserts (Attila Magyar via Ashutosh Chauhan)

This is an automated email from the ASF dual-hosted git repository.

abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 5bbac26  HIVE-22411: Performance degradation on single row inserts (Attila Magyar via Ashutosh Chauhan)
5bbac26 is described below

commit 5bbac263278bf9aca812c42b184edccd10641e57
Author: Attila Magyar <am...@hortonworks.com>
AuthorDate: Tue Nov 12 16:28:01 2019 +0100

    HIVE-22411: Performance degradation on single row inserts (Attila Magyar via Ashutosh Chauhan)
    
    Signed-off-by: Laszlo Bodor <bo...@gmail.com>
---
 .../org/apache/hadoop/hive/common/FileUtils.java   | 43 ++++++++---
 .../apache/hadoop/hive/common/HiveStatsUtils.java  | 41 +----------
 .../apache/hadoop/hive/metastore/Warehouse.java    |  4 +-
 .../hadoop/hive/metastore/utils/FileUtils.java     | 83 ++++++++++++----------
 4 files changed, 84 insertions(+), 87 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index 651b842..61aca56 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -47,14 +47,13 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.GlobFilter;
 import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.Trash;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
-import org.apache.hadoop.hive.conf.HiveConfUtil;
-import org.apache.hadoop.hive.io.HdfsUtils;
 import org.apache.hadoop.hive.shims.HadoopShims;
 import org.apache.hadoop.hive.shims.ShimLoader;
 import org.apache.hadoop.hive.shims.Utils;
@@ -332,20 +331,48 @@ public final class FileUtils {
    */
   public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
       List<FileStatus> results) throws IOException {
-    listStatusRecursively(fs, fileStatus, HIDDEN_FILES_PATH_FILTER, results);
+    if (isS3a(fs)) {
+      // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering.
+      // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used.
+      listS3FilesRecursive(fileStatus, fs, results);
+    } else {
+      generalListStatusRecursively(fs, fileStatus, results);
+    }
   }
 
-  public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
-      PathFilter filter, List<FileStatus> results) throws IOException {
+  private static void generalListStatusRecursively(FileSystem fs, FileStatus fileStatus, List<FileStatus> results) throws IOException {
     if (fileStatus.isDir()) {
-      for (FileStatus stat : fs.listStatus(fileStatus.getPath(), filter)) {
-        listStatusRecursively(fs, stat, results);
+      for (FileStatus stat : fs.listStatus(fileStatus.getPath(), HIDDEN_FILES_PATH_FILTER)) {
+        generalListStatusRecursively(fs, stat, results);
       }
     } else {
       results.add(fileStatus);
     }
   }
 
+  private static void listS3FilesRecursive(FileStatus base, FileSystem fs, List<FileStatus> results) throws IOException {
+    if (!base.isDirectory()) {
+      results.add(base);
+      return;
+    }
+    RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(base.getPath(), true);
+    while (remoteIterator.hasNext()) {
+      LocatedFileStatus each = remoteIterator.next();
+      Path relativePath = new Path(each.getPath().toString().replace(base.toString(), ""));
+      if (org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) {
+        results.add(each);
+      }
+    }
+  }
+
+  public static boolean isS3a(FileSystem fs) {
+    try {
+      return "s3a".equalsIgnoreCase(fs.getScheme());
+    } catch (UnsupportedOperationException ex) {
+      return false;
+    }
+  }
+
   /**
    * Find the parent of path that exists, if path does not exist
    *
diff --git a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
index 09343e5..7641610 100644
--- a/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/HiveStatsUtils.java
@@ -19,14 +19,12 @@ package org.apache.hadoop.hive.common;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -56,24 +54,13 @@ public class HiveStatsUtils {
    */
   public static List<FileStatus> getFileStatusRecurse(Path path, int level,  FileSystem fs)
       throws IOException {
-    return getFileStatusRecurse(path, level, fs, FileUtils.HIDDEN_FILES_PATH_FILTER, false);
-  }
-
-  public static List<FileStatus> getFileStatusRecurse(
-      Path path, int level, FileSystem fs, PathFilter filter) throws IOException {
-    return getFileStatusRecurse(path, level, fs, filter, false);
-  }
-
-  public static List<FileStatus> getFileStatusRecurse(
-      Path path, int level, FileSystem fs, PathFilter filter, boolean allLevelsBelow)
-          throws IOException {
 
     // if level is <0, the return all files/directories under the specified path
     if (level < 0) {
       List<FileStatus> result = new ArrayList<FileStatus>();
       try {
         FileStatus fileStatus = fs.getFileStatus(path);
-        FileUtils.listStatusRecursively(fs, fileStatus, filter, result);
+        FileUtils.listStatusRecursively(fs, fileStatus, result);
       } catch (IOException e) {
         // globStatus() API returns empty FileStatus[] when the specified path
         // does not exist. But getFileStatus() throw IOException. To mimic the
@@ -90,31 +77,7 @@ public class HiveStatsUtils {
       sb.append(Path.SEPARATOR).append("*");
     }
     Path pathPattern = new Path(path, sb.toString());
-    if (!allLevelsBelow) {
-      return Lists.newArrayList(fs.globStatus(pathPattern, filter));
-    }
-    LinkedList<FileStatus> queue = new LinkedList<>();
-    List<FileStatus> results = new ArrayList<FileStatus>();
-    for (FileStatus status : fs.globStatus(pathPattern)) {
-      if (filter.accept(status.getPath())) {
-        results.add(status);
-      }
-      if (status.isDirectory()) {
-        queue.add(status);
-      }
-    }
-    while (!queue.isEmpty()) {
-      FileStatus status = queue.poll();
-      for (FileStatus child : fs.listStatus(status.getPath())) {
-        if (filter.accept(child.getPath())) {
-          results.add(child);
-        }
-        if (child.isDirectory()) {
-          queue.add(child);
-        }
-      }
-    }
-    return results;
+    return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER));
   }
 
   public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception {
diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
index 38e843a..9ce379b 100755
--- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
+++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java
@@ -690,7 +690,7 @@ public class Warehouse {
     try {
       Path path = new Path(location);
       FileSystem fileSys = path.getFileSystem(conf);
-      return FileUtils.getFileStatusRecurse(path, -1, fileSys);
+      return FileUtils.getFileStatusRecurse(path, fileSys);
     } catch (IOException ioe) {
       MetaStoreUtils.logAndThrowMetaException(ioe);
     }
@@ -708,7 +708,7 @@ public class Warehouse {
     Path tablePath = getDnsPath(new Path(table.getSd().getLocation()));
     try {
       FileSystem fileSys = tablePath.getFileSystem(conf);
-      return FileUtils.getFileStatusRecurse(tablePath, -1, fileSys);
+      return FileUtils.getFileStatusRecurse(tablePath, fileSys);
     } catch (IOException ioe) {
       MetaStoreUtils.logAndThrowMetaException(ioe);
     }
diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
index bf206ff..e9342e6 100644
--- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
+++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/utils/FileUtils.java
@@ -17,6 +17,14 @@
  */
 package org.apache.hadoop.hive.metastore.utils;
 
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.List;
+import java.util.NoSuchElementException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.FileStatus;
@@ -33,16 +41,6 @@ import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.collect.Lists;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.List;
-import java.util.NoSuchElementException;
-
 public class FileUtils {
   private static final PathFilter SNAPSHOT_DIR_PATH_FILTER = new PathFilter() {
     @Override
@@ -311,41 +309,31 @@ public class FileUtils {
   /**
    * Get all file status from a root path and recursively go deep into certain levels.
    *
-   * @param path
+   * @param base
    *          the root path
-   * @param level
-   *          the depth of directory to explore
    * @param fs
    *          the file system
    * @return array of FileStatus
    * @throws IOException
    */
-  public static List<FileStatus> getFileStatusRecurse(Path path, int level, FileSystem fs)
-      throws IOException {
-
-    // if level is <0, the return all files/directories under the specified path
-    if (level < 0) {
-      List<FileStatus> result = new ArrayList<>();
-      try {
-        FileStatus fileStatus = fs.getFileStatus(path);
-        FileUtils.listStatusRecursively(fs, fileStatus, result);
-      } catch (IOException e) {
-        // globStatus() API returns empty FileStatus[] when the specified path
-        // does not exist. But getFileStatus() throw IOException. To mimic the
-        // similar behavior we will return empty array on exception. For external
-        // tables, the path of the table will not exists during table creation
-        return new ArrayList<>(0);
+  public static List<FileStatus> getFileStatusRecurse(Path base, FileSystem fs) {
+    try {
+      List<FileStatus> results = new ArrayList<>();
+      if (isS3a(fs)) {
+        // S3A file system has an optimized recursive directory listing implementation however it doesn't support filtering.
+        // Therefore we filter the result set afterwards. This might be not so optimal in HDFS case (which does a tree walking) where a filter could have been used.
+        listS3FilesRecursive(base, fs, results);
+      } else {
+        listStatusRecursively(fs, fs.getFileStatus(base), results);
       }
-      return result;
-    }
-
-    // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
-    StringBuilder sb = new StringBuilder(path.toUri().getPath());
-    for (int i = 0; i < level; i++) {
-      sb.append(Path.SEPARATOR).append("*");
+      return results;
+    } catch (IOException e) {
+      // globStatus() API returns empty FileStatus[] when the specified path
+      // does not exist. But getFileStatus() throw IOException. To mimic the
+      // similar behavior we will return empty array on exception. For external
+      // tables, the path of the table will not exists during table creation
+      return new ArrayList<>(0);
     }
-    Path pathPattern = new Path(path, sb.toString());
-    return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER));
   }
 
   /**
@@ -361,7 +349,7 @@ public class FileUtils {
    * @param results
    *          receives enumeration of all files found
    */
-  public static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
+  private static void listStatusRecursively(FileSystem fs, FileStatus fileStatus,
                                            List<FileStatus> results) throws IOException {
 
     if (fileStatus.isDir()) {
@@ -373,6 +361,25 @@ public class FileUtils {
     }
   }
 
+  private static void listS3FilesRecursive(Path base, FileSystem fs, List<FileStatus> results) throws IOException {
+    RemoteIterator<LocatedFileStatus> remoteIterator = fs.listFiles(base, true);
+    while (remoteIterator.hasNext()) {
+      LocatedFileStatus each = remoteIterator.next();
+      Path relativePath = new Path(each.getPath().toString().replace(base.toString(), ""));
+      if (RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER.accept(relativePath)) {
+        results.add(each);
+      }
+    }
+  }
+
+  public static boolean isS3a(FileSystem fs) {
+    try {
+      return "s3a".equalsIgnoreCase(fs.getScheme());
+    } catch (UnsupportedOperationException ex) {
+      return false;
+    }
+  }
+
   public static String makePartName(List<String> partCols, List<String> vals) {
     return makePartName(partCols, vals, null);
   }