You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by to...@apache.org on 2019/05/13 22:54:37 UTC

[impala] 01/04: IMPALA-8428: Add support for caching file handles on s3

This is an automated email from the ASF dual-hosted git repository.

todd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7188ad32e6e94e90cbee492f3ad3628855f70925
Author: Sahil Takiar <st...@cloudera.com>
AuthorDate: Tue Mar 26 15:05:02 2019 -0500

    IMPALA-8428: Add support for caching file handles on s3
    
    This patch is based on work done by Joe McDonnell. This change adds
    support for cacheing file handles from S3. It add a new configuration
    flag 'cache_s3_file_handles' (set to true by default) which controls
    whether or not cacheing of S3 file handles is enabled.
    
    The S3 file handle cache is dependent on HADOOP-14747 (S3AInputStream to
    implement CanUnbuffer). HADOOP-14747 adds support for hdfsUnbufferFile
    to S3A streams. The call to unbuffer closes the underlying S3 object
    stream. Without this change the S3 file handle cache would quickly cause
    an impalad to crash because all S3 file handles in the cache would have
    a dangling HTTP(S) connection open to S3.
    
    Testing:
    * Modified test_hdfs_fd_caching.py so it is enabled for S3 as well as
    remote HDFS
    * Ran core tests
    * Ran TPC-DS on a real cluster and validated that the S3 file handle
    cache works as expected
    * Ran several test queries on a real cluster with S3Guard enabled and
    validated that the S3 file handle cache works as expected
    
    Change-Id: I5b304d37bc724377fbe7955441cce0cec6fb7f19
    Reviewed-on: http://gerrit.cloudera.org:8080/13221
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/io/disk-io-mgr.cc             | 6 +++++-
 be/src/runtime/io/scan-range.cc              | 4 +++-
 tests/custom_cluster/test_hdfs_fd_caching.py | 9 +++------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/be/src/runtime/io/disk-io-mgr.cc b/be/src/runtime/io/disk-io-mgr.cc
index 0edc7e6..6c256b0 100644
--- a/be/src/runtime/io/disk-io-mgr.cc
+++ b/be/src/runtime/io/disk-io-mgr.cc
@@ -136,10 +136,14 @@ DEFINE_uint64(num_file_handle_cache_partitions, 16, "Number of partitions used b
     "file handle cache.");
 
 // This parameter controls whether remote HDFS file handles are cached. It does not impact
-// S3, ADLS, or ABFS file handles. This is enabled by default.
+// S3, ADLS, or ABFS file handles.
 DEFINE_bool(cache_remote_file_handles, true, "Enable the file handle cache for "
     "remote HDFS files.");
 
+// This parameter controls whether S3 file handles are cached.
+DEFINE_bool(cache_s3_file_handles, true, "Enable the file handle cache for "
+    "S3 files.");
+
 AtomicInt32 DiskIoMgr::next_disk_id_;
 
 string DiskIoMgr::DebugString() {
diff --git a/be/src/runtime/io/scan-range.cc b/be/src/runtime/io/scan-range.cc
index 6e3ca8c..5910387 100644
--- a/be/src/runtime/io/scan-range.cc
+++ b/be/src/runtime/io/scan-range.cc
@@ -38,6 +38,7 @@ DEFINE_int64(abfs_read_chunk_size, 128 * 1024, "The maximum read chunk size to u
     "reading from ABFS.");
 
 DECLARE_bool(cache_remote_file_handles);
+DECLARE_bool(cache_s3_file_handles);
 
 // Implementation of the ScanRange functionality. Each ScanRange contains a queue
 // of ready buffers. For each ScanRange, there is only a single producer and
@@ -207,7 +208,8 @@ ReadOutcome ScanRange::DoRead(int disk_id) {
   bool use_file_handle_cache = false;
   if (is_file_handle_caching_enabled() && !is_erasure_coded_ &&
       (expected_local_ ||
-       (FLAGS_cache_remote_file_handles && disk_id_ == io_mgr_->RemoteDfsDiskId()))) {
+       (FLAGS_cache_remote_file_handles && disk_id_ == io_mgr_->RemoteDfsDiskId()) ||
+       (FLAGS_cache_s3_file_handles && disk_id_ == io_mgr_->RemoteS3DiskId()))) {
     use_file_handle_cache = true;
   }
   Status read_status = file_reader_->Open(use_file_handle_cache);
diff --git a/tests/custom_cluster/test_hdfs_fd_caching.py b/tests/custom_cluster/test_hdfs_fd_caching.py
index 8849363..bdcfb7b 100644
--- a/tests/custom_cluster/test_hdfs_fd_caching.py
+++ b/tests/custom_cluster/test_hdfs_fd_caching.py
@@ -21,7 +21,6 @@ from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.skip import SkipIfLocal, SkipIfEC
 from tests.util.filesystem_utils import (
     IS_ISILON,
-    IS_S3,
     IS_ABFS,
     IS_ADLS)
 from time import sleep
@@ -131,10 +130,9 @@ class TestHdfsFdCaching(CustomClusterTestSuite):
 
     cache_capacity = 16
 
-    # Caching only applies to local HDFS files. If this is local HDFS, then verify
+    # Caching applies to HDFS and S3 files. If this is HDFS or S3, then verify
     # that caching works. Otherwise, verify that file handles are not cached.
-    if IS_S3 or IS_ABFS or IS_ADLS or IS_ISILON or \
-        pytest.config.option.testing_remote_cluster:
+    if IS_ABFS or IS_ADLS or IS_ISILON:
       caching_expected = False
     else:
       caching_expected = True
@@ -150,8 +148,7 @@ class TestHdfsFdCaching(CustomClusterTestSuite):
     handle_timeout = 5
 
     # Only test eviction on platforms where caching is enabled.
-    if IS_S3 or IS_ABFS or IS_ADLS or IS_ISILON or \
-        pytest.config.option.testing_remote_cluster:
+    if IS_ABFS or IS_ADLS or IS_ISILON:
       return
     caching_expected = True
     self.run_fd_caching_test(vector, caching_expected, cache_capacity, handle_timeout)