You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/10/26 22:16:42 UTC

[2/3] incubator-impala git commit: IMPALA-1291: Parquet read fails if io buffer size is less than the footer size

IMPALA-1291: Parquet read fails if io buffer size is less than the footer size

This commit introduces the compile-time constant READ_SIZE_MIN_VALUE
in the newly created common/global-flags.h

A static_assert checks if READ_SIZE_MIN_VALUE is greater than or equal to
HdfsParquetScanner::FOOTER_SIZE.

Also, moved the definition of read_size flag to global-flags.cc,
and declared it in global-flags.h.

Runtime checks test if read_size is greater than or eaqual to
READ_SIZE_MIN_VALUE. If not, the process is terminated.

Change-Id: Ib7364147e7daf5380f11368871f8479646b448da
Reviewed-on: http://gerrit.cloudera.org:8080/8366
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/bb73a296
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/bb73a296
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/bb73a296

Branch: refs/heads/master
Commit: bb73a2964bf35b3349267c0a79a89563ed232ef1
Parents: 00fd838
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Tue Oct 24 12:13:04 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Oct 26 21:32:34 2017 +0000

----------------------------------------------------------------------
 be/src/common/global-flags.cc      |  9 +++++++++
 be/src/common/global-flags.h       | 33 +++++++++++++++++++++++++++++++++
 be/src/common/init.cc              |  6 ++++++
 be/src/exec/hdfs-parquet-scanner.h |  5 +++++
 be/src/runtime/disk-io-mgr.cc      | 11 +++--------
 be/src/util/backend-gflag-util.cc  |  2 +-
 6 files changed, 57 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/global-flags.cc
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index cec27eb..29c16a5 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -172,3 +172,12 @@ DEFINE_bool(redirect_stdout_stderr, true,
 DEFINE_int32(max_log_files, 10, "Maximum number of log files to retain per severity "
     "level. The most recent log files are retained. If set to 0, all log files are "
     "retained.");
+
+// The read size is the preferred size of the reads issued to HDFS or the local FS.
+// There is a trade off of latency and throughout, trying to keep disks busy but
+// not introduce seeks.  The literature seems to agree that with 8 MB reads, random
+// io and sequential io perform similarly.
+DEFINE_int32(read_size, 8 * 1024 * 1024, "(Advanced) The preferred I/O request size in "
+    "bytes to issue to HDFS or the local filesystem. Increasing the read size will "
+    "increase memory requirements. Decreasing the read size may decrease I/O "
+    "throughput.");

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/global-flags.h
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.h b/be/src/common/global-flags.h
new file mode 100644
index 0000000..07ae00b
--- /dev/null
+++ b/be/src/common/global-flags.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_COMMON_GLOBAL_FLAGS_H
+#define IMPALA_COMMON_GLOBAL_FLAGS_H
+
+#include <gflags/gflags.h>
+
+namespace impala {
+
+constexpr int64_t READ_SIZE_MIN_VALUE = 1024 * 128;
+
+}
+
+DECLARE_int32(read_size);
+
+#endif
+
+

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/init.cc
----------------------------------------------------------------------
diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index 003ebf9..abbe416 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -20,11 +20,13 @@
 #include <gperftools/heap-profiler.h>
 #include <gperftools/malloc_extension.h>
 
+#include "common/global-flags.h"
 #include "common/logging.h"
 #include "common/status.h"
 #include "exec/kudu-util.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "gutil/atomicops.h"
+#include "gutil/strings/substitute.h"
 #include "rpc/authentication.h"
 #include "rpc/thrift-util.h"
 #include "runtime/bufferpool/buffer-pool.h"
@@ -193,6 +195,10 @@ void impala::InitCommonRuntime(int argc, char** argv, bool init_jvm,
     const string& error_message = SetRedactionRulesFromFile(FLAGS_redaction_rules_file);
     if (!error_message.empty()) CLEAN_EXIT_WITH_ERROR(error_message);
   }
+  if (FLAGS_read_size < READ_SIZE_MIN_VALUE) {
+    CLEAN_EXIT_WITH_ERROR(Substitute("read_size can not be lower than $0",
+        READ_SIZE_MIN_VALUE));
+  }
   impala::InitGoogleLoggingSafe(argv[0]);
   // Breakpad needs flags and logging to initialize.
   ABORT_IF_ERROR(RegisterMinidump(argv[0]));

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/exec/hdfs-parquet-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h
index 5f67036..a5333fe 100644
--- a/be/src/exec/hdfs-parquet-scanner.h
+++ b/be/src/exec/hdfs-parquet-scanner.h
@@ -20,6 +20,7 @@
 #define IMPALA_EXEC_HDFS_PARQUET_SCANNER_H
 
 #include "codegen/impala-ir.h"
+#include "common/global-flags.h"
 #include "exec/hdfs-scanner.h"
 #include "exec/parquet-common.h"
 #include "exec/parquet-scratch-tuple-batch.h"
@@ -359,6 +360,10 @@ class HdfsParquetScanner : public HdfsScanner {
   /// Size of the file footer.  This is a guess.  If this value is too little, we will
   /// need to issue another read.
   static const int64_t FOOTER_SIZE = 1024 * 100;
+  static_assert(FOOTER_SIZE <= READ_SIZE_MIN_VALUE,
+      "FOOTER_SIZE can not be greater than READ_SIZE_MIN_VALUE.\n"
+      "You can increase FOOTER_SIZE if you want, "
+      "just don't forget to increase READ_SIZE_MIN_VALUE as well.");
 
   /// Class name in LLVM IR.
   static const char* LLVM_CLASS_NAME;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index 7cc2af7..54dfc98 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "common/global-flags.h"
 #include "runtime/disk-io-mgr.h"
 #include "runtime/disk-io-mgr-handle-cache.inline.h"
 #include "runtime/disk-io-mgr-internal.h"
@@ -78,14 +79,7 @@ DEFINE_int32(num_s3_io_threads, 16, "Number of S3 I/O threads");
 // enforced by ADLS for a cluster, which spans between 500-700. For smaller clusters
 // (~10 nodes), 64 threads would be more ideal.
 DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
-// The read size is the preferred size of the reads issued to HDFS or the local FS.
-// There is a trade off of latency and throughout, trying to keep disks busy but
-// not introduce seeks.  The literature seems to agree that with 8 MB reads, random
-// io and sequential io perform similarly.
-DEFINE_int32(read_size, 8 * 1024 * 1024, "(Advanced) The preferred I/O request size in "
-    "bytes to issue to HDFS or the local filesystem. Increasing the read size will "
-    "increase memory requirements. Decreasing the read size may decrease I/O "
-    "throughput.");
+
 DECLARE_int64(min_buffer_size);
 
 // With 1024B through 8MB buffers, this is up to ~2GB of buffers.
@@ -304,6 +298,7 @@ DiskIoMgr::DiskIoMgr() :
     file_handle_cache_(min(FLAGS_max_cached_file_handles,
         FileSystemUtil::MaxNumFileHandles()),
         FLAGS_unused_file_handle_timeout_sec) {
+  DCHECK_LE(READ_SIZE_MIN_VALUE, FLAGS_read_size);
   int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
   free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
   int num_local_disks = DiskInfo::num_disks();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/util/backend-gflag-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index 598ba08..f5d2e64 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "common/global-flags.h"
 #include "util/backend-gflag-util.h"
 
 #include "gen-cpp/BackendGflags_types.h"
@@ -28,7 +29,6 @@ DECLARE_bool(load_catalog_in_background);
 DECLARE_bool(load_auth_to_local_rules);
 DECLARE_bool(enable_stats_extrapolation);
 DECLARE_int32(non_impala_java_vlog);
-DECLARE_int32(read_size);
 DECLARE_int32(num_metadata_loading_threads);
 DECLARE_int32(initial_hms_cnxn_timeout_s);
 DECLARE_int32(kudu_operation_timeout_ms);