You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/10/26 22:16:42 UTC
[2/3] incubator-impala git commit: IMPALA-1291: Parquet read fails if
io buffer size is less than the footer size
IMPALA-1291: Parquet read fails if io buffer size is less than the footer size
This commit introduces the compile-time constant READ_SIZE_MIN_VALUE
in the newly created common/global-flags.h
A static_assert checks if READ_SIZE_MIN_VALUE is greater than or equal to
HdfsParquetScanner::FOOTER_SIZE.
Also, moved the definition of read_size flag to global-flags.cc,
and declared it in global-flags.h.
Runtime checks test if read_size is greater than or eaqual to
READ_SIZE_MIN_VALUE. If not, the process is terminated.
Change-Id: Ib7364147e7daf5380f11368871f8479646b448da
Reviewed-on: http://gerrit.cloudera.org:8080/8366
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/bb73a296
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/bb73a296
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/bb73a296
Branch: refs/heads/master
Commit: bb73a2964bf35b3349267c0a79a89563ed232ef1
Parents: 00fd838
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Tue Oct 24 12:13:04 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Oct 26 21:32:34 2017 +0000
----------------------------------------------------------------------
be/src/common/global-flags.cc | 9 +++++++++
be/src/common/global-flags.h | 33 +++++++++++++++++++++++++++++++++
be/src/common/init.cc | 6 ++++++
be/src/exec/hdfs-parquet-scanner.h | 5 +++++
be/src/runtime/disk-io-mgr.cc | 11 +++--------
be/src/util/backend-gflag-util.cc | 2 +-
6 files changed, 57 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/global-flags.cc
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index cec27eb..29c16a5 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -172,3 +172,12 @@ DEFINE_bool(redirect_stdout_stderr, true,
DEFINE_int32(max_log_files, 10, "Maximum number of log files to retain per severity "
"level. The most recent log files are retained. If set to 0, all log files are "
"retained.");
+
+// The read size is the preferred size of the reads issued to HDFS or the local FS.
+// There is a trade off of latency and throughout, trying to keep disks busy but
+// not introduce seeks. The literature seems to agree that with 8 MB reads, random
+// io and sequential io perform similarly.
+DEFINE_int32(read_size, 8 * 1024 * 1024, "(Advanced) The preferred I/O request size in "
+ "bytes to issue to HDFS or the local filesystem. Increasing the read size will "
+ "increase memory requirements. Decreasing the read size may decrease I/O "
+ "throughput.");
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/global-flags.h
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.h b/be/src/common/global-flags.h
new file mode 100644
index 0000000..07ae00b
--- /dev/null
+++ b/be/src/common/global-flags.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_COMMON_GLOBAL_FLAGS_H
+#define IMPALA_COMMON_GLOBAL_FLAGS_H
+
+#include <gflags/gflags.h>
+
+namespace impala {
+
+constexpr int64_t READ_SIZE_MIN_VALUE = 1024 * 128;
+
+}
+
+DECLARE_int32(read_size);
+
+#endif
+
+
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/common/init.cc
----------------------------------------------------------------------
diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index 003ebf9..abbe416 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -20,11 +20,13 @@
#include <gperftools/heap-profiler.h>
#include <gperftools/malloc_extension.h>
+#include "common/global-flags.h"
#include "common/logging.h"
#include "common/status.h"
#include "exec/kudu-util.h"
#include "exprs/scalar-expr-evaluator.h"
#include "gutil/atomicops.h"
+#include "gutil/strings/substitute.h"
#include "rpc/authentication.h"
#include "rpc/thrift-util.h"
#include "runtime/bufferpool/buffer-pool.h"
@@ -193,6 +195,10 @@ void impala::InitCommonRuntime(int argc, char** argv, bool init_jvm,
const string& error_message = SetRedactionRulesFromFile(FLAGS_redaction_rules_file);
if (!error_message.empty()) CLEAN_EXIT_WITH_ERROR(error_message);
}
+ if (FLAGS_read_size < READ_SIZE_MIN_VALUE) {
+ CLEAN_EXIT_WITH_ERROR(Substitute("read_size can not be lower than $0",
+ READ_SIZE_MIN_VALUE));
+ }
impala::InitGoogleLoggingSafe(argv[0]);
// Breakpad needs flags and logging to initialize.
ABORT_IF_ERROR(RegisterMinidump(argv[0]));
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/exec/hdfs-parquet-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h
index 5f67036..a5333fe 100644
--- a/be/src/exec/hdfs-parquet-scanner.h
+++ b/be/src/exec/hdfs-parquet-scanner.h
@@ -20,6 +20,7 @@
#define IMPALA_EXEC_HDFS_PARQUET_SCANNER_H
#include "codegen/impala-ir.h"
+#include "common/global-flags.h"
#include "exec/hdfs-scanner.h"
#include "exec/parquet-common.h"
#include "exec/parquet-scratch-tuple-batch.h"
@@ -359,6 +360,10 @@ class HdfsParquetScanner : public HdfsScanner {
/// Size of the file footer. This is a guess. If this value is too little, we will
/// need to issue another read.
static const int64_t FOOTER_SIZE = 1024 * 100;
+ static_assert(FOOTER_SIZE <= READ_SIZE_MIN_VALUE,
+ "FOOTER_SIZE can not be greater than READ_SIZE_MIN_VALUE.\n"
+ "You can increase FOOTER_SIZE if you want, "
+ "just don't forget to increase READ_SIZE_MIN_VALUE as well.");
/// Class name in LLVM IR.
static const char* LLVM_CLASS_NAME;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index 7cc2af7..54dfc98 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include "common/global-flags.h"
#include "runtime/disk-io-mgr.h"
#include "runtime/disk-io-mgr-handle-cache.inline.h"
#include "runtime/disk-io-mgr-internal.h"
@@ -78,14 +79,7 @@ DEFINE_int32(num_s3_io_threads, 16, "Number of S3 I/O threads");
// enforced by ADLS for a cluster, which spans between 500-700. For smaller clusters
// (~10 nodes), 64 threads would be more ideal.
DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
-// The read size is the preferred size of the reads issued to HDFS or the local FS.
-// There is a trade off of latency and throughout, trying to keep disks busy but
-// not introduce seeks. The literature seems to agree that with 8 MB reads, random
-// io and sequential io perform similarly.
-DEFINE_int32(read_size, 8 * 1024 * 1024, "(Advanced) The preferred I/O request size in "
- "bytes to issue to HDFS or the local filesystem. Increasing the read size will "
- "increase memory requirements. Decreasing the read size may decrease I/O "
- "throughput.");
+
DECLARE_int64(min_buffer_size);
// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
@@ -304,6 +298,7 @@ DiskIoMgr::DiskIoMgr() :
file_handle_cache_(min(FLAGS_max_cached_file_handles,
FileSystemUtil::MaxNumFileHandles()),
FLAGS_unused_file_handle_timeout_sec) {
+ DCHECK_LE(READ_SIZE_MIN_VALUE, FLAGS_read_size);
int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
int num_local_disks = DiskInfo::num_disks();
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb73a296/be/src/util/backend-gflag-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index 598ba08..f5d2e64 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include "common/global-flags.h"
#include "util/backend-gflag-util.h"
#include "gen-cpp/BackendGflags_types.h"
@@ -28,7 +29,6 @@ DECLARE_bool(load_catalog_in_background);
DECLARE_bool(load_auth_to_local_rules);
DECLARE_bool(enable_stats_extrapolation);
DECLARE_int32(non_impala_java_vlog);
-DECLARE_int32(read_size);
DECLARE_int32(num_metadata_loading_threads);
DECLARE_int32(initial_hms_cnxn_timeout_s);
DECLARE_int32(kudu_operation_timeout_ms);