You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/01/29 04:21:36 UTC

[1/4] impala git commit: IMPALA-6377: Bump breakpad version to include the fix for Breakpad #752

Repository: impala
Updated Branches:
  refs/heads/2.x 42ce1f959 -> 8ae6080a9


IMPALA-6377: Bump breakpad version to include the fix for Breakpad #752

This change bumps the Breakpad version to pull in the fix for
https://bugs.chromium.org/p/google-breakpad/issues/detail?id=752 .

Change-Id: I9b6212ab77eff2df0cb51339c25183ae4f22b07b
Reviewed-on: http://gerrit.cloudera.org:8080/9131
Reviewed-by: Sailesh Mukil <sa...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/8ae6080a
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/8ae6080a
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/8ae6080a

Branch: refs/heads/2.x
Commit: 8ae6080a956496f5f76b5e5282fb6d36751e6d80
Parents: 1b1bd7c
Author: Lars Volker <lv...@cloudera.com>
Authored: Wed Jan 24 18:31:40 2018 -0800
Committer: Philip Zeyliger <ph...@cloudera.com>
Committed: Thu Jan 25 16:30:18 2018 -0800

----------------------------------------------------------------------
 bin/impala-config.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/8ae6080a/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 058dc01..bb1cb15 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -72,7 +72,7 @@ fi
 # moving to a different build of the toolchain, e.g. when a version is bumped or a
 # compile option is changed. The build id can be found in the output of the toolchain
 # build jobs, it is constructed from the build number and toolchain git hash prefix.
-export IMPALA_TOOLCHAIN_BUILD_ID=482-c2361403fc
+export IMPALA_TOOLCHAIN_BUILD_ID=39-a1bbd2851a
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p4
@@ -81,7 +81,7 @@ export IMPALA_BINUTILS_VERSION=2.26.1
 unset IMPALA_BINUTILS_URL
 export IMPALA_BOOST_VERSION=1.57.0-p3
 unset IMPALA_BOOST_URL
-export IMPALA_BREAKPAD_VERSION=1b704857f1e78a864e6942e613457e55f1aecb60-p3
+export IMPALA_BREAKPAD_VERSION=97a98836768f8f0154f8f86e5e14c2bb7e74132e-p2
 unset IMPALA_BREAKPAD_URL
 export IMPALA_BZIP2_VERSION=1.0.6-p2
 unset IMPALA_BZIP2_URL


[4/4] impala git commit: IMPALA-6383: free memory after skipping parquet row groups

Posted by ta...@apache.org.
IMPALA-6383: free memory after skipping parquet row groups

Before this patch, resources were only flushed after breaking out of
NextRowGroup(). This is a problem because resources can be allocated
for skipped row groups (e.g. for reading dictionaries).

Testing:
Tested in conjunction with a prototype buffer pool patch that was
DCHECKing before the change.

Added DCHECKs to the current version to ensure the streams are cleared
up as expected.

Ran the repro for IMPALA-6419 to confirm this iteration of the patch
fixed the original problem.

Change-Id: I95713675455f7635fa3f72616b166f35e2a46c1a
Reviewed-on: http://gerrit.cloudera.org:8080/9059
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c0c3ba7f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c0c3ba7f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c0c3ba7f

Branch: refs/heads/2.x
Commit: c0c3ba7f45d2897083c5d6d4442b95ea7da1c3e0
Parents: 42ce1f9
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Jan 10 15:35:41 2018 -0800
Committer: Philip Zeyliger <ph...@cloudera.com>
Committed: Thu Jan 25 16:30:18 2018 -0800

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc | 52 +++++++++++++++++++++-----------
 be/src/exec/hdfs-parquet-scanner.h  |  5 +++
 be/src/exec/scanner-context.h       |  8 ++---
 3 files changed, 44 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c0c3ba7f/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 3a17a3b..c14edd7 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -229,6 +229,7 @@ Status HdfsParquetScanner::Open(ScannerContext* context) {
   // Release I/O buffers immediately to make sure they are cleaned up
   // in case we return a non-OK status anywhere below.
   context_->ReleaseCompletedResources(true);
+  context_->ClearStreams();
   RETURN_IF_ERROR(footer_status);
 
   // Parse the file schema into an internal representation for schema resolution.
@@ -264,7 +265,7 @@ void HdfsParquetScanner::Close(RowBatch* row_batch) {
     }
   } else {
     template_tuple_pool_->FreeAll();
-    dictionary_pool_.get()->FreeAll();
+    dictionary_pool_->FreeAll();
     context_->ReleaseCompletedResources(true);
     for (ParquetColumnReader* col_reader : column_readers_) col_reader->Close(nullptr);
     // The scratch batch may still contain tuple data. We can get into this case if
@@ -479,7 +480,6 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* row_batch) {
     // Transfer resources and clear streams if there is any leftover from the previous
     // row group. We will create new streams for the next row group.
     FlushRowGroupResources(row_batch);
-    context_->ClearStreams();
     if (!advance_row_group_) {
       Status status =
           ValidateEndOfRowGroup(column_readers_, row_group_idx_, row_group_rows_read_);
@@ -620,6 +620,9 @@ Status HdfsParquetScanner::NextRowGroup() {
   while (true) {
     // Reset the parse status for the next row group.
     parse_status_ = Status::OK();
+    // Make sure that we don't have leftover resources from the file metadata scan range
+    // or previous row groups.
+    DCHECK_EQ(0, context_->NumStreams());
 
     ++row_group_idx_;
     if (row_group_idx_ >= file_metadata_.row_groups.size()) {
@@ -672,6 +675,9 @@ Status HdfsParquetScanner::NextRowGroup() {
     // of the column.
     RETURN_IF_ERROR(InitScalarColumns(row_group_idx_, dict_filterable_readers_));
 
+    // InitColumns() may have allocated resources to scan columns. If we skip this row
+    // group below, we must call ReleaseSkippedRowGroupResources() before continuing.
+
     // If there is a dictionary-encoded column where every value is eliminated
     // by a conjunct, the row group can be eliminated. This initializes dictionaries
     // for all columns visited.
@@ -680,10 +686,12 @@ Status HdfsParquetScanner::NextRowGroup() {
     if (!status.ok()) {
       // Either return an error or skip this row group if it is ok to ignore errors
       RETURN_IF_ERROR(state_->LogOrReturnError(status.msg()));
+      ReleaseSkippedRowGroupResources();
       continue;
     }
     if (skip_row_group_on_dict_filters) {
       COUNTER_ADD(num_dict_filtered_row_groups_counter_, 1);
+      ReleaseSkippedRowGroupResources();
       continue;
     }
 
@@ -695,10 +703,11 @@ Status HdfsParquetScanner::NextRowGroup() {
     if (!status.ok()) {
       // Either return an error or skip this row group if it is ok to ignore errors
       RETURN_IF_ERROR(state_->LogOrReturnError(status.msg()));
+      ReleaseSkippedRowGroupResources();
       continue;
     }
 
-    bool seeding_ok = true;
+    bool seeding_failed = false;
     for (ParquetColumnReader* col_reader: column_readers_) {
       // Seed collection and boolean column readers with NextLevel().
       // The ScalarColumnReaders use an optimized ReadValueBatch() that
@@ -707,19 +716,21 @@ Status HdfsParquetScanner::NextRowGroup() {
       // ScalarColumnReader::ReadValueBatch() which does not need seeding. This
       // will allow better sharing of code between the row-wise and column-wise
       // materialization strategies.
-      if (col_reader->NeedsSeedingForBatchedReading()) {
-        if (!col_reader->NextLevels()) {
-          seeding_ok = false;
-          break;
-        }
+      if (col_reader->NeedsSeedingForBatchedReading()
+          && !col_reader->NextLevels()) {
+        seeding_failed = true;
+        break;
       }
-      DCHECK(parse_status_.ok()) << "Invalid parse_status_" << parse_status_.GetDetail();
     }
-
-    if (!parse_status_.ok()) {
-      RETURN_IF_ERROR(state_->LogOrReturnError(parse_status_.msg()));
-    } else if (seeding_ok) {
-      // Found a non-empty row group and successfully initialized the column readers.
+    if (seeding_failed) {
+      if (!parse_status_.ok()) {
+        RETURN_IF_ERROR(state_->LogOrReturnError(parse_status_.msg()));
+      }
+      ReleaseSkippedRowGroupResources();
+      continue;
+    } else {
+      // Seeding succeeded - we're ready to read the row group.
+      DCHECK(parse_status_.ok()) << "Invalid parse_status_" << parse_status_.GetDetail();
       break;
     }
   }
@@ -733,9 +744,16 @@ void HdfsParquetScanner::FlushRowGroupResources(RowBatch* row_batch) {
   row_batch->tuple_data_pool()->AcquireData(dictionary_pool_.get(), false);
   scratch_batch_->ReleaseResources(row_batch->tuple_data_pool());
   context_->ReleaseCompletedResources(true);
-  for (ParquetColumnReader* col_reader : column_readers_) {
-    col_reader->Close(row_batch);
-  }
+  for (ParquetColumnReader* col_reader : column_readers_) col_reader->Close(row_batch);
+  context_->ClearStreams();
+}
+
+void HdfsParquetScanner::ReleaseSkippedRowGroupResources() {
+  dictionary_pool_->FreeAll();
+  scratch_batch_->ReleaseResources(nullptr);
+  context_->ReleaseCompletedResources(true);
+  for (ParquetColumnReader* col_reader : column_readers_) col_reader->Close(nullptr);
+  context_->ClearStreams();
 }
 
 bool HdfsParquetScanner::IsDictFilterable(BaseScalarColumnReader* col_reader) {

http://git-wip-us.apache.org/repos/asf/impala/blob/c0c3ba7f/be/src/exec/hdfs-parquet-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h
index 2ddf0fc..b1409d7 100644
--- a/be/src/exec/hdfs-parquet-scanner.h
+++ b/be/src/exec/hdfs-parquet-scanner.h
@@ -654,6 +654,11 @@ class HdfsParquetScanner : public HdfsScanner {
   /// Should be called after completing a row group and when returning the last batch.
   void FlushRowGroupResources(RowBatch* row_batch);
 
+  /// Releases resources associated with a row group that was skipped and closes all
+  /// column readers. Should be called after skipping a row group from which no rows
+  /// were returned.
+  void ReleaseSkippedRowGroupResources();
+
   /// Evaluates whether the column reader is eligible for dictionary predicates
   bool IsDictFilterable(ParquetColumnReader* col_reader);
 

http://git-wip-us.apache.org/repos/asf/impala/blob/c0c3ba7f/be/src/exec/scanner-context.h
----------------------------------------------------------------------
diff --git a/be/src/exec/scanner-context.h b/be/src/exec/scanner-context.h
index e316063..09a4bdc 100644
--- a/be/src/exec/scanner-context.h
+++ b/be/src/exec/scanner-context.h
@@ -89,7 +89,6 @@ class ScannerContext {
   ScannerContext(RuntimeState*, HdfsScanNodeBase*, HdfsPartitionDescriptor*,
       io::ScanRange* scan_range, const std::vector<FilterContext>& filter_ctxs,
       MemPool* expr_results_pool);
-
   /// Destructor verifies that all stream objects have been released.
   ~ScannerContext();
 
@@ -338,6 +337,8 @@ class ScannerContext {
     return streams_[idx].get();
   }
 
+  int NumStreams() const { return streams_.size(); }
+
   /// Release completed resources for all streams, e.g. the last buffer in each stream if
   /// the current read position is at the end of the buffer. If 'done' is true all
   /// resources are freed, even if the caller has not read that data yet. After calling
@@ -354,8 +355,8 @@ class ScannerContext {
   /// size to 0.
   void ClearStreams();
 
-  /// Add a stream to this ScannerContext for 'range'. Returns the added stream.
-  /// The stream is created in the runtime state's object pool
+  /// Add a stream to this ScannerContext for 'range'. The stream is owned by this
+  /// context.
   Stream* AddStream(io::ScanRange* range);
 
   /// Returns false if scan_node_ is multi-threaded and has been cancelled.
@@ -370,7 +371,6 @@ class ScannerContext {
 
   RuntimeState* state_;
   HdfsScanNodeBase* scan_node_;
-
   HdfsPartitionDescriptor* partition_desc_;
 
   /// Vector of streams. Non-columnar formats will always have one stream per context.


[3/4] impala git commit: IMPALA-6410: Use subprocess in compare_branches.py.

Posted by ta...@apache.org.
IMPALA-6410: Use subprocess in compare_branches.py.

Switches bin/compare_branches.py to use 'subprocess' instead
of 'sh'. We often use 'sh' in Impala testing code for its
friendly API, but it has to be installed separately. To avoid
automation that is just doing git operations needing to
either build the Impala python environment or otherwise get
extra libraries, I converted the usages.

As a side-effect, the script outputs the stdout of 'git cherry-pick',
whereas it used to swallow it. I like it better this way.

I tested this by running it in an environment which needed
some cherry-picks.

Change-Id: I509a548a129e7ad67aaf800a8ba03cffad51dd81
Reviewed-on: http://gerrit.cloudera.org:8080/9130
Reviewed-by: Jim Apple <jb...@apache.org>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1b1bd7c3
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1b1bd7c3
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1b1bd7c3

Branch: refs/heads/2.x
Commit: 1b1bd7c31b3a1efac6b1285ea055145b2b27c1d3
Parents: b08e6eb
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Wed Jan 24 15:54:27 2018 -0800
Committer: Philip Zeyliger <ph...@cloudera.com>
Committed: Thu Jan 25 16:30:18 2018 -0800

----------------------------------------------------------------------
 bin/compare_branches.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1b1bd7c3/bin/compare_branches.py
----------------------------------------------------------------------
diff --git a/bin/compare_branches.py b/bin/compare_branches.py
index 6a81901..7050924 100755
--- a/bin/compare_branches.py
+++ b/bin/compare_branches.py
@@ -64,7 +64,7 @@ import json
 import logging
 import os
 import re
-import sh
+import subprocess
 import sys
 
 from collections import defaultdict
@@ -136,8 +136,8 @@ def build_commit_map(branch, merge_base):
   fields = ['%H', '%s', '%an', '%cd', '%b']
   pretty_format = '\x1f'.join(fields) + '\x1e'
   result = OrderedDict()
-  for line in sh.git.log(
-      branch, "^" + merge_base, pretty=pretty_format, color='never').split('\x1e'):
+  for line in subprocess.check_output(["git", "log", branch, "^" + merge_base,
+    "--pretty=" + pretty_format, "--color=never"]).split('\x1e'):
     if line == "":
       # if no changes are identified by the git log, we get an empty string
       continue
@@ -174,8 +174,9 @@ def cherrypick(cherry_pick_hashes, full_target_branch_name):
     return
 
   # Cherrypicking only makes sense if we're on the equivalent of the target branch.
-  head_sha = sh.git('rev-parse', 'HEAD').strip()
-  target_branch_sha = sh.git('rev-parse', full_target_branch_name).strip()
+  head_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
+  target_branch_sha = subprocess.check_output(
+      ['git', 'rev-parse', full_target_branch_name]).strip()
   if head_sha != target_branch_sha:
     print "Cannot cherrypick because %s (%s) and HEAD (%s) are divergent." % (
         full_target_branch_name, target_branch_sha, head_sha)
@@ -183,7 +184,8 @@ def cherrypick(cherry_pick_hashes, full_target_branch_name):
 
   cherry_pick_hashes.reverse()
   for cherry_pick_hash in cherry_pick_hashes:
-    sh.git('cherry-pick', '--keep-redundant-commits', cherry_pick_hash)
+    subprocess.check_call(
+        ['git', 'cherry-pick', '--keep-redundant-commits', cherry_pick_hash])
 
 
 def main():
@@ -202,19 +204,19 @@ def main():
   # Ensure all branches are up to date, unless remotes are disabled
   # by specifying them with an empty string.
   if options.source_remote_name != "":
-    sh.git.fetch(options.source_remote_name)
+    subprocess.check_call(['git', 'fetch', options.source_remote_name])
     full_source_branch_name = options.source_remote_name + '/' + options.source_branch
   else:
     full_source_branch_name = options.source_branch
   if options.target_remote_name != "":
     if options.source_remote_name != options.target_remote_name:
-      sh.git.fetch(options.target_remote_name)
+      subprocess.check_call(['git', 'fetch', options.target_remote_name])
     full_target_branch_name = options.target_remote_name + '/' + options.target_branch
   else:
     full_target_branch_name = options.target_branch
 
-  merge_base = sh.git("merge-base",
-      full_source_branch_name, full_target_branch_name).strip()
+  merge_base = subprocess.check_output(["git", "merge-base",
+      full_source_branch_name, full_target_branch_name]).strip()
   source_commits = build_commit_map(full_source_branch_name, merge_base)
   target_commits = build_commit_map(full_target_branch_name, merge_base)
 


[2/4] impala git commit: KUDU-2270: Add a flag to control logging in RpczStore::LogTrace()

Posted by ta...@apache.org.
KUDU-2270: Add a flag to control logging in RpczStore::LogTrace()

This change adds a new flag FLAGS_rpc_duration_too_long_ms
which controls the duration above which a RPC is considered
too long and is logged at INFO level in the log. Previously,
this threshold is hardcoded to 1000ms which may be too short
for a busy Impalad demon, leading to massive log spew.

Change-Id: Ie587ee602e83bb65d74f7ee622a9bc47897f2574
Reviewed-on: http://gerrit.cloudera.org:8080/9117
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Kudu Jenkins
Reviewed-on: http://gerrit.cloudera.org:8080/9121
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b08e6eb4
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b08e6eb4
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b08e6eb4

Branch: refs/heads/2.x
Commit: b08e6eb4f36f3dccfc409dd3804e4fbe3fe57acd
Parents: c0c3ba7
Author: Michael Ho <kw...@cloudera.com>
Authored: Wed Jan 24 00:05:11 2018 -0800
Committer: Philip Zeyliger <ph...@cloudera.com>
Committed: Thu Jan 25 16:30:18 2018 -0800

----------------------------------------------------------------------
 be/src/kudu/rpc/rpc-test.cc   | 1 +
 be/src/kudu/rpc/rpcz_store.cc | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b08e6eb4/be/src/kudu/rpc/rpc-test.cc
----------------------------------------------------------------------
diff --git a/be/src/kudu/rpc/rpc-test.cc b/be/src/kudu/rpc/rpc-test.cc
index dc29323..6d9d156 100644
--- a/be/src/kudu/rpc/rpc-test.cc
+++ b/be/src/kudu/rpc/rpc-test.cc
@@ -43,6 +43,7 @@ METRIC_DECLARE_histogram(rpc_incoming_queue_time);
 
 DECLARE_bool(rpc_reopen_outbound_connections);
 DECLARE_int32(rpc_negotiation_inject_delay_ms);
+DECLARE_int32(rpc_duration_too_long_ms);
 
 using std::shared_ptr;
 using std::string;

http://git-wip-us.apache.org/repos/asf/impala/blob/b08e6eb4/be/src/kudu/rpc/rpcz_store.cc
----------------------------------------------------------------------
diff --git a/be/src/kudu/rpc/rpcz_store.cc b/be/src/kudu/rpc/rpcz_store.cc
index b1a0591..710a57e 100644
--- a/be/src/kudu/rpc/rpcz_store.cc
+++ b/be/src/kudu/rpc/rpcz_store.cc
@@ -40,6 +40,13 @@ DEFINE_bool_hidden(rpc_dump_all_traces, false,
 TAG_FLAG(rpc_dump_all_traces, advanced);
 TAG_FLAG(rpc_dump_all_traces, runtime);
 
+DEFINE_int32_hidden(rpc_duration_too_long_ms, 1000,
+             "Threshold (in milliseconds) above which a RPC is considered too long and its "
+             "duration and method name are logged at INFO level. The time measured is between "
+             "when a RPC is accepted and when its call handler completes.");
+TAG_FLAG(rpc_duration_too_long_ms, advanced);
+TAG_FLAG(rpc_duration_too_long_ms, runtime);
+
 using std::pair;
 using std::vector;
 using std::unique_ptr;
@@ -244,7 +251,7 @@ void RpczStore::LogTrace(InboundCall* call) {
   if (PREDICT_FALSE(FLAGS_rpc_dump_all_traces)) {
     LOG(INFO) << call->ToString() << " took " << duration_ms << "ms. Trace:";
     call->trace()->Dump(&LOG(INFO), true);
-  } else if (duration_ms > 1000) {
+  } else if (duration_ms > FLAGS_rpc_duration_too_long_ms) {
     LOG(INFO) << call->ToString() << " took " << duration_ms << "ms. "
               << "Request Metrics: " << call->trace()->MetricsAsJSON();
   }