You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/01/09 07:00:35 UTC

[1/3] impala git commit: IMPALA-7446: enable buffer pool GC when near process mem limit

Repository: impala
Updated Branches:
  refs/heads/master 78da2b135 -> 274e96bd1


IMPALA-7446: enable buffer pool GC when near process mem limit

GC is performed when:
* The amount of memory allocated from the system for the buffer pool
  exceeds the reservation (i.e. free buffers and clean pages are not
  offset by unused reservation).
* The soft or hard process memory limit would otherwise cause an
  allocation to fail.

Testing:
Looped the old version of the semi_joins_exhaustive test, which
reliably reproduced the issue. I confirmed that the buffer pool GC was
running and that it preventing the query failures.

Added a backend test that reproed the issue. A large chunk of the code
change is to add infrastructure to use TCMalloc memory metrics
for the process memory tracker in backend tests.

Ran exhaustive tests.

Change-Id: I81e8e29f1ba319f1b499032f9518d32c511b4b21
Reviewed-on: http://gerrit.cloudera.org:8080/12133
Reviewed-by: Bikramjeet Vig <bi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/ae65ff83
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/ae65ff83
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/ae65ff83

Branch: refs/heads/master
Commit: ae65ff831966eb99417e15738c1aa96013a66f39
Parents: 78da2b1
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Thu Dec 27 09:59:57 2018 -0800
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Jan 9 05:26:50 2019 +0000

----------------------------------------------------------------------
 be/src/runtime/bufferpool/buffer-allocator.cc   |  4 +-
 be/src/runtime/bufferpool/buffer-pool-test.cc   | 42 ++++++++++++-
 be/src/runtime/exec-env.cc                      | 66 +++++++++++++-------
 be/src/runtime/exec-env.h                       |  4 ++
 be/src/runtime/mem-tracker.h                    |  2 +-
 be/src/runtime/test-env.cc                      | 15 ++++-
 be/src/runtime/test-env.h                       | 10 +++
 .../QueryTest/semi-joins-exhaustive.test        | 15 ++---
 tests/query_test/test_join_queries.py           |  5 +-
 9 files changed, 125 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/bufferpool/buffer-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator.cc b/be/src/runtime/bufferpool/buffer-allocator.cc
index 12ba151..7f6db92 100644
--- a/be/src/runtime/bufferpool/buffer-allocator.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator.cc
@@ -475,8 +475,10 @@ void BufferPool::BufferAllocator::ReleaseMemory(int64_t bytes_to_free) {
     FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
     // Free but don't claim any memory.
     bytes_freed += arena->FreeSystemMemory(bytes_to_free - bytes_freed, 0, nullptr).first;
-    if (bytes_freed >= bytes_to_free) return;
+    if (bytes_freed >= bytes_to_free) break;
   }
+  VLOG(1) << "BufferAllocator::ReleaseMemory(): Freed "
+          << PrettyPrinter::PrintBytes(bytes_freed);
 }
 
 int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) {

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/bufferpool/buffer-pool-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc
index 87723d9..58744a6 100644
--- a/be/src/runtime/bufferpool/buffer-pool-test.cc
+++ b/be/src/runtime/bufferpool/buffer-pool-test.cc
@@ -33,8 +33,8 @@
 #include "runtime/bufferpool/buffer-pool-internal.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
-#include "runtime/test-env.h"
 #include "runtime/query-state.h"
+#include "runtime/test-env.h"
 #include "service/fe-support.h"
 #include "testutil/cpu-util.h"
 #include "testutil/death-test-util.h"
@@ -42,8 +42,8 @@
 #include "testutil/rand-util.h"
 #include "util/blocking-queue.h"
 #include "util/filesystem-util.h"
-#include "util/spinlock.h"
 #include "util/metrics.h"
+#include "util/spinlock.h"
 
 #include "common/names.h"
 
@@ -2247,7 +2247,45 @@ TEST_F(BufferPoolTest, ConcurrentBufferOperations) {
   pool.DeregisterClient(&client);
   global_reservations_.Close();
 }
+
+// IMPALA-7446: the buffer pool GC hook that's set up in ExecEnv should
+// free cached buffers.
+TEST_F(BufferPoolTest, BufferPoolGc) {
+  const int64_t BUFFER_SIZE = 1024L * 1024L * 1024L;
+  // Set up a small buffer pool and process mem limit that fits only a single buffer.
+  // A large buffer size is used so that untracked memory is small relative to the
+  // buffer.
+  test_env_.reset(new TestEnv);
+  test_env_->SetBufferPoolArgs(1024, BUFFER_SIZE);
+  // Make sure we have a process memory tracker that uses TCMalloc metrics to match
+  // GC behaviour of a real impalad and reproduce IMPALA-7446. We need to add some
+  // extra headroom for other allocations and overhead.
+  test_env_->SetProcessMemTrackerArgs(BUFFER_SIZE * 3 / 2, true);
+  ASSERT_OK(test_env_->Init());
+  BufferPool* buffer_pool = test_env_->exec_env()->buffer_pool();
+  // Set up a client with unlimited reservation.
+  MemTracker* client_tracker = obj_pool_.Add(
+      new MemTracker(-1, "client", test_env_->exec_env()->process_mem_tracker()));
+  BufferPool::ClientHandle client;
+  ASSERT_OK(buffer_pool->RegisterClient("", nullptr,
+      test_env_->exec_env()->buffer_reservation(), client_tracker,
+      numeric_limits<int>::max(), NewProfile(), &client));
+
+  BufferPool::BufferHandle buffer;
+  ASSERT_TRUE(client.IncreaseReservation(BUFFER_SIZE));
+  // Make sure that buffers/pages were gc'ed and/or recycled.
+  EXPECT_EQ(0, buffer_pool->GetSystemBytesAllocated());
+  ASSERT_OK(buffer_pool->AllocateBuffer(&client, BUFFER_SIZE, &buffer));
+  buffer_pool->FreeBuffer(&client, &buffer);
+  ASSERT_OK(client.DecreaseReservationTo(numeric_limits<int64_t>::max(), 0));
+  // Before IMPALA-7446 was fixed, this reservation increase would fail because the
+  // free buffer counted against the process memory limit.
+  ASSERT_TRUE(client.IncreaseReservation(BUFFER_SIZE));
+  ASSERT_OK(buffer_pool->AllocateBuffer(&client, BUFFER_SIZE, &buffer));
+  buffer_pool->FreeBuffer(&client, &buffer);
+  buffer_pool->DeregisterClient(&client);
 }
+} // namespace impala
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/exec-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index 18c6b85..e60d752 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -62,6 +62,7 @@
 #include "util/openssl-util.h"
 #include "util/parse-util.h"
 #include "util/pretty-printer.h"
+#include "util/test-info.h"
 #include "util/thread-pool.h"
 #include "util/webserver.h"
 
@@ -305,29 +306,7 @@ Status ExecEnv::Init() {
   // Resolve hostname to IP address.
   RETURN_IF_ERROR(HostnameToIpAddr(FLAGS_hostname, &ip_address_));
 
-  mem_tracker_.reset(
-      new MemTracker(AggregateMemoryMetrics::TOTAL_USED, bytes_limit, "Process"));
-  // Add BufferPool MemTrackers for cached memory that is not tracked against queries
-  // but is included in process memory consumption.
-  obj_pool_->Add(new MemTracker(BufferPoolMetric::FREE_BUFFER_BYTES, -1,
-      "Buffer Pool: Free Buffers", mem_tracker_.get()));
-  obj_pool_->Add(new MemTracker(BufferPoolMetric::CLEAN_PAGE_BYTES, -1,
-      "Buffer Pool: Clean Pages", mem_tracker_.get()));
-  if (FLAGS_mem_limit_includes_jvm) {
-    // Add JVM metrics that should count against the process memory limit.
-    obj_pool_->Add(new MemTracker(
-        JvmMemoryMetric::HEAP_MAX_USAGE, -1, "JVM: max heap size", mem_tracker_.get()));
-    obj_pool_->Add(new MemTracker(JvmMemoryMetric::NON_HEAP_COMMITTED, -1,
-        "JVM: non-heap committed", mem_tracker_.get()));
-  }
-  // Also need a MemTracker for unused reservations as a negative value. Unused
-  // reservations are counted against queries but not against the process memory
-  // consumption. This accounts for that difference.
-  IntGauge* negated_unused_reservation = obj_pool_->Add(new NegatedGauge(
-      MakeTMetricDef("negated_unused_reservation", TMetricKind::GAUGE, TUnit::BYTES),
-      BufferPoolMetric::UNUSED_RESERVATION_BYTES));
-  obj_pool_->Add(new MemTracker(negated_unused_reservation, -1,
-      "Buffer Pool: Unused Reservation", mem_tracker_.get()));
+  InitMemTracker(bytes_limit);
 
   // Initializes the RPCMgr, ControlServices and DataStreamServices.
   krpc_address_.__set_hostname(ip_address_);
@@ -453,6 +432,47 @@ void ExecEnv::InitBufferPool(int64_t min_buffer_size, int64_t capacity,
   buffer_reservation_->InitRootTracker(nullptr, capacity);
 }
 
+void ExecEnv::InitMemTracker(int64_t bytes_limit) {
+  DCHECK(AggregateMemoryMetrics::TOTAL_USED != nullptr) << "Memory metrics not reg'd";
+  mem_tracker_.reset(
+      new MemTracker(AggregateMemoryMetrics::TOTAL_USED, bytes_limit, "Process"));
+  if (FLAGS_mem_limit_includes_jvm) {
+    // Add JVM metrics that should count against the process memory limit.
+    obj_pool_->Add(new MemTracker(
+        JvmMemoryMetric::HEAP_MAX_USAGE, -1, "JVM: max heap size", mem_tracker_.get()));
+    obj_pool_->Add(new MemTracker(JvmMemoryMetric::NON_HEAP_COMMITTED, -1,
+        "JVM: non-heap committed", mem_tracker_.get()));
+  }
+  if (buffer_pool_ != nullptr) {
+    // Tests can create multiple buffer pools, meaning that the metrics are not usable.
+    if (!TestInfo::is_test()) {
+      // Add BufferPool MemTrackers for cached memory that is not tracked against queries
+      // but is included in process memory consumption.
+      obj_pool_->Add(new MemTracker(BufferPoolMetric::FREE_BUFFER_BYTES, -1,
+          "Buffer Pool: Free Buffers", mem_tracker_.get()));
+      obj_pool_->Add(new MemTracker(BufferPoolMetric::CLEAN_PAGE_BYTES, -1,
+          "Buffer Pool: Clean Pages", mem_tracker_.get()));
+      // Also need a MemTracker for unused reservations as a negative value. Unused
+      // reservations are counted against queries but not against the process memory
+      // consumption. This accounts for that difference.
+      IntGauge* negated_unused_reservation = obj_pool_->Add(new NegatedGauge(
+          MakeTMetricDef("negated_unused_reservation", TMetricKind::GAUGE, TUnit::BYTES),
+          BufferPoolMetric::UNUSED_RESERVATION_BYTES));
+      obj_pool_->Add(new MemTracker(negated_unused_reservation, -1,
+          "Buffer Pool: Unused Reservation", mem_tracker_.get()));
+    }
+    mem_tracker_->AddGcFunction([buffer_pool=buffer_pool_.get()] (int64_t bytes_to_free)
+    {
+        // Only free memory in excess of the current reservation - the buffer pool
+        // does not need to give up cached memory that is offset by an unused reservation.
+        int64_t reserved = BufferPoolMetric::RESERVED->GetValue();
+        int64_t allocated_from_sys = BufferPoolMetric::SYSTEM_ALLOCATED->GetValue();
+        if (reserved >= allocated_from_sys) return;
+        buffer_pool->ReleaseMemory(min(bytes_to_free, allocated_from_sys - reserved));
+    });
+  }
+}
+
 TNetworkAddress ExecEnv::GetThriftBackendAddress() const {
   DCHECK(impala_server_ != nullptr);
   return impala_server_->GetThriftBackendAddress();

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/exec-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.h b/be/src/runtime/exec-env.h
index 9047521..465fe7a 100644
--- a/be/src/runtime/exec-env.h
+++ b/be/src/runtime/exec-env.h
@@ -250,6 +250,10 @@ class ExecEnv {
 
   /// Initialise 'buffer_pool_' and 'buffer_reservation_' with given capacity.
   void InitBufferPool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
+
+  /// Initialise 'mem_tracker_' with a limit of 'bytes_limit'. Must be called after
+  /// InitBufferPool() and RegisterMemoryMetrics().
+  void InitMemTracker(int64_t bytes_limit);
 };
 
 } // namespace impala

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/mem-tracker.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/mem-tracker.h b/be/src/runtime/mem-tracker.h
index 756a22a..8a66228 100644
--- a/be/src/runtime/mem-tracker.h
+++ b/be/src/runtime/mem-tracker.h
@@ -266,7 +266,7 @@ class MemTracker {
       if (mode == MemLimit::HARD && bytes_over_limit_metric_ != nullptr) {
         bytes_over_limit_metric_->SetValue(consumption() - limit_);
       }
-      return mode == MemLimit::HARD ? GcMemory(limit_) : true;
+      return GcMemory(GetLimit(mode));
     }
     return false;
   }

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/test-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.cc b/be/src/runtime/test-env.cc
index a82cc9d..14adc00 100644
--- a/be/src/runtime/test-env.cc
+++ b/be/src/runtime/test-env.cc
@@ -23,11 +23,12 @@
 #include "gutil/strings/substitute.h"
 #include "rpc/rpc-mgr.h"
 #include "runtime/query-exec-mgr.h"
-#include "runtime/tmp-file-mgr.h"
 #include "runtime/query-state.h"
+#include "runtime/tmp-file-mgr.h"
 #include "service/control-service.h"
 #include "util/disk-info.h"
 #include "util/impalad-metrics.h"
+#include "util/memory-metrics.h"
 
 #include "common/names.h"
 
@@ -49,11 +50,11 @@ Status TestEnv::Init() {
   if (static_metrics_ == NULL) {
     static_metrics_.reset(new MetricGroup("test-env-static-metrics"));
     ImpaladMetrics::CreateMetrics(static_metrics_.get());
+    RETURN_IF_ERROR(RegisterMemoryMetrics(static_metrics_.get(), true, nullptr, nullptr));
   }
 
   exec_env_.reset(new ExecEnv);
   // Populate the ExecEnv state that the backend tests need.
-  exec_env_->mem_tracker_.reset(new MemTracker(-1, "Process"));
   RETURN_IF_ERROR(exec_env_->disk_io_mgr()->Init());
   exec_env_->tmp_file_mgr_.reset(new TmpFileMgr);
   if (have_tmp_file_mgr_args_) {
@@ -66,6 +67,11 @@ Status TestEnv::Init() {
     exec_env_->InitBufferPool(buffer_pool_min_buffer_len_, buffer_pool_capacity_,
         static_cast<int64_t>(0.1 * buffer_pool_capacity_));
   }
+  if (process_mem_tracker_use_metrics_) {
+    exec_env_->InitMemTracker(process_mem_limit_);
+  } else {
+    exec_env_->mem_tracker_.reset(new MemTracker(process_mem_limit_, "Process"));
+  }
 
   // Initialize RpcMgr and control service.
   IpAddr ip_address;
@@ -90,6 +96,11 @@ void TestEnv::SetBufferPoolArgs(int64_t min_buffer_len, int64_t capacity) {
   buffer_pool_capacity_ = capacity;
 }
 
+void TestEnv::SetProcessMemTrackerArgs(int64_t bytes_limit, bool use_metrics) {
+  process_mem_limit_ = bytes_limit;
+  process_mem_tracker_use_metrics_ = use_metrics;
+}
+
 TestEnv::~TestEnv() {
   // Queries must be torn down first since they are dependent on global state.
   TearDownQueries();

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/be/src/runtime/test-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h
index 13a05a8..d146c8c 100644
--- a/be/src/runtime/test-env.h
+++ b/be/src/runtime/test-env.h
@@ -48,6 +48,12 @@ class TestEnv {
   /// If not called, a buffer pool with no capacity is created.
   void SetBufferPoolArgs(int64_t min_buffer_len, int64_t capacity);
 
+  /// Set configuration for the process memory tracker. Only has effect if called before
+  /// Init(). If 'use_metrics' is true, the usual process memory tracker that uses
+  /// memory consumption metrics is created, otherwise no metrics are used.
+  /// If not called, a process memory tracker with no limit is created.
+  void SetProcessMemTrackerArgs(int64_t bytes_limit, bool use_metrics);
+
   /// Initialize the TestEnv with the specified arguments.
   Status Init();
 
@@ -89,6 +95,10 @@ class TestEnv {
   int64_t buffer_pool_min_buffer_len_;
   int64_t buffer_pool_capacity_;
 
+  /// Arguments for process memory tracker, used in Init().
+  int64_t process_mem_limit_ = -1;
+  bool process_mem_tracker_use_metrics_ = false;
+
   /// Global state for test environment.
   static boost::scoped_ptr<MetricGroup> static_metrics_;
   boost::scoped_ptr<ExecEnv> exec_env_;

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/testdata/workloads/functional-query/queries/QueryTest/semi-joins-exhaustive.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/semi-joins-exhaustive.test b/testdata/workloads/functional-query/queries/QueryTest/semi-joins-exhaustive.test
index 2a7f101..a73b0ec 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/semi-joins-exhaustive.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/semi-joins-exhaustive.test
@@ -1,17 +1,18 @@
 ====
 ---- QUERY
-# Regression test for IMPALA-2256. Join whose right side has high cardinality and
-# zero materialized slots. Because the right side of the join here is always the
-# same key, this query can run out of memory and fail to spill; see IMPALA-4856.
-# The cardinality (~12M) is chosen so that the test run successfully in parallel
-# with other tests when impalad has a 7.8GB memlimit. (The peak memory usage of
-# the relevant fragment is ~850MB when tested.)
+# Regression test for IMPALA-2256. Join whose right side has very high
+# cardinality (60M) and zero materialized slots.
+# Because the right side of the join here is always
+# the same key, this query can run out of memory and fail to spill; see
+# IMPALA-4857. The cardinality (60M) is chosen so that the test
+# runs when impalad has a 7.8GB memlimit. (The peak memory usage
+# of the relevant fragment is 3.6GB when tested.)
 SELECT straight_join
 COUNT(*) FROM alltypesagg t1
 WHERE t1.int_col IN (
  SELECT 1 FROM alltypesagg t1
  CROSS JOIN alltypesagg t2
- WHERE t1.int_col < 100)
+ WHERE t1.int_col < 500)
 ---- RESULTS
 10
 ---- TYPES

http://git-wip-us.apache.org/repos/asf/impala/blob/ae65ff83/tests/query_test/test_join_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_join_queries.py b/tests/query_test/test_join_queries.py
index 51020c7..9bb2997 100644
--- a/tests/query_test/test_join_queries.py
+++ b/tests/query_test/test_join_queries.py
@@ -172,7 +172,8 @@ class TestSemiJoinQueries(ImpalaTestSuite):
     self.__load_semi_join_tables(unique_database)
     self.run_test_case('QueryTest/semi-joins', new_vector, unique_database)
 
+  @pytest.mark.execute_serially
   def test_semi_joins_exhaustive(self, vector):
+    """Expensive and memory-intensive semi-join tests."""
     if self.exploration_strategy() != 'exhaustive': pytest.skip()
-    new_vector = copy(vector)
-    self.run_test_case('QueryTest/semi-joins-exhaustive', new_vector)
+    self.run_test_case('QueryTest/semi-joins-exhaustive', vector)


[3/3] impala git commit: IMPALA-8059: Disable broken tests

Posted by ta...@apache.org.
IMPALA-8059: Disable broken tests

IMPALA-7625 caused some tests to fail but because the change otherwise
also addressed test failures we explicitly disable the affected tests
here instead of reverting IMPALA-7625.

Change-Id: Ibbd11840aac63dc7d483cafc9ee9b419dc840f37
Reviewed-on: http://gerrit.cloudera.org:8080/12190
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/274e96bd
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/274e96bd
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/274e96bd

Branch: refs/heads/master
Commit: 274e96bd147b5d91872c441c3a600fa8d5295bbe
Parents: fa78c59
Author: Lars Volker <lv...@cloudera.com>
Authored: Tue Jan 8 17:35:54 2019 -0800
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Jan 9 06:27:41 2019 +0000

----------------------------------------------------------------------
 tests/webserver/test_web_pages.py | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/274e96bd/tests/webserver/test_web_pages.py
----------------------------------------------------------------------
diff --git a/tests/webserver/test_web_pages.py b/tests/webserver/test_web_pages.py
index e70c27a..07c8da3 100644
--- a/tests/webserver/test_web_pages.py
+++ b/tests/webserver/test_web_pages.py
@@ -304,6 +304,7 @@ class TestWebPage(ImpalaTestSuite):
       self.client.cancel(query_handle)
     return response_json
 
+  @pytest.mark.xfail(run=True, reason="IMPALA-8059")
   def test_backend_states(self, unique_database):
     """Test that /query_backends returns the list of backend states for DML or
     queries; nothing for DDL statements"""
@@ -334,6 +335,7 @@ class TestWebPage(ImpalaTestSuite):
                                                         self.QUERY_BACKENDS_URL)
     assert 'backend_states' not in response_json
 
+  @pytest.mark.xfail(run=True, reason="IMPALA-8059")
   def test_backend_instances(self, unique_database, query_options=None):
     """Test that /query_finstances returns the list of fragment instances for DML or queries;
     nothing for DDL statements"""
@@ -368,6 +370,7 @@ class TestWebPage(ImpalaTestSuite):
                                                          self.QUERY_BACKENDS_URL)
     assert 'backend_instances' not in response_json
 
+  @pytest.mark.xfail(run=True, reason="IMPALA-8059")
   def test_backend_instances_mt_dop(self, unique_database):
     """Test that accessing /query_finstances does not crash the backend when running with
     mt_dop."""


[2/3] impala git commit: IMPALA-7924: Generate Thrift 11 Python Code

Posted by ta...@apache.org.
IMPALA-7924: Generate Thrift 11 Python Code

Upgrades the version of the toolchain in order to pull in Thrift 0.11.0.
Updates the CMake build to write generated Python code using Thrift 0.11
to shell/build/thrift-11-gen/gen-py/.

The Thrift 0.11 Python deserialization code has some big performance
improvements that allow faster parsing of runtime profiles. By adding
the ability to generate the Thrift Python code using Thrift 0.11 we can
take advantage of the Python performance improvements without going
through a full Thrift upgrade from 0.9 to 0.11.

Set USE_THRIFT11_GEN_PY=true and then run bin/set-pythonpath.sh to add
the Thrift 0.11 Python generated code to the PYTHONPATH rather than the
0.9 generated code.

Testing:
- Ran core tests

Change-Id: I3432c3e29d28ec3ef6a0a22156a18910f511fed0
Reviewed-on: http://gerrit.cloudera.org:8080/12036
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/fa78c594
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/fa78c594
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/fa78c594

Branch: refs/heads/master
Commit: fa78c594de39878902f3887a209f29f7976583d0
Parents: ae65ff8
Author: Sahil Takiar <ta...@gmail.com>
Authored: Mon Dec 3 16:09:04 2018 -0500
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Jan 9 05:54:59 2019 +0000

----------------------------------------------------------------------
 CMakeLists.txt               |   7 +++
 bin/bootstrap_toolchain.py   |   1 +
 bin/impala-config.sh         |   4 +-
 bin/set-pythonpath.sh        |  10 +++-
 common/thrift/CMakeLists.txt | 112 ++++++++++++++++++++++++++------------
 5 files changed, 96 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fcc855..285e617 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ set_dep_root(RE2)
 set_dep_root(RAPIDJSON)
 set_dep_root(SNAPPY)
 set_dep_root(THRIFT)
+set(THRIFT11_ROOT $ENV{IMPALA_TOOLCHAIN}/thrift-$ENV{IMPALA_THRIFT11_VERSION})
 set_dep_root(ZLIB)
 set_dep_root(CCTZ)
 
@@ -263,6 +264,12 @@ message(STATUS "Thrift version: ${THRIFT_VERSION}")
 message(STATUS "Thrift contrib dir: ${THRIFT_CONTRIB_DIR}")
 message(STATUS "Thrift compiler: ${THRIFT_COMPILER}")
 
+if (NOT EXISTS ${THRIFT11_ROOT}/bin/thrift)
+  message(FATAL_ERROR "Unable to find Thrift 11 compiler")
+endif(NOT EXISTS ${THRIFT11_ROOT}/bin/thrift)
+
+set(THRIFT11_COMPILER ${THRIFT11_ROOT}/bin/thrift)
+
 # find flatbuffers headers, lib and compiler
 find_package(FlatBuffers REQUIRED)
 IMPALA_ADD_THIRDPARTY_LIB(flatbuffers ${FLATBUFFERS_INCLUDE_DIR}

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index d7bf5b4..286aaaa 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -466,6 +466,7 @@ if __name__ == "__main__":
       "libunwind", "lz4", "openldap", "openssl", "orc", "protobuf",
       "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"])
   packages.insert(0, Package("llvm", "5.0.1-asserts-p1"))
+  packages.insert(0, Package("thrift", os.environ.get("IMPALA_THRIFT11_VERSION")))
   bootstrap(toolchain_root, packages)
 
   # Download the CDH components if necessary.

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 9e27f54..230554f 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -68,7 +68,7 @@ fi
 # moving to a different build of the toolchain, e.g. when a version is bumped or a
 # compile option is changed. The build id can be found in the output of the toolchain
 # build jobs, it is constructed from the build number and toolchain git hash prefix.
-export IMPALA_TOOLCHAIN_BUILD_ID=235-57748561c0
+export IMPALA_TOOLCHAIN_BUILD_ID=241-259ecff082
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p4
@@ -143,6 +143,8 @@ export IMPALA_TPC_H_VERSION=2.17.0
 unset IMPALA_TPC_H_URL
 export IMPALA_THRIFT_VERSION=0.9.3-p4
 unset IMPALA_THRIFT_URL
+export IMPALA_THRIFT11_VERSION=0.11.0-p2
+unset IMPALA_THRIFT11_URL
 export IMPALA_ZLIB_VERSION=1.2.8
 unset IMPALA_ZLIB_URL
 

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/bin/set-pythonpath.sh
----------------------------------------------------------------------
diff --git a/bin/set-pythonpath.sh b/bin/set-pythonpath.sh
index 10d0c67..86a5fe7 100755
--- a/bin/set-pythonpath.sh
+++ b/bin/set-pythonpath.sh
@@ -16,7 +16,15 @@
 # under the License.
 
 # set the python path for test modules and beeswax
-PYTHONPATH=${IMPALA_HOME}:${IMPALA_HOME}/shell/gen-py:${IMPALA_HOME}/testdata/
+# setting USE_THRIFT11_GEN_PY will add Thrift 11 Python generated code rather than the
+# default Thrift Python code
+PYTHONPATH=${IMPALA_HOME}
+if [ -n "${USE_THRIFT11_GEN_PY:-}" ]; then
+  PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/shell/build/thrift-11-gen/gen-py
+else
+  PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/shell/gen-py
+fi
+PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/testdata/
 
 # There should be just a single version of python that created the
 # site-packages directory. We find it by performing shell independent expansion

http://git-wip-us.apache.org/repos/asf/impala/blob/fa78c594/common/thrift/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/common/thrift/CMakeLists.txt b/common/thrift/CMakeLists.txt
index 87dcede..4d8646a 100644
--- a/common/thrift/CMakeLists.txt
+++ b/common/thrift/CMakeLists.txt
@@ -39,19 +39,19 @@ function(THRIFT_GEN VAR)
   ENDIF(NOT ARGN)
 
   set(${VAR})
-  foreach(FIL ${ARGN})
+  foreach(THRIFT_FILE ${ARGN})
     # Get full path
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    # Get basename
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    get_filename_component(ABS_THRIFT_FILE ${THRIFT_FILE} ABSOLUTE)
+    # Get basename without the file extension
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
 
     # All the output files we can determine based on filename.
     #   - Does not include .skeleton.cpp files
     #   - Does not include java output files
-    set(OUTPUT_BE_FILE "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_types.cpp")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_types.h")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_constants.cpp")
-    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${FIL_WE}_constants.h")
+    set(OUTPUT_BE_FILE "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_types.cpp")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_types.h")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_constants.cpp")
+    set(OUTPUT_BE_FILE ${OUTPUT_BE_FILE} "${BE_OUTPUT_DIR}/gen-cpp/${THRIFT_FILE_WE}_constants.h")
     list(APPEND ${VAR} ${OUTPUT_BE_FILE})
 
     # BeeswaxService thrift generation
@@ -60,38 +60,38 @@ function(THRIFT_GEN VAR)
     # We need to generate C++ src file for the parent dependencies using the "-r" option.
     set(CPP_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen cpp:moveable_types -o
         ${BE_OUTPUT_DIR})
-    IF (FIL STREQUAL "beeswax.thrift")
+    IF (THRIFT_FILE STREQUAL "beeswax.thrift")
       set(CPP_ARGS -r ${CPP_ARGS})
-    ENDIF(FIL STREQUAL "beeswax.thrift")
+    ENDIF(THRIFT_FILE STREQUAL "beeswax.thrift")
 
-    IF (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
+    IF (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
       # Do not generate Java HiveServer2 and Parquet files because we should use the jar
       # from Hive or Parquet.
       # Also do not generate ImpalaService.thrift because the generated code doesn't
       # compile with hive if the thrift version in hive is 0.9.0
       add_custom_command(
         OUTPUT ${OUTPUT_BE_FILE}
-        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${FIL}
-        DEPENDS ${ABS_FIL}
-        COMMENT "Running thrift compiler on ${FIL}"
+        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${THRIFT_FILE}
+        DEPENDS ${ABS_THRIFT_FILE}
+        COMMENT "Running thrift compiler on ${THRIFT_FILE}"
         VERBATIM
       )
-    ELSE (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
+    ELSE (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
       add_custom_command(
         OUTPUT ${OUTPUT_BE_FILE}
-        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${JAVA_FE_ARGS} ${FIL}
-        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${FIL}
-        DEPENDS ${ABS_FIL}
-        COMMENT "Running thrift compiler on ${FIL}"
+        COMMAND ${THRIFT_COMPILER} ${CPP_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${JAVA_FE_ARGS} ${THRIFT_FILE}
+        COMMAND ${THRIFT_COMPILER} ${PYTHON_ARGS} ${THRIFT_FILE}
+        DEPENDS ${ABS_THRIFT_FILE}
+        COMMENT "Running thrift compiler on ${THRIFT_FILE}"
         VERBATIM
     )
-    ENDIF (FIL STREQUAL ${TCLI_SERVICE_THRIFT} OR FIL STREQUAL "parquet.thrift" OR
-        FIL STREQUAL "ImpalaService.thrift")
-  endforeach(FIL)
+    ENDIF (THRIFT_FILE STREQUAL ${TCLI_SERVICE_THRIFT} OR THRIFT_FILE STREQUAL "parquet.thrift" OR
+        THRIFT_FILE STREQUAL "ImpalaService.thrift")
+  endforeach(THRIFT_FILE)
 
   set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction(THRIFT_GEN)
@@ -103,28 +103,61 @@ function(THRIFT_GEN_DS VAR)
   ENDIF(NOT ARGN)
 
   set(${VAR})
-  foreach(FIL ${ARGN})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
+  foreach(THRIFT_FILE ${ARGN})
+    get_filename_component(ABS_THRIFT_FILE ${THRIFT_FILE} ABSOLUTE)
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
 
     # This isn't the exact output file that's created, just a unique file
-    set(OUTPUT_FILE "${EXT_DS_OUTPUT_DIR}/${FIL_WE}.java")
+    set(OUTPUT_FILE "${EXT_DS_OUTPUT_DIR}/${THRIFT_FILE_WE}.java")
     list(APPEND ${VAR} ${OUTPUT_FILE})
     add_custom_command(
       OUTPUT ${OUTPUT_FILE}
-      COMMAND ${THRIFT_COMPILER} ${JAVA_EXT_DS_ARGS} ${FIL}
-      DEPENDS ${ABS_FIL}
-      COMMENT "Running thrift compiler for ext-data-source on ${FIL}"
+      COMMAND ${THRIFT_COMPILER} ${JAVA_EXT_DS_ARGS} ${THRIFT_FILE}
+      DEPENDS ${ABS_THRIFT_FILE}
+      COMMENT "Running thrift compiler for ext-data-source on ${THRIFT_FILE}"
       VERBATIM
     )
-  endforeach(FIL)
+  endforeach(THRIFT_FILE)
   set(${VAR} ${${VAR}} PARENT_SCOPE)
 endfunction(THRIFT_GEN_DS)
 
+# Helper function similar to THRIFT_GEN and THRIFT_GEN_DS, but specific to Thrift 11. For
+# now the method only generates Python code. The generated Python code is not used by
+# any other Impala module, it is merely meant to provide devs access to Thrift 11 Python
+# generated code until a full upgrade from Thrift 0.9 to 0.11 has been completed. Once
+# the complete upgrade to Thrift 11 has been complete (see IMPALA-7825), this method
+# should be removed.
+function(THRIFT11_GEN VAR)
+  IF (NOT ARGN)
+    MESSAGE(SEND_ERROR "Error: THRIFT11_GEN called without any src files")
+    RETURN()
+  ENDIF(NOT ARGN)
+
+  set(${VAR})
+  foreach(THRIFT_FILE ${ARGN})
+    # Get full path
+    get_filename_component(ABS_THRIFT__FILE ${THRIFT_FILE} ABSOLUTE)
+    # Get basename without the file extension
+    get_filename_component(THRIFT_FILE_WE ${THRIFT_FILE} NAME_WE)
+
+    # This isn't the exact output file that's created, just a unique file
+    set(OUTPUT_FILE "${THRIFT11_PYTHON_OUTPUT_DIR}/${THRIFT_FILE_WE}.py")
+    list(APPEND ${VAR} ${OUTPUT_FILE})
+    add_custom_command(
+      OUTPUT ${OUTPUT_FILE}
+      COMMAND ${THRIFT11_COMPILER} ${THRIFT11_PYTHON_ARGS} ${THRIFT_FILE}
+      DEPENDS ${ABS_THRIFT_FILE}
+      COMMENT "Running thrift 11 compiler on ${THRIFT_FILE}"
+      VERBATIM
+    )
+  endforeach(THRIFT_FILE)
+  set(${VAR} ${${VAR}} PARENT_SCOPE)
+endfunction(THRIFT11_GEN)
 
 set(HIVE_THRIFT_SOURCE_DIR "hive-$ENV{IMPALA_HIVE_MAJOR_VERSION}-api")
 set(TCLI_SERVICE_THRIFT "${HIVE_THRIFT_SOURCE_DIR}/TCLIService.thrift")
 message("Using Thrift compiler: ${THRIFT_COMPILER}")
+message("Using Thrift 11 compiler: ${THRIFT11_COMPILER}")
 set(THRIFT_INCLUDE_DIR_OPTION -I ${THRIFT_CONTRIB_DIR} -I $ENV{HIVE_SRC_DIR}/metastore/if
   -I ${HIVE_THRIFT_SOURCE_DIR})
 set(BE_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/be/generated-sources)
@@ -132,17 +165,21 @@ set(FE_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/fe/generated-sources)
 # TODO: avoid duplicating generated java classes
 set(EXT_DS_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/ext-data-source/api/generated-sources)
 set(PYTHON_OUTPUT_DIR ${CMAKE_SOURCE_DIR}/shell/)
+set(THRIFT11_PYTHON_OUTPUT_DIR ${PYTHON_OUTPUT_DIR}/build/thrift-11-gen)
 MESSAGE("Found output dir: " ${PYTHON_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${BE_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${FE_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${EXT_DS_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${PYTHON_OUTPUT_DIR})
 file(MAKE_DIRECTORY ${HIVE_THRIFT_SOURCE_DIR})
+file(MAKE_DIRECTORY ${THRIFT11_PYTHON_OUTPUT_DIR})
 
 # Args passed to thrift for Java gen
 set(JAVA_FE_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen java -o ${FE_OUTPUT_DIR})
 set(JAVA_EXT_DS_ARGS ${THRIFT_INCLUDE_DIR_OPTION} --gen java -o ${EXT_DS_OUTPUT_DIR})
 set(PYTHON_ARGS ${THRIFT_INCLUDE_DIR_OPTION} -r --gen py -o ${PYTHON_OUTPUT_DIR})
+set(THRIFT11_PYTHON_ARGS ${THRIFT_INCLUDE_DIR_OPTION} -r --gen py -o
+                         ${THRIFT11_PYTHON_OUTPUT_DIR})
 
 set (EXT_DATA_SRC_FILES
   ErrorCodes.thrift
@@ -211,7 +248,7 @@ add_custom_command(OUTPUT hive-2-api/TCLIService.thrift
 # a list of files they produce
 THRIFT_GEN(THRIFT_ALL_FILES ${SRC_FILES})
 THRIFT_GEN_DS(THRIFT_DATA_SRC_FILES ${EXT_DATA_SRC_FILES})
-
+THRIFT11_GEN(THRIFT11_ALL_FILES ${SRC_FILES})
 
 add_custom_target(thrift-generated-files-error DEPENDS ErrorCodes.thrift)
 add_custom_target(thrift-generated-files-metrics DEPENDS MetricDefs.thrift)
@@ -225,6 +262,9 @@ add_dependencies(thrift-cpp thrift-generated-files-metrics thrift-generated-file
 add_custom_target(thrift-ext-data-src ALL DEPENDS ${THRIFT_DATA_SRC_FILES})
 add_dependencies(thrift-ext-data-src thrift-cpp)
 
+add_custom_target(thrift11-py ALL DEPENDS ${THRIFT11_ALL_FILES})
+add_dependencies(thrift11-py thrift-ext-data-src)
+
 # Combined target for all thrift dependencies
 add_custom_target(thrift-deps ALL)
-add_dependencies(thrift-deps thrift-ext-data-src)
+add_dependencies(thrift-deps thrift11-py)