You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/15 17:08:50 UTC

[impala] 02/06: IMPALA-8537: Negative values reported for tmp-file-mgr.scratch-space-bytes-used under heavy spilling load

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 454d85be8a5b003c725586d2f75ee672925f1962
Author: Abhishek <ar...@cloudera.com>
AuthorDate: Mon May 13 16:43:08 2019 -0700

    IMPALA-8537: Negative values reported for
    tmp-file-mgr.scratch-space-bytes-used under heavy spilling load
    
    Whenever closing a FileGroup, the TmpFileMgr::scratch_bytes_used_metric_
    was incorrectly being decremented by the total scratch space bytes
    across the entire FileGroup
    (i.e FileGroup::scratch_space_bytes_used_counter_), for every File in
    the FileGroup. This was resulting in the -ive value for the current
    scratch space bytes.
    
    The fix is to decrement the TmpFileMgr::scratch_bytes_used_metric_ by
    the FileGroup::scratch_space_bytes_used_counter_, only once when the
    FileGroup is closed.
    
    Testing Done:
    - Added checks for expected current value and HWM of the scratch space
      bytes in some of the existing test units in tmp-file-mgr-test.cc.
    - Added a new scenario in tmp-file-mgr-test.cc which mimics concurrent
      spilling queries and checks for propper current and HWM value for
      the scratch space bytes.
    - Ad-hoc tests forcing multiple scratch space dirs/files and running
      concurrent spilling queries while making sure the current value is
      never -ive.
    
    Change-Id: I338ecc06ddfad414091bd50f683b767b61abdcc4
    Reviewed-on: http://gerrit.cloudera.org:8080/13326
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/tmp-file-mgr-test.cc | 82 ++++++++++++++++++++++++++++++++++++-
 be/src/runtime/tmp-file-mgr.cc      |  8 ++--
 be/src/util/metrics.h               |  2 +
 3 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index bd53dd6..f838c47 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -92,6 +92,16 @@ class TmpFileMgrTest : public ::testing::Test {
     }
   }
 
+  /// Check that current scratch space bytes and HWM match the expected values.
+  void checkHWMMetrics(int64_t exp_current_value, int64_t exp_hwm_value) {
+    AtomicHighWaterMarkGauge* hwm_value =
+        metrics_->FindMetricForTesting<AtomicHighWaterMarkGauge>(
+            "tmp-file-mgr.scratch-space-bytes-used-high-water-mark");
+    IntGauge* current_value = hwm_value->current_value_;
+    ASSERT_EQ(current_value->GetValue(), exp_current_value);
+    ASSERT_EQ(hwm_value->GetValue(), exp_hwm_value);
+  }
+
   void RemoveAndCreateDirs(const vector<string>& dirs) {
     for (const string& dir: dirs) {
       ASSERT_OK(FileSystemUtil::RemoveAndCreateDirectory(dir));
@@ -393,15 +403,24 @@ TEST_F(TmpFileMgrTest, TestScratchLimit) {
   ASSERT_EQ(status.code(), TErrorCode::SCRATCH_LIMIT_EXCEEDED);
   ASSERT_NE(string::npos, status.msg().msg().find(GetBackendString()));
 
+  // Check HWM metrics
+  checkHWMMetrics(LIMIT, LIMIT);
   file_group.Close();
+  checkHWMMetrics(0, LIMIT);
 }
 
 // Test that scratch file ranges of varying length are recycled as expected.
 TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
+  vector<string> tmp_dirs({"/tmp/tmp-file-mgr-test.1", "/tmp/tmp-file-mgr-test.2"});
+  RemoveAndCreateDirs(tmp_dirs);
+  TmpFileMgr tmp_file_mgr;
+  ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
+
+  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
   int64_t expected_scratch_bytes_allocated = 0;
   // Test some different allocation sizes.
+  checkHWMMetrics(0, 0);
   for (int alloc_size = 64; alloc_size <= 64 * 1024; alloc_size *= 2) {
     // Generate some data.
     const int BLOCKS = 5;
@@ -417,6 +436,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
     // 'file_group' should allocate extra scratch bytes for this 'alloc_size'.
     expected_scratch_bytes_allocated += alloc_size * BLOCKS;
     const int TEST_ITERS = 5;
+
     // Make sure free space doesn't grow over several iterations.
     for (int i = 0; i < TEST_ITERS; ++i) {
       cb_counter_ = 0;
@@ -426,6 +446,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
       }
       WaitForCallbacks(BLOCKS);
       EXPECT_EQ(expected_scratch_bytes_allocated, BytesAllocated(&file_group));
+      checkHWMMetrics(expected_scratch_bytes_allocated, expected_scratch_bytes_allocated);
 
       // Read back and validate.
       for (int j = 0; j < BLOCKS; ++j) {
@@ -437,10 +458,11 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
       // Check that the space is still in use - it should be recycled by the next
       // iteration.
       EXPECT_EQ(expected_scratch_bytes_allocated, BytesAllocated(&file_group));
+      checkHWMMetrics(expected_scratch_bytes_allocated, expected_scratch_bytes_allocated);
     }
   }
   file_group.Close();
-  test_env_->TearDownQueries();
+  checkHWMMetrics(0, expected_scratch_bytes_allocated);
 }
 
 // Regression test for IMPALA-4748, where hitting the process memory limit caused
@@ -565,6 +587,62 @@ void TmpFileMgrTest::TestBlockVerification() {
   file_group.Close();
   test_env_->TearDownQueries();
 }
+
+// Test that the current scratch space bytes and HWM values are proper when different
+// FileGroups are used concurrently. This test unit mimics concurrent spilling queries.
+TEST_F(TmpFileMgrTest, TestHWMMetric) {
+  RuntimeProfile* profile_1 = RuntimeProfile::Create(&obj_pool_, "tmp-file-mgr-test-1");
+  RuntimeProfile* profile_2 = RuntimeProfile::Create(&obj_pool_, "tmp-file-mgr-test-2");
+  vector<string> tmp_dirs({"/tmp/tmp-file-mgr-test.1", "/tmp/tmp-file-mgr-test.2"});
+  RemoveAndCreateDirs(tmp_dirs);
+  TmpFileMgr tmp_file_mgr;
+  ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
+
+  const int64_t LIMIT = 128;
+  // A power-of-two so that FileGroup allocates exactly this amount of scratch space.
+  const int64_t ALLOC_SIZE = 64;
+  TUniqueId id_1;
+  TmpFileMgr::FileGroup file_group_1(&tmp_file_mgr, io_mgr(), profile_1, id_1, LIMIT);
+  TUniqueId id_2;
+  TmpFileMgr::FileGroup file_group_2(&tmp_file_mgr, io_mgr(), profile_2, id_2, LIMIT);
+
+  vector<TmpFileMgr::File*> files;
+  ASSERT_OK(CreateFiles(&file_group_1, &files));
+  ASSERT_OK(CreateFiles(&file_group_2, &files));
+
+  Status status;
+  int64_t offset;
+  TmpFileMgr::File* alloc_file;
+
+  // Alloc from file_group_1 and file_group_2 interleaving allocations.
+  SetNextAllocationIndex(&file_group_1, 0);
+  ASSERT_OK(GroupAllocateSpace(&file_group_1, ALLOC_SIZE, &alloc_file, &offset));
+  ASSERT_EQ(alloc_file, files[0]);
+  ASSERT_EQ(0, offset);
+
+  SetNextAllocationIndex(&file_group_2, 0);
+  ASSERT_OK(GroupAllocateSpace(&file_group_2, ALLOC_SIZE, &alloc_file, &offset));
+  ASSERT_EQ(alloc_file, files[2]);
+  ASSERT_EQ(0, offset);
+
+  ASSERT_OK(GroupAllocateSpace(&file_group_1, ALLOC_SIZE, &alloc_file, &offset));
+  ASSERT_EQ(0, offset);
+  ASSERT_EQ(alloc_file, files[1]);
+
+  ASSERT_OK(GroupAllocateSpace(&file_group_2, ALLOC_SIZE, &alloc_file, &offset));
+  ASSERT_EQ(0, offset);
+  ASSERT_EQ(alloc_file, files[3]);
+
+  EXPECT_EQ(LIMIT, BytesAllocated(&file_group_1));
+  EXPECT_EQ(LIMIT, BytesAllocated(&file_group_2));
+
+  // Check HWM metrics
+  checkHWMMetrics(2 * LIMIT, 2 * LIMIT);
+  file_group_1.Close();
+  checkHWMMetrics(LIMIT, 2 * LIMIT);
+  file_group_2.Close();
+  checkHWMMetrics(0, 2 * LIMIT);
+}
 }
 
 int main(int argc, char** argv) {
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 32c1509..372e301 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -283,14 +283,14 @@ void TmpFileMgr::FileGroup::Close() {
   if (io_ctx_ != nullptr) io_mgr_->UnregisterContext(io_ctx_.get());
   for (std::unique_ptr<TmpFileMgr::File>& file : tmp_files_) {
     Status status = file->Remove();
-    if (status.ok()) {
-      tmp_file_mgr_->scratch_bytes_used_metric_->Increment(
-          -1 * scratch_space_bytes_used_counter_->value());
-    } else {
+    if (!status.ok()) {
       LOG(WARNING) << "Error removing scratch file '" << file->path()
                    << "': " << status.msg().msg();
     }
   }
+  tmp_file_mgr_->scratch_bytes_used_metric_->Increment(
+      -1 * scratch_space_bytes_used_counter_->value());
+
   tmp_files_.clear();
 }
 
diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h
index 80b863b..80f899d 100644
--- a/be/src/util/metrics.h
+++ b/be/src/util/metrics.h
@@ -288,6 +288,8 @@ class AtomicHighWaterMarkGauge : public ScalarMetric<int64_t, TMetricKind::GAUGE
 
  private:
   FRIEND_TEST(MetricsTest, AtomicHighWaterMarkGauge);
+  friend class TmpFileMgrTest;
+
   /// Set 'hwm_value_' to 'v' if 'v' is larger than 'hwm_value_'. The entire operation is
   /// atomic.
   void UpdateMax(int64_t v) {