You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/15 17:08:50 UTC
[impala] 02/06: IMPALA-8537: Negative values reported for
tmp-file-mgr.scratch-space-bytes-used under heavy spilling load
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 454d85be8a5b003c725586d2f75ee672925f1962
Author: Abhishek <ar...@cloudera.com>
AuthorDate: Mon May 13 16:43:08 2019 -0700
IMPALA-8537: Negative values reported for
tmp-file-mgr.scratch-space-bytes-used under heavy spilling load
Whenever closing a FileGroup, the TmpFileMgr::scratch_bytes_used_metric_
was incorrectly being decremented by the total scratch space bytes
across the entire FileGroup
(i.e FileGroup::scratch_space_bytes_used_counter_), for every File in
the FileGroup. This was resulting in the -ive value for the current
scratch space bytes.
The fix is to decrement the TmpFileMgr::scratch_bytes_used_metric_ by
the FileGroup::scratch_space_bytes_used_counter_, only once when the
FileGroup is closed.
Testing Done:
- Added checks for expected current value and HWM of the scratch space
bytes in some of the existing test units in tmp-file-mgr-test.cc.
- Added a new scenario in tmp-file-mgr-test.cc which mimics concurrent
spilling queries and checks for propper current and HWM value for
the scratch space bytes.
- Ad-hoc tests forcing multiple scratch space dirs/files and running
concurrent spilling queries while making sure the current value is
never -ive.
Change-Id: I338ecc06ddfad414091bd50f683b767b61abdcc4
Reviewed-on: http://gerrit.cloudera.org:8080/13326
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/runtime/tmp-file-mgr-test.cc | 82 ++++++++++++++++++++++++++++++++++++-
be/src/runtime/tmp-file-mgr.cc | 8 ++--
be/src/util/metrics.h | 2 +
3 files changed, 86 insertions(+), 6 deletions(-)
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index bd53dd6..f838c47 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -92,6 +92,16 @@ class TmpFileMgrTest : public ::testing::Test {
}
}
+ /// Check that current scratch space bytes and HWM match the expected values.
+ void checkHWMMetrics(int64_t exp_current_value, int64_t exp_hwm_value) {
+ AtomicHighWaterMarkGauge* hwm_value =
+ metrics_->FindMetricForTesting<AtomicHighWaterMarkGauge>(
+ "tmp-file-mgr.scratch-space-bytes-used-high-water-mark");
+ IntGauge* current_value = hwm_value->current_value_;
+ ASSERT_EQ(current_value->GetValue(), exp_current_value);
+ ASSERT_EQ(hwm_value->GetValue(), exp_hwm_value);
+ }
+
void RemoveAndCreateDirs(const vector<string>& dirs) {
for (const string& dir: dirs) {
ASSERT_OK(FileSystemUtil::RemoveAndCreateDirectory(dir));
@@ -393,15 +403,24 @@ TEST_F(TmpFileMgrTest, TestScratchLimit) {
ASSERT_EQ(status.code(), TErrorCode::SCRATCH_LIMIT_EXCEEDED);
ASSERT_NE(string::npos, status.msg().msg().find(GetBackendString()));
+ // Check HWM metrics
+ checkHWMMetrics(LIMIT, LIMIT);
file_group.Close();
+ checkHWMMetrics(0, LIMIT);
}
// Test that scratch file ranges of varying length are recycled as expected.
TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
+ vector<string> tmp_dirs({"/tmp/tmp-file-mgr-test.1", "/tmp/tmp-file-mgr-test.2"});
+ RemoveAndCreateDirs(tmp_dirs);
+ TmpFileMgr tmp_file_mgr;
+ ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
TUniqueId id;
- TmpFileMgr::FileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
+
+ TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
int64_t expected_scratch_bytes_allocated = 0;
// Test some different allocation sizes.
+ checkHWMMetrics(0, 0);
for (int alloc_size = 64; alloc_size <= 64 * 1024; alloc_size *= 2) {
// Generate some data.
const int BLOCKS = 5;
@@ -417,6 +436,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
// 'file_group' should allocate extra scratch bytes for this 'alloc_size'.
expected_scratch_bytes_allocated += alloc_size * BLOCKS;
const int TEST_ITERS = 5;
+
// Make sure free space doesn't grow over several iterations.
for (int i = 0; i < TEST_ITERS; ++i) {
cb_counter_ = 0;
@@ -426,6 +446,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
}
WaitForCallbacks(BLOCKS);
EXPECT_EQ(expected_scratch_bytes_allocated, BytesAllocated(&file_group));
+ checkHWMMetrics(expected_scratch_bytes_allocated, expected_scratch_bytes_allocated);
// Read back and validate.
for (int j = 0; j < BLOCKS; ++j) {
@@ -437,10 +458,11 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
// Check that the space is still in use - it should be recycled by the next
// iteration.
EXPECT_EQ(expected_scratch_bytes_allocated, BytesAllocated(&file_group));
+ checkHWMMetrics(expected_scratch_bytes_allocated, expected_scratch_bytes_allocated);
}
}
file_group.Close();
- test_env_->TearDownQueries();
+ checkHWMMetrics(0, expected_scratch_bytes_allocated);
}
// Regression test for IMPALA-4748, where hitting the process memory limit caused
@@ -565,6 +587,62 @@ void TmpFileMgrTest::TestBlockVerification() {
file_group.Close();
test_env_->TearDownQueries();
}
+
+// Test that the current scratch space bytes and HWM values are proper when different
+// FileGroups are used concurrently. This test unit mimics concurrent spilling queries.
+TEST_F(TmpFileMgrTest, TestHWMMetric) {
+ RuntimeProfile* profile_1 = RuntimeProfile::Create(&obj_pool_, "tmp-file-mgr-test-1");
+ RuntimeProfile* profile_2 = RuntimeProfile::Create(&obj_pool_, "tmp-file-mgr-test-2");
+ vector<string> tmp_dirs({"/tmp/tmp-file-mgr-test.1", "/tmp/tmp-file-mgr-test.2"});
+ RemoveAndCreateDirs(tmp_dirs);
+ TmpFileMgr tmp_file_mgr;
+ ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
+
+ const int64_t LIMIT = 128;
+ // A power-of-two so that FileGroup allocates exactly this amount of scratch space.
+ const int64_t ALLOC_SIZE = 64;
+ TUniqueId id_1;
+ TmpFileMgr::FileGroup file_group_1(&tmp_file_mgr, io_mgr(), profile_1, id_1, LIMIT);
+ TUniqueId id_2;
+ TmpFileMgr::FileGroup file_group_2(&tmp_file_mgr, io_mgr(), profile_2, id_2, LIMIT);
+
+ vector<TmpFileMgr::File*> files;
+ ASSERT_OK(CreateFiles(&file_group_1, &files));
+ ASSERT_OK(CreateFiles(&file_group_2, &files));
+
+ Status status;
+ int64_t offset;
+ TmpFileMgr::File* alloc_file;
+
+ // Alloc from file_group_1 and file_group_2 interleaving allocations.
+ SetNextAllocationIndex(&file_group_1, 0);
+ ASSERT_OK(GroupAllocateSpace(&file_group_1, ALLOC_SIZE, &alloc_file, &offset));
+ ASSERT_EQ(alloc_file, files[0]);
+ ASSERT_EQ(0, offset);
+
+ SetNextAllocationIndex(&file_group_2, 0);
+ ASSERT_OK(GroupAllocateSpace(&file_group_2, ALLOC_SIZE, &alloc_file, &offset));
+ ASSERT_EQ(alloc_file, files[2]);
+ ASSERT_EQ(0, offset);
+
+ ASSERT_OK(GroupAllocateSpace(&file_group_1, ALLOC_SIZE, &alloc_file, &offset));
+ ASSERT_EQ(0, offset);
+ ASSERT_EQ(alloc_file, files[1]);
+
+ ASSERT_OK(GroupAllocateSpace(&file_group_2, ALLOC_SIZE, &alloc_file, &offset));
+ ASSERT_EQ(0, offset);
+ ASSERT_EQ(alloc_file, files[3]);
+
+ EXPECT_EQ(LIMIT, BytesAllocated(&file_group_1));
+ EXPECT_EQ(LIMIT, BytesAllocated(&file_group_2));
+
+ // Check HWM metrics
+ checkHWMMetrics(2 * LIMIT, 2 * LIMIT);
+ file_group_1.Close();
+ checkHWMMetrics(LIMIT, 2 * LIMIT);
+ file_group_2.Close();
+ checkHWMMetrics(0, 2 * LIMIT);
+}
}
int main(int argc, char** argv) {
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 32c1509..372e301 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -283,14 +283,14 @@ void TmpFileMgr::FileGroup::Close() {
if (io_ctx_ != nullptr) io_mgr_->UnregisterContext(io_ctx_.get());
for (std::unique_ptr<TmpFileMgr::File>& file : tmp_files_) {
Status status = file->Remove();
- if (status.ok()) {
- tmp_file_mgr_->scratch_bytes_used_metric_->Increment(
- -1 * scratch_space_bytes_used_counter_->value());
- } else {
+ if (!status.ok()) {
LOG(WARNING) << "Error removing scratch file '" << file->path()
<< "': " << status.msg().msg();
}
}
+ tmp_file_mgr_->scratch_bytes_used_metric_->Increment(
+ -1 * scratch_space_bytes_used_counter_->value());
+
tmp_files_.clear();
}
diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h
index 80b863b..80f899d 100644
--- a/be/src/util/metrics.h
+++ b/be/src/util/metrics.h
@@ -288,6 +288,8 @@ class AtomicHighWaterMarkGauge : public ScalarMetric<int64_t, TMetricKind::GAUGE
private:
FRIEND_TEST(MetricsTest, AtomicHighWaterMarkGauge);
+ friend class TmpFileMgrTest;
+
/// Set 'hwm_value_' to 'v' if 'v' is larger than 'hwm_value_'. The entire operation is
/// atomic.
void UpdateMax(int64_t v) {