You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2023/02/08 17:25:39 UTC
[impala] 02/04: IMPALA-11883: Calculate erasure-coded bytes read directly
This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit b858f2acdee77b4651ac84a7063fddf8fb0caae1
Author: Michael Smith <mi...@cloudera.com>
AuthorDate: Mon Jan 30 12:12:27 2023 -0800
IMPALA-11883: Calculate erasure-coded bytes read directly
Calculate the metric erasure-coded-bytes-read directly from HDFS reads
rather than through hdfsFileGetReadStatistics. This allows us to use it
for other filesystem implementations (Ozone).
Also renumbers is_erasure_coded in THdfsFileSplit to 8, where it was
originally before it was removed by IMPALA-9485 (and never replaced).
Testing:
- ran updated test_io_metrics.py with Ozone, with and without EC
- ran updated test_io_metrics.py with HDFS, with and without EC
Change-Id: Ide0fc806590b2328df8068a9a54645d1d1fb137c
Reviewed-on: http://gerrit.cloudera.org:8080/19460
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Michael Smith <mi...@cloudera.com>
---
be/src/runtime/io/hdfs-file-reader.cc | 7 ++++---
be/src/runtime/io/request-context.h | 2 +-
common/thrift/PlanNodes.thrift | 6 +++---
tests/query_test/test_io_metrics.py | 3 +--
4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/be/src/runtime/io/hdfs-file-reader.cc b/be/src/runtime/io/hdfs-file-reader.cc
index d880264c5..1d814dcc8 100644
--- a/be/src/runtime/io/hdfs-file-reader.cc
+++ b/be/src/runtime/io/hdfs-file-reader.cc
@@ -238,6 +238,10 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_
bool is_first_read = (num_remote_bytes_ == 0);
// Collect and accumulate statistics
GetHdfsStatistics(hdfs_file, log_slow_read);
+ if (scan_range_->is_erasure_coded()) {
+ scan_range_->reader_->bytes_read_ec_.Add(current_bytes_read);
+ }
+
if (FLAGS_fs_trace_remote_reads && expected_local_ &&
num_remote_bytes_ > 0 && is_first_read) {
// Only log the first unexpected remote read for scan range
@@ -404,9 +408,6 @@ void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file, bool log_stats) {
scan_range_->reader_->bytes_read_short_circuit_.Add(
stats->totalShortCircuitBytesRead);
scan_range_->reader_->bytes_read_dn_cache_.Add(stats->totalZeroCopyBytesRead);
- if (scan_range_->is_erasure_coded()) {
- scan_range_->reader_->bytes_read_ec_.Add(stats->totalBytesRead);
- }
if (stats->totalLocalBytesRead != stats->totalBytesRead) {
num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead;
}
diff --git a/be/src/runtime/io/request-context.h b/be/src/runtime/io/request-context.h
index 646908374..585a28f47 100644
--- a/be/src/runtime/io/request-context.h
+++ b/be/src/runtime/io/request-context.h
@@ -400,7 +400,7 @@ class RequestContext {
/// Total number of bytes read from date node cache, updated at end of each range scan
AtomicInt64 bytes_read_dn_cache_{0};
- /// Total number of erasure-coded bytes read, updated at end of each range scan
+ /// Total number of erasure-coded bytes read
AtomicInt64 bytes_read_ec_{0};
/// Total number of bytes from remote reads that were expected to be local.
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index d76a25fa6..86519e4d1 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -218,6 +218,9 @@ struct THdfsFileSplit {
// last modified time of the file
7: required i64 mtime
+ // Whether the HDFS file is stored with erasure coding.
+ 8: optional bool is_erasure_coded
+
// Hash of the partition's path. This must be hashed with a hash algorithm that is
// consistent across different processes and machines. This is currently using
// Java's String.hashCode(), which is consistent. For testing purposes, this can use
@@ -227,9 +230,6 @@ struct THdfsFileSplit {
// The absolute path of the file, it's used only when data files are outside of
// the Iceberg table location (IMPALA-11507).
10: optional string absolute_path
-
- // Whether the HDFS file is stored with erasure coding.
- 11: optional bool is_erasure_coded
}
// Key range for single THBaseScanNode. Corresponds to HBaseKeyRangePB and should be kept
diff --git a/tests/query_test/test_io_metrics.py b/tests/query_test/test_io_metrics.py
index f75d14997..b45a0f8e7 100644
--- a/tests/query_test/test_io_metrics.py
+++ b/tests/query_test/test_io_metrics.py
@@ -47,8 +47,7 @@ class TestIOMetrics(ImpalaTestSuite):
def append_metric(metric, expect_nonzero):
(expect_nonzero_metrics if expect_nonzero else expect_zero_metrics).append(metric)
- # IMPALA-11697: these come from getReadStatistics, which is only implemented for HDFS
- append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_HDFS and IS_EC)
+ append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_EC)
append_metric("impala-server.io-mgr.short-circuit-bytes-read",
IS_HDFS and not IS_DOCKERIZED_TEST_CLUSTER)
append_metric("impala-server.io-mgr.local-bytes-read",