You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2023/02/08 17:25:39 UTC

[impala] 02/04: IMPALA-11883: Calculate erasure-coded bytes read directly

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit b858f2acdee77b4651ac84a7063fddf8fb0caae1
Author: Michael Smith <mi...@cloudera.com>
AuthorDate: Mon Jan 30 12:12:27 2023 -0800

    IMPALA-11883: Calculate erasure-coded bytes read directly
    
    Calculate the metric erasure-coded-bytes-read directly from HDFS reads
    rather than through hdfsFileGetReadStatistics. This allows us to use it
    for other filesystem implementations (Ozone).
    
    Also renumbers is_erasure_coded in THdfsFileSplit to 8, where it was
    originally before it was removed by IMPALA-9485 (and never replaced).
    
    Testing:
    - ran updated test_io_metrics.py with Ozone, with and without EC
    - ran updated test_io_metrics.py with HDFS, with and without EC
    
    Change-Id: Ide0fc806590b2328df8068a9a54645d1d1fb137c
    Reviewed-on: http://gerrit.cloudera.org:8080/19460
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Michael Smith <mi...@cloudera.com>
---
 be/src/runtime/io/hdfs-file-reader.cc | 7 ++++---
 be/src/runtime/io/request-context.h   | 2 +-
 common/thrift/PlanNodes.thrift        | 6 +++---
 tests/query_test/test_io_metrics.py   | 3 +--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/be/src/runtime/io/hdfs-file-reader.cc b/be/src/runtime/io/hdfs-file-reader.cc
index d880264c5..1d814dcc8 100644
--- a/be/src/runtime/io/hdfs-file-reader.cc
+++ b/be/src/runtime/io/hdfs-file-reader.cc
@@ -238,6 +238,10 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_
       bool is_first_read = (num_remote_bytes_ == 0);
       // Collect and accumulate statistics
       GetHdfsStatistics(hdfs_file, log_slow_read);
+      if (scan_range_->is_erasure_coded()) {
+        scan_range_->reader_->bytes_read_ec_.Add(current_bytes_read);
+      }
+
       if (FLAGS_fs_trace_remote_reads && expected_local_ &&
           num_remote_bytes_ > 0 && is_first_read) {
         // Only log the first unexpected remote read for scan range
@@ -404,9 +408,6 @@ void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file, bool log_stats) {
       scan_range_->reader_->bytes_read_short_circuit_.Add(
           stats->totalShortCircuitBytesRead);
       scan_range_->reader_->bytes_read_dn_cache_.Add(stats->totalZeroCopyBytesRead);
-      if (scan_range_->is_erasure_coded()) {
-        scan_range_->reader_->bytes_read_ec_.Add(stats->totalBytesRead);
-      }
       if (stats->totalLocalBytesRead != stats->totalBytesRead) {
         num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead;
       }
diff --git a/be/src/runtime/io/request-context.h b/be/src/runtime/io/request-context.h
index 646908374..585a28f47 100644
--- a/be/src/runtime/io/request-context.h
+++ b/be/src/runtime/io/request-context.h
@@ -400,7 +400,7 @@ class RequestContext {
   /// Total number of bytes read from date node cache, updated at end of each range scan
   AtomicInt64 bytes_read_dn_cache_{0};
 
-  /// Total number of erasure-coded bytes read, updated at end of each range scan
+  /// Total number of erasure-coded bytes read
   AtomicInt64 bytes_read_ec_{0};
 
   /// Total number of bytes from remote reads that were expected to be local.
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index d76a25fa6..86519e4d1 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -218,6 +218,9 @@ struct THdfsFileSplit {
   // last modified time of the file
   7: required i64 mtime
 
+  // Whether the HDFS file is stored with erasure coding.
+  8: optional bool is_erasure_coded
+
   // Hash of the partition's path. This must be hashed with a hash algorithm that is
   // consistent across different processes and machines. This is currently using
   // Java's String.hashCode(), which is consistent. For testing purposes, this can use
@@ -227,9 +230,6 @@ struct THdfsFileSplit {
   // The absolute path of the file, it's used only when data files are outside of
   // the Iceberg table location (IMPALA-11507).
   10: optional string absolute_path
-
-  // Whether the HDFS file is stored with erasure coding.
-  11: optional bool is_erasure_coded
 }
 
 // Key range for single THBaseScanNode. Corresponds to HBaseKeyRangePB and should be kept
diff --git a/tests/query_test/test_io_metrics.py b/tests/query_test/test_io_metrics.py
index f75d14997..b45a0f8e7 100644
--- a/tests/query_test/test_io_metrics.py
+++ b/tests/query_test/test_io_metrics.py
@@ -47,8 +47,7 @@ class TestIOMetrics(ImpalaTestSuite):
     def append_metric(metric, expect_nonzero):
       (expect_nonzero_metrics if expect_nonzero else expect_zero_metrics).append(metric)
 
-    # IMPALA-11697: these come from getReadStatistics, which is only implemented for HDFS
-    append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_HDFS and IS_EC)
+    append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_EC)
     append_metric("impala-server.io-mgr.short-circuit-bytes-read",
         IS_HDFS and not IS_DOCKERIZED_TEST_CLUSTER)
     append_metric("impala-server.io-mgr.local-bytes-read",