You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/06/01 03:15:54 UTC

[doris] branch branch-1.2-lts updated: [branch1.2](parquet) optimize parquet reader profile (#20286)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
     new 279b1f0842 [branch1.2](parquet) optimize parquet reader profile (#20286)
279b1f0842 is described below

commit 279b1f0842db417d42202094315d3622208dbddf
Author: Mingyu Chen <mo...@163.com>
AuthorDate: Thu Jun 1 11:15:47 2023 +0800

    [branch1.2](parquet) optimize parquet reader profile (#20286)
    
    Only for branch 1.2.
    1. Optimize parquet reader profile, separate the open reader time and meta parse time.
    2. Optimize the result of `explain verbose` for external file scan node.(already on master branch)
---
 .github/actions/clang-format-lint-action           |  2 +-
 .github/workflows/title-checker.yml                |  2 +-
 be/src/vec/exec/format/parquet/vparquet_reader.cpp |  9 ++++-
 be/src/vec/exec/format/parquet/vparquet_reader.h   |  2 +
 be/src/vec/exec/scan/vfile_scanner.cpp             |  2 +
 be/src/vec/exec/scan/vfile_scanner.h               |  1 +
 be/src/vec/exec/scan/vscan_node.cpp                |  2 +-
 be/test/vec/exec/parquet/parquet_reader_test.cpp   |  1 +
 .../planner/external/ExternalFileScanNode.java     | 46 +++++++++++++++++-----
 9 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/.github/actions/clang-format-lint-action b/.github/actions/clang-format-lint-action
index c3b2c943e9..1566bcec08 160000
--- a/.github/actions/clang-format-lint-action
+++ b/.github/actions/clang-format-lint-action
@@ -1 +1 @@
-Subproject commit c3b2c943e924028b93a707a5b1b017976ab8d50c
+Subproject commit 1566bcec081dcb246ab02e7c5f9786c0b629dd4d
diff --git a/.github/workflows/title-checker.yml b/.github/workflows/title-checker.yml
index fac1bfdb01..2d0ffd8541 100644
--- a/.github/workflows/title-checker.yml
+++ b/.github/workflows/title-checker.yml
@@ -35,5 +35,5 @@ jobs:
       - name: Check Title
         uses: ./.github/actions/action-pr-title
         with:
-          regex: '\[([a-zA-Z0-9 \-_])+\]\(([a-zA-Z0-9 \-_])+\)(.*)'
+          regex: '\[([a-zA-Z0-9 \-_\.])+\]\(([a-zA-Z0-9 \-_])+\)(.*)'
           github_token: ${{ github.token }}
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index d873260010..56775a53d3 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -73,6 +73,8 @@ void ParquetReader::_init_profile() {
                 ADD_CHILD_COUNTER(_profile, "ReadBytes", TUnit::BYTES, parquet_profile);
         _parquet_profile.column_read_time =
                 ADD_CHILD_TIMER(_profile, "ColumnReadTime", parquet_profile);
+        _parquet_profile.open_reader_time =
+                ADD_CHILD_TIMER(_profile, "OpenReaderTime", parquet_profile);
         _parquet_profile.parse_meta_time =
                 ADD_CHILD_TIMER(_profile, "ParseMetaTime", parquet_profile);
         _parquet_profile.page_index_filter_time =
@@ -114,6 +116,7 @@ void ParquetReader::close() {
             COUNTER_UPDATE(_parquet_profile.to_read_bytes, _statistics.read_bytes);
             COUNTER_UPDATE(_parquet_profile.column_read_time, _statistics.column_read_time);
             COUNTER_UPDATE(_parquet_profile.parse_meta_time, _statistics.parse_meta_time);
+            COUNTER_UPDATE(_parquet_profile.open_reader_time, _statistics.open_reader_time);
             COUNTER_UPDATE(_parquet_profile.page_index_filter_time,
                            _statistics.page_index_filter_time);
             COUNTER_UPDATE(_parquet_profile.row_group_filter_time,
@@ -140,15 +143,17 @@ void ParquetReader::close() {
 
 Status ParquetReader::_open_file() {
     if (_file_reader == nullptr) {
+        SCOPED_RAW_TIMER(&_statistics.open_reader_time);
         RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _scan_params, _scan_range.path,
                                                         _scan_range.start_offset,
                                                         _scan_range.file_size, 0, _file_reader));
+        RETURN_IF_ERROR(_file_reader->open());
     }
     if (_file_metadata == nullptr) {
-        RETURN_IF_ERROR(_file_reader->open());
         if (_file_reader->size() == 0) {
             return Status::EndOfFile("open file failed, empty parquet file: " + _scan_range.path);
         }
+        SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
         RETURN_IF_ERROR(parse_thrift_footer(_file_reader.get(), _file_metadata));
     }
     return Status::OK();
@@ -161,8 +166,8 @@ std::vector<tparquet::KeyValue> ParquetReader::get_metadata_key_values() {
 }
 
 Status ParquetReader::open() {
-    SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
     RETURN_IF_ERROR(_open_file());
+    SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
     _t_metadata = &_file_metadata->to_thrift();
     return Status::OK();
 }
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 81c0e2b28a..33aec14603 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -52,6 +52,7 @@ public:
         int64_t parse_meta_time = 0;
         int64_t row_group_filter_time = 0;
         int64_t page_index_filter_time = 0;
+        int64_t open_reader_time = 0;
     };
 
     ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
@@ -128,6 +129,7 @@ private:
         RuntimeProfile::Counter* decode_dict_time;
         RuntimeProfile::Counter* decode_level_time;
         RuntimeProfile::Counter* decode_null_map_time;
+        RuntimeProfile::Counter* open_reader_time;
     };
 
     Status _open_file();
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp
index bde638504e..23915c5c3f 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -73,6 +73,7 @@ Status VFileScanner::prepare(
     _pre_filter_timer = ADD_TIMER(_parent->_scanner_profile, "FileScannerPreFilterTimer");
     _convert_to_output_block_timer =
             ADD_TIMER(_parent->_scanner_profile, "FileScannerConvertOuputBlockTime");
+    _file_counter = ADD_COUNTER(_parent->_scanner_profile, "FileReaderCounter", TUnit::UNIT);
 
     if (vconjunct_ctx_ptr != nullptr) {
         // Copy vconjunct_ctx_ptr from scan node to this scanner's _vconjunct_ctx.
@@ -465,6 +466,7 @@ Status VFileScanner::_get_next_reader() {
             return Status::OK();
         }
         const TFileRangeDesc& range = _ranges[_next_range++];
+        COUNTER_UPDATE(_file_counter, 1);
 
         // create reader for specific format
         // TODO: add json, avro
diff --git a/be/src/vec/exec/scan/vfile_scanner.h b/be/src/vec/exec/scan/vfile_scanner.h
index 993e472576..506908486d 100644
--- a/be/src/vec/exec/scan/vfile_scanner.h
+++ b/be/src/vec/exec/scan/vfile_scanner.h
@@ -125,6 +125,7 @@ private:
     RuntimeProfile::Counter* _fill_missing_columns_timer = nullptr;
     RuntimeProfile::Counter* _pre_filter_timer = nullptr;
     RuntimeProfile::Counter* _convert_to_output_block_timer = nullptr;
+    RuntimeProfile::Counter* _file_counter = nullptr;
 
 private:
     Status _init_expr_ctxes();
diff --git a/be/src/vec/exec/scan/vscan_node.cpp b/be/src/vec/exec/scan/vscan_node.cpp
index 0dd68ea0cb..eabdfc7dcb 100644
--- a/be/src/vec/exec/scan/vscan_node.cpp
+++ b/be/src/vec/exec/scan/vscan_node.cpp
@@ -152,7 +152,7 @@ Status VScanNode::_init_profile() {
             runtime_profile()->add_rate_counter("TotalReadThroughput", _rows_read_counter);
     _num_scanners = ADD_COUNTER(_runtime_profile, "NumScanners", TUnit::UNIT);
     _get_next_timer = ADD_TIMER(_runtime_profile, "GetNextTime");
-    _acquire_runtime_filter_timer = ADD_TIMER(_runtime_profile, "AcuireRuntimeFilterTime");
+    _acquire_runtime_filter_timer = ADD_TIMER(_runtime_profile, "AcquireRuntimeFilterTime");
 
     // 2. counters for scanners
     _scanner_profile.reset(new RuntimeProfile("VScanner"));
diff --git a/be/test/vec/exec/parquet/parquet_reader_test.cpp b/be/test/vec/exec/parquet/parquet_reader_test.cpp
index 078736d9c3..373c93d0c4 100644
--- a/be/test/vec/exec/parquet/parquet_reader_test.cpp
+++ b/be/test/vec/exec/parquet/parquet_reader_test.cpp
@@ -91,6 +91,7 @@ TEST_F(ParquetReaderTest, normal) {
     auto slot_descs = desc_tbl->get_tuple_descriptor(0)->slots();
     LocalFileReader* reader =
             new LocalFileReader("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", 0);
+    reader->open();
 
     cctz::time_zone ctz;
     TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
index 9c65533cc3..1aadb1e5c6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
@@ -66,11 +66,15 @@ import org.apache.doris.thrift.TScanRangeLocations;
 import org.apache.doris.thrift.TUniqueId;
 
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 
@@ -613,18 +617,40 @@ public class ExternalFileScanNode extends ExternalScanNode {
 
         if (detailLevel == TExplainLevel.VERBOSE) {
             output.append(prefix).append("backends:").append("\n");
+            Multimap<Long, TFileRangeDesc> scanRangeLocationsMap = ArrayListMultimap.create();
+            // 1. group by backend id
             for (TScanRangeLocations locations : scanRangeLocations) {
-                output.append(prefix).append("  ").append(locations.getLocations().get(0).backend_id).append("\n");
-                List<TFileRangeDesc> files = locations.getScanRange().getExtScanRange().getFileScanRange().getRanges();
-                for (int i = 0; i < 3; i++) {
-                    if (i >= files.size()) {
-                        break;
+                scanRangeLocationsMap.putAll(locations.getLocations().get(0).backend_id,
+                        locations.getScanRange().getExtScanRange().getFileScanRange().getRanges());
+            }
+            for (long beId : scanRangeLocationsMap.keySet()) {
+                List<TFileRangeDesc> fileRangeDescs = Lists.newArrayList(scanRangeLocationsMap.get(beId));
+                // 2. sort by file start offset
+                Collections.sort(fileRangeDescs, new Comparator<TFileRangeDesc>() {
+                    @Override
+                    public int compare(TFileRangeDesc o1, TFileRangeDesc o2) {
+                        return Long.compare(o1.getStartOffset(), o2.getStartOffset());
+                    }
+                });
+                // 3. if size <= 4, print all. if size > 4, print first 3 and last 1
+                int size = fileRangeDescs.size();
+                output.append(prefix).append("  ").append(beId).append(": ").append(size).append(" files\n");
+                if (size <= 4) {
+                    for (TFileRangeDesc file : fileRangeDescs) {
+                        output.append(prefix).append("    ").append(file.getPath()).append(" start: ")
+                                .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
+                    }
+                } else {
+                    for (int i = 0; i < 3; i++) {
+                        TFileRangeDesc file = fileRangeDescs.get(i);
+                        output.append(prefix).append("    ").append(file.getPath()).append(" start: ")
+                                .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
                     }
-                    TFileRangeDesc file = files.get(i);
-                    output.append(prefix).append("    ").append(file.getPath())
-                            .append(" start: ").append(file.getStartOffset())
-                            .append(" length: ").append(file.getSize())
-                            .append("\n");
+                    int other = size - 4;
+                    output.append(prefix).append("    ... other ").append(other).append(" files ...\n");
+                    TFileRangeDesc file = fileRangeDescs.get(size - 1);
+                    output.append(prefix).append("    ").append(file.getPath()).append(" start: ")
+                            .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
                 }
             }
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org