You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/06/01 03:15:54 UTC
[doris] branch branch-1.2-lts updated: [branch1.2](parquet) optimize parquet reader profile (#20286)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
new 279b1f0842 [branch1.2](parquet) optimize parquet reader profile (#20286)
279b1f0842 is described below
commit 279b1f0842db417d42202094315d3622208dbddf
Author: Mingyu Chen <mo...@163.com>
AuthorDate: Thu Jun 1 11:15:47 2023 +0800
[branch1.2](parquet) optimize parquet reader profile (#20286)
Only for branch 1.2.
1. Optimize parquet reader profile, separate the open reader time and meta parse time.
2. Optimize the result of `explain verbose` for external file scan node.(already on master branch)
---
.github/actions/clang-format-lint-action | 2 +-
.github/workflows/title-checker.yml | 2 +-
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 9 ++++-
be/src/vec/exec/format/parquet/vparquet_reader.h | 2 +
be/src/vec/exec/scan/vfile_scanner.cpp | 2 +
be/src/vec/exec/scan/vfile_scanner.h | 1 +
be/src/vec/exec/scan/vscan_node.cpp | 2 +-
be/test/vec/exec/parquet/parquet_reader_test.cpp | 1 +
.../planner/external/ExternalFileScanNode.java | 46 +++++++++++++++++-----
9 files changed, 52 insertions(+), 15 deletions(-)
diff --git a/.github/actions/clang-format-lint-action b/.github/actions/clang-format-lint-action
index c3b2c943e9..1566bcec08 160000
--- a/.github/actions/clang-format-lint-action
+++ b/.github/actions/clang-format-lint-action
@@ -1 +1 @@
-Subproject commit c3b2c943e924028b93a707a5b1b017976ab8d50c
+Subproject commit 1566bcec081dcb246ab02e7c5f9786c0b629dd4d
diff --git a/.github/workflows/title-checker.yml b/.github/workflows/title-checker.yml
index fac1bfdb01..2d0ffd8541 100644
--- a/.github/workflows/title-checker.yml
+++ b/.github/workflows/title-checker.yml
@@ -35,5 +35,5 @@ jobs:
- name: Check Title
uses: ./.github/actions/action-pr-title
with:
- regex: '\[([a-zA-Z0-9 \-_])+\]\(([a-zA-Z0-9 \-_])+\)(.*)'
+ regex: '\[([a-zA-Z0-9 \-_\.])+\]\(([a-zA-Z0-9 \-_])+\)(.*)'
github_token: ${{ github.token }}
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index d873260010..56775a53d3 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -73,6 +73,8 @@ void ParquetReader::_init_profile() {
ADD_CHILD_COUNTER(_profile, "ReadBytes", TUnit::BYTES, parquet_profile);
_parquet_profile.column_read_time =
ADD_CHILD_TIMER(_profile, "ColumnReadTime", parquet_profile);
+ _parquet_profile.open_reader_time =
+ ADD_CHILD_TIMER(_profile, "OpenReaderTime", parquet_profile);
_parquet_profile.parse_meta_time =
ADD_CHILD_TIMER(_profile, "ParseMetaTime", parquet_profile);
_parquet_profile.page_index_filter_time =
@@ -114,6 +116,7 @@ void ParquetReader::close() {
COUNTER_UPDATE(_parquet_profile.to_read_bytes, _statistics.read_bytes);
COUNTER_UPDATE(_parquet_profile.column_read_time, _statistics.column_read_time);
COUNTER_UPDATE(_parquet_profile.parse_meta_time, _statistics.parse_meta_time);
+ COUNTER_UPDATE(_parquet_profile.open_reader_time, _statistics.open_reader_time);
COUNTER_UPDATE(_parquet_profile.page_index_filter_time,
_statistics.page_index_filter_time);
COUNTER_UPDATE(_parquet_profile.row_group_filter_time,
@@ -140,15 +143,17 @@ void ParquetReader::close() {
Status ParquetReader::_open_file() {
if (_file_reader == nullptr) {
+ SCOPED_RAW_TIMER(&_statistics.open_reader_time);
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _scan_params, _scan_range.path,
_scan_range.start_offset,
_scan_range.file_size, 0, _file_reader));
+ RETURN_IF_ERROR(_file_reader->open());
}
if (_file_metadata == nullptr) {
- RETURN_IF_ERROR(_file_reader->open());
if (_file_reader->size() == 0) {
return Status::EndOfFile("open file failed, empty parquet file: " + _scan_range.path);
}
+ SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
RETURN_IF_ERROR(parse_thrift_footer(_file_reader.get(), _file_metadata));
}
return Status::OK();
@@ -161,8 +166,8 @@ std::vector<tparquet::KeyValue> ParquetReader::get_metadata_key_values() {
}
Status ParquetReader::open() {
- SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
RETURN_IF_ERROR(_open_file());
+ SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
_t_metadata = &_file_metadata->to_thrift();
return Status::OK();
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 81c0e2b28a..33aec14603 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -52,6 +52,7 @@ public:
int64_t parse_meta_time = 0;
int64_t row_group_filter_time = 0;
int64_t page_index_filter_time = 0;
+ int64_t open_reader_time = 0;
};
ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
@@ -128,6 +129,7 @@ private:
RuntimeProfile::Counter* decode_dict_time;
RuntimeProfile::Counter* decode_level_time;
RuntimeProfile::Counter* decode_null_map_time;
+ RuntimeProfile::Counter* open_reader_time;
};
Status _open_file();
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp
index bde638504e..23915c5c3f 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -73,6 +73,7 @@ Status VFileScanner::prepare(
_pre_filter_timer = ADD_TIMER(_parent->_scanner_profile, "FileScannerPreFilterTimer");
_convert_to_output_block_timer =
ADD_TIMER(_parent->_scanner_profile, "FileScannerConvertOuputBlockTime");
+ _file_counter = ADD_COUNTER(_parent->_scanner_profile, "FileReaderCounter", TUnit::UNIT);
if (vconjunct_ctx_ptr != nullptr) {
// Copy vconjunct_ctx_ptr from scan node to this scanner's _vconjunct_ctx.
@@ -465,6 +466,7 @@ Status VFileScanner::_get_next_reader() {
return Status::OK();
}
const TFileRangeDesc& range = _ranges[_next_range++];
+ COUNTER_UPDATE(_file_counter, 1);
// create reader for specific format
// TODO: add json, avro
diff --git a/be/src/vec/exec/scan/vfile_scanner.h b/be/src/vec/exec/scan/vfile_scanner.h
index 993e472576..506908486d 100644
--- a/be/src/vec/exec/scan/vfile_scanner.h
+++ b/be/src/vec/exec/scan/vfile_scanner.h
@@ -125,6 +125,7 @@ private:
RuntimeProfile::Counter* _fill_missing_columns_timer = nullptr;
RuntimeProfile::Counter* _pre_filter_timer = nullptr;
RuntimeProfile::Counter* _convert_to_output_block_timer = nullptr;
+ RuntimeProfile::Counter* _file_counter = nullptr;
private:
Status _init_expr_ctxes();
diff --git a/be/src/vec/exec/scan/vscan_node.cpp b/be/src/vec/exec/scan/vscan_node.cpp
index 0dd68ea0cb..eabdfc7dcb 100644
--- a/be/src/vec/exec/scan/vscan_node.cpp
+++ b/be/src/vec/exec/scan/vscan_node.cpp
@@ -152,7 +152,7 @@ Status VScanNode::_init_profile() {
runtime_profile()->add_rate_counter("TotalReadThroughput", _rows_read_counter);
_num_scanners = ADD_COUNTER(_runtime_profile, "NumScanners", TUnit::UNIT);
_get_next_timer = ADD_TIMER(_runtime_profile, "GetNextTime");
- _acquire_runtime_filter_timer = ADD_TIMER(_runtime_profile, "AcuireRuntimeFilterTime");
+ _acquire_runtime_filter_timer = ADD_TIMER(_runtime_profile, "AcquireRuntimeFilterTime");
// 2. counters for scanners
_scanner_profile.reset(new RuntimeProfile("VScanner"));
diff --git a/be/test/vec/exec/parquet/parquet_reader_test.cpp b/be/test/vec/exec/parquet/parquet_reader_test.cpp
index 078736d9c3..373c93d0c4 100644
--- a/be/test/vec/exec/parquet/parquet_reader_test.cpp
+++ b/be/test/vec/exec/parquet/parquet_reader_test.cpp
@@ -91,6 +91,7 @@ TEST_F(ParquetReaderTest, normal) {
auto slot_descs = desc_tbl->get_tuple_descriptor(0)->slots();
LocalFileReader* reader =
new LocalFileReader("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", 0);
+ reader->open();
cctz::time_zone ctz;
TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
index 9c65533cc3..1aadb1e5c6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/ExternalFileScanNode.java
@@ -66,11 +66,15 @@ import org.apache.doris.thrift.TScanRangeLocations;
import org.apache.doris.thrift.TUniqueId;
import com.google.common.base.Preconditions;
+import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import java.util.Collections;
+import java.util.Comparator;
import java.util.List;
import java.util.Map;
@@ -613,18 +617,40 @@ public class ExternalFileScanNode extends ExternalScanNode {
if (detailLevel == TExplainLevel.VERBOSE) {
output.append(prefix).append("backends:").append("\n");
+ Multimap<Long, TFileRangeDesc> scanRangeLocationsMap = ArrayListMultimap.create();
+ // 1. group by backend id
for (TScanRangeLocations locations : scanRangeLocations) {
- output.append(prefix).append(" ").append(locations.getLocations().get(0).backend_id).append("\n");
- List<TFileRangeDesc> files = locations.getScanRange().getExtScanRange().getFileScanRange().getRanges();
- for (int i = 0; i < 3; i++) {
- if (i >= files.size()) {
- break;
+ scanRangeLocationsMap.putAll(locations.getLocations().get(0).backend_id,
+ locations.getScanRange().getExtScanRange().getFileScanRange().getRanges());
+ }
+ for (long beId : scanRangeLocationsMap.keySet()) {
+ List<TFileRangeDesc> fileRangeDescs = Lists.newArrayList(scanRangeLocationsMap.get(beId));
+ // 2. sort by file start offset
+ Collections.sort(fileRangeDescs, new Comparator<TFileRangeDesc>() {
+ @Override
+ public int compare(TFileRangeDesc o1, TFileRangeDesc o2) {
+ return Long.compare(o1.getStartOffset(), o2.getStartOffset());
+ }
+ });
+ // 3. if size <= 4, print all. if size > 4, print first 3 and last 1
+ int size = fileRangeDescs.size();
+ output.append(prefix).append(" ").append(beId).append(": ").append(size).append(" files\n");
+ if (size <= 4) {
+ for (TFileRangeDesc file : fileRangeDescs) {
+ output.append(prefix).append(" ").append(file.getPath()).append(" start: ")
+ .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
+ }
+ } else {
+ for (int i = 0; i < 3; i++) {
+ TFileRangeDesc file = fileRangeDescs.get(i);
+ output.append(prefix).append(" ").append(file.getPath()).append(" start: ")
+ .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
}
- TFileRangeDesc file = files.get(i);
- output.append(prefix).append(" ").append(file.getPath())
- .append(" start: ").append(file.getStartOffset())
- .append(" length: ").append(file.getSize())
- .append("\n");
+ int other = size - 4;
+ output.append(prefix).append(" ... other ").append(other).append(" files ...\n");
+ TFileRangeDesc file = fileRangeDescs.get(size - 1);
+ output.append(prefix).append(" ").append(file.getPath()).append(" start: ")
+ .append(file.getStartOffset()).append(" length: ").append(file.getSize()).append("\n");
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org