You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/06/13 08:25:53 UTC

[incubator-doris] branch master updated: [Doc] Add docs to OLAP_SCAN_NODE query profile (#3808)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b8ee84a  [Doc] Add docs to OLAP_SCAN_NODE query profile (#3808)
b8ee84a is described below

commit b8ee84a120813181733c6ac8cd5b9fd3c9b2f674
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Sat Jun 13 16:25:40 2020 +0800

    [Doc] Add docs to OLAP_SCAN_NODE query profile (#3808)
---
 be/src/exec/olap_scan_node.cpp                     |  33 ++----
 be/src/exec/olap_scan_node.h                       |   1 +
 be/src/exec/olap_scanner.cpp                       |   3 +-
 be/src/exec/scan_node.cpp                          |   3 -
 be/src/exec/scan_node.h                            |   5 -
 be/src/olap/file_stream.cpp                        |   2 +-
 be/src/olap/olap_common.h                          |   7 +-
 be/src/olap/rowset/beta_rowset_reader.cpp          |   2 +-
 be/src/olap/rowset/segment_v2/page_io.cpp          |   2 +
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |   4 +-
 docs/en/administrator-guide/running-profile.md     | 118 +++++++++++++++++---
 docs/zh-CN/administrator-guide/running-profile.md  | 124 ++++++++++++++++++---
 .../apache/doris/common/util/RuntimeProfile.java   |  20 ++--
 13 files changed, 247 insertions(+), 77 deletions(-)

diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index db3fe88..06f64f8 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -92,32 +92,23 @@ void OlapScanNode::_init_counter(RuntimeState* state) {
     ADD_TIMER(_runtime_profile, "ShowHintsTime");
 
     _reader_init_timer = ADD_TIMER(_runtime_profile, "ReaderInitTime");
-    _read_compressed_counter =
-        ADD_COUNTER(_runtime_profile, "CompressedBytesRead", TUnit::BYTES);
-    _read_uncompressed_counter =
-        ADD_COUNTER(_runtime_profile, "UncompressedBytesRead", TUnit::BYTES);
+    _read_compressed_counter = ADD_COUNTER(_runtime_profile, "CompressedBytesRead", TUnit::BYTES);
+    _read_uncompressed_counter = ADD_COUNTER(_runtime_profile, "UncompressedBytesRead", TUnit::BYTES);
     _block_load_timer = ADD_TIMER(_runtime_profile, "BlockLoadTime");
-    _block_load_counter =
-        ADD_COUNTER(_runtime_profile, "BlocksLoad", TUnit::UNIT);
-    _block_fetch_timer =
-        ADD_TIMER(_runtime_profile, "BlockFetchTime");
-    _raw_rows_counter =
-        ADD_COUNTER(_runtime_profile, "RawRowsRead", TUnit::UNIT);
+    _block_load_counter = ADD_COUNTER(_runtime_profile, "BlocksLoad", TUnit::UNIT);
+    _block_fetch_timer = ADD_TIMER(_runtime_profile, "BlockFetchTime");
+    _raw_rows_counter = ADD_COUNTER(_runtime_profile, "RawRowsRead", TUnit::UNIT);
     _block_convert_timer = ADD_TIMER(_runtime_profile, "BlockConvertTime");
     _block_seek_timer = ADD_TIMER(_runtime_profile, "BlockSeekTime");
     _block_seek_counter = ADD_COUNTER(_runtime_profile, "BlockSeekCount", TUnit::UNIT);
 
-    _rows_vec_cond_counter =
-        ADD_COUNTER(_runtime_profile, "RowsVectorPredFiltered", TUnit::UNIT);
-    _vec_cond_timer =
-        ADD_TIMER(_runtime_profile, "VectorPredEvalTime");
+    _rows_vec_cond_counter = ADD_COUNTER(_runtime_profile, "RowsVectorPredFiltered", TUnit::UNIT);
+    _vec_cond_timer = ADD_TIMER(_runtime_profile, "VectorPredEvalTime");
 
-    _stats_filtered_counter =
-        ADD_COUNTER(_runtime_profile, "RowsStatsFiltered", TUnit::UNIT);
-    _bf_filtered_counter =
-        ADD_COUNTER(_runtime_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
-    _del_filtered_counter =
-        ADD_COUNTER(_runtime_profile, "RowsDelFiltered", TUnit::UNIT);
+    _stats_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsStatsFiltered", TUnit::UNIT);
+    _bf_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
+    _del_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsDelFiltered", TUnit::UNIT);
+    _key_range_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsKeyRangeFiltered", TUnit::UNIT);
 
     _io_timer = ADD_TIMER(_runtime_profile, "IOTimer");
     _decompressor_timer = ADD_TIMER(_runtime_profile, "DecompressorTimer");
@@ -128,7 +119,7 @@ void OlapScanNode::_init_counter(RuntimeState* state) {
     _total_pages_num_counter = ADD_COUNTER(_runtime_profile, "TotalPagesNum", TUnit::UNIT);
     _cached_pages_num_counter = ADD_COUNTER(_runtime_profile, "CachedPagesNum", TUnit::UNIT);
 
-    _bitmap_index_filter_counter = ADD_COUNTER(_runtime_profile, "BitmapIndexFilterCount", TUnit::UNIT);
+    _bitmap_index_filter_counter = ADD_COUNTER(_runtime_profile, "RowsBitmapIndexFiltered", TUnit::UNIT);
     _bitmap_index_filter_timer = ADD_TIMER(_runtime_profile, "BitmapIndexFilterTimer");
 
     _num_scanners = ADD_COUNTER(_runtime_profile, "NumScanners", TUnit::UNIT);
diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index c91eb34..335a100 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -271,6 +271,7 @@ private:
     RuntimeProfile::Counter* _stats_filtered_counter = nullptr;
     RuntimeProfile::Counter* _bf_filtered_counter = nullptr;
     RuntimeProfile::Counter* _del_filtered_counter = nullptr;
+    RuntimeProfile::Counter* _key_range_filtered_counter = nullptr;
 
     RuntimeProfile::Counter* _block_seek_timer = nullptr;
     RuntimeProfile::Counter* _block_seek_counter = nullptr;
diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index f0af8e9..394a0d8 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -469,13 +469,14 @@ void OlapScanner::update_counter() {
     COUNTER_UPDATE(_parent->_stats_filtered_counter, _reader->stats().rows_stats_filtered);
     COUNTER_UPDATE(_parent->_bf_filtered_counter, _reader->stats().rows_bf_filtered);
     COUNTER_UPDATE(_parent->_del_filtered_counter, _reader->stats().rows_del_filtered);
+    COUNTER_UPDATE(_parent->_key_range_filtered_counter, _reader->stats().rows_key_range_filtered);
 
     COUNTER_UPDATE(_parent->_index_load_timer, _reader->stats().index_load_ns);
 
     COUNTER_UPDATE(_parent->_total_pages_num_counter, _reader->stats().total_pages_num);
     COUNTER_UPDATE(_parent->_cached_pages_num_counter, _reader->stats().cached_pages_num);
 
-    COUNTER_UPDATE(_parent->_bitmap_index_filter_counter, _reader->stats().bitmap_index_filter_count);
+    COUNTER_UPDATE(_parent->_bitmap_index_filter_counter, _reader->stats().rows_bitmap_index_filtered);
     COUNTER_UPDATE(_parent->_bitmap_index_filter_timer, _reader->stats().bitmap_index_filter_timer);
     COUNTER_UPDATE(_parent->_block_seek_counter, _reader->stats().block_seek_num);
 
diff --git a/be/src/exec/scan_node.cpp b/be/src/exec/scan_node.cpp
index 6a9ae67..6021abf 100644
--- a/be/src/exec/scan_node.cpp
+++ b/be/src/exec/scan_node.cpp
@@ -29,7 +29,6 @@ const string ScanNode::_s_materialize_tuple_timer = "MaterializeTupleTime(*)";
 const string ScanNode::_s_per_read_thread_throughput_counter =
     "PerReadThreadRawHdfsThroughput";
 const string ScanNode::_s_num_disks_accessed_counter = "NumDiskAccess";
-const string ScanNode::_s_scan_ranges_complete_counter = "ScanRangesComplete";
 const string ScanNode::_s_scanner_thread_counters_prefix = "ScannerThreads";
 const string ScanNode::_s_scanner_thread_total_wallclock_time =
     "ScannerThreadsTotalWallClockTime";
@@ -59,8 +58,6 @@ Status ScanNode::prepare(RuntimeState* state) {
                                  _bytes_read_counter,
                                  _read_timer),
             "");
-    _scan_ranges_complete_counter =
-        ADD_COUNTER(runtime_profile(), _s_scan_ranges_complete_counter, TUnit::UNIT);
     _num_disks_accessed_counter =
         ADD_COUNTER(runtime_profile(), _s_num_disks_accessed_counter, TUnit::UNIT);
 
diff --git a/be/src/exec/scan_node.h b/be/src/exec/scan_node.h
index 0ca7bd1..54ef5dc 100644
--- a/be/src/exec/scan_node.h
+++ b/be/src/exec/scan_node.h
@@ -108,9 +108,6 @@ public:
     RuntimeProfile::Counter* materialize_tuple_timer() const {
         return _materialize_tuple_timer;
     }
-    RuntimeProfile::Counter* scan_ranges_complete_counter() const {
-        return _scan_ranges_complete_counter;
-    }
     RuntimeProfile::ThreadCounters* scanner_thread_counters() const {
         return _scanner_thread_counters;
     }
@@ -123,7 +120,6 @@ public:
     static const std::string _s_per_read_thread_throughput_counter;
     static const std::string _s_num_disks_accessed_counter;
     static const std::string _s_materialize_tuple_timer;
-    static const std::string _s_scan_ranges_complete_counter;
     static const std::string _s_scanner_thread_counters_prefix;
     static const std::string _s_scanner_thread_total_wallclock_time;
     static const std::string _s_average_io_mgr_queue_capacity;
@@ -140,7 +136,6 @@ protected:
     RuntimeProfile::Counter* _per_read_thread_throughput_counter;
     RuntimeProfile::Counter* _num_disks_accessed_counter;
     RuntimeProfile::Counter* _materialize_tuple_timer;  // time writing tuple slots
-    RuntimeProfile::Counter* _scan_ranges_complete_counter;
     // Aggregated scanner thread counters
     RuntimeProfile::ThreadCounters* _scanner_thread_counters;
     RuntimeProfile::Counter* _num_scanner_threads_started_counter;
diff --git a/be/src/olap/file_stream.cpp b/be/src/olap/file_stream.cpp
index 08ebf2a..d8db9c2 100755
--- a/be/src/olap/file_stream.cpp
+++ b/be/src/olap/file_stream.cpp
@@ -98,9 +98,9 @@ OLAPStatus ReadOnlyFileStream::_assure_data() {
                 OLAP_LOG_WARNING("fail to decompress err=%d", res);
                 return res;
             }
-            _stats->uncompressed_bytes_read += _compressed_helper->limit();
         }
     }
+    _stats->uncompressed_bytes_read += _compressed_helper->limit();
 
     _uncompressed = _compressed_helper;
     _current_compress_position = file_cursor_used;
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 3a62b63..84e5fd1 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -243,9 +243,9 @@ struct OlapReaderStatistics {
     // total read bytes in memory
     int64_t bytes_read = 0;
 
-    int64_t block_load_ns = 0;
+    int64_t block_load_ns = 0; 
     int64_t blocks_load = 0;
-    int64_t block_fetch_ns = 0;
+    int64_t block_fetch_ns = 0;  // time of rowset reader's `next_batch()` call
     int64_t block_seek_num = 0;
     int64_t block_seek_ns = 0;
     int64_t block_convert_ns = 0;
@@ -255,6 +255,7 @@ struct OlapReaderStatistics {
     int64_t rows_vec_cond_filtered = 0;
     int64_t vec_cond_ns = 0;
 
+    int64_t rows_key_range_filtered = 0;
     int64_t rows_stats_filtered = 0;
     int64_t rows_bf_filtered = 0;
     int64_t rows_del_filtered = 0;
@@ -264,7 +265,7 @@ struct OlapReaderStatistics {
     int64_t total_pages_num = 0;
     int64_t cached_pages_num = 0;
 
-    int64_t bitmap_index_filter_count = 0;
+    int64_t rows_bitmap_index_filtered = 0;
     int64_t bitmap_index_filter_timer = 0;
 };
 
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp
index 9098500..148de34 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -114,10 +114,10 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) {
 }
 
 OLAPStatus BetaRowsetReader::next_block(RowBlock** block) {
+    SCOPED_RAW_TIMER(&_stats->block_fetch_ns);
     // read next input block
     _input_block->clear();
     {
-        SCOPED_RAW_TIMER(&_stats->block_fetch_ns);
         auto s = _iterator->next_batch(_input_block.get());
         if (!s.ok()) {
             if (s.is_end_of_file()) {
diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp
index a9c79ae..4871b90 100644
--- a/be/src/olap/rowset/segment_v2/page_io.cpp
+++ b/be/src/olap/rowset/segment_v2/page_io.cpp
@@ -191,6 +191,8 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts,
         page = std::move(decompressed_page);
         page_slice = Slice(page.get(), footer->uncompressed_size() + footer_size + 4);
         opts.stats->uncompressed_bytes_read += page_slice.size;
+    } else {
+        opts.stats->uncompressed_bytes_read += body_size;
     }
 
     *body = Slice(page_slice.data, page_slice.size - 4 - footer_size);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 50e81d0..645784c 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -166,7 +166,9 @@ Status SegmentIterator::_get_row_ranges_by_keys() {
         RowRanges::ranges_union(result_ranges, row_range, &result_ranges);
     }
     // pre-condition: _row_ranges == [0, num_rows)
+    size_t pre_size = _row_bitmap.cardinality();
     _row_bitmap = RowRanges::ranges_to_roaring(result_ranges);
+    _opts.stats->rows_key_range_filtered += (pre_size - _row_bitmap.cardinality());
     DorisMetrics::instance()->segment_rows_by_short_key.increment(_row_bitmap.cardinality());
 
     return Status::OK();
@@ -301,7 +303,7 @@ Status SegmentIterator::_apply_bitmap_index() {
         }
     }
     _col_predicates = std::move(remaining_predicates);
-    _opts.stats->bitmap_index_filter_count += (input_rows - _row_bitmap.cardinality());
+    _opts.stats->rows_bitmap_index_filtered += (input_rows - _row_bitmap.cardinality());
     return Status::OK();
 }
 
diff --git a/docs/en/administrator-guide/running-profile.md b/docs/en/administrator-guide/running-profile.md
index e411bf5..c038867 100644
--- a/docs/en/administrator-guide/running-profile.md
+++ b/docs/en/administrator-guide/running-profile.md
@@ -89,7 +89,7 @@ The fragment ID is listed here; ``` hostname ``` show the be node executing the
 
 There are many statistical information collected at BE.  so we list the corresponding meanings of profile are below:
 
-#### Fragment
+#### `Fragment`
    - AverageThreadTokens: Number of threads used to execute fragment, excluding the usage of thread pool
    - PeakReservation: Peak memory used by buffer pool
    - MemoryLimit: Memory limit at query
@@ -98,14 +98,14 @@ There are many statistical information collected at BE.  so we list the correspo
    - BytesReceived: Size of bytes received by network
    - DeserializeRowBatchTimer: Time consuming to receive data deserialization
 
-#### DataStreamSender
+#### `DataStreamSender`
  - BytesSent: Total bytes data sent
  - IgnoreRows: Rows filtered
  - OverallThroughput: Total throughput = BytesSent / Time
  - SerializeBatchTime: Sending data serialization time
  - UncompressedRowBatchSize: Size of rowbatch before sending data compression
 
-#### SORT_NODE
+#### `SORT_NODE`
   - InMemorySortTime: In memory sort time
   - InitialRunsCreated: Number of initialize sort run
   - MergeGetNext: Time cost of MergeSort from multiple sort_run to get the next batch (only show spilled disk)
@@ -113,7 +113,7 @@ There are many statistical information collected at BE.  so we list the correspo
   - SortDataSize: Total sorted data
   - TotalMergesPerformed: Number of external sort merges
 
-#### AGGREGATION_NODE:
+#### `AGGREGATION_NODE`
   - PartitionsCreated: Number of partition split by aggregate
   - GetResultsTime: Time to get aggregate results from each partition
   - HTResizeTime:  Time spent in resizing hashtable
@@ -127,17 +127,104 @@ There are many statistical information collected at BE.  so we list the correspo
   - HashProbe:  Number of hashtable probe
   - HashTravelLength:  The number of steps moved when hashtable queries
 
-#### OLAP_SCAN_NODE:
- - BytesRead: Total data
- - TotalReadThroughput:Throughput = BytesRead / Time
- - TabletCount: Number of scanned tablets
- - RowsPushedCondFiltered:Number of filters pushed down
- - RawRowsRead: Number of rows read
- - RowsReturned: Number of rows returned by the node
- - RowsReturnedRate: Rate of rows returned
- - PeakMemoryUsage: Peak memory usage of the node
-
-#### Buffer pool:
+#### `OLAP_SCAN_NODE`
+
+The `OLAP_SCAN_NODE` is responsible for specific data scanning tasks. One `OLAP_SCAN_NODE` will generate one or more `OlapScanner` threads. Each Scanner thread is responsible for scanning part of the data.
+
+Some or all of the predicate conditions in the query will be pushed to `OLAP_SCAN_NODE`. Some of these predicate conditions will continue to be pushed down to the storage engine in order to use the storage engine's index for data filtering. The other part will be kept in `OLAP_SCAN_NODE` to filter the data returned from the storage engine.
+
+The profile of a typical `OLAP_SCAN_NODE` is as follows. Some indicators will have different meanings depending on the storage format (V1 or V2).
+
+```
+OLAP_SCAN_NODE (id=0): (Active: 4.050ms, non-child: 35.68%)
+   -BitmapIndexFilterTimer: 0.000ns # Time consuming to filter data using bitmap index.
+   -BlockConvertTime: 7.433ms   # Time consuming to convert a vectorized block into a row structure RowBlock. Vectorized Block is VectorizedRowBatch in V1, and RowBlockV2 in V2.
+   -BlockFetchTime: 36.934ms    # Rowset Reader time to get Block.
+   -BlockLoadTime: 23.368ms # time of SegmentReader(V1) or SegmentIterator(V2) to get the block time.
+   -BlockSeekCount: 0   # The number of block seek times when reading segments.
+   -BlockSeekTime: 3.062ms  # Time consuming for block seek when reading segments.
+   -BlocksLoad: 221 # number of blocks read
+   -BytesRead: 6.59 MB  # The amount of data read from the data file. Assuming that 10 32-bit integers are read, the amount of data is 10 * 4B = 40 Bytes. This data only represents the fully expanded size of the data in memory, and does not represent the actual IO size.
+   -CachedPagesNum: 0   # In V2 only, when PageCache is enabled, the number of pages that hit Cache.
+   -CompressedBytesRead: 1.36 MB    # V1, the size of the data read from the file before decompression. In V2, the uncompressed size of Pages that did not hit PageCache.
+   -DecompressorTimer: 4.194ms  # Data decompression takes time.
+   -IOTimer: 1.404ms    # IO time to actually read data from the operating system.
+   -IndexLoadTime: 1.521ms  # In V1 only, it takes time to read Index Stream.
+   -NumDiskAccess: 6    # The number of disks involved in this ScanNode.
+   -NumScanners: 25 # The number of Scanners generated by this ScanNode.
+   -PeakMemoryUsage: 0  # meaningless
+   -PerReadThreadRawHdfsThroughput: 0.00 /sec   # meaningless
+   -RawRowsRead: 141.71K    # The number of raw rows read in the storage engine. See below for details.
+   -ReaderInitTime: 16.515ms    # OlapScanner time to initialize Reader. V1 includes the time to form MergeHeap. V2 includes the time to generate Iterators at all levels and read the first block.
+   -RowsBitmapFiltered: 0   # Number of rows filtered by bitmap index
+   -RowsBloomFilterFiltered: 0  # In V2 only, the number of rows filtered by the BloomFilter index.
+   -RowsDelFiltered: 0  # V1 indicates the number of rows filtered according to the delete condition. V2 also includes the number of rows filtered by BloomFilter and some predicate conditions.
+   -RowsPushedCondFiltered: 0   # Filter the conditions based on the predicate passed down, such as the condition passed from BuildTable to ProbeTable in Join calculation. This value is inaccurate because if the filtering effect is poor, it will not be filtered.
+   -RowsRead: 132.78K   # The number of rows returned from the storage engine to the Scanner, excluding the number of rows filtered by the Scanner.
+   -RowsReturned: 132.78K   # The number of rows returned from ScanNode to the upper node.
+   -RowsReturnedRate: 32.78 M/sec   # RowsReturned/ActiveTime
+   -RowsStatsFiltered: 0    # In V2, the number of rows filtered according to Zonemap with predicate conditions. V1 also contains the number of rows filtered by BloomFilter.
+   -RowsVectorPredFiltered: 0   # The number of rows filtered by the vectorized conditional filtering operation.
+   -ScanTime: 49.239ms # Time-consuming statistics of Scanner calling get_next() method.
+   -ScannerThreadsInvoluntaryContextSwitches: 0 # meaningless
+   -ScannerThreadsTotalWallClockTime: 0.000ns   # meaningless
+     -MaterializeTupleTime(*): 0.000ns  # meaningless
+     -ScannerThreadsSysTime: 0.000ns    # meaningless
+     -ScannerThreadsUserTime: 0.000ns   # meaningless
+   -ScannerThreadsVoluntaryContextSwitches: 0   # meaningless
+   -ShowHintsTime: 0.000ns  # meaningless in V2. Part of the data is read in V1 to perform ScanRange segmentation.
+   -TabletCount: 25 # The number of tablets involved in this ScanNode.
+   -TotalPagesNum: 0    # In V2 only, the total number of pages read.
+   -TotalRawReadTime(*): 0.000ns    # meaningless
+   -TotalReadThroughput: 0.00 /sec  # meaningless
+   -UncompressedBytesRead: 4.28 MB  # V1 is the decompressed size of the read data file (if the file does not need to be decompressed, the file size is directly counted). In V2, only the uncompressed size of the PageCache is counted (if the Page does not need to be decompressed, the Page size is directly counted)
+   -VectorPredEvalTime: 0.000ns # Time consuming of vectorized conditional filtering operation.
+```
+
+* Some notes on the number of rows in Profile
+
+    The metrics related to the number of rows in the Profile are:
+    
+    * RowsKeyRangeFiltered
+    * RowsBitmapIndexFiltered
+    * RowsBloomFilterFiltered
+    * RowsStatsFiltered
+    * RowsDelFiltered
+    * RawRowsRead
+    * RowsRead
+    * RowsReturned
+
+    The predicate conditions in a query are filtered in the storage engine and Scanner respectively. Among the above indicators, the group of metrics `Rows***Filtered` describes the number of rows filtered in the storage engine. The last three metrics describe the number of lines processed in Scanner.
+    
+    The following only describes the process of reading data in Segment V2 format. In the Segment V1 format, the meaning of these metrics are slightly different.
+
+    When reading a V2 format segment, it will first filter based on the Key range (the query range composed of the prefix key), and the number of filtered lines is recorded in `RowsKeyRangeFiltered`. After that, the data is filtered using the Bitmap index, and the filtered rows are recorded in `RowsBitmapIndexFiltered`. After that, the data is filtered using the BloomFilter index and recorded in `RowsBloomFilterFiltered`. The value of `RowsBloomFilterFiltered` is the difference between t [...]
+
+    `RowsStatsFiltered` records the number of rows filtered by other predicate conditions. This includes the predicate conditions pushed down to the storage engine and the Delete condition in the storage engine.
+    
+    `RowsDelFiltered` contains the number of filtered rows recorded by `RowsBloomFilterFiltered` and `RowsStatsFiltered`.
+    
+    `RawRowsRead` is the number of rows that need to be read after the above filtering. The `RowsRead` is the number of rows returned to the Scanner. `RowsRead` is usually smaller than `RawRowsRead`, because returning from the storage engine to the Scanner may go through a data aggregation.
+    
+    `RowsReturned` is the number of rows that ScanNode will eventually return to the upper node. `RowsReturned` will usually be less than
+`RowsRead`. Because there will be some predicate conditions that are not pushed down to the storage engine on the Scanner, it will be filtered in Scanner.
+
+    Through the above indicators, you can roughly analyze the number of rows processed by the storage engine and the final number of rows after filtering. Through the set of indicators of `Rows***Filtered`, you can also analyze whether the query condition is pushed down to the storage engine and the filtering effect of different indexes.
+    
+    If the gap between `RawRowsRead` and `RowsRead` is large, it means that a large number of rows are aggregated, and the aggregation may be time-consuming. If the gap between `RowsRead` and `RowsReturned` is large, it means that many lines are filtered in Scanner. This shows that many highly selected conditions are not pushed to the storage engine. The filtering efficiency in Scanner is worse than that in the storage engine.
+
+* Simple analysis of Scan Node Profile
+
+    OlapScanNode's Profile is usually used to analyze the efficiency of data scanning. In addition to the information about the number of rows that can be used to infer the predicate pushdown and index usage, the following aspects can also be used for simple analysis.
+    
+    * First of all, many indicators, such as `IOTimer`, `BlockFetchTime`, etc. are the accumulation of all Scanner thread indicators, so the value may be relatively large. And because the Scanner thread reads data asynchronously, these cumulative indicators can only reflect the cumulative working time of the Scanner, and do not directly represent the time cost of ScanNode. The proportion of time spent by ScanNode in the entire query plan is the value recorded in the `Active` field. Somet [...]
+    * IOTimer is the IO time, which can directly reflect the time-consuming IO operation. Here is the accumulated IO time of all Scanner threads.
+    * NumScanners indicates the number of Scanner threads. Too many or too few threads will affect query efficiency. At the same time, some aggregate indicators can be divided by the number of threads to roughly estimate the time spent by each thread.
+    * TabletCount represents the number of tablets that need to be scanned. Excessive numbers may mean that a large number of random reads and data merge operations are required.
+    * UncompressedBytesRead indirectly reflects the amount of data read. If the value is large, it indicates that there may be a large number of IO operations.
+    * CachedPagesNum and TotalPagesNum. For V2 format, you can view the hit of PageCache. The higher the hit rate, the less time the IO and decompression operations take.
+
+#### `Buffer pool`
  - AllocTime: Memory allocation time
  - CumulativeAllocationBytes: Cumulative amount of memory allocated
  - CumulativeAllocations: Cumulative number of memory allocations
@@ -145,3 +232,4 @@ There are many statistical information collected at BE.  so we list the correspo
  - PeakUnpinnedBytes: Amount of memory data of unpin
  - PeakUsedReservation: Peak usage of reservation
  - ReservationLimit: Limit of reservation of bufferpool
+
diff --git a/docs/zh-CN/administrator-guide/running-profile.md b/docs/zh-CN/administrator-guide/running-profile.md
index a662d30..34cce34 100644
--- a/docs/zh-CN/administrator-guide/running-profile.md
+++ b/docs/zh-CN/administrator-guide/running-profile.md
@@ -84,10 +84,11 @@ Query:
            - RowsReturnedRate: 811
 ```
 这里列出了Fragment的ID;```hostname```指的是执行Fragment的BE节点;```Active:10s270ms```表示该节点的执行总时间;```non-child: 0.14%```表示执行节点自身的执行时间,不包含子节点的执行时间。后续依次打印子节点的统计信息,**这里可以通过缩进区分节点之间的父子关系**。
+
 ## Profile参数解析
 BE端收集的统计信息较多,下面列出了各个参数的对应含义:
 
-#### Fragment
+#### `Fragment`
    - AverageThreadTokens: 执行Fragment使用线程数目,不包含线程池的使用情况
    - Buffer Pool PeakReservation: Buffer Pool使用的内存的峰值
    - MemoryLimit: 查询时的内存限制
@@ -95,20 +96,22 @@ BE端收集的统计信息较多,下面列出了各个参数的对应含义:
    - RowsProduced: 处理列的行数
    - BytesReceived: 通过网络接收的Bytes大小
    - DeserializeRowBatchTimer: 反序列化的耗时
-#### BlockMgr:
+
+#### `BlockMgr`
   - BlocksCreated: 落盘时创建的Blocks数目
   - BlocksRecycled: 重用的Blocks数目
   - BytesWritten: 总的落盘写数据量
   - MaxBlockSize: 单个Block的大小
   - TotalReadBlockTime: 读Block的总耗时
-#### DataStreamSender
+
+#### `DataStreamSender`
    - BytesSent: 发送的总数据量 = 接受者 * 发送数据量
    - IgnoreRows: 过滤的行数
    - OverallThroughput: 总的吞吐量 = BytesSent / 时间
    - SerializeBatchTime: 发送数据序列化消耗的时间
    - UncompressedRowBatchSize: 发送数据压缩前的RowBatch的大小
 
- #### SORT_NODE
+#### `SORT_NODE`
   - InMemorySortTime: 内存之中的排序耗时
   - InitialRunsCreated: 初始化排序的趟数(如果内存排序的话,该数为1)
   - SortDataSize: 总的排序数据量
@@ -116,7 +119,7 @@ BE端收集的统计信息较多,下面列出了各个参数的对应含义:
   - MergeGetNextBatch: MergeSort提取下一个sort_run的batch的耗时 (仅在落盘时计时)
   - TotalMergesPerformed: 进行外排merge的次数
  
-#### AGGREGATION_NODE:
+#### `AGGREGATION_NODE`
   - PartitionsCreated: 聚合查询拆分成Partition的个数
   - GetResultsTime: 从各个partition之中获取聚合结果的时间
   - HTResizeTime:  HashTable进行resize消耗的时间
@@ -130,18 +133,104 @@ BE端收集的统计信息较多,下面列出了各个参数的对应含义:
   - HashProbe:  HashTable查询的次数
   - HashTravelLength:  HashTable查询时移动的步数
 
-#### OLAP_SCAN_NODE:
- - RowsProduced: 生成结果的行数
- - BytesRead: scan node 扫描数据的总量
- - TotalReadThroughput:吞吐量
- - TabletCount: tablet 的数目
- - RowsPushedCondFiltered:下推的过滤器
- - RawRowsRead: 实际读取的行数
- - RowsReturned: 该节点返回的行数
- - RowsReturnedRate: 返回行数的速率
- - PeakMemoryUsage 内存使用的峰值  
-
-#### Buffer pool:
+#### `OLAP_SCAN_NODE`
+
+`OLAP_SCAN_NODE` 节点负责具体的数据扫描任务。一个 `OLAP_SCAN_NODE` 会生成一个或多个 `OlapScanner` 线程。每个 Scanner 线程负责扫描部分数据。
+
+查询中的部分或全部谓词条件会推送给 `OLAP_SCAN_NODE`。这些谓词条件中一部分会继续下推给存储引擎,以便利用存储引擎的索引进行数据过滤。另一部分会保留在 `OLAP_SCAN_NODE` 中,用于过滤从存储引擎中返回的数据。
+
+一个典型的 `OLAP_SCAN_NODE` 节点的 Profile 如下。部分指标会因存储格式的不同(V1 或 V2)而有不同含义。
+
+```
+OLAP_SCAN_NODE (id=0):(Active: 4.050ms, non-child: 35.68%)
+   - BitmapIndexFilterTimer: 0.000ns    # 利用 bitmap 索引过滤数据的耗时。
+   - BlockConvertTime: 7.433ms  # 将向量化Block转换为行结构的 RowBlock 的耗时。向量化 Block 在 V1 中为 VectorizedRowBatch,V2中为 RowBlockV2。
+   - BlockFetchTime: 36.934ms   # Rowset Reader 获取 Block 的时间。
+   - BlockLoadTime: 23.368ms    # SegmentReader(V1) 或 SegmentIterator(V2) 获取 block 的时间。
+   - BlockSeekCount: 0  # 读取 Segment 时进行 block seek 的次数。
+   - BlockSeekTime: 3.062ms # 读取 Segment 时进行 block seek 的耗时。
+   - BlocksLoad: 221    # 读取 Block 的数量
+   - BytesRead: 6.59 MB # 从数据文件中读取到的数据量。假设读取到了是10个32位整型,则数据量为 10 * 4B = 40 Bytes。这个数据仅表示数据在内存中全展开的大小,并不代表实际的 IO 大小。
+   - CachedPagesNum: 0  # 仅 V2 中,当开启 PageCache 后,命中 Cache 的 Page 数量。
+   - CompressedBytesRead: 1.36 MB   # V1 中,从文件中读取的解压前的数据大小。V2 中,读取到的没有命中 PageCache 的 Page 的压缩前的大小。
+   - DecompressorTimer: 4.194ms # 数据解压耗时。
+   - IOTimer: 1.404ms   # 实际从操作系统读取数据的 IO 时间。
+   - IndexLoadTime: 1.521ms # 仅 V1 中,读取 Index Stream 的耗时。
+   - NumDiskAccess: 6   # 该 ScanNode 节点涉及到的磁盘数量。
+   - NumScanners: 25    # 该 ScanNode 生成的 Scanner 数量。
+   - PeakMemoryUsage: 0     # 无意义
+   - PerReadThreadRawHdfsThroughput: 0.00 /sec  # 无意义
+   - RawRowsRead: 141.71K   # 存储引擎中读取的原始行数。详情见下文。
+   - ReaderInitTime: 16.515ms   # OlapScanner 初始化 Reader 的时间。V1 中包括组建 MergeHeap 的时间。V2 中包括生成各级 Iterator 并读取第一组Block的时间。
+   - RowsBitmapFiltered: 0  # 利用 bitmap 索引过滤掉的行数。
+   - RowsBloomFilterFiltered: 0 # 仅 V2 中,通过 BloomFilter 索引过滤掉的行数。
+   - RowsDelFiltered: 0     # V1 中表示根据 delete 条件过滤掉的行数。V2 中还包括通过 BloomFilter 和部分谓词条件过滤掉的行数。
+   - RowsPushedCondFiltered: 0  # 根据传递下推的谓词过滤掉的条件,比如 Join 计算中从 BuildTable 传递给 ProbeTable 的条件。该数值不准确,因为如果过滤效果差,就不再过滤了。
+   - RowsRead: 132.78K  # 从存储引擎返回到 Scanner 的行数,不包括经 Scanner 过滤的行数。
+   - RowsReturned: 132.78K  # 从 ScanNode 返回给上层节点的行数。
+   - RowsReturnedRate: 32.78 M/sec  # RowsReturned/ActiveTime
+   - RowsStatsFiltered: 0   # V2 中,包含谓词条件根据 Zonemap 过滤掉的行数。V1 中还包含通过 BloomFilter 过滤掉的行数。
+   - RowsVectorPredFiltered: 0  # 通过向量化条件过滤操作过滤掉的行数。
+   - ScanTime: 49.239ms:Scanner 调用 get_next() 方法的耗时统计。
+   - ScannerThreadsInvoluntaryContextSwitches: 0    # 无意义
+   - ScannerThreadsTotalWallClockTime: 0.000ns  # 无意义
+     - MaterializeTupleTime(*): 0.000ns # 无意义
+     - ScannerThreadsSysTime: 0.000ns   # 无意义
+     - ScannerThreadsUserTime: 0.000ns  # 无意义
+   - ScannerThreadsVoluntaryContextSwitches: 0  # 无意义
+   - ShowHintsTime: 0.000ns # V2 中无意义。V1 中读取部分数据来进行 ScanRange 的切分。
+   - TabletCount : 25   # 该 ScanNode 涉及的 Tablet 数量。
+   - TotalPagesNum: 0   # 仅 V2 中,读取的总 Page 数量。
+   - TotalRawReadTime(*): 0.000ns   # 无意义
+   - TotalReadThroughput: 0.00 /sec # 无意义
+   - UncompressedBytesRead: 4.28 MB # V1 中为读取的数据文件解压后的大小(如果文件无需解压,则直接统计文件大小)。V2 中,仅统计未命中 PageCache 的 Page 解压后的大小(如果Page无需解压,直接统计Page大小)
+   - VectorPredEvalTime: 0.000ns    # 向量化条件过滤操作的耗时。
+```
+
+* Profile 中关于行数的一些说明
+
+    在 Profile 中和行数相关的指标有:
+    
+    * RowsKeyRangeFiltered
+    * RowsBitmapIndexFiltered
+    * RowsBloomFilterFiltered
+    * RowsStatsFiltered
+    * RowsDelFiltered
+    * RawRowsRead
+    * RowsRead
+    * RowsReturned
+
+    一个查询中的谓词条件会分别在存储引擎和 Scanner 中进行过滤。以上指标中,`Rows***Filtered` 这组指标描述了在存储引擎中被过滤的行数。后三个指标描述了在 Scanner 中被处理的行数。
+
+    以下仅针对 Segment V2 格式数据读取的流程进行说明。Segment V1 格式中,这些指标的含义略有不同。
+
+    当读取一个 V2 格式的 Segment 时,首先会根据 Key range(前缀key组成的查询范围)进行一次过滤,过滤掉的行数记录在 `RowsKeyRangeFiltered` 中。之后,再利用 Bitmap 索引过滤数据,过滤掉的行数记录在 `RowsBitmapIndexFiltered` 中。之后,再利用 BloomFilter 索引过滤数据,记录在 `RowsBloomFilterFiltered` 中。`RowsBloomFilterFiltered` 的值是 Segment 的总行数(而不是Bitmap索引过滤后的行数)和经过 BloomFilter 过滤后剩余行数的差值,因此 BloomFilter 过滤掉的数据可能会和 Bitmap 过滤掉的数据有重叠。
+
+    `RowsStatsFiltered` 中记录的是经过其他谓词条件过滤掉的行数,这里包括下推到存储引擎的谓词条件,以及存储引擎中的 Delete 条件。
+    
+    `RowsDelFiltered` 中包含了 `RowsBloomFilterFiltered` 和 `RowsStatsFiltered` 记录的过滤行数。
+    
+    `RawRowsRead` 是经过上述过滤后,最终需要读取的行数。而 `RowsRead` 是最终返回给 Scanner 的行数。`RowsRead` 通常小于 `RawRowsRead`,是因为从存储引擎返回到 Scanner,可能会经过一次数据聚合。
+    
+    `RowsReturned` 是 ScanNode 最终返回给上层节点的行数。`RowsReturned` 通常也会小于 
+`RowsRead`。因为在 Scanner 上会有一些没有下推给存储引擎的谓词条件,会进行一次过滤。
+
+    通过以上指标,可以大致分析出存储引擎处理的行数以及最终过滤后的结果行数大小。通过 `Rows***Filtered` 这组指标,也可以分析查询条件是否下推到了存储引擎,以及不同索引的过滤效果。
+    
+    如果 `RawRowsRead` 和 `RowsRead` 差距较大,则说明大量的行被聚合,而聚合可能比较耗时。如果 `RowsRead` 和 `RowsReturned` 差距较大,则说明很多行在 Scanner 中进行了过滤。这说明很多选择度高的谓词条件并没有推送给存储引擎。而在 Scanner 中的过滤效率会比在存储引擎中过滤效率差。
+    
+* Scan Node Profile 的简单分析
+
+    OlapScanNode 的 Profile 通常用于分析数据扫描的效率。除了前面介绍的通过行数相关信息可以推断谓词条件下推和索引使用情况外,还可以通过以下几个方面进行简单的分析。
+    
+    * 首先,很多指标,如 `IOTimer`,`BlockFetchTime` 等都是所有 Scanner 线程指标的累加,因此数值可能会比较大。并且因为 Scanner 线程是异步读取数据的,所以这些累加指标只能反映 Scanner 累加的工作时间,并不直接代表 ScanNode 的耗时。ScanNode 在整个查询计划中的耗时占比为 `Active` 字段记录的值。有时会出现比如 `IOTimer` 有几十秒,而 `Active` 实际只有几秒钟。这种情况通常因为:1. `IOTimer` 为多个 Scanner 的累加时间,而 Scanner 数量较多。2. 上层节点比较耗时。比如上层节点耗时 100秒,而底层 ScanNode 只需 10秒。则反映在 `Active` 的字段可能只有几毫秒。因为在上层处理数据的同时,ScanNode 已经异步的进行了数据扫描并准备好了数据。当上层节点从 ScanNode 获取数据时,可以获取到已经准备好的数据,因此 Active 时间很短。
+    * IOTimer 是 IO 时间,能够直接反映 IO 操作耗时。这里是所有 Scanner 线程累加的 IO 时间。
+    * NumScanners 表示 Scanner 线程数。线程数过多或过少都会影响查询效率。同时可以用一些汇总指标除以线程数来大致的估算每个线程的耗时。
+    * TabletCount 表示需要扫描的 tablet 数量。数量过多可能意味着需要大量的随机读取和数据合并操作。
+    * UncompressedBytesRead 间接反映了读取的数据量。如果该数值较大,说明可能有大量的 IO 操作。
+    * CachedPagesNum 和 TotalPagesNum。对于 V2 格式,可以查看命中 PageCache 的情况。命中率越高,说明 IO 和解压操作耗时越少。
+
+#### `Buffer pool`
  - AllocTime: 内存分配耗时
  - CumulativeAllocationBytes: 累计内存分配的量
  - CumulativeAllocations: 累计的内存分配次数
@@ -149,3 +238,4 @@ BE端收集的统计信息较多,下面列出了各个参数的对应含义:
  - PeakUnpinnedBytes: unpin的内存数据量
  - PeakUsedReservation: Reservation的内存使用量
  - ReservationLimit: BufferPool的Reservation的限制量
+
diff --git a/fe/src/main/java/org/apache/doris/common/util/RuntimeProfile.java b/fe/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
index 6a86b87..a6ee648 100644
--- a/fe/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
+++ b/fe/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
@@ -34,10 +34,10 @@ import org.apache.logging.log4j.Logger;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Formatter;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeSet;
 
 /**
  * It is accessed by two kinds of thread, one is to create this RuntimeProfile
@@ -57,7 +57,7 @@ public class RuntimeProfile {
     private Map<String, Counter> counterMap = Maps.newConcurrentMap();
     private Map<String, RuntimeProfile> childMap = Maps.newConcurrentMap();
 
-    private Map<String, Set<String> > childCounterMap = Maps.newHashMap();
+    private Map<String, TreeSet<String>> childCounterMap = Maps.newHashMap();
     private List<Pair<RuntimeProfile, Boolean>> childList = Lists.newArrayList();
 
     private String name;
@@ -101,7 +101,7 @@ public class RuntimeProfile {
             
             Set<String> childCounters = childCounterMap.get(parentCounterName);
             if (childCounters == null) {
-                childCounterMap.put(parentCounterName, new HashSet<String>());
+                childCounterMap.put(parentCounterName, new TreeSet<String>());
                 childCounters = childCounterMap.get(parentCounterName);
             }
             childCounters.add(name);
@@ -142,7 +142,7 @@ public class RuntimeProfile {
                     String parentCounterName = entry.getKey();                
                     Set<String> childCounters = childCounterMap.get(parentCounterName);
                     if (childCounters == null) {
-                        childCounterMap.put(parentCounterName, new HashSet<String>());
+                        childCounterMap.put(parentCounterName, new TreeSet<String>());
                         childCounters = childCounterMap.get(parentCounterName);
                     } 
                     childCounters.addAll(entry.getValue());
@@ -193,11 +193,12 @@ public class RuntimeProfile {
         builder.append(prefix).append(name).append(":");
         // total time
         if (counter.getValue() != 0) {
-            Formatter fmt = new Formatter();
-            builder.append("(Active: ")
-                    .append(this.printCounter(counter.getValue(), counter.getType()))
-                    .append(", % non-child: ").append(fmt.format("%.2f", localTimePercent))
-                    .append("%)");
+            try (Formatter fmt = new Formatter()) {
+                builder.append("(Active: ")
+                .append(this.printCounter(counter.getValue(), counter.getType()))
+                .append(", % non-child: ").append(fmt.format("%.2f", localTimePercent))
+                .append("%)");
+            }
         }
         builder.append("\n");
         
@@ -385,3 +386,4 @@ public class RuntimeProfile {
         return infoStrings.get(key);
     }
 }
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org