You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/09 13:54:58 UTC

[doris] branch master updated: [enhancement](index) Nereids support no need to read raw data for index column that only in filter conditions (#20605)

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 656b9ad3da [enhancement](index) Nereids support no need to read raw data for index column that only in filter conditions (#20605)
656b9ad3da is described below

commit 656b9ad3dacf3549836fdb494df4e11d0c58ffba
Author: YueW <45...@users.noreply.github.com>
AuthorDate: Fri Jun 9 21:54:48 2023 +0800

    [enhancement](index) Nereids support no need to read raw data for index column that only in filter conditions (#20605)
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  19 ++-
 be/src/olap/rowset/segment_v2/segment_iterator.h   |   1 +
 be/src/vec/exec/scan/new_olap_scan_node.cpp        |   3 -
 .../org/apache/doris/planner/OriginalPlanner.java  |  44 +------
 .../test_index_no_need_read_data.out               | 129 +++++++++++++++++++++
 .../test_index_no_need_read_data.groovy            |  88 ++++++++++++++
 6 files changed, 241 insertions(+), 43 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 92717945b1..53d89e6395 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -250,6 +250,10 @@ Status SegmentIterator::init(const StorageReadOptions& opts) {
     if (_char_type_idx.empty() && _char_type_idx_no_0.empty()) {
         _vec_init_char_column_id();
     }
+
+    if (opts.output_columns != nullptr) {
+        _output_columns = *(opts.output_columns);
+    }
     return Status::OK();
 }
 
@@ -917,7 +921,20 @@ Status SegmentIterator::_apply_inverted_index_on_block_column_predicate(
 }
 
 bool SegmentIterator::_need_read_data(ColumnId cid) {
-    // TODO(xk) impl right logic
+    if (_output_columns.count(-1)) {
+        // if _output_columns contains -1, it means that the light
+        // weight schema change may not be enabled or other reasons
+        // caused the column unique_id not be set, to prevent errors
+        // occurring, return true here that column data needs to be read
+        return true;
+    }
+    int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
+    if (_need_read_data_indices.count(unique_id) > 0 && !_need_read_data_indices[unique_id] &&
+        _output_columns.count(unique_id) < 1) {
+        VLOG_DEBUG << "SegmentIterator no need read data for column: "
+                   << _opts.tablet_schema->column_by_uid(unique_id).name();
+        return false;
+    }
     return true;
 }
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 899ee46e3a..93b8b398e7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -431,6 +431,7 @@ private:
     std::vector<ColumnPredicate*> _filter_info_id;
     bool _record_rowids = false;
     int32_t _tablet_id = 0;
+    std::set<int32_t> _output_columns;
 };
 
 } // namespace segment_v2
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index ac916bfcd1..740a57e793 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -440,9 +440,6 @@ Status NewOlapScanNode::_init_scanners(std::list<VScannerSPtr>* scanners) {
 
     if (!_olap_scan_node.output_column_unique_ids.empty()) {
         for (auto uid : _olap_scan_node.output_column_unique_ids) {
-            if (uid < 0) {
-                continue;
-            }
             _maybe_read_column_ids.emplace(uid);
         }
     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java
index 024ea0647b..1fbd140d45 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java
@@ -572,50 +572,16 @@ public class OriginalPlanner extends Planner {
      * column unique id for `A` and `B` will put into outputColumnUniqueIds.
      *
     */
+    // this opt will only work with nereidsPlanner
     private void pushOutColumnUniqueIdsToOlapScan(PlanFragment rootFragment, Analyzer analyzer) {
         Set<Integer> outputColumnUniqueIds = new HashSet<>();
-        ArrayList<Expr> outputExprs = rootFragment.getOutputExprs();
-        for (Expr expr : outputExprs) {
-            if (expr instanceof SlotRef) {
-                if (((SlotRef) expr).getColumn() != null) {
-                    outputColumnUniqueIds.add(((SlotRef) expr).getColumn().getUniqueId());
-                }
-            }
-        }
+        // add '-1' to avoid the optimization incorrect work with OriginalPlanner,
+        // because in the storage layer will skip this optimization if outputColumnUniqueIds contains '-1',
+        // to ensure the optimization only correct work with nereidsPlanner
+        outputColumnUniqueIds.add(-1);
 
         for (PlanFragment fragment : fragments) {
             PlanNode node = fragment.getPlanRoot();
-            PlanNode parent = null;
-            while (node.getChildren().size() != 0) {
-                for (PlanNode childNode : node.getChildren()) {
-                    List<SlotId> outputSlotIds = childNode.getOutputSlotIds();
-                    if (outputSlotIds != null) {
-                        for (SlotId sid : outputSlotIds) {
-                            SlotDescriptor slotDesc = analyzer.getSlotDesc(sid);
-                            outputColumnUniqueIds.add(slotDesc.getUniqueId());
-                        }
-                    }
-                }
-                // OlapScanNode is the last node.
-                // So, just get the two node and check if they are SortNode and OlapScan.
-                parent = node;
-                node = node.getChildren().get(0);
-            }
-
-            if (parent instanceof SortNode) {
-                SortNode sortNode = (SortNode) parent;
-                List<Expr> orderingExprs = sortNode.getSortInfo().getOrigOrderingExprs();
-                if (orderingExprs != null) {
-                    for (Expr expr : orderingExprs) {
-                        if (expr instanceof SlotRef) {
-                            if (((SlotRef) expr).getColumn() != null) {
-                                outputColumnUniqueIds.add(((SlotRef) expr).getColumn().getUniqueId());
-                            }
-                        }
-                    }
-                }
-            }
-
             if (!(node instanceof OlapScanNode)) {
                 continue;
             }
diff --git a/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out b/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out
new file mode 100644
index 0000000000..01a08f324d
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out
@@ -0,0 +1,129 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_nereids_0 --
+1	\N	addr qie3	yy	lj	100
+2	\N	hehe	\N	lala	200
+3	beijing	addr xuanwu	wugui	\N	300
+4	beijing	addr fengtai	fengtai1	fengtai2	\N
+5	beijing	addr chaoyang	wangjing	donghuqu	500
+6	shanghai	hehe	\N	haha	\N
+7	tengxun	qie	addr gg	lj	\N
+8	tengxun2	qie	\N	lj	800
+
+-- !select_nereids_1 --
+4
+
+-- !select_nereids_2 --
+3
+
+-- !select_nereids_3 --
+3
+
+-- !select_nereids_4 --
+3	beijing	addr xuanwu	wugui	\N	300
+4	beijing	addr fengtai	fengtai1	fengtai2	\N
+5	beijing	addr chaoyang	wangjing	donghuqu	500
+
+-- !select_nereids_5 --
+beijing	addr xuanwu	wugui
+beijing	addr fengtai	fengtai1
+beijing	addr chaoyang	wangjing
+
+-- !select_nereids_6 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_nereids_7 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_nereids_8 --
+SHANGHAI	\N
+TENGXUN	addr gg
+TENGXUN2	\N
+
+-- !select_nereids_9 --
+4	\N
+3	addr gg
+3	\N
+
+-- !select_nereids_10 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_nereids_11 --
+hehe	\N	SHANGHAI
+qie	addr gg	TENGXUN
+qie	\N	TENGXUN2
+
+-- !select_nereids_12 --
+300
+\N
+500
+
+-- !select_0 --
+1	\N	addr qie3	yy	lj	100
+2	\N	hehe	\N	lala	200
+3	beijing	addr xuanwu	wugui	\N	300
+4	beijing	addr fengtai	fengtai1	fengtai2	\N
+5	beijing	addr chaoyang	wangjing	donghuqu	500
+6	shanghai	hehe	\N	haha	\N
+7	tengxun	qie	addr gg	lj	\N
+8	tengxun2	qie	\N	lj	800
+
+-- !select_1 --
+4
+
+-- !select_2 --
+3
+
+-- !select_3 --
+3
+
+-- !select_4 --
+3	beijing	addr xuanwu	wugui	\N	300
+4	beijing	addr fengtai	fengtai1	fengtai2	\N
+5	beijing	addr chaoyang	wangjing	donghuqu	500
+
+-- !select_5 --
+beijing	addr xuanwu	wugui
+beijing	addr fengtai	fengtai1
+beijing	addr chaoyang	wangjing
+
+-- !select_6 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_7 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_8 --
+SHANGHAI	\N
+TENGXUN	addr gg
+TENGXUN2	\N
+
+-- !select_9 --
+4	\N
+3	addr gg
+3	\N
+
+-- !select_10 --
+hehe	\N
+qie	addr gg
+qie	\N
+
+-- !select_11 --
+hehe	\N	SHANGHAI
+qie	addr gg	TENGXUN
+qie	\N	TENGXUN2
+
+-- !select_12 --
+300
+\N
+500
+
diff --git a/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy b/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy
new file mode 100644
index 0000000000..e5bc37e2fd
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_index_no_need_read_data", "inverted_index_select"){
+    def table1 = "test_index_no_need_read_data"
+
+    sql "drop table if exists ${table1}"
+
+    sql """
+       CREATE TABLE IF NOT EXISTS `${table1}` (
+      `id` int NULL COMMENT "",
+      `city` varchar(20) NULL COMMENT "",
+      `addr` varchar(20) NULL COMMENT "",
+      `name` varchar(20) NULL COMMENT "",
+      `compy` varchar(20) NULL COMMENT "",
+      `n` int NULL COMMENT "",
+      INDEX idx_city(city) USING INVERTED,
+      INDEX idx_addr(addr) USING INVERTED PROPERTIES("parser"="english"),
+      INDEX idx_n(n) USING INVERTED
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    COMMENT "OLAP"
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES (
+    "replication_allocation" = "tag.location.default: 1",
+    "in_memory" = "false",
+    "storage_format" = "V2"
+    )
+    """
+
+    sql """insert into ${table1} values
+            (1,null,'addr qie3','yy','lj',100),
+            (2,null,'hehe',null,'lala',200),
+            (3,'beijing','addr xuanwu','wugui',null,300),
+            (4,'beijing','addr fengtai','fengtai1','fengtai2',null),
+            (5,'beijing','addr chaoyang','wangjing','donghuqu',500),
+            (6,'shanghai','hehe',null,'haha',null),
+            (7,'tengxun','qie','addr gg','lj',null),
+            (8,'tengxun2','qie',null,'lj',800)
+    """
+
+    // case1: enable nereids planner
+    sql "set enable_nereids_planner = true"
+
+    qt_select_nereids_0 "SELECT * FROM ${table1} ORDER BY id"
+    qt_select_nereids_1 "SELECT count() FROM ${table1} WHERE n > 100"
+    qt_select_nereids_2 "SELECT count() FROM ${table1} WHERE city = 'beijing'"
+    qt_select_nereids_3 "SELECT count(*) FROM ${table1} WHERE city = 'beijing'"
+    qt_select_nereids_4 "SELECT * FROM ${table1} WHERE city = 'beijing' ORDER BY id"
+    qt_select_nereids_5 "SELECT city, addr, name FROM ${table1} WHERE city = 'beijing' ORDER BY id"
+    qt_select_nereids_6 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY city"
+    qt_select_nereids_7 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY id"
+    qt_select_nereids_8 "SELECT upper(city), name FROM ${table1} WHERE city != 'beijing' ORDER BY id"
+    qt_select_nereids_9 "SELECT length(addr), name FROM ${table1} WHERE city != 'beijing' ORDER BY id"
+    qt_select_nereids_10 "SELECT addr, name FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t"
+    qt_select_nereids_11 "SELECT addr, name, upper(city) FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t"
+    qt_select_nereids_12 "SELECT sum(n) FROM ${table1} WHERE city = 'beijing' group by id ORDER BY id"
+
+    // case2: disable nereids planner
+    sql "set enable_nereids_planner = false"
+    
+    qt_select_0 "SELECT * FROM ${table1} ORDER BY id"
+    qt_select_1 "SELECT count() FROM ${table1} WHERE n > 100"
+    qt_select_2 "SELECT count() FROM ${table1} WHERE city = 'beijing'"
+    qt_select_3 "SELECT count(*) FROM ${table1} WHERE city = 'beijing'"
+    qt_select_4 "SELECT * FROM ${table1} WHERE city = 'beijing' ORDER BY id"
+    qt_select_5 "SELECT city, addr, name FROM ${table1} WHERE city = 'beijing' ORDER BY id"
+    qt_select_6 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY city"
+    qt_select_7 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY id"
+    qt_select_8 "SELECT upper(city), name FROM ${table1} WHERE city != 'beijing' ORDER BY id"
+    qt_select_9 "SELECT length(addr), name FROM ${table1} WHERE city != 'beijing' ORDER BY id"
+    qt_select_10 "SELECT addr, name FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t"
+    qt_select_11 "SELECT addr, name, upper(city) FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t"
+    qt_select_12 "SELECT sum(n) FROM ${table1} WHERE city = 'beijing' group by id ORDER BY id"
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org