You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/12/19 07:29:13 UTC

[GitHub] [doris] xiaokang commented on a diff in pull request #14211: [feature-wip](inverted index) API for inverted index reader and syntax for fulltext match

xiaokang commented on code in PR #14211:
URL: https://github.com/apache/doris/pull/14211#discussion_r1051848978


##########
be/src/exprs/expr.cpp:
##########
@@ -383,6 +384,22 @@ Status Expr::create_expr(ObjectPool* pool, const TExprNode& texpr_node, Expr** e
         return Status::OK();
     }
 
+    case TExprNodeType::MATCH_PRED: {
+        DCHECK(texpr_node.__isset.fn);
+        if (texpr_node.fn.name.function_name == "match_any" ||

Review Comment:
   define const string



##########
fe/fe-core/src/main/cup/sql_parser.cup:
##########
@@ -6124,6 +6136,27 @@ like_predicate ::=
     new LikePredicate(LikePredicate.Operator.REGEXP, e1, e2), null); :}
   ;
 
+match_predicate ::=
+  expr:e1 KW_MATCH_ANY expr:e2

Review Comment:
   can we limit e1 to a column and e2 to constant?



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.catalog.ArrayType;
+import org.apache.doris.catalog.Function;
+import org.apache.doris.catalog.FunctionSet;
+import org.apache.doris.catalog.ScalarFunction;
+import org.apache.doris.catalog.Type;
+import org.apache.doris.common.AnalysisException;
+import org.apache.doris.thrift.TExprNode;
+import org.apache.doris.thrift.TExprNodeType;
+import org.apache.doris.thrift.TExprOpcode;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.util.Objects;
+
+/**
+ * filed MATCH query_str
+ */
+public class MatchPredicate extends Predicate {
+    private static final Logger LOG = LogManager.getLogger(MatchPredicate.class);
+
+    public enum Operator {
+        MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY),
+        MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
+        MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
+        MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ),
+        MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT),
+        MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT),
+        MATCH_ELEMENT_LE("MATCH_ELEMENT_LE", "match_element_le", TExprOpcode.MATCH_ELEMENT_LE),
+        MATCH_ELEMENT_GE("MATCH_ELEMENT_GE", "match_element_ge", TExprOpcode.MATCH_ELEMENT_GE);
+
+
+        private final String description;
+        private final String name;
+        private final TExprOpcode opcode;
+
+        Operator(String description,
+                 String name,
+                 TExprOpcode opcode) {
+            this.description = description;
+            this.name = name;
+            this.opcode = opcode;
+        }
+
+        @Override
+        public String toString() {
+            return description;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public TExprOpcode getOpcode() {
+            return opcode;
+        }
+    }
+
+    public static void initBuiltins(FunctionSet functionSet) {
+        String symbolNotUsed = "symbol_not_used";
+
+        for (Type t : Type.getNumericDateTimeTypes()) {
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_EQ.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_LT.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_GT.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_LE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_GE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+        }
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ANY.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ANY.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ALL.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ALL.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_PHRASE.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_PHRASE.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+    }
+
+    private final Operator op;
+
+    public MatchPredicate(Operator op, Expr e1, Expr e2) {
+        super();
+        this.op = op;
+        Preconditions.checkNotNull(e1);
+        children.add(e1);
+        Preconditions.checkNotNull(e2);
+        children.add(e2);
+        // TODO: Calculate selectivity
+        selectivity = Expr.DEFAULT_SELECTIVITY;
+    }
+
+    public Boolean isMatchElement(Operator op) {
+        return Objects.equals(op.getName(), "match_element_eq") || Objects.equals(op.getName(), "match_element_lt")

Review Comment:
   use const Operator.xxx.getName() or define equals in Operator class.



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.catalog.ArrayType;
+import org.apache.doris.catalog.Function;
+import org.apache.doris.catalog.FunctionSet;
+import org.apache.doris.catalog.ScalarFunction;
+import org.apache.doris.catalog.Type;
+import org.apache.doris.common.AnalysisException;
+import org.apache.doris.thrift.TExprNode;
+import org.apache.doris.thrift.TExprNodeType;
+import org.apache.doris.thrift.TExprOpcode;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.util.Objects;
+
+/**
+ * filed MATCH query_str
+ */
+public class MatchPredicate extends Predicate {
+    private static final Logger LOG = LogManager.getLogger(MatchPredicate.class);
+
+    public enum Operator {
+        MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY),
+        MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
+        MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
+        MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ),
+        MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT),
+        MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT),
+        MATCH_ELEMENT_LE("MATCH_ELEMENT_LE", "match_element_le", TExprOpcode.MATCH_ELEMENT_LE),
+        MATCH_ELEMENT_GE("MATCH_ELEMENT_GE", "match_element_ge", TExprOpcode.MATCH_ELEMENT_GE);
+
+
+        private final String description;
+        private final String name;
+        private final TExprOpcode opcode;
+
+        Operator(String description,
+                 String name,
+                 TExprOpcode opcode) {
+            this.description = description;
+            this.name = name;
+            this.opcode = opcode;
+        }
+
+        @Override
+        public String toString() {
+            return description;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public TExprOpcode getOpcode() {
+            return opcode;
+        }
+    }
+
+    public static void initBuiltins(FunctionSet functionSet) {
+        String symbolNotUsed = "symbol_not_used";
+
+        for (Type t : Type.getNumericDateTimeTypes()) {
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_EQ.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_LT.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_GT.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_LE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+            functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_ELEMENT_GE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
+        }
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ANY.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ANY.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ALL.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_ALL.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_PHRASE.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
+                Type.BOOLEAN));
+        functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                Operator.MATCH_PHRASE.getName(),
+                symbolNotUsed,
+                Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
+                Type.BOOLEAN));
+    }
+
+    private final Operator op;
+
+    public MatchPredicate(Operator op, Expr e1, Expr e2) {

Review Comment:
   can we limit type of e1 and e2?



##########
be/src/vec/exec/scan/new_olap_scan_node.cpp:
##########
@@ -201,7 +201,6 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
     for (auto& iter : _colname_to_value_range) {
         std::vector<TCondition> filters;
         std::visit([&](auto&& range) { range.to_olap_filter(filters); }, iter.second);
-

Review Comment:
   just delete blank line?



##########
be/src/vec/exprs/vexpr.h:
##########
@@ -100,6 +100,8 @@ class VExpr {
 
     TExprNodeType::type node_type() const { return _node_type; }
 
+    TExprOpcode::type op() const { return _opcode; }

Review Comment:
   opcode()?



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -373,6 +382,55 @@ Status SegmentIterator::_apply_bitmap_index() {
     return Status::OK();
 }
 
+Status SegmentIterator::_apply_inverted_index() {
+    std::vector<ColumnPredicate*> remaining_predicates;
+
+    for (auto pred : _col_predicates) {
+        int32_t unique_id = _schema.unique_id(pred->column_id());
+        if (_inverted_index_iterators.count(unique_id) < 1 ||
+            _inverted_index_iterators[unique_id] == nullptr) {
+            // 1. this column no inverted index
+            remaining_predicates.push_back(pred);
+        } else {
+            roaring::Roaring bitmap = _row_bitmap;
+            Status res = pred->evaluate(_schema, _inverted_index_iterators[unique_id], num_rows(), &bitmap);
+            if (!res.ok()) {
+                LOG(WARNING) << "failed to evaluate index"
+                             << ", column predicate type: " << pred->pred_type_string(pred->type())
+                             << ", error msg: " << res.get_error_msg();
+                return res;
+            }
+
+            std::string pred_sign = _gen_predicate_sign(pred);
+            auto pred_type = pred->type();
+            if (pred_type == PredicateType::MATCH) {
+                _rowid_result_for_index.emplace(
+                        std::make_pair(pred_sign, std::make_pair(false, bitmap)));
+            }
+
+            _row_bitmap &= bitmap;
+            if (_row_bitmap.isEmpty()) {
+                break; // all rows have been pruned, no need to process further predicates
+            }
+        }
+    }
+    _col_predicates = std::move(remaining_predicates);
+    return Status::OK();
+}
+
+std::string SegmentIterator::_gen_predicate_sign(ColumnPredicate* predicate) {

Review Comment:
   consider a more concreate name such predicate_result_column_name



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -1209,6 +1291,53 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
     return Status::OK();
 }
 
+void SegmentIterator::_output_index_return_column(uint16_t* sel_rowid_idx, uint16_t select_size, vectorized::Block* block) {
+    if (block->rows() == 0) {
+        return;
+    }
+
+    for (auto column_sign : _rowid_result_for_index) {
+        block->insert({vectorized::ColumnUInt8::create(),
+                       std::make_shared<vectorized::DataTypeUInt8>(), column_sign.first});
+        if (!column_sign.second.first) {
+            // predicate not in compound query
+            block->get_by_name(column_sign.first).column =
+                    vectorized::DataTypeUInt8().create_column_const(block->rows(), 1u);
+            continue;
+        }
+        _build_index_return_column(sel_rowid_idx, select_size, block, column_sign.first, column_sign.second.second);
+    }
+}
+
+void SegmentIterator::_build_index_return_column(uint16_t* sel_rowid_idx, uint16_t select_size,

Review Comment:
   add comment for the logic for row range and id split.



##########
be/src/exprs/match_predicate.cpp:
##########
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exprs/match_predicate.h"
+
+#include <string.h>
+
+#include <memory>
+#include <sstream>
+
+#include "exec/olap_utils.h"
+#include "exprs/string_functions.h"
+#include "olap/schema.h"
+#include "runtime/string_value.hpp"
+
+namespace doris {
+
+MatchPredicate::MatchPredicate(uint32_t column_id, const std::string& value, MatchType match_type)
+        : ColumnPredicate(column_id), _value(value), _match_type(match_type) {}
+
+PredicateType MatchPredicate::type() const {
+    return PredicateType::MATCH;
+}
+
+Status MatchPredicate::evaluate(const Schema& schema, InvertedIndexIterator* iterator,
+                            uint32_t num_rows, roaring::Roaring* bitmap) const {
+    if (iterator == nullptr) {
+        return Status::OK();
+    }
+    auto column_desc = schema.column(_column_id);
+    roaring::Roaring roaring;
+    Status s = Status::OK();
+    auto inverted_index_query_type = _to_inverted_index_query_type(_match_type);
+
+    if (is_string_type(column_desc->type()) ||
+        (column_desc->type() == OLAP_FIELD_TYPE_ARRAY &&
+         is_string_type(column_desc->get_sub_field(0)->type_info()->type()))) {
+        StringValue match_value;
+        int32_t length = _value.length();
+        char* buffer = const_cast<char*>(_value.c_str());
+        match_value.replace(buffer, length);
+        s = iterator->read_from_inverted_index(column_desc->name(), &match_value,
+                                               inverted_index_query_type, num_rows, &roaring);
+    } else if (column_desc->type() == OLAP_FIELD_TYPE_ARRAY &&
+               is_numeric_type(column_desc->get_sub_field(0)->type_info()->type())) {
+        char buf[column_desc->get_sub_field(0)->type_info()->size()];
+        column_desc->get_sub_field(0)->from_string(buf, _value);
+        s = iterator->read_from_inverted_index(column_desc->name(), buf, inverted_index_query_type,
+                                               num_rows, &roaring);
+    }
+    *bitmap &= roaring;
+    return s;
+}
+
+InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType match_type) const {
+    auto ret = InvertedIndexQueryType::UNKNOWN_QUERY;

Review Comment:
   why not just use MatchType enum?



##########
be/src/exprs/expr.cpp:
##########
@@ -383,6 +384,22 @@ Status Expr::create_expr(ObjectPool* pool, const TExprNode& texpr_node, Expr** e
         return Status::OK();
     }
 
+    case TExprNodeType::MATCH_PRED: {
+        DCHECK(texpr_node.__isset.fn);
+        if (texpr_node.fn.name.function_name == "match_any" ||
+            texpr_node.fn.name.function_name == "match_all" ||
+            texpr_node.fn.name.function_name == "match_phrase" ||
+            texpr_node.fn.name.function_name == "match_element_eq" ||
+            texpr_node.fn.name.function_name == "match_element_lt" ||
+            texpr_node.fn.name.function_name == "match_element_gt" ||
+            texpr_node.fn.name.function_name == "match_element_le" ||
+            texpr_node.fn.name.function_name == "match_element_ge") {
+            //*expr = pool->add(new ScalarFnCall(texpr_node));

Review Comment:
   remove comment



##########
be/src/exec/olap_common.h:
##########
@@ -94,8 +94,12 @@ class ColumnValueRange {
 
     Status add_range(SQLFilterOp op, CppType value);
 
+    Status add_match_value(MatchType match_type, const CppType& value);

Review Comment:
   Is function filter pushdown more reasonable for MATCH operator?



##########
be/src/exprs/match_predicate.cpp:
##########
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exprs/match_predicate.h"
+
+#include <string.h>
+
+#include <memory>
+#include <sstream>
+
+#include "exec/olap_utils.h"
+#include "exprs/string_functions.h"
+#include "olap/schema.h"
+#include "runtime/string_value.hpp"
+
+namespace doris {
+
+MatchPredicate::MatchPredicate(uint32_t column_id, const std::string& value, MatchType match_type)
+        : ColumnPredicate(column_id), _value(value), _match_type(match_type) {}
+
+PredicateType MatchPredicate::type() const {
+    return PredicateType::MATCH;
+}
+
+Status MatchPredicate::evaluate(const Schema& schema, InvertedIndexIterator* iterator,
+                            uint32_t num_rows, roaring::Roaring* bitmap) const {
+    if (iterator == nullptr) {
+        return Status::OK();
+    }
+    auto column_desc = schema.column(_column_id);
+    roaring::Roaring roaring;
+    Status s = Status::OK();
+    auto inverted_index_query_type = _to_inverted_index_query_type(_match_type);
+
+    if (is_string_type(column_desc->type()) ||
+        (column_desc->type() == OLAP_FIELD_TYPE_ARRAY &&
+         is_string_type(column_desc->get_sub_field(0)->type_info()->type()))) {
+        StringValue match_value;
+        int32_t length = _value.length();
+        char* buffer = const_cast<char*>(_value.c_str());
+        match_value.replace(buffer, length);
+        s = iterator->read_from_inverted_index(column_desc->name(), &match_value,
+                                               inverted_index_query_type, num_rows, &roaring);
+    } else if (column_desc->type() == OLAP_FIELD_TYPE_ARRAY &&

Review Comment:
   what about is_numeric_type but is not array?



##########
fe/fe-core/src/main/jflex/sql_scanner.flex:
##########
@@ -293,6 +293,15 @@ import org.apache.doris.qe.SqlModeHelper;
         keywordMap.put("lock", new Integer(SqlParserSymbols.KW_LOCK));
         keywordMap.put("low_priority", new Integer(SqlParserSymbols.KW_LOW_PRIORITY));
         keywordMap.put("map", new Integer(SqlParserSymbols.KW_MAP));
+        keywordMap.put("match", new Integer(SqlParserSymbols.KW_MATCH));
+        keywordMap.put("match_any", new Integer(SqlParserSymbols.KW_MATCH_ANY));
+        keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL));
+        keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE));
+        keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));

Review Comment:
   no match_ prefix for element_*?



##########
be/src/vec/functions/match.cpp:
##########
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <limits>
+#include <type_traits>
+
+#include "common/consts.h"
+#include "common/logging.h"
+#include "vec/columns/column_string.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/functions/simple_function_factory.h"
+
+namespace doris::vectorized {
+
+class FunctionMatchBase : public IFunction {
+public:
+    size_t get_number_of_arguments() const override { return 2; }
+
+    String get_name() const override { return "match"; }
+
+    /// Get result types by argument types. If the function does not apply to these arguments, throw an exception.
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        return std::make_shared<DataTypeUInt8>();
+    }
+    
+    Status execute_impl(FunctionContext* context, Block& block,
+                               const ColumnNumbers& arguments, size_t result,
+                               size_t input_rows_count) override {
+        //const auto match_query_col = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
+        auto column_with_name = block.get_by_position(arguments[1]);
+        /*const auto* match_query = check_and_get_column<ColumnString>(match_query_col.get());
+        if (!match_query) {
+            return Status::InternalError("Not supported input arguments types");
+        }*/
+        auto match_query_str = column_with_name.to_string(0);
+        //std::string match_query_str = match_query_col->get_data_at(0).to_string();
+        std::string column_name = block.get_by_position(arguments[0]).name;
+        auto match_pred_name = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str;
+        if (!block.has(match_pred_name)) {
+            if (!config::enable_storage_vectorization) {
+                return Status::Cancelled("please check whether turn on the configuration 'enable_storage_vectorization'");
+            }
+            LOG(WARNING) << "execute match query meet error, block no column: " << match_pred_name;
+            return Status::InternalError("match query meet error");

Review Comment:
   add more info in error message



##########
be/src/vec/exec/scan/vscan_node.cpp:
##########
@@ -822,6 +837,48 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
     return Status::OK();
 }
 
+template <PrimitiveType T>
+Status VScanNode::_normalize_match_predicate(VExpr* expr, VExprContext* expr_ctx,
+                                             SlotDescriptor* slot,
+                                             ColumnValueRange<T>& range, PushDownType* pdt) {
+    if (TExprNodeType::MATCH_PRED == expr->node_type()) {
+        DCHECK(expr->children().size() == 2);
+
+        // create empty range as temp range, temp range should do intersection on range
+        auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(
+                slot->type().precision, slot->type().scale);
+        // Normalize match conjuncts like 'where col match value'
+
+        auto match_checker = [](const std::string& fn_name) {
+            return true; // TODO xk fn_name == "match";

Review Comment:
   fix comment



##########
be/src/vec/functions/match.cpp:
##########
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <limits>
+#include <type_traits>
+
+#include "common/consts.h"
+#include "common/logging.h"
+#include "vec/columns/column_string.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/functions/simple_function_factory.h"
+
+namespace doris::vectorized {
+
+class FunctionMatchBase : public IFunction {
+public:
+    size_t get_number_of_arguments() const override { return 2; }
+
+    String get_name() const override { return "match"; }
+
+    /// Get result types by argument types. If the function does not apply to these arguments, throw an exception.
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        return std::make_shared<DataTypeUInt8>();
+    }
+    
+    Status execute_impl(FunctionContext* context, Block& block,
+                               const ColumnNumbers& arguments, size_t result,
+                               size_t input_rows_count) override {
+        //const auto match_query_col = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
+        auto column_with_name = block.get_by_position(arguments[1]);
+        /*const auto* match_query = check_and_get_column<ColumnString>(match_query_col.get());
+        if (!match_query) {
+            return Status::InternalError("Not supported input arguments types");
+        }*/
+        auto match_query_str = column_with_name.to_string(0);
+        //std::string match_query_str = match_query_col->get_data_at(0).to_string();
+        std::string column_name = block.get_by_position(arguments[0]).name;
+        auto match_pred_name = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str;

Review Comment:
   match_pred_column_name



##########
be/src/olap/comparison_predicate.h:
##########
@@ -180,6 +181,52 @@ class ComparisonPredicateBase : public ColumnPredicate {
                                bitmap);
     }
 
+    Status evaluate(const Schema& schema, InvertedIndexIterator* iterator,
+                            uint32_t num_rows, roaring::Roaring* bitmap) const override {
+        if (iterator == nullptr) {
+            return Status::OK();
+        }
+        auto column_desc = schema.column(_column_id);
+        std::string column_name = column_desc->name();
+
+        InvertedIndexQueryType query_type;
+        switch (PT)
+        {
+        case PredicateType::EQ:
+            query_type = InvertedIndexQueryType::EQUAL_QUERY;
+            break;
+        case PredicateType::NE:
+            query_type = InvertedIndexQueryType::EQUAL_QUERY;
+            break;
+        case PredicateType::LT:
+            query_type = InvertedIndexQueryType::LESS_THAN_QUERY;
+            break;
+        case PredicateType::LE:
+            query_type = InvertedIndexQueryType::LESS_EQUAL_QUERY;
+            break;
+        case PredicateType::GT:
+            query_type = InvertedIndexQueryType::GREATER_THAN_QUERY;
+            break;
+        case PredicateType::GE:
+            query_type = InvertedIndexQueryType::GREATER_EQUAL_QUERY;
+            break;
+        default:
+            return Status::InvalidArgument("invalid comparison predicate type {}", PT);
+        }
+
+        roaring::Roaring roaring;
+        RETURN_IF_ERROR(
+            iterator->read_from_inverted_index(column_name, &_value, query_type, num_rows, &roaring));

Review Comment:
   If iterator is related to a column one by one, it should know the column_name and the arg column_name & schema is not needed. 



##########
be/src/olap/rowset/segment_v2/segment_iterator.cpp:
##########
@@ -659,6 +732,9 @@ Status SegmentIterator::next_batch(RowBlockV2* block) {
         for (auto column_predicate : _col_predicates) {
             auto column_id = column_predicate->column_id();
             auto column_block = block->column_block(column_id);
+            if (column_predicate->type() == PredicateType::MATCH) {

Review Comment:
   is there any chance to be true here?



##########
be/src/olap/column_predicate.h:
##########
@@ -130,6 +139,13 @@ class ColumnPredicate {
     virtual Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows,
                             roaring::Roaring* roaring) const = 0;
 
+    //evaluate predicate on inverted
+    virtual Status evaluate(const Schema& schema, InvertedIndexIterator* iterator,

Review Comment:
   is schema necessary



##########
be/src/olap/in_list_predicate.h:
##########
@@ -241,6 +242,29 @@ class InListPredicateBase : public ColumnPredicate {
         return Status::OK();
     }
 
+    Status evaluate(const Schema& schema, InvertedIndexIterator* iterator,
+                    uint32_t num_rows, roaring::Roaring* result) const override {
+        if (iterator == nullptr) {
+            return Status::OK();
+        }
+        auto column_desc = schema.column(_column_id);
+        std::string column_name = column_desc->name();
+        roaring::Roaring indices;
+        for (auto value : *_values) {
+            InvertedIndexQueryType query_type = InvertedIndexQueryType::EQUAL_QUERY;
+            roaring::Roaring index;
+            RETURN_IF_ERROR(
+                    iterator->read_from_inverted_index(column_name, &value, query_type, num_rows, &index));
+            indices |= index;
+        }
+        if constexpr (PT == PredicateType::IN_LIST) {
+            *result &= indices;
+        } else {
+            *result -= indices;

Review Comment:
   should check NOT_IN_LIST explicitly



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -211,9 +224,11 @@ class ColumnReader {
     const BloomFilterIndexPB* _bf_index_meta = nullptr;
 
     DorisCallOnce<Status> _load_index_once;
+    mutable std::mutex _load_index_lock;

Review Comment:
   use doris::mutex for compatibility



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org