You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/06/19 09:07:23 UTC

[incubator-doris] branch master updated: [Doris On ES] Support fetch `_id` field from ES (#3900)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 355df12  [Doris On ES] Support fetch  `_id` field from ES  (#3900)
355df12 is described below

commit 355df127b7232aeb080855455b3d4458b81beacb
Author: Yunfeng,Wu <wu...@baidu.com>
AuthorDate: Fri Jun 19 17:07:07 2020 +0800

    [Doris On ES] Support fetch  `_id` field from ES  (#3900)
    
    More information can be found: https://github.com/apache/incubator-doris/issues/3901
    
    The created ES external Table must contains `_id` column if you want to fetch the Elasticsearch document `_id`.
    ```
    CREATE EXTERNAL TABLE `doe_id2` (
      `_id` varchar COMMENT "",
       `city`  varchar COMMENT ""
    ) ENGINE=ELASTICSEARCH
    PROPERTIES (
    "hosts" = "http://10.74.167.16:8200",
    "user" = "root",
    "password" = "root",
    "index" = "doe",
    "type" = "doc",
    "version" = "6.5.3",
    "enable_docvalue_scan" = "true",
    "transport" = "http"
    );
    
    Query:
    
    ```
    mysql> select * from doe_id2 limit 10;
    +----------------------+------+
    | _id                  | city |
    +----------------------+------+
    | iRHNc3IB8XwmcbhB7lEB | gz   |
    | jBHNc3IB8XwmcbhB71Ef | gz   |
    | jRHNc3IB8XwmcbhB71GI | gz   |
    | jhHNc3IB8XwmcbhB71Hx | gz   |
    | ThHNc3IB8XwmcbhBkFHB | sh   |
    | TxHNc3IB8XwmcbhBkFH9 | sh   |
    | URHNc3IB8XwmcbhBklFA | sh   |
    | ahHNc3IB8XwmcbhBxlFq | gz   |
    | axHNc3IB8XwmcbhBxlHw | gz   |
    | bxHNc3IB8XwmcbhByVFO | gz   |
    +----------------------+------+
    ```
    
    NOTICE:
    This change the column name format to support column name start with "_".
---
 be/src/exec/es/es_scan_reader.cpp                  |  4 +-
 be/src/exec/es/es_scroll_parser.cpp                | 37 ++++++++++++++
 .../java/org/apache/doris/common/FeNameFormat.java |  3 +-
 .../org/apache/doris/common/ExceptionChecker.java  | 56 ++++++++++++++++++++++
 .../org/apache/doris/common/FeNameFormatTest.java  | 34 +++++++++++++
 5 files changed, 132 insertions(+), 2 deletions(-)

diff --git a/be/src/exec/es/es_scan_reader.cpp b/be/src/exec/es/es_scan_reader.cpp
index 43af786..6e4b670 100644
--- a/be/src/exec/es/es_scan_reader.cpp
+++ b/be/src/exec/es/es_scan_reader.cpp
@@ -28,7 +28,9 @@
 
 namespace doris {
 
-const std::string SOURCE_SCROLL_SEARCH_FILTER_PATH = "filter_path=_scroll_id,hits.hits._source,hits.total,_id";
+// hits.hits._id used for obtain ES document `_id`
+const std::string SOURCE_SCROLL_SEARCH_FILTER_PATH = "filter_path=_scroll_id,hits.hits._source,hits.total,hits.hits._id";
+// hits.hits._score used for processing field not exists in one batch
 const std::string DOCVALUE_SCROLL_SEARCH_FILTER_PATH = "filter_path=_scroll_id,hits.total,hits.hits._score,hits.hits.fields";
 
 const std::string REQUEST_SCROLL_PATH = "_scroll";
diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp
index 16f913d..0742ae1 100644
--- a/be/src/exec/es/es_scroll_parser.cpp
+++ b/be/src/exec/es/es_scroll_parser.cpp
@@ -40,6 +40,7 @@ static const char* FIELD_HITS = "hits";
 static const char* FIELD_INNER_HITS = "hits";
 static const char* FIELD_SOURCE = "_source";
 static const char* FIELD_TOTAL = "total";
+static const char* FIELD_ID = "_id";
 
 
 // get the original json data type
@@ -296,6 +297,42 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc,
         if (!slot_desc->is_materialized()) {
             continue;
         }
+        // _id field must exists in every document, this is guaranteed by ES
+        // if _id was found in tuple, we would get `_id` value from inner-hit node
+        // json-format response would like below:
+        //    "hits": {
+        //            "hits": [
+        //                {
+        //                    "_id": "UhHNc3IB8XwmcbhBk1ES",
+        //                    "_source": {
+        //                          "k": 201,
+        //                    }
+        //                }
+        //            ]
+        //        }
+        if (slot_desc->col_name() == FIELD_ID) {
+            // actually this branch will not be reached, this is guaranteed by Doris FE.
+            if (pure_doc_value) {
+                std::stringstream ss;
+                ss << "obtain `_id` is not supported in doc_values mode";
+                return Status::RuntimeError(ss.str());
+            }
+            tuple->set_not_null(slot_desc->null_indicator_offset());
+            void* slot = tuple->get_slot(slot_desc->tuple_offset());
+            // obj[FIELD_ID] must not be NULL
+            std::string _id = obj[FIELD_ID].GetString();
+            size_t len = _id.length();
+            char* buffer = reinterpret_cast<char*>(tuple_pool->try_allocate_unaligned(len));
+            if (UNLIKELY(buffer == NULL)) {
+                string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow",
+                            len, "string slot");
+                return tuple_pool->mem_tracker()->MemLimitExceeded(NULL, details, len);
+            }
+            memcpy(buffer, _id.data(), len);
+            reinterpret_cast<StringValue*>(slot)->ptr = buffer;
+            reinterpret_cast<StringValue*>(slot)->len = len;
+            continue;
+        }
 
         // if pure_doc_value enabled, docvalue_context must contains the key
         // todo: need move all `pure_docvalue` for every tuple outside fill_tuple
diff --git a/fe/src/main/java/org/apache/doris/common/FeNameFormat.java b/fe/src/main/java/org/apache/doris/common/FeNameFormat.java
index de386ff..1a4526f 100644
--- a/fe/src/main/java/org/apache/doris/common/FeNameFormat.java
+++ b/fe/src/main/java/org/apache/doris/common/FeNameFormat.java
@@ -26,6 +26,7 @@ import com.google.common.base.Strings;
 public class FeNameFormat {
     private static final String LABEL_REGEX = "^[-_A-Za-z0-9]{1,128}$";
     private static final String COMMON_NAME_REGEX = "^[a-zA-Z][a-zA-Z0-9_]{0,63}$";
+    private static final String COLUMN_NAME_REGEX = "^[_a-zA-Z][a-zA-Z0-9_]{0,63}$";
 
     public static final String FORBIDDEN_PARTITION_NAME = "placeholder_";
 
@@ -61,7 +62,7 @@ public class FeNameFormat {
     }
 
     public static void checkColumnName(String columnName) throws AnalysisException {
-        if (Strings.isNullOrEmpty(columnName) || !columnName.matches(COMMON_NAME_REGEX)) {
+        if (Strings.isNullOrEmpty(columnName) || !columnName.matches(COLUMN_NAME_REGEX)) {
             ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_COLUMN_NAME, columnName);
         }
         if (columnName.startsWith(SchemaChangeHandler.SHADOW_NAME_PRFIX)) {
diff --git a/fe/src/test/java/org/apache/doris/common/ExceptionChecker.java b/fe/src/test/java/org/apache/doris/common/ExceptionChecker.java
new file mode 100644
index 0000000..128a8b8
--- /dev/null
+++ b/fe/src/test/java/org/apache/doris/common/ExceptionChecker.java
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.common;
+
+import org.apache.doris.http.DorisHttpTestCase;
+import junit.framework.AssertionFailedError;
+
+public class ExceptionChecker {
+
+    public static void expectThrowsNoException(DorisHttpTestCase.ThrowingRunnable runnable) {
+        try {
+            runnable.run();
+        } catch (Throwable e) {
+            throw new AssertionFailedError(e.getMessage());
+        }
+    }
+
+    /**
+     * Checks a specific exception class is thrown by the given runnable, and returns it.
+     */
+    public static <T extends Throwable> T expectThrows(Class<T> expectedType, DorisHttpTestCase.ThrowingRunnable runnable) {
+        return expectThrows(expectedType, "Expected exception " + expectedType.getSimpleName() + " but no exception was thrown", runnable);
+    }
+
+    /**
+     * Checks a specific exception class is thrown by the given runnable, and returns it.
+     */
+    public static <T extends Throwable> T expectThrows(Class<T> expectedType, String noExceptionMessage, DorisHttpTestCase.ThrowingRunnable runnable) {
+        try {
+            runnable.run();
+        } catch (Throwable e) {
+            if (expectedType.isInstance(e)) {
+                return expectedType.cast(e);
+            }
+            AssertionFailedError assertion = new AssertionFailedError("Unexpected exception type, expected " + expectedType.getSimpleName() + " but got " + e);
+            assertion.initCause(e);
+            throw assertion;
+        }
+        throw new AssertionFailedError(noExceptionMessage);
+    }
+}
diff --git a/fe/src/test/java/org/apache/doris/common/FeNameFormatTest.java b/fe/src/test/java/org/apache/doris/common/FeNameFormatTest.java
new file mode 100644
index 0000000..2e6dd69
--- /dev/null
+++ b/fe/src/test/java/org/apache/doris/common/FeNameFormatTest.java
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.common;
+
+import org.junit.Test;
+
+public class FeNameFormatTest {
+
+    @Test
+    public void testCheckColumnName() {
+        ExceptionChecker.expectThrowsNoException(() -> FeNameFormat.checkColumnName("_id"));
+        ExceptionChecker.expectThrowsNoException(() -> FeNameFormat.checkColumnName("__id"));
+        ExceptionChecker.expectThrowsNoException(() -> FeNameFormat.checkColumnName("___id"));
+        ExceptionChecker.expectThrowsNoException(() -> FeNameFormat.checkColumnName("___id_"));
+        ExceptionChecker.expectThrows(AnalysisException.class, () -> FeNameFormat.checkColumnName("?id_"));
+        ExceptionChecker.expectThrows(AnalysisException.class, () -> FeNameFormat.checkColumnName("#id_"));
+    }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org