You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2021/11/26 02:13:05 UTC

[incubator-doris] branch master updated: [feat-opt](json-function) optimize get_json_xx function (#7157)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a1bf287  [feat-opt](json-function) optimize get_json_xx function (#7157)
a1bf287 is described below

commit a1bf2878c0a67260c564a9254b1f22c709bb91e0
Author: Hao Tan <45...@users.noreply.github.com>
AuthorDate: Fri Nov 26 10:12:55 2021 +0800

    [feat-opt](json-function) optimize get_json_xx function (#7157)
    
    Avoid repeated parsing json string is the first parameter of function is constant.
---
 be/src/exprs/json_functions.cpp | 90 +++++++++++++++++++++++------------------
 be/src/exprs/json_functions.h   |  5 +++
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp
index e85f214..3727553 100644
--- a/be/src/exprs/json_functions.cpp
+++ b/be/src/exprs/json_functions.cpp
@@ -40,6 +40,8 @@
 #include "rapidjson/error/en.h"
 #include "runtime/string_value.h"
 #include "runtime/tuple_row.h"
+#include "udf/udf.h"
+
 namespace doris {
 
 // static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?");
@@ -308,35 +310,35 @@ rapidjson::Value* JsonFunctions::get_json_object(FunctionContext* context,
     //    '$.text#abc.xyz'  ->  [$, text#abc, xyz]
     //    '$."text.abc".xyz'  ->  [$, text.abc, xyz]
     //    '$."text.abc"[1].xyz'  ->  [$, text.abc[1], xyz]
-    std::vector<JsonPath>* parsed_paths;
-    std::vector<JsonPath> tmp_parsed_paths;
+    JsonState* json_state;
+    JsonState tmp_json_state;
 #ifndef BE_TEST
-    parsed_paths = reinterpret_cast<std::vector<JsonPath>*>(
-            context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
-    if (parsed_paths == nullptr) {
-        // TODO: use std::string_view instead of std::string
-        // avoid use boost::tokenizer
+    json_state = reinterpret_cast<JsonState*>(context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
+    if (json_state == nullptr) {
+        json_state = &tmp_json_state;
+    }
+
+    if (json_state->json_paths.size() == 0) {
         boost::tokenizer<boost::escaped_list_separator<char>> tok(
                 path_string, boost::escaped_list_separator<char>("\\", ".", "\""));
         std::vector<std::string> paths(tok.begin(), tok.end());
-        get_parsed_paths(paths, &tmp_parsed_paths);
-        parsed_paths = &tmp_parsed_paths;
+        get_parsed_paths(paths, &json_state->json_paths);
     }
 #else
+    json_state = &tmp_json_state;
     boost::tokenizer<boost::escaped_list_separator<char>> tok(
             path_string, boost::escaped_list_separator<char>("\\", ".", "\""));
     std::vector<std::string> paths(tok.begin(), tok.end());
-    get_parsed_paths(paths, &tmp_parsed_paths);
-    parsed_paths = &tmp_parsed_paths;
+    get_parsed_paths(paths, &json_state->json_paths);
 #endif
 
-    VLOG_TRACE << "first parsed path: " << (*parsed_paths)[0].debug_string();
+    VLOG_TRACE << "first parsed path: " << json_state->json_paths[0].debug_string();
 
-    if (!(*parsed_paths)[0].is_valid) {
+    if (!json_state->json_paths[0].is_valid) {
         return document;
     }
 
-    if (UNLIKELY((*parsed_paths).size() == 1)) {
+    if (UNLIKELY(json_state->json_paths.size() == 1)) {
         if (fntype == JSON_FUN_STRING) {
             document->SetString(json_string.data(), json_string.length(), document->GetAllocator());
         } else {
@@ -344,15 +346,20 @@ rapidjson::Value* JsonFunctions::get_json_object(FunctionContext* context,
         }
     }
 
-    //rapidjson::Document document;
-    document->Parse(json_string.data(), json_string.length());
-    if (UNLIKELY(document->HasParseError())) {
-        VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
-                      << GetParseError_En(document->GetParseError());
-        document->SetNull();
-        return document;
+    if (!json_state->document.IsNull()) {
+        document = &json_state->document;
+    } else {
+        document->Parse(json_string.data(), json_string.length());
+        //rapidjson::Document document;
+        if (UNLIKELY(document->HasParseError())) {
+            VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
+                    << GetParseError_En(document->GetParseError());
+            document->SetNull();
+            return document;
+        }
     }
-    return match_value(*parsed_paths, document, document->GetAllocator());
+
+    return match_value(json_state->json_paths, document, document->GetAllocator());
 }
 
 rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json(
@@ -418,23 +425,28 @@ void JsonFunctions::json_path_prepare(doris_udf::FunctionContext* context,
         return;
     }
 
-    if (!context->is_arg_constant(1)) {
+    if (!context->is_arg_constant(0) && !context->is_arg_constant(1)) {
         return;
     }
+
+    JsonState* json_state = new JsonState;
+
+    StringVal* json_str = reinterpret_cast<StringVal*>(context->get_constant_arg(0));
+    if (json_str != nullptr && !json_str->is_null) {
+        std::string json_string((char*)json_str->ptr, json_str->len);
+        json_state->document.Parse(json_string.c_str());
+    }
     StringVal* path = reinterpret_cast<StringVal*>(context->get_constant_arg(1));
-    if (path->is_null) {
-        return;
+    if (path != nullptr && !path->is_null) {
+        std::string path_str(reinterpret_cast<char*>(path->ptr), path->len);
+        boost::tokenizer<boost::escaped_list_separator<char>> tok(
+                path_str, boost::escaped_list_separator<char>("\\", ".", "\""));
+        std::vector<std::string> path_exprs(tok.begin(), tok.end());
+        get_parsed_paths(path_exprs, &json_state->json_paths);
     }
 
-    std::string path_str(reinterpret_cast<char*>(path->ptr), path->len);
-    boost::tokenizer<boost::escaped_list_separator<char>> tok(
-            path_str, boost::escaped_list_separator<char>("\\", ".", "\""));
-    std::vector<std::string> path_exprs(tok.begin(), tok.end());
-    std::vector<JsonPath>* parsed_paths = new std::vector<JsonPath>();
-    get_parsed_paths(path_exprs, parsed_paths);
-
-    context->set_function_state(scope, parsed_paths);
-    VLOG_TRACE << "prepare json path. size: " << parsed_paths->size();
+    context->set_function_state(scope, json_state);
+    VLOG_TRACE << "prepare json path. size: " << json_state->json_paths.size();
 }
 
 void JsonFunctions::json_path_close(doris_udf::FunctionContext* context,
@@ -442,11 +454,11 @@ void JsonFunctions::json_path_close(doris_udf::FunctionContext* context,
     if (scope != FunctionContext::FRAGMENT_LOCAL) {
         return;
     }
-    std::vector<JsonPath>* parsed_paths =
-            reinterpret_cast<std::vector<JsonPath>*>(context->get_function_state(scope));
-    if (parsed_paths != nullptr) {
-        delete parsed_paths;
-        VLOG_TRACE << "close json path";
+
+    JsonState* json_state = reinterpret_cast<JsonState*>(context->get_function_state(scope));
+    if (json_state != nullptr) {
+        delete json_state;
+        VLOG_TRACE << "close json state";
     }
 }
 
diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h
index c16d51c..ab0d7e4 100644
--- a/be/src/exprs/json_functions.h
+++ b/be/src/exprs/json_functions.h
@@ -69,6 +69,11 @@ struct JsonPath {
     }
 };
 
+struct JsonState {
+    std::vector<JsonPath> json_paths;
+    rapidjson::Document document;
+};
+
 class JsonFunctions {
 public:
     static void init();

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org