You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2021/11/26 02:13:05 UTC
[incubator-doris] branch master updated: [feat-opt](json-function) optimize get_json_xx function (#7157)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new a1bf287 [feat-opt](json-function) optimize get_json_xx function (#7157)
a1bf287 is described below
commit a1bf2878c0a67260c564a9254b1f22c709bb91e0
Author: Hao Tan <45...@users.noreply.github.com>
AuthorDate: Fri Nov 26 10:12:55 2021 +0800
[feat-opt](json-function) optimize get_json_xx function (#7157)
Avoid repeated parsing json string is the first parameter of function is constant.
---
be/src/exprs/json_functions.cpp | 90 +++++++++++++++++++++++------------------
be/src/exprs/json_functions.h | 5 +++
2 files changed, 56 insertions(+), 39 deletions(-)
diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp
index e85f214..3727553 100644
--- a/be/src/exprs/json_functions.cpp
+++ b/be/src/exprs/json_functions.cpp
@@ -40,6 +40,8 @@
#include "rapidjson/error/en.h"
#include "runtime/string_value.h"
#include "runtime/tuple_row.h"
+#include "udf/udf.h"
+
namespace doris {
// static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?");
@@ -308,35 +310,35 @@ rapidjson::Value* JsonFunctions::get_json_object(FunctionContext* context,
// '$.text#abc.xyz' -> [$, text#abc, xyz]
// '$."text.abc".xyz' -> [$, text.abc, xyz]
// '$."text.abc"[1].xyz' -> [$, text.abc[1], xyz]
- std::vector<JsonPath>* parsed_paths;
- std::vector<JsonPath> tmp_parsed_paths;
+ JsonState* json_state;
+ JsonState tmp_json_state;
#ifndef BE_TEST
- parsed_paths = reinterpret_cast<std::vector<JsonPath>*>(
- context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
- if (parsed_paths == nullptr) {
- // TODO: use std::string_view instead of std::string
- // avoid use boost::tokenizer
+ json_state = reinterpret_cast<JsonState*>(context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
+ if (json_state == nullptr) {
+ json_state = &tmp_json_state;
+ }
+
+ if (json_state->json_paths.size() == 0) {
boost::tokenizer<boost::escaped_list_separator<char>> tok(
path_string, boost::escaped_list_separator<char>("\\", ".", "\""));
std::vector<std::string> paths(tok.begin(), tok.end());
- get_parsed_paths(paths, &tmp_parsed_paths);
- parsed_paths = &tmp_parsed_paths;
+ get_parsed_paths(paths, &json_state->json_paths);
}
#else
+ json_state = &tmp_json_state;
boost::tokenizer<boost::escaped_list_separator<char>> tok(
path_string, boost::escaped_list_separator<char>("\\", ".", "\""));
std::vector<std::string> paths(tok.begin(), tok.end());
- get_parsed_paths(paths, &tmp_parsed_paths);
- parsed_paths = &tmp_parsed_paths;
+ get_parsed_paths(paths, &json_state->json_paths);
#endif
- VLOG_TRACE << "first parsed path: " << (*parsed_paths)[0].debug_string();
+ VLOG_TRACE << "first parsed path: " << json_state->json_paths[0].debug_string();
- if (!(*parsed_paths)[0].is_valid) {
+ if (!json_state->json_paths[0].is_valid) {
return document;
}
- if (UNLIKELY((*parsed_paths).size() == 1)) {
+ if (UNLIKELY(json_state->json_paths.size() == 1)) {
if (fntype == JSON_FUN_STRING) {
document->SetString(json_string.data(), json_string.length(), document->GetAllocator());
} else {
@@ -344,15 +346,20 @@ rapidjson::Value* JsonFunctions::get_json_object(FunctionContext* context,
}
}
- //rapidjson::Document document;
- document->Parse(json_string.data(), json_string.length());
- if (UNLIKELY(document->HasParseError())) {
- VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
- << GetParseError_En(document->GetParseError());
- document->SetNull();
- return document;
+ if (!json_state->document.IsNull()) {
+ document = &json_state->document;
+ } else {
+ document->Parse(json_string.data(), json_string.length());
+ //rapidjson::Document document;
+ if (UNLIKELY(document->HasParseError())) {
+ VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
+ << GetParseError_En(document->GetParseError());
+ document->SetNull();
+ return document;
+ }
}
- return match_value(*parsed_paths, document, document->GetAllocator());
+
+ return match_value(json_state->json_paths, document, document->GetAllocator());
}
rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json(
@@ -418,23 +425,28 @@ void JsonFunctions::json_path_prepare(doris_udf::FunctionContext* context,
return;
}
- if (!context->is_arg_constant(1)) {
+ if (!context->is_arg_constant(0) && !context->is_arg_constant(1)) {
return;
}
+
+ JsonState* json_state = new JsonState;
+
+ StringVal* json_str = reinterpret_cast<StringVal*>(context->get_constant_arg(0));
+ if (json_str != nullptr && !json_str->is_null) {
+ std::string json_string((char*)json_str->ptr, json_str->len);
+ json_state->document.Parse(json_string.c_str());
+ }
StringVal* path = reinterpret_cast<StringVal*>(context->get_constant_arg(1));
- if (path->is_null) {
- return;
+ if (path != nullptr && !path->is_null) {
+ std::string path_str(reinterpret_cast<char*>(path->ptr), path->len);
+ boost::tokenizer<boost::escaped_list_separator<char>> tok(
+ path_str, boost::escaped_list_separator<char>("\\", ".", "\""));
+ std::vector<std::string> path_exprs(tok.begin(), tok.end());
+ get_parsed_paths(path_exprs, &json_state->json_paths);
}
- std::string path_str(reinterpret_cast<char*>(path->ptr), path->len);
- boost::tokenizer<boost::escaped_list_separator<char>> tok(
- path_str, boost::escaped_list_separator<char>("\\", ".", "\""));
- std::vector<std::string> path_exprs(tok.begin(), tok.end());
- std::vector<JsonPath>* parsed_paths = new std::vector<JsonPath>();
- get_parsed_paths(path_exprs, parsed_paths);
-
- context->set_function_state(scope, parsed_paths);
- VLOG_TRACE << "prepare json path. size: " << parsed_paths->size();
+ context->set_function_state(scope, json_state);
+ VLOG_TRACE << "prepare json path. size: " << json_state->json_paths.size();
}
void JsonFunctions::json_path_close(doris_udf::FunctionContext* context,
@@ -442,11 +454,11 @@ void JsonFunctions::json_path_close(doris_udf::FunctionContext* context,
if (scope != FunctionContext::FRAGMENT_LOCAL) {
return;
}
- std::vector<JsonPath>* parsed_paths =
- reinterpret_cast<std::vector<JsonPath>*>(context->get_function_state(scope));
- if (parsed_paths != nullptr) {
- delete parsed_paths;
- VLOG_TRACE << "close json path";
+
+ JsonState* json_state = reinterpret_cast<JsonState*>(context->get_function_state(scope));
+ if (json_state != nullptr) {
+ delete json_state;
+ VLOG_TRACE << "close json state";
}
}
diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h
index c16d51c..ab0d7e4 100644
--- a/be/src/exprs/json_functions.h
+++ b/be/src/exprs/json_functions.h
@@ -69,6 +69,11 @@ struct JsonPath {
}
};
+struct JsonState {
+ std::vector<JsonPath> json_paths;
+ rapidjson::Document document;
+};
+
class JsonFunctions {
public:
static void init();
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org