You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/12/29 04:37:33 UTC
[doris] 02/05: [Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
commit b204d46615608116e645a5f1c060a6528a2a005f
Author: Kang <kx...@gmail.com>
AuthorDate: Thu Dec 29 09:29:09 2022 +0800
[Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)
test data: https://data.gharchive.org/2020-11-13-18.json.gz, 2GB, 197696 lines
before: String 13s vs. JSONB 28s
after: String 13s vs. JSONB 16s
**NOTICE: simdjson need to be patched since BOOL is conflicted with a macro BOOL defined in odbc sqltypes.h**
---
.licenserc.yaml | 1 +
be/CMakeLists.txt | 2 +
be/src/runtime/jsonb_value.h | 4 +-
be/src/util/jsonb_error.h | 10 +-
be/src/util/jsonb_parser_simd.h | 350 ++++++++++++++++++++++++++++++++
be/src/vec/functions/function_jsonb.cpp | 9 +-
6 files changed, 369 insertions(+), 7 deletions(-)
diff --git a/.licenserc.yaml b/.licenserc.yaml
index d458e45269..020ee7b4e8 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -52,6 +52,7 @@ header:
- "be/src/util/jsonb_document.h"
- "be/src/util/jsonb_error.h"
- "be/src/util/jsonb_parser.h"
+ - "be/src/util/jsonb_parser_simd.h"
- "be/src/util/jsonb_stream.h"
- "be/src/util/jsonb_updater.h"
- "be/src/util/jsonb_utils.h"
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index cf51ddc492..915c207189 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -492,6 +492,8 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
if (USE_AVX2)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
+ # set -mlzcnt for leading zero count used by simdjson
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h
index bdb5ba4976..1e03518d9d 100644
--- a/be/src/runtime/jsonb_value.h
+++ b/be/src/runtime/jsonb_value.h
@@ -22,7 +22,7 @@
#include "util/cpu_info.h"
#include "util/hash_util.hpp"
#include "util/jsonb_error.h"
-#include "util/jsonb_parser.h"
+#include "util/jsonb_parser_simd.h"
#include "util/jsonb_utils.h"
#include "vec/common/string_ref.h"
@@ -38,7 +38,7 @@ struct JsonBinaryValue {
// default nullprt and size 0 for invalid or NULL value
const char* ptr = nullptr;
size_t len = 0;
- JsonbParser parser;
+ JsonbParserSIMD parser;
JsonBinaryValue() : ptr(nullptr), len(0) {}
JsonBinaryValue(char* ptr, int len) { from_json_string(const_cast<const char*>(ptr), len); }
diff --git a/be/src/util/jsonb_error.h b/be/src/util/jsonb_error.h
index 77d6fa16d0..2ad632fb8b 100644
--- a/be/src/util/jsonb_error.h
+++ b/be/src/util/jsonb_error.h
@@ -30,12 +30,14 @@ enum class JsonbErrType {
E_EMPTY_DOCUMENT,
E_OUTPUT_FAIL,
E_INVALID_DOCU,
+ E_INVALID_TYPE,
E_INVALID_SCALAR_VALUE,
E_INVALID_KEY_STRING,
E_INVALID_KEY_LENGTH,
E_INVALID_STR,
E_INVALID_OBJ,
E_INVALID_ARR,
+ E_INVALID_NUMBER,
E_INVALID_HEX,
E_INVALID_OCTAL,
E_INVALID_DECIMAL,
@@ -53,6 +55,7 @@ enum class JsonbErrType {
E_INVALID_JSONB_OBJ,
E_NESTING_LVL_OVERFLOW,
E_INVALID_DOCU_COMPAT,
+ E_EXCEPTION,
// new error code should always be added above
E_NUM_ERRORS
@@ -77,13 +80,15 @@ private:
"Invalid document version",
"Empty document",
"Fatal error in writing JSONB",
- "Invalid document: document must be an object or an array",
+ "Invalid document",
+ "Invalid json value type",
"Invalid scalar value",
"Invalid key string",
"Key length exceeds maximum size allowed (64 bytes)",
"Invalid string value",
"Invalid JSON object",
"Invalid JSON array",
+ "Invalid number",
"Invalid HEX number",
"Invalid octal number",
"Invalid decimal number",
@@ -100,7 +105,8 @@ private:
"Invalid update operation",
"Invalid JSONB object (internal)",
"Object or array has too many nesting levels",
- "Invalid document: document must be an object or an array",
+ "Invalid document",
+ "Exception throwed",
nullptr /* E_NUM_ERRORS */
};
diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h
new file mode 100644
index 0000000000..10d19a3f57
--- /dev/null
+++ b/be/src/util/jsonb_parser_simd.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2014, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This file defines JsonbParserTSIMD (template) and JsonbParser.
+ *
+ * JsonbParserTSIMD is a template class which implements a JSON parser.
+ * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
+ * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
+ * JsonbWriterT object with an output stream object. However, you can also
+ * pass in your JsonbWriterT or any stream object that implements some basic
+ * interface of std::ostream (see JsonbStream.h).
+ *
+ * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
+ * JsonbStream.h). So unless you want to provide own a different output stream
+ * type, use JsonbParser object.
+ *
+ * ** Parsing JSON **
+ * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
+ * packed bytes. There are three ways to parse a JSON string: (1) using
+ * c-string, (2) using string with len, (3) using std::istream object. You can
+ * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
+ * internally if the input is raw character buffer.
+ *
+ * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
+ * strings, and the previous JSONB will be overwritten.
+ *
+ * If parsing fails (returned false), the error code will be set to one of
+ * JsonbErrType, and can be retrieved by calling getErrorCode().
+ *
+ * ** External dictionary **
+ * During parsing a JSON string, you can pass a call-back function to map a key
+ * string to an id, and store the dictionary id in JSONB to save space. The
+ * purpose of using an external dictionary is more towards a collection of
+ * documents (which has common keys) rather than a single document, so that
+ * space saving will be significant.
+ *
+ * ** Endianness **
+ * Note: JSONB serialization doesn't assume endianness of the server. However
+ * you will need to ensure that the endianness at the reader side is the same
+ * as that at the writer side (if they are on different machines). Otherwise,
+ * proper conversion is needed when a number value is returned to the
+ * caller/writer.
+ *
+ * @author Tian Xia <ti...@fb.com>
+ *
+ * this file is copied from
+ * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
+ * and modified by Doris
+ */
+
+#ifndef JSONB_JSONBJSONPARSERSIMD_H
+#define JSONB_JSONBJSONPARSERSIMD_H
+
+#include <simdjson.h>
+
+#include <cmath>
+#include <limits>
+
+#include "jsonb_document.h"
+#include "jsonb_error.h"
+#include "jsonb_writer.h"
+#include "string_parser.hpp"
+
+namespace doris {
+
+/*
+ * Template JsonbParserTSIMD
+ */
+template <class OS_TYPE>
+class JsonbParserTSIMD {
+public:
+ JsonbParserTSIMD() : err_(JsonbErrType::E_NONE) {}
+
+ explicit JsonbParserTSIMD(OS_TYPE& os) : writer_(os), err_(JsonbErrType::E_NONE) {}
+
+ // parse a UTF-8 JSON string
+ bool parse(const std::string& str, hDictInsert handler = nullptr) {
+ return parse(str.c_str(), (unsigned int)str.size(), handler);
+ }
+
+ // parse a UTF-8 JSON c-style string (NULL terminated)
+ bool parse(const char* c_str, hDictInsert handler = nullptr) {
+ return parse(c_str, (unsigned int)strlen(c_str), handler);
+ }
+
+ // parse a UTF-8 JSON string with length
+ bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
+ // reset state before parse
+ reset();
+
+ if (!pch || len == 0) {
+ err_ = JsonbErrType::E_EMPTY_DOCUMENT;
+ LOG(WARNING) << "empty json string";
+ return false;
+ }
+
+ // parse json using simdjson, return false on exception
+ try {
+ simdjson::padded_string json_str {pch, len};
+ simdjson::ondemand::document doc = parser_.iterate(json_str);
+
+ // simdjson process top level primitive types specially
+ // so some repeated code here
+ switch (doc.type()) {
+ case simdjson::ondemand::json_type::object:
+ case simdjson::ondemand::json_type::array: {
+ parse(doc.get_value(), handler);
+ break;
+ }
+ case simdjson::ondemand::json_type::null: {
+ if (writer_.writeNull() == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeNull failed";
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::boolean: {
+ if (writer_.writeBool(doc.get_bool()) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeBool failed";
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::string: {
+ write_string(doc.get_string());
+ break;
+ }
+ case simdjson::ondemand::json_type::number: {
+ write_number(doc.get_number());
+ break;
+ }
+ }
+
+ return err_ == JsonbErrType::E_NONE;
+ } catch (simdjson::simdjson_error& e) {
+ err_ = JsonbErrType::E_EXCEPTION;
+ LOG(WARNING) << "simdjson parse exception: " << e.what();
+ return false;
+ }
+ }
+
+ // parse json, recursively if necessary, by simdjson
+ // and serialize to binary format by writer
+ void parse(simdjson::ondemand::value value, hDictInsert handler = nullptr) {
+ switch (value.type()) {
+ case simdjson::ondemand::json_type::null: {
+ if (writer_.writeNull() == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeNull failed";
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::boolean: {
+ if (writer_.writeBool(value.get_bool()) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeBool failed";
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::string: {
+ write_string(value.get_string());
+ break;
+ }
+ case simdjson::ondemand::json_type::number: {
+ write_number(value.get_number());
+ break;
+ }
+ case simdjson::ondemand::json_type::object: {
+ if (!writer_.writeStartObject()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeStartObject failed";
+ break;
+ }
+
+ for (auto kv : value.get_object()) {
+ std::string_view key;
+ simdjson::error_code e = kv.unescaped_key().get(key);
+ if (e != simdjson::SUCCESS) {
+ err_ = JsonbErrType::E_INVALID_KEY_STRING;
+ LOG(WARNING) << "simdjson get key failed: " << e;
+ break;
+ }
+
+ int key_id = -1;
+ if (handler) {
+ key_id = handler(key.data(), key.size());
+ }
+
+ if (key_id < 0) {
+ if (writer_.writeKey(key.data(), key.size()) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeKey failed key: " << key;
+ break;
+ }
+ } else {
+ if (writer_.writeKey(key_id) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeKey failed key_id: " << key_id;
+ break;
+ }
+ }
+
+ // parse object value
+ parse(kv.value(), handler);
+ if (err_ != JsonbErrType::E_NONE) {
+ LOG(WARNING) << "parse object value failed";
+ break;
+ }
+ }
+ if (err_ != JsonbErrType::E_NONE) {
+ break;
+ }
+
+ if (!writer_.writeEndObject()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeEndObject failed";
+ break;
+ }
+
+ break;
+ }
+ case simdjson::ondemand::json_type::array: {
+ if (!writer_.writeStartArray()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeStartArray failed";
+ break;
+ }
+
+ for (auto elem : value.get_array()) {
+ // parse array element
+ parse(elem.value(), handler);
+ if (err_ != JsonbErrType::E_NONE) {
+ LOG(WARNING) << "parse array element failed";
+ break;
+ }
+ }
+ if (err_ != JsonbErrType::E_NONE) {
+ break;
+ }
+
+ if (!writer_.writeEndArray()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeEndArray failed";
+ break;
+ }
+
+ break;
+ }
+ default: {
+ err_ = JsonbErrType::E_INVALID_TYPE;
+ LOG(WARNING) << "unknown value type: "; // << value;
+ break;
+ }
+
+ } // end of switch
+ }
+
+ void write_string(std::string_view str) {
+ // start writing string
+ if (!writer_.writeStartString()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeStartString failed";
+ return;
+ }
+
+ // write string
+ if (str.size() > 0) {
+ if (writer_.writeString(str.data(), str.size()) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeString failed";
+ return;
+ }
+ }
+
+ // end writing string
+ if (!writer_.writeEndString()) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeEndString failed";
+ return;
+ }
+ }
+
+ void write_number(simdjson::ondemand::number num) {
+ if (num.is_double()) {
+ if (writer_.writeDouble(num.get_double()) == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeDouble failed";
+ return;
+ }
+ } else if (num.is_int64() || num.is_uint64()) {
+ if (num.is_uint64() && num.get_uint64() > std::numeric_limits<int64_t>::max()) {
+ err_ = JsonbErrType::E_OCTAL_OVERFLOW;
+ LOG(WARNING) << "overflow number: " << num.get_uint64();
+ return;
+ }
+ int64_t val = num.is_int64() ? num.get_int64() : num.get_uint64();
+ int size = 0;
+ if (val <= std::numeric_limits<int8_t>::max()) {
+ size = writer_.writeInt8((int8_t)val);
+ } else if (val <= std::numeric_limits<int16_t>::max()) {
+ size = writer_.writeInt16((int16_t)val);
+ } else if (val <= std::numeric_limits<int32_t>::max()) {
+ size = writer_.writeInt32((int32_t)val);
+ } else { // val <= INT64_MAX
+ size = writer_.writeInt64(val);
+ }
+
+ if (size == 0) {
+ err_ = JsonbErrType::E_OUTPUT_FAIL;
+ LOG(WARNING) << "writeInt failed";
+ return;
+ }
+ } else {
+ err_ = JsonbErrType::E_INVALID_NUMBER;
+ LOG(WARNING) << "invalid number: " << num.as_double();
+ return;
+ }
+ }
+
+ JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
+
+ JsonbErrType getErrorCode() { return err_; }
+
+ // clear error code
+ void clearErr() { err_ = JsonbErrType::E_NONE; }
+
+ void reset() {
+ writer_.reset();
+ clearErr();
+ }
+
+private:
+ simdjson::ondemand::parser parser_;
+ JsonbWriterT<OS_TYPE> writer_;
+ JsonbErrType err_;
+};
+
+using JsonbParserSIMD = JsonbParserTSIMD<JsonbOutStream>;
+
+} // namespace doris
+
+#endif // JSONB_JSONBJSONPARSERSIMD_H
diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp
index ea84ddf3ae..02f352fb57 100644
--- a/be/src/vec/functions/function_jsonb.cpp
+++ b/be/src/vec/functions/function_jsonb.cpp
@@ -18,6 +18,7 @@
#include <boost/token_functions.hpp>
#include <vector>
+// #include "util/jsonb_parser_simd.h"
#include "util/string_parser.hpp"
#include "util/string_util.h"
#include "vec/columns/column.h"
@@ -47,7 +48,7 @@ enum class JsonbParseErrorMode { FAIL = 0, RETURN_NULL, RETURN_VALUE, RETURN_INV
template <NullalbeMode nullable_mode, JsonbParseErrorMode parse_error_handle_mode>
class FunctionJsonbParseBase : public IFunction {
private:
- JsonbParser default_value_parser;
+ JsonbParserSIMD default_value_parser;
bool has_const_default_value = false;
public:
@@ -193,6 +194,10 @@ public:
size_t size = col_from.size();
col_to->reserve(size);
+ // parser can be reused for performance
+ JsonbParserSIMD parser;
+ JsonbErrType error = JsonbErrType::E_NONE;
+
for (size_t i = 0; i < input_rows_count; ++i) {
if (col_from.is_null_at(i)) {
null_map->get_data()[i] = 1;
@@ -201,8 +206,6 @@ public:
}
const auto& val = col_from_string->get_data_at(i);
- JsonbParser parser;
- JsonbErrType error = JsonbErrType::E_NONE;
if (parser.parse(val.data, val.size)) {
// insert jsonb format data
col_to->insert_data(parser.getWriter().getOutput()->getBuffer(),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org