You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/12/29 01:29:15 UTC

[doris] branch master updated: [Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0f3c0b78e3 [Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)
0f3c0b78e3 is described below

commit 0f3c0b78e3034dea974da576e4ebf9211dc0978c
Author: Kang <kx...@gmail.com>
AuthorDate: Thu Dec 29 09:29:09 2022 +0800

    [Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)
    
    test data: https://data.gharchive.org/2020-11-13-18.json.gz, 2GB, 197696 lines
    before: String 13s vs. JSONB 28s
    after: String 13s vs. JSONB 16s
    
    **NOTICE: simdjson need to be patched since BOOL is conflicted with a macro BOOL defined in odbc sqltypes.h**
---
 .licenserc.yaml                         |   1 +
 be/CMakeLists.txt                       |   2 +
 be/src/runtime/jsonb_value.h            |   4 +-
 be/src/util/jsonb_error.h               |  10 +-
 be/src/util/jsonb_parser_simd.h         | 350 ++++++++++++++++++++++++++++++++
 be/src/vec/functions/function_jsonb.cpp |   9 +-
 6 files changed, 369 insertions(+), 7 deletions(-)

diff --git a/.licenserc.yaml b/.licenserc.yaml
index 54c9435c14..8e02ad6e88 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -52,6 +52,7 @@ header:
     - "be/src/util/jsonb_document.h"
     - "be/src/util/jsonb_error.h"
     - "be/src/util/jsonb_parser.h"
+    - "be/src/util/jsonb_parser_simd.h"
     - "be/src/util/jsonb_stream.h"
     - "be/src/util/jsonb_updater.h"
     - "be/src/util/jsonb_utils.h"
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index c85f87e8a4..dfa097223e 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -510,6 +510,8 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
     if (USE_AVX2)
         set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
     endif()
+    # set -mlzcnt for leading zero count used by simdjson
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
 endif()
 set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG")
 
diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h
index bdb5ba4976..1e03518d9d 100644
--- a/be/src/runtime/jsonb_value.h
+++ b/be/src/runtime/jsonb_value.h
@@ -22,7 +22,7 @@
 #include "util/cpu_info.h"
 #include "util/hash_util.hpp"
 #include "util/jsonb_error.h"
-#include "util/jsonb_parser.h"
+#include "util/jsonb_parser_simd.h"
 #include "util/jsonb_utils.h"
 #include "vec/common/string_ref.h"
 
@@ -38,7 +38,7 @@ struct JsonBinaryValue {
     // default nullprt and size 0 for invalid or NULL value
     const char* ptr = nullptr;
     size_t len = 0;
-    JsonbParser parser;
+    JsonbParserSIMD parser;
 
     JsonBinaryValue() : ptr(nullptr), len(0) {}
     JsonBinaryValue(char* ptr, int len) { from_json_string(const_cast<const char*>(ptr), len); }
diff --git a/be/src/util/jsonb_error.h b/be/src/util/jsonb_error.h
index 77d6fa16d0..2ad632fb8b 100644
--- a/be/src/util/jsonb_error.h
+++ b/be/src/util/jsonb_error.h
@@ -30,12 +30,14 @@ enum class JsonbErrType {
     E_EMPTY_DOCUMENT,
     E_OUTPUT_FAIL,
     E_INVALID_DOCU,
+    E_INVALID_TYPE,
     E_INVALID_SCALAR_VALUE,
     E_INVALID_KEY_STRING,
     E_INVALID_KEY_LENGTH,
     E_INVALID_STR,
     E_INVALID_OBJ,
     E_INVALID_ARR,
+    E_INVALID_NUMBER,
     E_INVALID_HEX,
     E_INVALID_OCTAL,
     E_INVALID_DECIMAL,
@@ -53,6 +55,7 @@ enum class JsonbErrType {
     E_INVALID_JSONB_OBJ,
     E_NESTING_LVL_OVERFLOW,
     E_INVALID_DOCU_COMPAT,
+    E_EXCEPTION,
 
     // new error code should always be added above
     E_NUM_ERRORS
@@ -77,13 +80,15 @@ private:
             "Invalid document version",
             "Empty document",
             "Fatal error in writing JSONB",
-            "Invalid document: document must be an object or an array",
+            "Invalid document",
+            "Invalid json value type",
             "Invalid scalar value",
             "Invalid key string",
             "Key length exceeds maximum size allowed (64 bytes)",
             "Invalid string value",
             "Invalid JSON object",
             "Invalid JSON array",
+            "Invalid number",
             "Invalid HEX number",
             "Invalid octal number",
             "Invalid decimal number",
@@ -100,7 +105,8 @@ private:
             "Invalid update operation",
             "Invalid JSONB object (internal)",
             "Object or array has too many nesting levels",
-            "Invalid document: document must be an object or an array",
+            "Invalid document",
+            "Exception throwed",
 
             nullptr /* E_NUM_ERRORS */
     };
diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h
new file mode 100644
index 0000000000..10d19a3f57
--- /dev/null
+++ b/be/src/util/jsonb_parser_simd.h
@@ -0,0 +1,350 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This file defines JsonbParserTSIMD (template) and JsonbParser.
+ *
+ * JsonbParserTSIMD is a template class which implements a JSON parser.
+ * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
+ * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
+ * JsonbWriterT object with an output stream object.  However, you can also
+ * pass in your JsonbWriterT or any stream object that implements some basic
+ * interface of std::ostream (see JsonbStream.h).
+ *
+ * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
+ * JsonbStream.h). So unless you want to provide own a different output stream
+ * type, use JsonbParser object.
+ *
+ * ** Parsing JSON **
+ * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
+ * packed bytes. There are three ways to parse a JSON string: (1) using
+ * c-string, (2) using string with len, (3) using std::istream object. You can
+ * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
+ * internally if the input is raw character buffer.
+ *
+ * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
+ * strings, and the previous JSONB will be overwritten.
+ *
+ * If parsing fails (returned false), the error code will be set to one of
+ * JsonbErrType, and can be retrieved by calling getErrorCode().
+ *
+ * ** External dictionary **
+ * During parsing a JSON string, you can pass a call-back function to map a key
+ * string to an id, and store the dictionary id in JSONB to save space. The
+ * purpose of using an external dictionary is more towards a collection of
+ * documents (which has common keys) rather than a single document, so that
+ * space saving will be significant.
+ *
+ * ** Endianness **
+ * Note: JSONB serialization doesn't assume endianness of the server. However
+ * you will need to ensure that the endianness at the reader side is the same
+ * as that at the writer side (if they are on different machines). Otherwise,
+ * proper conversion is needed when a number value is returned to the
+ * caller/writer.
+ *
+ * @author Tian Xia <ti...@fb.com>
+ * 
+ * this file is copied from 
+ * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
+ * and modified by Doris
+ */
+
+#ifndef JSONB_JSONBJSONPARSERSIMD_H
+#define JSONB_JSONBJSONPARSERSIMD_H
+
+#include <simdjson.h>
+
+#include <cmath>
+#include <limits>
+
+#include "jsonb_document.h"
+#include "jsonb_error.h"
+#include "jsonb_writer.h"
+#include "string_parser.hpp"
+
+namespace doris {
+
+/*
+ * Template JsonbParserTSIMD
+ */
+template <class OS_TYPE>
+class JsonbParserTSIMD {
+public:
+    JsonbParserTSIMD() : err_(JsonbErrType::E_NONE) {}
+
+    explicit JsonbParserTSIMD(OS_TYPE& os) : writer_(os), err_(JsonbErrType::E_NONE) {}
+
+    // parse a UTF-8 JSON string
+    bool parse(const std::string& str, hDictInsert handler = nullptr) {
+        return parse(str.c_str(), (unsigned int)str.size(), handler);
+    }
+
+    // parse a UTF-8 JSON c-style string (NULL terminated)
+    bool parse(const char* c_str, hDictInsert handler = nullptr) {
+        return parse(c_str, (unsigned int)strlen(c_str), handler);
+    }
+
+    // parse a UTF-8 JSON string with length
+    bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
+        // reset state before parse
+        reset();
+
+        if (!pch || len == 0) {
+            err_ = JsonbErrType::E_EMPTY_DOCUMENT;
+            LOG(WARNING) << "empty json string";
+            return false;
+        }
+
+        // parse json using simdjson, return false on exception
+        try {
+            simdjson::padded_string json_str {pch, len};
+            simdjson::ondemand::document doc = parser_.iterate(json_str);
+
+            // simdjson process top level primitive types specially
+            // so some repeated code here
+            switch (doc.type()) {
+            case simdjson::ondemand::json_type::object:
+            case simdjson::ondemand::json_type::array: {
+                parse(doc.get_value(), handler);
+                break;
+            }
+            case simdjson::ondemand::json_type::null: {
+                if (writer_.writeNull() == 0) {
+                    err_ = JsonbErrType::E_OUTPUT_FAIL;
+                    LOG(WARNING) << "writeNull failed";
+                }
+                break;
+            }
+            case simdjson::ondemand::json_type::boolean: {
+                if (writer_.writeBool(doc.get_bool()) == 0) {
+                    err_ = JsonbErrType::E_OUTPUT_FAIL;
+                    LOG(WARNING) << "writeBool failed";
+                }
+                break;
+            }
+            case simdjson::ondemand::json_type::string: {
+                write_string(doc.get_string());
+                break;
+            }
+            case simdjson::ondemand::json_type::number: {
+                write_number(doc.get_number());
+                break;
+            }
+            }
+
+            return err_ == JsonbErrType::E_NONE;
+        } catch (simdjson::simdjson_error& e) {
+            err_ = JsonbErrType::E_EXCEPTION;
+            LOG(WARNING) << "simdjson parse exception: " << e.what();
+            return false;
+        }
+    }
+
+    // parse json, recursively if necessary, by simdjson
+    //  and serialize to binary format by writer
+    void parse(simdjson::ondemand::value value, hDictInsert handler = nullptr) {
+        switch (value.type()) {
+        case simdjson::ondemand::json_type::null: {
+            if (writer_.writeNull() == 0) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeNull failed";
+            }
+            break;
+        }
+        case simdjson::ondemand::json_type::boolean: {
+            if (writer_.writeBool(value.get_bool()) == 0) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeBool failed";
+            }
+            break;
+        }
+        case simdjson::ondemand::json_type::string: {
+            write_string(value.get_string());
+            break;
+        }
+        case simdjson::ondemand::json_type::number: {
+            write_number(value.get_number());
+            break;
+        }
+        case simdjson::ondemand::json_type::object: {
+            if (!writer_.writeStartObject()) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeStartObject failed";
+                break;
+            }
+
+            for (auto kv : value.get_object()) {
+                std::string_view key;
+                simdjson::error_code e = kv.unescaped_key().get(key);
+                if (e != simdjson::SUCCESS) {
+                    err_ = JsonbErrType::E_INVALID_KEY_STRING;
+                    LOG(WARNING) << "simdjson get key failed: " << e;
+                    break;
+                }
+
+                int key_id = -1;
+                if (handler) {
+                    key_id = handler(key.data(), key.size());
+                }
+
+                if (key_id < 0) {
+                    if (writer_.writeKey(key.data(), key.size()) == 0) {
+                        err_ = JsonbErrType::E_OUTPUT_FAIL;
+                        LOG(WARNING) << "writeKey failed key: " << key;
+                        break;
+                    }
+                } else {
+                    if (writer_.writeKey(key_id) == 0) {
+                        err_ = JsonbErrType::E_OUTPUT_FAIL;
+                        LOG(WARNING) << "writeKey failed key_id: " << key_id;
+                        break;
+                    }
+                }
+
+                // parse object value
+                parse(kv.value(), handler);
+                if (err_ != JsonbErrType::E_NONE) {
+                    LOG(WARNING) << "parse object value failed";
+                    break;
+                }
+            }
+            if (err_ != JsonbErrType::E_NONE) {
+                break;
+            }
+
+            if (!writer_.writeEndObject()) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeEndObject failed";
+                break;
+            }
+
+            break;
+        }
+        case simdjson::ondemand::json_type::array: {
+            if (!writer_.writeStartArray()) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeStartArray failed";
+                break;
+            }
+
+            for (auto elem : value.get_array()) {
+                // parse array element
+                parse(elem.value(), handler);
+                if (err_ != JsonbErrType::E_NONE) {
+                    LOG(WARNING) << "parse array element failed";
+                    break;
+                }
+            }
+            if (err_ != JsonbErrType::E_NONE) {
+                break;
+            }
+
+            if (!writer_.writeEndArray()) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeEndArray failed";
+                break;
+            }
+
+            break;
+        }
+        default: {
+            err_ = JsonbErrType::E_INVALID_TYPE;
+            LOG(WARNING) << "unknown value type: "; // << value;
+            break;
+        }
+
+        } // end of switch
+    }
+
+    void write_string(std::string_view str) {
+        // start writing string
+        if (!writer_.writeStartString()) {
+            err_ = JsonbErrType::E_OUTPUT_FAIL;
+            LOG(WARNING) << "writeStartString failed";
+            return;
+        }
+
+        // write string
+        if (str.size() > 0) {
+            if (writer_.writeString(str.data(), str.size()) == 0) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeString failed";
+                return;
+            }
+        }
+
+        // end writing string
+        if (!writer_.writeEndString()) {
+            err_ = JsonbErrType::E_OUTPUT_FAIL;
+            LOG(WARNING) << "writeEndString failed";
+            return;
+        }
+    }
+
+    void write_number(simdjson::ondemand::number num) {
+        if (num.is_double()) {
+            if (writer_.writeDouble(num.get_double()) == 0) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeDouble failed";
+                return;
+            }
+        } else if (num.is_int64() || num.is_uint64()) {
+            if (num.is_uint64() && num.get_uint64() > std::numeric_limits<int64_t>::max()) {
+                err_ = JsonbErrType::E_OCTAL_OVERFLOW;
+                LOG(WARNING) << "overflow number: " << num.get_uint64();
+                return;
+            }
+            int64_t val = num.is_int64() ? num.get_int64() : num.get_uint64();
+            int size = 0;
+            if (val <= std::numeric_limits<int8_t>::max()) {
+                size = writer_.writeInt8((int8_t)val);
+            } else if (val <= std::numeric_limits<int16_t>::max()) {
+                size = writer_.writeInt16((int16_t)val);
+            } else if (val <= std::numeric_limits<int32_t>::max()) {
+                size = writer_.writeInt32((int32_t)val);
+            } else { // val <= INT64_MAX
+                size = writer_.writeInt64(val);
+            }
+
+            if (size == 0) {
+                err_ = JsonbErrType::E_OUTPUT_FAIL;
+                LOG(WARNING) << "writeInt failed";
+                return;
+            }
+        } else {
+            err_ = JsonbErrType::E_INVALID_NUMBER;
+            LOG(WARNING) << "invalid number: " << num.as_double();
+            return;
+        }
+    }
+
+    JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
+
+    JsonbErrType getErrorCode() { return err_; }
+
+    // clear error code
+    void clearErr() { err_ = JsonbErrType::E_NONE; }
+
+    void reset() {
+        writer_.reset();
+        clearErr();
+    }
+
+private:
+    simdjson::ondemand::parser parser_;
+    JsonbWriterT<OS_TYPE> writer_;
+    JsonbErrType err_;
+};
+
+using JsonbParserSIMD = JsonbParserTSIMD<JsonbOutStream>;
+
+} // namespace doris
+
+#endif // JSONB_JSONBJSONPARSERSIMD_H
diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp
index ea84ddf3ae..02f352fb57 100644
--- a/be/src/vec/functions/function_jsonb.cpp
+++ b/be/src/vec/functions/function_jsonb.cpp
@@ -18,6 +18,7 @@
 #include <boost/token_functions.hpp>
 #include <vector>
 
+// #include "util/jsonb_parser_simd.h"
 #include "util/string_parser.hpp"
 #include "util/string_util.h"
 #include "vec/columns/column.h"
@@ -47,7 +48,7 @@ enum class JsonbParseErrorMode { FAIL = 0, RETURN_NULL, RETURN_VALUE, RETURN_INV
 template <NullalbeMode nullable_mode, JsonbParseErrorMode parse_error_handle_mode>
 class FunctionJsonbParseBase : public IFunction {
 private:
-    JsonbParser default_value_parser;
+    JsonbParserSIMD default_value_parser;
     bool has_const_default_value = false;
 
 public:
@@ -193,6 +194,10 @@ public:
         size_t size = col_from.size();
         col_to->reserve(size);
 
+        // parser can be reused for performance
+        JsonbParserSIMD parser;
+        JsonbErrType error = JsonbErrType::E_NONE;
+
         for (size_t i = 0; i < input_rows_count; ++i) {
             if (col_from.is_null_at(i)) {
                 null_map->get_data()[i] = 1;
@@ -201,8 +206,6 @@ public:
             }
 
             const auto& val = col_from_string->get_data_at(i);
-            JsonbParser parser;
-            JsonbErrType error = JsonbErrType::E_NONE;
             if (parser.parse(val.data, val.size)) {
                 // insert jsonb format data
                 col_to->insert_data(parser.getWriter().getOutput()->getBuffer(),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org