You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/07/20 14:37:22 UTC

[doris] 04/25: [Improve](simdjson) put unescaped string value after parsed (#21866)

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 6bb31b7e1d6f3ce92c3433221ad944090622c583
Author: lihangyu <15...@163.com>
AuthorDate: Thu Jul 20 10:33:17 2023 +0800

    [Improve](simdjson) put unescaped string value after parsed (#21866)
    
    In some cases, it is necessary to unescape the original value, such as when converting a string to JSONB.
    If not unescape, then later jsonb parse will be failed
---
 be/src/vec/exec/format/json/new_json_reader.cpp    | 13 +++-
 be/src/vec/exec/format/json/new_json_reader.h      |  1 +
 .../data/load_p0/stream_load/test_json_load.out    |  3 +
 .../data/load_p0/stream_load/with_jsonb.json       |  1 +
 .../load_p0/stream_load/test_json_load.groovy      | 87 ++++++++++++++--------
 5 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp
index 94c376ae69..02145c3360 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -1095,6 +1095,7 @@ Status NewJsonReader::_simdjson_init_reader() {
         _slot_desc_index[_file_slot_descs[i]->col_name()] = i;
     }
     _simdjson_ondemand_padding_buffer.resize(_padded_size);
+    _simdjson_ondemand_unscape_padding_buffer.resize(_padded_size);
     return Status::OK();
 }
 
@@ -1479,15 +1480,18 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
         break;
     }
     default: {
-        auto str_view = simdjson::to_json_string(value).value();
         if (value.type() == simdjson::ondemand::json_type::string) {
+            uint8_t* unescape_buffer =
+                    reinterpret_cast<uint8_t*>(&_simdjson_ondemand_unscape_padding_buffer[0]);
+            std::string_view unescaped_value =
+                    _ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer);
             nullable_column->get_null_map_data().push_back(0);
-            // trim
-            column_string->insert_data(str_view.data() + 1, str_view.length() - 2);
+            column_string->insert_data(unescaped_value.data(), unescaped_value.length());
             break;
         }
+        auto value_str = simdjson::to_json_string(value).value();
         nullable_column->get_null_map_data().push_back(0);
-        column_string->insert_data(str_view.data(), str_view.length());
+        column_string->insert_data(value_str.data(), value_str.length());
     }
     }
     *valid = true;
@@ -1570,6 +1574,7 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
         // For efficiency reasons, simdjson requires a string with a few bytes (simdjson::SIMDJSON_PADDING) at the end.
         // Hence, a re-allocation is needed if the space is not enough.
         _simdjson_ondemand_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING);
+        _simdjson_ondemand_unscape_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING);
         _padded_size = *size + simdjson::SIMDJSON_PADDING;
     }
     // trim BOM since simdjson does not handle UTF-8 Unicode (with BOM)
diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h
index 5dbd5b5c28..99106d100f 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -254,6 +254,7 @@ private:
     static constexpr size_t _init_buffer_size = 1024 * 1024 * 8;
     size_t _padded_size = _init_buffer_size + simdjson::SIMDJSON_PADDING;
     std::string _simdjson_ondemand_padding_buffer;
+    std::string _simdjson_ondemand_unscape_padding_buffer;
     // char _simdjson_ondemand_padding_buffer[_padded_size];
     simdjson::ondemand::document _original_json_doc;
     simdjson::ondemand::value _json_value;
diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out
index f7e7d61607..6296f37099 100644
--- a/regression-test/data/load_p0/stream_load/test_json_load.out
+++ b/regression-test/data/load_p0/stream_load/test_json_load.out
@@ -199,3 +199,6 @@
 10	hefei	23456710
 200	changsha	3456789
 
+-- !select1 --
+John	30	New York	{"email":"john@example.com","phone":"+1-123-456-7890"}
+
diff --git a/regression-test/data/load_p0/stream_load/with_jsonb.json b/regression-test/data/load_p0/stream_load/with_jsonb.json
new file mode 100644
index 0000000000..97c537fa0e
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/with_jsonb.json
@@ -0,0 +1 @@
+{ "name": "John", "age": 30, "city": "New York", "contact": "{ \"email\": \"john@example.com\", \"phone\": \"+1-123-456-7890\" }" }
diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
index 4e3d2dbf81..7a69114d11 100644
--- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
@@ -114,13 +114,13 @@ suite("test_json_load", "p0") {
         assertTrue(result1[0][0] == 0, "Create table should update 0 rows")
     }
     
-    def load_json_data = {label, strip_flag, read_flag, format_flag, exprs, json_paths, 
+    def load_json_data = {table_name, label, strip_flag, read_flag, format_flag, exprs, json_paths, 
                         json_root, where_expr, fuzzy_flag, file_name, ignore_failure=false,
                         expected_succ_rows = -1, load_to_single_tablet = 'true' ->
         
         // load the json data
         streamLoad {
-            table "test_json_load"
+            table "${table_name}"
             
             // set http request header params
             set 'label', label + "_" + UUID.randomUUID().toString()
@@ -216,7 +216,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
 
-        load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json')
+        load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json')
 
         sql "sync"
         qt_select1 "select * from ${testTable} order by id"
@@ -231,7 +231,7 @@ suite("test_json_load", "p0") {
 
         create_test_table1.call(testTable)
 
-        load_json_data.call('test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json')
+        load_json_data.call("test_json_load", 'test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json')
 
         sql "sync" 
         qt_select2 "select * from ${testTable} order by id"
@@ -246,7 +246,7 @@ suite("test_json_load", "p0") {
         
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]',
                             '', '', '', 'simple_json.json')
 
         sql "sync"
@@ -262,7 +262,7 @@ suite("test_json_load", "p0") {
         
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]',
                             '', '', '', 'simple_json.json')
 
         sql "sync"
@@ -278,7 +278,7 @@ suite("test_json_load", "p0") {
         
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]',
                             '', '', '', 'multi_line_json.json')
         
         sql "sync"
@@ -294,7 +294,7 @@ suite("test_json_load", "p0") {
 
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
                             '', '', '', 'multi_line_json.json')
 
         sql "sync"
@@ -310,7 +310,7 @@ suite("test_json_load", "p0") {
 
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
                             '', 'id > 50', '', 'multi_line_json.json')
 
         sql "sync"
@@ -326,7 +326,7 @@ suite("test_json_load", "p0") {
 
         create_test_table2.call(testTable)
         
-        load_json_data.call('test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
                             '', 'id > 50', 'true', 'multi_line_json.json')
 
         sql "sync"
@@ -342,7 +342,7 @@ suite("test_json_load", "p0") {
 
         create_test_table1.call(testTable)
         
-        load_json_data.call('test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '',
+        load_json_data.call("test_json_load", 'test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '',
                             '$.item', '', 'true', 'nest_json.json')
 
         sql "sync"
@@ -358,7 +358,7 @@ suite("test_json_load", "p0") {
 
         create_test_table1.call(testTable)
         
-        load_json_data.call('test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '',
+        load_json_data.call("test_json_load", 'test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '',
                             '$.item', '', 'false', 'invalid_json.json', false, 4)
 
         sql "sync"
@@ -374,7 +374,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
 
-        load_json_data.call('test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10)
+        load_json_data.call("test_json_load", 'test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10)
 
         sql "sync"
         qt_select11 "select * from ${testTable} order by id"
@@ -389,7 +389,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
 
-        load_json_data.call('test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json')
+        load_json_data.call("test_json_load", 'test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json')
 
         sql "sync"
         qt_select12 "select * from ${testTable} order by id"
@@ -441,15 +441,15 @@ suite("test_json_load", "p0") {
 
         create_test_table1.call(testTable)
         
-        load_json_data.call('test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
                             '$.item', '', 'true', 'nest_json.json')
 
         // invalid nest_json
-        load_json_data.call('test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
                             '$.item', '', 'true', 'invalid_nest_json1.json', true) 
-        load_json_data.call('test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
                             '$.item', '', 'true', 'invalid_nest_json2.json', false, 7) 
-        load_json_data.call('test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
+        load_json_data.call("test_json_load", 'test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\",  \"$.city\", \"$.code\"]',
                             '$.item', '', 'true', 'invalid_nest_json3.json', true) 
 
         sql "sync"
@@ -465,7 +465,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
         
-        load_json_data.call('test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10',
+        load_json_data.call("test_json_load", 'test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10',
                             '[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'nest_json.json')
 
         sql "sync"
@@ -481,7 +481,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
         
-        load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+        load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
                             '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'nest_json_array.json', false, 7)
 
         sql "sync"
@@ -496,13 +496,13 @@ suite("test_json_load", "p0") {
         sql "DROP TABLE IF EXISTS ${testTable}"
 
         test_invalid_json_array_table.call(testTable)
-        load_json_data.call('test_json_load_case17', 'true', '', 'json', '', '',
+        load_json_data.call("test_json_load", 'test_json_load_case17', 'true', '', 'json', '', '',
                 '', '', '', 'invalid_json_array.json', false, 0, 'false')
-        load_json_data.call('test_json_load_case17_1', 'true', '', 'json', '', '',
+        load_json_data.call("test_json_load", 'test_json_load_case17_1', 'true', '', 'json', '', '',
                 '$.item', '', '', 'invalid_json_array1.json', false, 0, 'false')
-        load_json_data.call('test_json_load_case17_2', 'true', '', 'json', '', '',
+        load_json_data.call("test_json_load", 'test_json_load_case17_2', 'true', '', 'json', '', '',
                 '$.item', '', '', 'invalid_json_array2.json', false, 0, 'false')
-        load_json_data.call('test_json_load_case17_3', 'true', '', 'json', '', '',
+        load_json_data.call("test_json_load", 'test_json_load_case17_3', 'true', '', 'json', '', '',
                 '$.item', '', '', 'invalid_json_array3.json', false, 0, 'false')
         sql "sync"
         qt_select17 "select * from ${testTable}"
@@ -516,13 +516,13 @@ suite("test_json_load", "p0") {
         sql "DROP TABLE IF EXISTS ${testTable}"
 
         create_test_table1.call(testTable)
-        load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+        load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
                             '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array.json', true) 
-        load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+        load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
                             '[\"$.id\", \"$.code\", \"$.city[100]\"]', '$.item', '', 'true', 'invalid_nest_json_array1.json', true) 
-        load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+        load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
                             '[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'invalid_nest_json_array2.json', true) 
-        load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+        load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
                             '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array3.json', true) 
 
         sql "sync"
@@ -537,7 +537,7 @@ suite("test_json_load", "p0") {
         sql "DROP TABLE IF EXISTS ${testTable}"
 
         create_test_table1.call(testTable)
-        load_json_data.call('test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '',
+        load_json_data.call("test_json_load", 'test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '',
                 '', '', '', 'case_sensitive_json.json', false, 2)
         sql "sync"
         qt_select19 "select * from ${testTable} order by id"
@@ -552,7 +552,7 @@ suite("test_json_load", "p0") {
         
         create_test_table1.call(testTable)
 
-        load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json')
+        load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json')
 
         sql "sync"
         qt_select1 "select * from ${testTable} order by id"
@@ -561,6 +561,33 @@ suite("test_json_load", "p0") {
         try_sql("DROP TABLE IF EXISTS ${testTable}")
     } 
 
+    // case21: import json with jsonb field 
+    try {
+        testTable = "with_jsonb" 
+        sql "DROP TABLE IF EXISTS ${testTable}"
+        sql """
+                CREATE TABLE ${testTable} (
+                `name` varchar(29) NOT NULL COMMENT '年月',
+                `age` int,
+                `city` varchar(29),
+                `contact` jsonb
+                ) ENGINE=OLAP
+                DUPLICATE KEY(`name`)
+                COMMENT '流水月结明细表'
+                DISTRIBUTED BY RANDOM BUCKETS auto
+                PROPERTIES (
+                "replication_allocation" = "tag.location.default: 1"
+                ); 
+            """
+        load_json_data.call("with_jsonb", "with_jsonb_load" , 'false', '', 'json', '', '', '', '', '', 'with_jsonb.json')
+
+        sql "sync"
+        qt_select1 "select * from ${testTable}"
+
+    } finally {
+        try_sql("DROP TABLE IF EXISTS ${testTable}")
+    } 
+
     // if 'enableHdfs' in regression-conf.groovy has been set to true,
     // the test will run these case as below.
     if (enableHdfs()) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org