You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/07/20 14:37:22 UTC
[doris] 04/25: [Improve](simdjson) put unescaped string value after parsed (#21866)
This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6bb31b7e1d6f3ce92c3433221ad944090622c583
Author: lihangyu <15...@163.com>
AuthorDate: Thu Jul 20 10:33:17 2023 +0800
[Improve](simdjson) put unescaped string value after parsed (#21866)
In some cases, it is necessary to unescape the original value, such as when converting a string to JSONB.
If not unescape, then later jsonb parse will be failed
---
be/src/vec/exec/format/json/new_json_reader.cpp | 13 +++-
be/src/vec/exec/format/json/new_json_reader.h | 1 +
.../data/load_p0/stream_load/test_json_load.out | 3 +
.../data/load_p0/stream_load/with_jsonb.json | 1 +
.../load_p0/stream_load/test_json_load.groovy | 87 ++++++++++++++--------
5 files changed, 71 insertions(+), 34 deletions(-)
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp
index 94c376ae69..02145c3360 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -1095,6 +1095,7 @@ Status NewJsonReader::_simdjson_init_reader() {
_slot_desc_index[_file_slot_descs[i]->col_name()] = i;
}
_simdjson_ondemand_padding_buffer.resize(_padded_size);
+ _simdjson_ondemand_unscape_padding_buffer.resize(_padded_size);
return Status::OK();
}
@@ -1479,15 +1480,18 @@ Status NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value&
break;
}
default: {
- auto str_view = simdjson::to_json_string(value).value();
if (value.type() == simdjson::ondemand::json_type::string) {
+ uint8_t* unescape_buffer =
+ reinterpret_cast<uint8_t*>(&_simdjson_ondemand_unscape_padding_buffer[0]);
+ std::string_view unescaped_value =
+ _ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer);
nullable_column->get_null_map_data().push_back(0);
- // trim
- column_string->insert_data(str_view.data() + 1, str_view.length() - 2);
+ column_string->insert_data(unescaped_value.data(), unescaped_value.length());
break;
}
+ auto value_str = simdjson::to_json_string(value).value();
nullable_column->get_null_map_data().push_back(0);
- column_string->insert_data(str_view.data(), str_view.length());
+ column_string->insert_data(value_str.data(), value_str.length());
}
}
*valid = true;
@@ -1570,6 +1574,7 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
// For efficiency reasons, simdjson requires a string with a few bytes (simdjson::SIMDJSON_PADDING) at the end.
// Hence, a re-allocation is needed if the space is not enough.
_simdjson_ondemand_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING);
+ _simdjson_ondemand_unscape_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING);
_padded_size = *size + simdjson::SIMDJSON_PADDING;
}
// trim BOM since simdjson does not handle UTF-8 Unicode (with BOM)
diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h
index 5dbd5b5c28..99106d100f 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -254,6 +254,7 @@ private:
static constexpr size_t _init_buffer_size = 1024 * 1024 * 8;
size_t _padded_size = _init_buffer_size + simdjson::SIMDJSON_PADDING;
std::string _simdjson_ondemand_padding_buffer;
+ std::string _simdjson_ondemand_unscape_padding_buffer;
// char _simdjson_ondemand_padding_buffer[_padded_size];
simdjson::ondemand::document _original_json_doc;
simdjson::ondemand::value _json_value;
diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out
index f7e7d61607..6296f37099 100644
--- a/regression-test/data/load_p0/stream_load/test_json_load.out
+++ b/regression-test/data/load_p0/stream_load/test_json_load.out
@@ -199,3 +199,6 @@
10 hefei 23456710
200 changsha 3456789
+-- !select1 --
+John 30 New York {"email":"john@example.com","phone":"+1-123-456-7890"}
+
diff --git a/regression-test/data/load_p0/stream_load/with_jsonb.json b/regression-test/data/load_p0/stream_load/with_jsonb.json
new file mode 100644
index 0000000000..97c537fa0e
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/with_jsonb.json
@@ -0,0 +1 @@
+{ "name": "John", "age": 30, "city": "New York", "contact": "{ \"email\": \"john@example.com\", \"phone\": \"+1-123-456-7890\" }" }
diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
index 4e3d2dbf81..7a69114d11 100644
--- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
@@ -114,13 +114,13 @@ suite("test_json_load", "p0") {
assertTrue(result1[0][0] == 0, "Create table should update 0 rows")
}
- def load_json_data = {label, strip_flag, read_flag, format_flag, exprs, json_paths,
+ def load_json_data = {table_name, label, strip_flag, read_flag, format_flag, exprs, json_paths,
json_root, where_expr, fuzzy_flag, file_name, ignore_failure=false,
expected_succ_rows = -1, load_to_single_tablet = 'true' ->
// load the json data
streamLoad {
- table "test_json_load"
+ table "${table_name}"
// set http request header params
set 'label', label + "_" + UUID.randomUUID().toString()
@@ -216,7 +216,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json')
+ load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json.json')
sql "sync"
qt_select1 "select * from ${testTable} order by id"
@@ -231,7 +231,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json')
+ load_json_data.call("test_json_load", 'test_json_load_case2_2', 'true', '', 'json', 'id= id * 10', '', '', '', '', 'simple_json.json')
sql "sync"
qt_select2 "select * from ${testTable} order by id"
@@ -246,7 +246,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case3_2', 'true', '', 'json', '', '[\"$.id\", \"$.code\"]',
'', '', '', 'simple_json.json')
sql "sync"
@@ -262,7 +262,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case4_2', 'true', '', 'json', 'code = id * 10 + 200', '[\"$.id\"]',
'', '', '', 'simple_json.json')
sql "sync"
@@ -278,7 +278,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case5_2', 'true', 'true', 'json', '', '[\"$.id\", \"$.code\"]',
'', '', '', 'multi_line_json.json')
sql "sync"
@@ -294,7 +294,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case6_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
'', '', '', 'multi_line_json.json')
sql "sync"
@@ -310,7 +310,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case7_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
'', 'id > 50', '', 'multi_line_json.json')
sql "sync"
@@ -326,7 +326,7 @@ suite("test_json_load", "p0") {
create_test_table2.call(testTable)
- load_json_data.call('test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case8_2', 'true', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
'', 'id > 50', 'true', 'multi_line_json.json')
sql "sync"
@@ -342,7 +342,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '',
+ load_json_data.call("test_json_load", 'test_json_load_case9_2', '', 'true', 'json', 'id= id * 10', '',
'$.item', '', 'true', 'nest_json.json')
sql "sync"
@@ -358,7 +358,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '',
+ load_json_data.call("test_json_load", 'test_json_load_case10_2', '', 'true', 'json', 'id= id * 10', '',
'$.item', '', 'false', 'invalid_json.json', false, 4)
sql "sync"
@@ -374,7 +374,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10)
+ load_json_data.call("test_json_load", 'test_json_load_case11_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2.json', false, 10)
sql "sync"
qt_select11 "select * from ${testTable} order by id"
@@ -389,7 +389,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json')
+ load_json_data.call("test_json_load", 'test_json_load_case12_2', 'true', '', 'json', '', '', '', '', '', 'simple_json2_lack_one_column.json')
sql "sync"
qt_select12 "select * from ${testTable} order by id"
@@ -441,15 +441,15 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case14_2', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.code\"]',
'$.item', '', 'true', 'nest_json.json')
// invalid nest_json
- load_json_data.call('test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case14_3', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
'$.item', '', 'true', 'invalid_nest_json1.json', true)
- load_json_data.call('test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case14_4', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
'$.item', '', 'true', 'invalid_nest_json2.json', false, 7)
- load_json_data.call('test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ load_json_data.call("test_json_load", 'test_json_load_case14_5', '', 'true', 'json', 'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
'$.item', '', 'true', 'invalid_nest_json3.json', true)
sql "sync"
@@ -465,7 +465,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10',
+ load_json_data.call("test_json_load", 'test_json_load_case15_2', '', 'true', 'json', 'id, code, city,id= id * 10',
'[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'nest_json.json')
sql "sync"
@@ -481,7 +481,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+ load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
'[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'nest_json_array.json', false, 7)
sql "sync"
@@ -496,13 +496,13 @@ suite("test_json_load", "p0") {
sql "DROP TABLE IF EXISTS ${testTable}"
test_invalid_json_array_table.call(testTable)
- load_json_data.call('test_json_load_case17', 'true', '', 'json', '', '',
+ load_json_data.call("test_json_load", 'test_json_load_case17', 'true', '', 'json', '', '',
'', '', '', 'invalid_json_array.json', false, 0, 'false')
- load_json_data.call('test_json_load_case17_1', 'true', '', 'json', '', '',
+ load_json_data.call("test_json_load", 'test_json_load_case17_1', 'true', '', 'json', '', '',
'$.item', '', '', 'invalid_json_array1.json', false, 0, 'false')
- load_json_data.call('test_json_load_case17_2', 'true', '', 'json', '', '',
+ load_json_data.call("test_json_load", 'test_json_load_case17_2', 'true', '', 'json', '', '',
'$.item', '', '', 'invalid_json_array2.json', false, 0, 'false')
- load_json_data.call('test_json_load_case17_3', 'true', '', 'json', '', '',
+ load_json_data.call("test_json_load", 'test_json_load_case17_3', 'true', '', 'json', '', '',
'$.item', '', '', 'invalid_json_array3.json', false, 0, 'false')
sql "sync"
qt_select17 "select * from ${testTable}"
@@ -516,13 +516,13 @@ suite("test_json_load", "p0") {
sql "DROP TABLE IF EXISTS ${testTable}"
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+ load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
'[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array.json', true)
- load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+ load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
'[\"$.id\", \"$.code\", \"$.city[100]\"]', '$.item', '', 'true', 'invalid_nest_json_array1.json', true)
- load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+ load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
'[\"$.id\", \"$.code\", \"$.city\"]', '$.item', '', 'true', 'invalid_nest_json_array2.json', true)
- load_json_data.call('test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
+ load_json_data.call("test_json_load", 'test_json_load_case16_2', 'true', '', 'json', 'id, code, city',
'[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item', '', 'true', 'invalid_nest_json_array3.json', true)
sql "sync"
@@ -537,7 +537,7 @@ suite("test_json_load", "p0") {
sql "DROP TABLE IF EXISTS ${testTable}"
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '',
+ load_json_data.call("test_json_load", 'test_json_load_case19', 'false', 'true', 'json', 'Id, cIty, CodE', '',
'', '', '', 'case_sensitive_json.json', false, 2)
sql "sync"
qt_select19 "select * from ${testTable} order by id"
@@ -552,7 +552,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json')
+ load_json_data.call("test_json_load", 'test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json')
sql "sync"
qt_select1 "select * from ${testTable} order by id"
@@ -561,6 +561,33 @@ suite("test_json_load", "p0") {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
+ // case21: import json with jsonb field
+ try {
+ testTable = "with_jsonb"
+ sql "DROP TABLE IF EXISTS ${testTable}"
+ sql """
+ CREATE TABLE ${testTable} (
+ `name` varchar(29) NOT NULL COMMENT '年月',
+ `age` int,
+ `city` varchar(29),
+ `contact` jsonb
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`name`)
+ COMMENT '流水月结明细表'
+ DISTRIBUTED BY RANDOM BUCKETS auto
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ load_json_data.call("with_jsonb", "with_jsonb_load" , 'false', '', 'json', '', '', '', '', '', 'with_jsonb.json')
+
+ sql "sync"
+ qt_select1 "select * from ${testTable}"
+
+ } finally {
+ try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+
// if 'enableHdfs' in regression-conf.groovy has been set to true,
// the test will run these case as below.
if (enableHdfs()) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org