You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ya...@apache.org on 2020/12/15 01:28:00 UTC

[incubator-doris] branch master updated: [enhancement]improve performance of json load (#5055)

This is an automated email from the ASF dual-hosted git repository.

yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 193db42  [enhancement]improve performance of json load (#5055)
193db42 is described below

commit 193db4207ea395671d4bf5552104e7a089dd2c4e
Author: Zhengguo Yang <ya...@gmail.com>
AuthorDate: Tue Dec 15 09:27:51 2020 +0800

    [enhancement]improve performance of json load (#5055)
    
    * imporve performance of json load
---
 be/src/exec/json_scanner.cpp | 69 +++++++++++++++++++++++++-------------------
 be/src/exec/json_scanner.h   |  8 +++--
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/be/src/exec/json_scanner.cpp b/be/src/exec/json_scanner.cpp
index a031645..bf09b09 100644
--- a/be/src/exec/json_scanner.cpp
+++ b/be/src/exec/json_scanner.cpp
@@ -152,7 +152,8 @@ Status JsonScanner::open_next_reader() {
     if (range.__isset.num_as_string) {
         num_as_string = range.num_as_string;
     }
-    _cur_file_reader = new JsonReader(_state, _counter, _profile, file, strip_outer_array, num_as_string);
+    _cur_file_reader =
+            new JsonReader(_state, _counter, _profile, file, strip_outer_array, num_as_string);
     RETURN_IF_ERROR(_cur_file_reader->init(jsonpath, json_root));
 
     return Status::OK();
@@ -183,25 +184,22 @@ rapidjson::Value::ConstValueIterator JsonDataInternal::get_next() {
 }
 
 ////// class JsonReader
-JsonReader::JsonReader(
-        RuntimeState* state, ScannerCounter* counter,
-        RuntimeProfile* profile,
-        FileReader* file_reader,
-        bool strip_outer_array,
-        bool num_as_string) :
-            _handle_json_callback(nullptr),
-            _next_line(0),
-            _total_lines(0),
-            _state(state),
-            _counter(counter),
-            _profile(profile),
-            _file_reader(file_reader),
-            _closed(false),
-            _strip_outer_array(strip_outer_array),
-            _num_as_string(num_as_string),
-            _json_doc(nullptr) {
+JsonReader::JsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile,
+                       FileReader* file_reader, bool strip_outer_array, bool num_as_string)
+        : _handle_json_callback(nullptr),
+          _next_line(0),
+          _total_lines(0),
+          _state(state),
+          _counter(counter),
+          _profile(profile),
+          _file_reader(file_reader),
+          _closed(false),
+          _strip_outer_array(strip_outer_array),
+          _num_as_string(num_as_string),
+          _json_doc(nullptr) {
     _bytes_read_counter = ADD_COUNTER(_profile, "BytesRead", TUnit::BYTES);
-    _read_timer = ADD_TIMER(_profile, "FileReadTime");
+    _read_timer = ADD_TIMER(_profile, "ReadTime");
+    _file_read_timer = ADD_TIMER(_profile, "FileReadTime");
 }
 
 JsonReader::~JsonReader() {
@@ -273,9 +271,11 @@ void JsonReader::_close() {
 // return Status::OK() if parse succeed or reach EOF.
 Status JsonReader::_parse_json_doc(bool* eof) {
     // read a whole message, must be delete json_str by `delete[]`
+    SCOPED_TIMER(_file_read_timer);
     uint8_t* json_str = nullptr;
     size_t length = 0;
     RETURN_IF_ERROR(_file_reader->read_one_message(&json_str, &length));
+    _bytes_read_counter += length;
     if (length == 0) {
         *eof = true;
         return Status::OK();
@@ -286,7 +286,10 @@ Status JsonReader::_parse_json_doc(bool* eof) {
     // As the issue: https://github.com/Tencent/rapidjson/issues/1458
     // Now, rapidjson only support uint64_t, So lagreint load cause bug. We use kParseNumbersAsStringsFlag.
     if (_num_as_string) {
-        has_parse_error = _origin_json_doc.Parse<rapidjson::kParseNumbersAsStringsFlag>((char*)json_str, length).HasParseError();
+        has_parse_error =
+                _origin_json_doc
+                        .Parse<rapidjson::kParseNumbersAsStringsFlag>((char*)json_str, length)
+                        .HasParseError();
     } else {
         has_parse_error = _origin_json_doc.Parse((char*)json_str, length).HasParseError();
     }
@@ -425,7 +428,9 @@ void JsonReader::_write_data_to_tuple(rapidjson::Value::ConstValueIterator value
 // for simple format json
 void JsonReader::_set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple,
                                   const std::vector<SlotDescriptor*>& slot_descs,
+                                  const std::vector<rapidjson::Value>& value_key,
                                   MemPool* tuple_pool, bool* valid) {
+    DCHECK(slot_descs.size() == value_key.size());
     if (!objectValue.IsObject()) {
         // Here we expect the incoming `objectValue` to be a Json Object, such as {"key" : "value"},
         // not other type of Json format.
@@ -437,20 +442,21 @@ void JsonReader::_set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple,
     }
 
     int nullcount = 0;
-    for (auto v : slot_descs) {
-        if (objectValue.HasMember(v->col_name().c_str())) {
-            rapidjson::Value& value = objectValue[v->col_name().c_str()];
-            _write_data_to_tuple(&value, v, tuple, tuple_pool, valid);
+    for (int i = 0; i < slot_descs.size(); ++i) {
+        rapidjson::Value::ConstMemberIterator it = objectValue.FindMember(value_key[i]);
+        if (it != objectValue.MemberEnd()) {
+            const rapidjson::Value& value = it->value;
+            _write_data_to_tuple(&value, slot_descs[i], tuple, tuple_pool, valid);
             if (!(*valid)) {
                 return;
             }
         } else { // not found
-            if (v->is_nullable()) {
-                tuple->set_null(v->null_indicator_offset());
+            if (slot_descs[i]->is_nullable()) {
+                tuple->set_null(slot_descs[i]->null_indicator_offset());
                 nullcount++;
             } else {
                 std::stringstream str_error;
-                str_error << "The column `" << v->col_name()
+                str_error << "The column `" << slot_descs[i]->col_name()
                           << "` is not nullable, but it's not found in jsondata.";
                 _state->append_error_msg_to_file(_print_json_value(objectValue), str_error.str());
                 _counter->num_rows_filtered++;
@@ -481,6 +487,11 @@ void JsonReader::_set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple,
  */
 Status JsonReader::_handle_simple_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs,
                                        MemPool* tuple_pool, bool* eof) {
+    // If you use a string as the key to find the json object, strlen will be called every time, so the key is constructed in advance
+    std::vector<rapidjson::Value> value_key;
+    for (auto v : slot_descs) {
+        value_key.emplace_back(v->col_name().c_str(), v->col_name().size());
+    }
     do {
         bool valid = false;
         if (_next_line >= _total_lines) { // parse json and generic document
@@ -512,9 +523,9 @@ Status JsonReader::_handle_simple_json(Tuple* tuple, const std::vector<SlotDescr
 
         if (_json_doc->IsArray()) {                                   // handle case 1
             rapidjson::Value& objectValue = (*_json_doc)[_next_line]; // json object
-            _set_tuple_value(objectValue, tuple, slot_descs, tuple_pool, &valid);
+            _set_tuple_value(objectValue, tuple, slot_descs, value_key, tuple_pool, &valid);
         } else { // handle case 2
-            _set_tuple_value(*_json_doc, tuple, slot_descs, tuple_pool, &valid);
+            _set_tuple_value(*_json_doc, tuple, slot_descs, value_key, tuple_pool, &valid);
         }
         _next_line++;
         if (!valid) {
diff --git a/be/src/exec/json_scanner.h b/be/src/exec/json_scanner.h
index 0ce2805..5975978 100644
--- a/be/src/exec/json_scanner.h
+++ b/be/src/exec/json_scanner.h
@@ -103,8 +103,8 @@ struct JsonPath;
 // return other error Status if encounter other errors.
 class JsonReader {
 public:
-    JsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile, FileReader* file_reader,
-               bool strip_outer_array, bool num_as_string);
+    JsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile,
+               FileReader* file_reader, bool strip_outer_array, bool num_as_string);
 
     ~JsonReader();
 
@@ -129,7 +129,8 @@ private:
                     const uint8_t* value, int32_t len);
     Status _parse_json_doc(bool* eof);
     void _set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple,
-                          const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool,
+                          const std::vector<SlotDescriptor*>& slot_descs,
+                          const std::vector<rapidjson::Value>& value_key, MemPool* tuple_pool,
                           bool* valid);
     void _write_data_to_tuple(rapidjson::Value::ConstValueIterator value, SlotDescriptor* desc,
                               Tuple* tuple, MemPool* tuple_pool, bool* valid);
@@ -154,6 +155,7 @@ private:
     bool _num_as_string;
     RuntimeProfile::Counter* _bytes_read_counter;
     RuntimeProfile::Counter* _read_timer;
+    RuntimeProfile::Counter* _file_read_timer;
 
     std::vector<std::vector<JsonPath>> _parsed_jsonpaths;
     std::vector<JsonPath> _parsed_json_root;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org