You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2020/07/05 08:10:39 UTC

[GitHub] [incubator-doris] morningman opened a new pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

morningman opened a new pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] worker24h commented on a change in pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
worker24h commented on a change in pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020#discussion_r449954852



##########
File path: be/src/exec/json_scanner.cpp
##########
@@ -471,10 +530,14 @@ Status JsonReader::handle_nest_complex_json(Tuple* tuple, const std::vector<Slot
  *      value1     10
  *      value2     30
  */
-Status JsonReader::handle_flat_array_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
+Status JsonReader::_handle_flat_array_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
     do {
-        if (_next_line >= _total_lines) {//parse json and generic document
-            RETURN_IF_ERROR(parse_json_doc(eof));
+        if (_next_line >= _total_lines) {
+            Status st = _parse_json_doc(eof);
+            if (st.is_data_quality_error()) {
+                continue; // continue to read next
+            }
+            RETURN_IF_ERROR(st); // terminate if encounter other errors

Review comment:
       I think that this code can be deleted




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] worker24h commented on a change in pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
worker24h commented on a change in pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020#discussion_r449959852



##########
File path: be/src/exec/json_scanner.cpp
##########
@@ -524,21 +593,21 @@ Status JsonReader::handle_flat_array_complex_json(Tuple* tuple, const std::vecto
     return Status::OK();
 }
 
-Status JsonReader::handle_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
+// handle json with specified json path
+Status JsonReader::_handle_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
     if (_strip_outer_array) {
-        return handle_flat_array_complex_json(tuple, slot_descs, tuple_pool, eof);
+        return _handle_flat_array_complex_json(tuple, slot_descs, tuple_pool, eof);
     } else {
-        return handle_nest_complex_json(tuple, slot_descs, tuple_pool, eof);
+        return _handle_nested_complex_json(tuple, slot_descs, tuple_pool, eof);
     }
 }
 
+// 

Review comment:
       Delete it




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] worker24h commented on a change in pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
worker24h commented on a change in pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020#discussion_r449954838



##########
File path: be/src/exec/json_scanner.cpp
##########
@@ -406,54 +459,60 @@ Status JsonReader::handle_simple_json(Tuple* tuple, const std::vector<SlotDescri
     return Status::OK();
 }
 
-Status JsonReader::set_tuple_value_from_map(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool *valid) {
+// for complex format json with strip_outer_array = false
+Status JsonReader::_set_tuple_value_from_jmap(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool *valid) {
     std::unordered_map<std::string, JsonDataInternal>::iterator it_map;
     for (auto v : slot_descs) {
         it_map = _jmap.find(v->col_name());
         if (it_map == _jmap.end()) {
-            return Status::RuntimeError("The column name of table is not foud in jsonpath.");
+            return Status::RuntimeError("The column name of table is not foud in jsonpath: " + v->col_name());
         }
-        rapidjson::Value::ConstValueIterator value = it_map->second.get_next();
+        rapidjson::Value* value = it_map->second.get_value();
         if (value == nullptr) {
             if (v->is_nullable()) {
                 tuple->set_null(v->null_indicator_offset());
             } else  {
                 std::stringstream str_error;
                 str_error << "The column `" << it_map->first << "` is not nullable, but it's not found in jsondata.";
-                _state->append_error_msg_to_file("", str_error.str());
+                _state->append_error_msg_to_file(_print_json_value(*value), str_error.str());
                 _counter->num_rows_filtered++;
                 *valid = false; // current row is invalid
                 break;
             }
         } else {
-            RETURN_IF_ERROR(write_data_to_tuple(value, v, tuple, tuple_pool));
+            _write_data_to_tuple(value, v, tuple, tuple_pool, valid);
+            if (!(*valid)) {
+                return Status::OK();
+            }
         }
     }
     *valid = true;
     return Status::OK();
 }
 
-Status JsonReader::handle_nest_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
+// _json_doc should be an object
+Status JsonReader::_handle_nested_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
     do {
         bool valid = false;
         if (_next_line >= _total_lines) {
-            RETURN_IF_ERROR(parse_json_doc(eof));
-            if (*eof) {
-                return Status::OK();
+            Status st = _parse_json_doc(eof);
+            if (st.is_data_quality_error()) {
+                continue; // continue to read next
             }
-            _total_lines = get_data_by_jsonpath(slot_descs);
-            if (_total_lines == -1) {
-                return Status::InternalError("Parse json data is failed.");
-            } else if (_total_lines == 0) {
-                *eof = true;
+            RETURN_IF_ERROR(st); // terminate if encounter other errors

Review comment:
       I think that this code can be deleted




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] worker24h commented on a change in pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
worker24h commented on a change in pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020#discussion_r449951165



##########
File path: be/src/exec/json_scanner.cpp
##########
@@ -360,26 +408,31 @@ Status JsonReader::set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple,
     }
 
     if (nullcount == slot_descs.size()) {
-        _state->append_error_msg_to_file("", "The all fields is null, this is a invalid row.");
+        _state->append_error_msg_to_file(_print_json_value(objectValue), "All fields is null, this is a invalid row.");
         _counter->num_rows_filtered++;
         *valid = false;
-        return Status::OK();
+        return;
     }
     *valid = true;
-    return Status::OK();
+    return;
 }
 
 /**
- * handle input a simple json
+ * handle input a simple json.
+ * A json is a simple json only when user not specifying the json path.
  * For example:
  *  case 1. [{"colunm1":"value1", "colunm2":10}, {"colunm1":"value2", "colunm2":30}]
  *  case 2. {"colunm1":"value1", "colunm2":10}
  */
-Status JsonReader::handle_simple_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
+Status JsonReader::_handle_simple_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
     do {
         bool valid = false;
         if (_next_line >= _total_lines) {//parse json and generic document
-            RETURN_IF_ERROR(parse_json_doc(eof));
+            Status st = _parse_json_doc(eof);
+            if (st.is_data_quality_error()) {
+                continue; // continue to read next
+            }
+            RETURN_IF_ERROR(st); // terminate if encounter other errors

Review comment:
       I think that this code can be deleted




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] morningman commented on a change in pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
morningman commented on a change in pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020#discussion_r450582767



##########
File path: be/src/exec/json_scanner.cpp
##########
@@ -406,54 +459,60 @@ Status JsonReader::handle_simple_json(Tuple* tuple, const std::vector<SlotDescri
     return Status::OK();
 }
 
-Status JsonReader::set_tuple_value_from_map(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool *valid) {
+// for complex format json with strip_outer_array = false
+Status JsonReader::_set_tuple_value_from_jmap(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool *valid) {
     std::unordered_map<std::string, JsonDataInternal>::iterator it_map;
     for (auto v : slot_descs) {
         it_map = _jmap.find(v->col_name());
         if (it_map == _jmap.end()) {
-            return Status::RuntimeError("The column name of table is not foud in jsonpath.");
+            return Status::RuntimeError("The column name of table is not foud in jsonpath: " + v->col_name());
         }
-        rapidjson::Value::ConstValueIterator value = it_map->second.get_next();
+        rapidjson::Value* value = it_map->second.get_value();
         if (value == nullptr) {
             if (v->is_nullable()) {
                 tuple->set_null(v->null_indicator_offset());
             } else  {
                 std::stringstream str_error;
                 str_error << "The column `" << it_map->first << "` is not nullable, but it's not found in jsondata.";
-                _state->append_error_msg_to_file("", str_error.str());
+                _state->append_error_msg_to_file(_print_json_value(*value), str_error.str());
                 _counter->num_rows_filtered++;
                 *valid = false; // current row is invalid
                 break;
             }
         } else {
-            RETURN_IF_ERROR(write_data_to_tuple(value, v, tuple, tuple_pool));
+            _write_data_to_tuple(value, v, tuple, tuple_pool, valid);
+            if (!(*valid)) {
+                return Status::OK();
+            }
         }
     }
     *valid = true;
     return Status::OK();
 }
 
-Status JsonReader::handle_nest_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
+// _json_doc should be an object
+Status JsonReader::_handle_nested_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof) {
     do {
         bool valid = false;
         if (_next_line >= _total_lines) {
-            RETURN_IF_ERROR(parse_json_doc(eof));
-            if (*eof) {
-                return Status::OK();
+            Status st = _parse_json_doc(eof);
+            if (st.is_data_quality_error()) {
+                continue; // continue to read next
             }
-            _total_lines = get_data_by_jsonpath(slot_descs);
-            if (_total_lines == -1) {
-                return Status::InternalError("Parse json data is failed.");
-            } else if (_total_lines == 0) {
-                *eof = true;
+            RETURN_IF_ERROR(st); // terminate if encounter other errors

Review comment:
       file reader may return other error




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[GitHub] [incubator-doris] morningman merged pull request #4020: [Load][Json] Refactor json load logic to make it more reasonable

Posted by GitBox <gi...@apache.org>.
morningman merged pull request #4020:
URL: https://github.com/apache/incubator-doris/pull/4020


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org