You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by "eldenmoon (via GitHub)" <gi...@apache.org> on 2023/06/07 07:40:50 UTC

[GitHub] [doris] eldenmoon commented on a diff in pull request #20078: [fix](load)Support load json data with default value

eldenmoon commented on code in PR #20078:
URL: https://github.com/apache/doris/pull/20078#discussion_r1221037246


##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status NewJsonReader::_simdjson_handle_nested_complex_json(
     return Status::OK();
 }
 
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {
-    /// Optimization by caching the order of fields (which is almost always the same)
-    /// and a quick check to match the next expected field, instead of searching the hash table.
-    if (_prev_positions.size() > key_index && _prev_positions[key_index] &&
-        name == _prev_positions[key_index]->get_first()) {
-        return _prev_positions[key_index]->get_second();
-    } else {
-        auto* it = _slot_desc_index.find(name);
-        if (it) {
-            if (key_index < _prev_positions.size()) {
-                _prev_positions[key_index] = it;
-            }
-            return it->get_second();
-        } else {
-            return size_t(-1);
-        }
-    }
-}
-
 Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* value, Block& block,
                                                  const std::vector<SlotDescriptor*>& slot_descs,
                                                  bool* valid) {
     // set
     _seen_columns.assign(block.columns(), false);
     size_t cur_row_count = block.rows();
     bool has_valid_value = false;
-    // iterate through object, simdjson::ondemond will parsing on the fly
-    size_t key_index = 0;
-    for (auto field : *value) {
-        std::string_view key = field.unescaped_key();
-        StringRef name_ref(key.data(), key.size());
-        const size_t column_index = _column_index(name_ref, key_index++);
-        if (UNLIKELY(ssize_t(column_index) < 0)) {
-            // This key is not exist in slot desc, just ignore
+    for (size_t i = 0; i < slot_descs.size(); ++i) {
+        auto slot_desc = slot_descs[i];
+        if (!slot_desc->is_materialized()) {
             continue;
         }
-        simdjson::ondemand::value val = field.value();
-        auto* column_ptr = block.get_by_position(column_index).column->assume_mutable().get();
-        RETURN_IF_ERROR(
-                _simdjson_write_data_to_column(val, slot_descs[column_index], column_ptr, valid));
-        if (!(*valid)) {
-            return Status::OK();
+        auto* column_ptr = block.get_by_position(i).column->assume_mutable().get();
+        auto field = value->find_field_unordered(slot_desc->col_name());

Review Comment:
   this is much slower than iterate through `for (auto field : *value) `, this loop will utilize simd instruction



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(
+        const std::vector<SlotDescriptor*>& slot_descs,
+        const std::unordered_map<std::string, vectorized::VExprContext*>& col_default_value_ctx) {
+    for (auto slot_desc : slot_descs) {
+        auto it = col_default_value_ctx.find(slot_desc->col_name());
+        if (it != col_default_value_ctx.end() && it->second != nullptr) {
+            auto* ctx = it->second;
+            // empty block to save default value of slot_desc->col_name()
+            Block block;
+            // If block is empty, some functions will produce no result. So we insert a column with
+            // single value here.
+            block.insert({ColumnUInt8::create(1), std::make_shared<DataTypeUInt8>(), ""});
+            int result = -1;
+            RETURN_IF_ERROR(ctx->execute(&block, &result));
+            DCHECK(result != -1);
+            auto column = block.get_by_position(result).column;
+            DCHECK(column->size() == 1);
+            _col_default_value_map.emplace(slot_desc->col_name(),
+                                           column->get_data_at(0).to_string());
+        }
+    }
+    return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+                                           vectorized::IColumn* column_ptr, bool* valid) {
+    if (slot_desc->is_nullable()) {
+        vectorized::ColumnNullable* nullable_column =
+                reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+        column_ptr = &nullable_column->get_nested_column();
+        auto col_value = _col_default_value_map.find(slot_desc->col_name());
+        if (col_value == _col_default_value_map.end()) {
+            nullable_column->insert_default();
+        } else {
+            const std::string& v_str = col_value->second;
+            nullable_column->get_null_map_data().push_back(0);
+            assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), v_str.size());

Review Comment:
   what if default value is `CURRENT_TIMESTAMP` will` v_str` be ok?



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -832,19 +836,19 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
                 return Status::OK();
             }
             has_valid_value = true;
-        } else { // not found
-            // When the entire row has no valid value, this row should be filtered,
-            // so the default value cannot be directly inserted here
-            if (!slot_desc->is_nullable()) {
-                RETURN_IF_ERROR(_append_error_msg(
-                        objectValue,
-                        "The column `{}` is not nullable, but it's not found in jsondata.",
-                        slot_desc->col_name(), valid));
-                break;
+        } else {
+            // not found, filling with default value
+            RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid));
+            if (!(*valid)) {
+                return Status::OK();
             }
         }
     }
     if (!has_valid_value) {
+        for (int i = 0; i < block.columns(); ++i) {
+            auto column = block.get_by_position(i).column->assume_mutable();
+            column->pop_back(1);

Review Comment:
   add comment to explain why we need to pop_back here?



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(
+        const std::vector<SlotDescriptor*>& slot_descs,
+        const std::unordered_map<std::string, vectorized::VExprContext*>& col_default_value_ctx) {
+    for (auto slot_desc : slot_descs) {
+        auto it = col_default_value_ctx.find(slot_desc->col_name());
+        if (it != col_default_value_ctx.end() && it->second != nullptr) {
+            auto* ctx = it->second;
+            // empty block to save default value of slot_desc->col_name()
+            Block block;
+            // If block is empty, some functions will produce no result. So we insert a column with
+            // single value here.
+            block.insert({ColumnUInt8::create(1), std::make_shared<DataTypeUInt8>(), ""});
+            int result = -1;
+            RETURN_IF_ERROR(ctx->execute(&block, &result));
+            DCHECK(result != -1);
+            auto column = block.get_by_position(result).column;
+            DCHECK(column->size() == 1);
+            _col_default_value_map.emplace(slot_desc->col_name(),
+                                           column->get_data_at(0).to_string());
+        }
+    }
+    return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+                                           vectorized::IColumn* column_ptr, bool* valid) {
+    if (slot_desc->is_nullable()) {
+        vectorized::ColumnNullable* nullable_column =
+                reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+        column_ptr = &nullable_column->get_nested_column();
+        auto col_value = _col_default_value_map.find(slot_desc->col_name());
+        if (col_value == _col_default_value_map.end()) {
+            nullable_column->insert_default();
+        } else {
+            const std::string& v_str = col_value->second;
+            nullable_column->get_null_map_data().push_back(0);
+            assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), v_str.size());

Review Comment:
   we should add test cases



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status NewJsonReader::_simdjson_handle_nested_complex_json(
     return Status::OK();
 }
 
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {

Review Comment:
   this function accelerate parsing speed, do not delete it



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(

Review Comment:
   we should `_fill_missing_column`  also when `fill missing slot`
   
   ```
    // fill missing slot
       int ctx_idx = 0;
       int nullcount = 0;
       for (auto slot_desc : slot_descs) {
           if (!slot_desc->is_materialized()) {
               continue;
           }
           int dest_index = ctx_idx++;
           auto* column_ptr = block.get_by_position(dest_index).column->assume_mutable().get();
           if (column_ptr->size() < cur_row_count + 1) {
              // ..._fill_missing_column.. here
               ++nullcount;
           }
           DCHECK(column_ptr->size() == cur_row_count + 1);
       }
       // There is at least one valid value here
       DCHECK(nullcount < block.columns());
       *valid = true;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org