You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2023/01/17 12:39:22 UTC

[GitHub] [doris] github-actions[bot] commented on a diff in pull request #16024: [Enhancement][icebergv2_parquet_reader] Optimize the position delete file filtering mechanism in iceberg v2 parquet reader.

github-actions[bot] commented on code in PR #16024:
URL: https://github.com/apache/doris/pull/16024#discussion_r1072157760


##########
be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:
##########
@@ -434,6 +438,103 @@
     return Status::OK();
 }
 
+Status RowGroupReader::_build_pos_delete_filter(size_t read_rows) {
+    _filter_ptr.reset(new IColumn::Filter(read_rows, 1));
+    int64_t start_row_id = _position_delete_ctx.current_row_id;
+    int64_t end_row_id = std::min(_position_delete_ctx.current_row_id + (int64_t)read_rows,
+                                  _position_delete_ctx.last_row_id);
+    while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
+        const int64_t& delete_row_id = _position_delete_ctx.delete_rows[_position_delete_ctx.index];
+        if (delete_row_id < start_row_id) {
+            _position_delete_ctx.index++;
+        } else if (delete_row_id < end_row_id) {
+            int64_t index = _position_delete_ctx.delete_rows[_position_delete_ctx.index] -
+                            _position_delete_ctx.current_row_id;
+            (*_filter_ptr)[index] = 0;
+            _position_delete_ctx.index++;
+        } else { // delete_row_id >= end_row_id
+            break;
+        }
+    }
+    _position_delete_ctx.current_row_id = end_row_id;
+    return Status::OK();
+}
+
+Status RowGroupReader::_filter_block(Block* block, const ColumnPtr filter_column,
+                                     int column_to_keep, std::vector<uint32_t> columns_to_filter) {
+    if (auto* nullable_column = check_and_get_column<ColumnNullable>(*filter_column)) {
+        ColumnPtr nested_column = nullable_column->get_nested_column_ptr();
+
+        MutableColumnPtr mutable_holder =
+                nested_column->use_count() == 1
+                        ? nested_column->assume_mutable()
+                        : nested_column->clone_resized(nested_column->size());
+
+        ColumnUInt8* concrete_column = typeid_cast<ColumnUInt8*>(mutable_holder.get());
+        if (!concrete_column) {
+            return Status::InvalidArgument(
+                    "Illegal type {} of column for filter. Must be UInt8 or Nullable(UInt8).",
+                    filter_column->get_name());
+        }
+        auto* __restrict null_map = nullable_column->get_null_map_data().data();
+        IColumn::Filter& filter = concrete_column->get_data();
+        auto* __restrict filter_data = filter.data();
+
+        const size_t size = filter.size();
+        for (size_t i = 0; i < size; ++i) {
+            (*_filter_ptr)[i] &= (!null_map[i]) & filter_data[i];
+        }
+        RETURN_IF_ERROR(_filter_block_internal(block, columns_to_filter));
+    } else if (auto* const_column = check_and_get_column<ColumnConst>(*filter_column)) {
+        bool ret = const_column->get_bool(0);
+        if (!ret) {
+            for (auto& col : columns_to_filter) {
+                std::move(*block->get_by_position(col).column).assume_mutable()->clear();
+            }
+        }
+    } else {
+        const IColumn::Filter& filter =
+                assert_cast<const doris::vectorized::ColumnVector<UInt8>&>(*filter_column)
+                        .get_data();
+
+        auto* __restrict filter_data = filter.data();
+        const size_t size = filter.size();
+        for (size_t i = 0; i < size; ++i) {
+            (*_filter_ptr)[i] &= filter_data[i];
+        }
+        RETURN_IF_ERROR(_filter_block_internal(block, columns_to_filter));
+    }
+    Block::erase_useless_column(block, column_to_keep);
+    return Status::OK();
+}
+
+Status RowGroupReader::_filter_block(Block* block, int column_to_keep,
+                                     const std::vector<uint32_t>& columns_to_filter) {
+    RETURN_IF_ERROR(_filter_block_internal(block, columns_to_filter));
+    Block::erase_useless_column(block, column_to_keep);
+
+    return Status::OK();
+}
+
+Status RowGroupReader::_filter_block_internal(Block* block,
+                                              const std::vector<uint32_t>& columns_to_filter) {
+    size_t count = _filter_ptr->size() -
+                   simd::count_zero_num((int8_t*)_filter_ptr->data(), _filter_ptr->size());
+    if (count == 0) {
+        for (auto& col : columns_to_filter) {
+            std::move(*block->get_by_position(col).column).assume_mutable()->clear();

Review Comment:
   warning: std::move of the const expression has no effect; remove std::move() [performance-move-const-arg]
   
   ```suggestion
               *block->get_by_position(col).column.assume_mutable()->clear();
   ```
   



##########
be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:
##########
@@ -434,6 +438,103 @@ Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t* read_rows, b
     return Status::OK();
 }
 
+Status RowGroupReader::_build_pos_delete_filter(size_t read_rows) {
+    _filter_ptr.reset(new IColumn::Filter(read_rows, 1));
+    int64_t start_row_id = _position_delete_ctx.current_row_id;
+    int64_t end_row_id = std::min(_position_delete_ctx.current_row_id + (int64_t)read_rows,
+                                  _position_delete_ctx.last_row_id);
+    while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
+        const int64_t& delete_row_id = _position_delete_ctx.delete_rows[_position_delete_ctx.index];
+        if (delete_row_id < start_row_id) {
+            _position_delete_ctx.index++;
+        } else if (delete_row_id < end_row_id) {
+            int64_t index = _position_delete_ctx.delete_rows[_position_delete_ctx.index] -
+                            _position_delete_ctx.current_row_id;
+            (*_filter_ptr)[index] = 0;
+            _position_delete_ctx.index++;
+        } else { // delete_row_id >= end_row_id
+            break;
+        }
+    }
+    _position_delete_ctx.current_row_id = end_row_id;
+    return Status::OK();
+}
+
+Status RowGroupReader::_filter_block(Block* block, const ColumnPtr filter_column,
+                                     int column_to_keep, std::vector<uint32_t> columns_to_filter) {
+    if (auto* nullable_column = check_and_get_column<ColumnNullable>(*filter_column)) {
+        ColumnPtr nested_column = nullable_column->get_nested_column_ptr();
+
+        MutableColumnPtr mutable_holder =
+                nested_column->use_count() == 1
+                        ? nested_column->assume_mutable()
+                        : nested_column->clone_resized(nested_column->size());
+
+        ColumnUInt8* concrete_column = typeid_cast<ColumnUInt8*>(mutable_holder.get());
+        if (!concrete_column) {
+            return Status::InvalidArgument(
+                    "Illegal type {} of column for filter. Must be UInt8 or Nullable(UInt8).",
+                    filter_column->get_name());
+        }
+        auto* __restrict null_map = nullable_column->get_null_map_data().data();
+        IColumn::Filter& filter = concrete_column->get_data();
+        auto* __restrict filter_data = filter.data();
+
+        const size_t size = filter.size();
+        for (size_t i = 0; i < size; ++i) {
+            (*_filter_ptr)[i] &= (!null_map[i]) & filter_data[i];
+        }
+        RETURN_IF_ERROR(_filter_block_internal(block, columns_to_filter));
+    } else if (auto* const_column = check_and_get_column<ColumnConst>(*filter_column)) {
+        bool ret = const_column->get_bool(0);
+        if (!ret) {
+            for (auto& col : columns_to_filter) {
+                std::move(*block->get_by_position(col).column).assume_mutable()->clear();

Review Comment:
   warning: std::move of the const expression has no effect; remove std::move() [performance-move-const-arg]
   
   ```suggestion
                   *block->get_by_position(col).column.assume_mutable()->clear();
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org