You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/07/28 06:44:47 UTC

[GitHub] [doris] morningman commented on a diff in pull request #11285: [feature-wip] (parquet-reader) add parquet reader impl template

morningman commented on code in PR #11285:
URL: https://github.com/apache/doris/pull/11285#discussion_r931835361


##########
be/src/vec/exec/format/parquet/vparquet_reader.h:
##########
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "io/file_reader.h"
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+#include <parquet/exception.h>
+#include <stdint.h>
+
+#include <vector>
+#include <string>
+
+#include "common/status.h"
+#include "exprs/expr_context.h"
+#include "vec/core/block.h"
+#include "gen_cpp/PaloBrokerService_types.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "gen_cpp/parquet_types.h"
+#include "parquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+//    struct Statistics {
+//        int32_t filtered_row_groups = 0;
+//        int32_t total_groups = 0;
+//        int64_t filtered_rows = 0;
+//        int64_t total_rows = 0;
+//        int64_t filtered_total_bytes = 0;
+//        int64_t total_bytes = 0;
+//    };
+
+    class VParquetReader {

Review Comment:
   No need to add `V` prefix



##########
be/src/vec/exec/format/parquet/vparquet_reader.h:
##########
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "io/file_reader.h"
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+#include <parquet/exception.h>
+#include <stdint.h>
+
+#include <vector>
+#include <string>
+
+#include "common/status.h"
+#include "exprs/expr_context.h"
+#include "vec/core/block.h"
+#include "gen_cpp/PaloBrokerService_types.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "gen_cpp/parquet_types.h"
+#include "parquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+//    struct Statistics {
+//        int32_t filtered_row_groups = 0;
+//        int32_t total_groups = 0;
+//        int64_t filtered_rows = 0;
+//        int64_t total_rows = 0;
+//        int64_t filtered_total_bytes = 0;
+//        int64_t total_bytes = 0;
+//    };
+
+    class VParquetReader {
+    public:
+        VParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file,
+                          int64_t range_start_offset, int64_t range_size);
+        ~VParquetReader();
+        virtual Status init_reader(const TupleDescriptor* tuple_desc,
+                                   const std::vector<SlotDescriptor*>& tuple_slot_descs,
+                                   const std::vector<ExprContext*>& conjunct_ctxs,
+                                   const std::string& timezone) = 0;
+        virtual Status next_batch(bool* eof) = 0;
+//        std::shared_ptr<Statistics>& statistics() { return _statistics; }
+        void close() {};
+        int64_t size(int64_t* size) { return _file_reader->size(); }
+
+
+    protected:
+        FileReader* _file_reader;
+        std::shared_ptr<FileMetaData> _file_metadata;
+        const int64_t _batch_size;
+        const int32_t _num_of_columns_from_file;
+        int _total_groups;                      // num of groups(stripes) of a parquet(orc) file
+        int _current_group;                     // current group(stripe)
+        std::map<std::string, int> _map_column; // column-name <---> column-index
+        std::vector<int> _include_column_ids;   // columns that need to get from file
+//        std::shared_ptr<Statistics> _statistics;
+    private:
+        // parquet file reader object
+        std::vector<Block*> _batch;
+        std::vector<arrow::Type::type> _parquet_column_type;
+
+        int _rows_of_group; // rows in a group.
+        std::string _timezone;
+        int64_t _range_start_offset;
+        int64_t _range_size;
+
+    private:
+        std::atomic<bool> _closed = false;
+        arrow::Status _status;

Review Comment:
   Why still has `arrow`?



##########
be/src/vec/exec/format/parquet/vparquet_reader.cpp:
##########
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_reader.h"
+#include "parquet_thrift_util.h"
+
+namespace doris::vectorized {
+    doris::vectorized::VParquetReader::VParquetReader(doris::FileReader *file_reader, int64_t batch_size,
+                                                      int32_t num_of_columns_from_file, int64_t range_start_offset,
+                                                      int64_t range_size)
+            : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) {
+        _file_reader = file_reader;
+        _total_groups = 0;
+        _current_group = 0;
+        parse_thrift_footer(file_reader, _file_metadata);

Review Comment:
   move `parse_thrift_footer()` to `open()` or `init()` method



##########
be/src/vec/exec/format/parquet/parquet_file_metadata.h:
##########
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+#include "gen_cpp/parquet_types.h"
+
+namespace doris::vectorized {
+
+    class FileMetaData {
+    public:
+        Status init(tparquet::FileMetaData& metadata);
+        tparquet::FileMetaData to_thrift_metadata();

Review Comment:
   Add a `debug()` method to print the thift content of `_metadata`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org