You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/07/29 06:30:38 UTC
[doris] branch master updated: [feature-wip] (parquet-reader) add parquet reader impl template (#11285)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e4bc3f6b6f [feature-wip] (parquet-reader) add parquet reader impl template (#11285)
e4bc3f6b6f is described below
commit e4bc3f6b6f47e352067bf3bd7b95b0359af4baca
Author: slothever <18...@users.noreply.github.com>
AuthorDate: Fri Jul 29 14:30:31 2022 +0800
[feature-wip] (parquet-reader) add parquet reader impl template (#11285)
---
be/src/vec/CMakeLists.txt | 10 ++-
be/src/vec/exec/file_hdfs_scanner.cpp | 20 +++++
be/src/vec/exec/file_hdfs_scanner.h | 26 +++++++
.../vec/exec/format/parquet/parquet_thrift_util.h | 87 ++++++++++++++++++++++
be/src/vec/exec/format/parquet/schema_desc.cpp | 33 ++++++++
be/src/vec/exec/format/parquet/schema_desc.h | 46 ++++++++++++
.../parquet/vparquet_column_chunk_reader.cpp | 36 +++++++++
.../format/parquet/vparquet_column_chunk_reader.h | 33 ++++++++
.../exec/format/parquet/vparquet_file_metadata.cpp | 52 +++++++++++++
.../exec/format/parquet/vparquet_file_metadata.h | 45 +++++++++++
.../exec/format/parquet/vparquet_group_reader.cpp | 20 +++++
.../exec/format/parquet/vparquet_group_reader.h | 24 ++++++
.../exec/format/parquet/vparquet_page_index.cpp | 29 ++++++++
.../vec/exec/format/parquet/vparquet_page_index.h | 35 +++++++++
.../exec/format/parquet/vparquet_page_reader.cpp | 33 ++++++++
.../vec/exec/format/parquet/vparquet_page_reader.h | 34 +++++++++
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 61 +++++++++++++++
be/src/vec/exec/format/parquet/vparquet_reader.h | 87 ++++++++++++++++++++++
be/test/CMakeLists.txt | 1 +
be/test/vec/exec/parquet/parquet_thrift_test.cpp | 66 ++++++++++++++++
20 files changed, 777 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt
index 9ee5a5066f..6be1a9c797 100644
--- a/be/src/vec/CMakeLists.txt
+++ b/be/src/vec/CMakeLists.txt
@@ -221,7 +221,15 @@ set(VEC_FILES
exec/file_arrow_scanner.cpp
exec/file_scanner.cpp
exec/file_scan_node.cpp
- exec/file_text_scanner.cpp)
+ exec/file_text_scanner.cpp
+ exec/file_hdfs_scanner.cpp
+ exec/format/parquet/vparquet_column_chunk_reader.cpp
+ exec/format/parquet/vparquet_group_reader.cpp
+ exec/format/parquet/vparquet_page_index.cpp
+ exec/format/parquet/vparquet_reader.cpp
+ exec/format/parquet/vparquet_file_metadata.cpp
+ exec/format/parquet/vparquet_page_reader.cpp
+ exec/format/parquet/schema_desc.cpp)
add_library(Vec STATIC
${VEC_FILES}
diff --git a/be/src/vec/exec/file_hdfs_scanner.cpp b/be/src/vec/exec/file_hdfs_scanner.cpp
new file mode 100644
index 0000000000..d4c7398913
--- /dev/null
+++ b/be/src/vec/exec/file_hdfs_scanner.cpp
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "file_hdfs_scanner.h"
+
+namespace doris::vectorized {} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/file_hdfs_scanner.h b/be/src/vec/exec/file_hdfs_scanner.h
new file mode 100644
index 0000000000..e88e3a887a
--- /dev/null
+++ b/be/src/vec/exec/file_hdfs_scanner.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace doris::vectorized {
+
+class HdfsFileScanner {};
+
+class ParquetFileHdfsScanner : public HdfsFileScanner {};
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
new file mode 100644
index 0000000000..3e2cbec60e
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+
+#include <cstdint>
+
+#include "common/logging.h"
+#include "gen_cpp/parquet_types.h"
+#include "io/file_reader.h"
+#include "util/coding.h"
+#include "util/thrift_util.h"
+#include "vparquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
+constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024;
+constexpr uint32_t PARQUET_FOOTER_SIZE = 8;
+
+Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) {
+ // try with buffer on stack
+ uint8_t buff[PARQUET_FOOTER_READ_SIZE];
+ int64_t file_size = file->size();
+ // read footer bytes
+ uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE);
+
+ int64_t bytes_read = 0;
+ RETURN_IF_ERROR(
+ file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff));
+
+ // validate magic
+ uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER);
+ if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
+ return Status::Corruption("Invalid magic number in parquet file");
+ }
+
+ // get metadata_size
+ uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE;
+ uint32_t metadata_size = decode_fixed32_le(footer_buff);
+ if (metadata_size > file_size - PARQUET_FOOTER_SIZE) {
+ Status::Corruption("Parquet file size is ", file_size,
+ " bytes, smaller than the size reported by footer's (", metadata_size,
+ "bytes)");
+ }
+ tparquet::FileMetaData t_metadata;
+ // deserialize footer
+ RETURN_IF_ERROR(
+ deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata));
+ file_metadata.reset(new FileMetaData(t_metadata));
+ RETURN_IF_ERROR(file_metadata->init_schema());
+ return Status::OK();
+}
+
+// Status parse_page_header() {
+// uint8_t* page_buf;
+//
+// }
+
+// Status parse_page_index() {
+//
+// }
+
+// void deserialize_column_index(int64_t start_offset, tparquet::ColumnIndex) {
+//
+// }
+//
+// void deserialize_offset_index(int64_t start_offset, tparquet::OffsetIndex) {
+//
+// }
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp
new file mode 100644
index 0000000000..21275d2bb4
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "schema_desc.h"
+
+namespace doris::vectorized {
+
+SchemaDescriptor::~SchemaDescriptor() {
+ // fields.clear();
+}
+
+std::string SchemaDescriptor::debug_string() const {
+ return std::string();
+}
+
+std::string FieldSchema::debug_string() const {
+ return std::string();
+}
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h
new file mode 100644
index 0000000000..678f633e29
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/schema_desc.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+
+namespace doris::vectorized {
+class FieldSchema {
+public:
+ int16_t max_def_level() const { return _max_def_level; }
+ int16_t max_rep_level() const { return _max_rep_level; }
+ std::string debug_string() const;
+
+private:
+ int16_t _max_def_level;
+ int16_t _max_rep_level;
+ // std::vector<FieldSchema> children;
+};
+
+class SchemaDescriptor {
+public:
+ SchemaDescriptor() = default;
+ ~SchemaDescriptor();
+
+ std::string debug_string() const;
+
+private:
+ // std::vector<FieldSchema> fields;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
new file mode 100644
index 0000000000..d9fc6313f1
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "vparquet_column_chunk_reader.h"
+
+namespace doris::vectorized {
+
+Status ColumnChunkReader::init() {
+ return Status();
+}
+
+Status ColumnChunkReader::read_min_max_stat() {
+ return Status();
+}
+
+Status ColumnChunkReader::decode_dict_page() {
+ return Status();
+}
+
+Status ColumnChunkReader::decode_nested_page() {
+ return Status();
+}
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
new file mode 100644
index 0000000000..efc9880ed2
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+
+namespace doris::vectorized {
+
+class ColumnChunkReader {
+public:
+ Status init();
+ Status read_min_max_stat();
+ Status decode_dict_page();
+ Status decode_nested_page();
+
+private:
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
new file mode 100644
index 0000000000..6e6ead39d2
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_file_metadata.h"
+
+#include <sstream>
+
+namespace doris::vectorized {
+
+FileMetaData::FileMetaData(tparquet::FileMetaData& metadata) : _metadata(metadata) {
+ _num_rows = metadata.num_rows;
+ _num_groups = metadata.row_groups.size();
+ if (_num_groups != 0) {
+ _num_columns = metadata.row_groups[0].columns.size();
+ }
+ if (metadata.schema[0].num_children <= 0) {
+ }
+}
+
+Status FileMetaData::init_schema() {
+ return Status();
+}
+
+const tparquet::FileMetaData& FileMetaData::to_thrift_metadata() {
+ return _metadata;
+}
+
+std::string FileMetaData::debug_string() const {
+ std::stringstream out;
+ out << "Parquet Metadata(";
+ out << "; version=" << _metadata.version;
+ out << "; num row groups=" << _num_groups;
+ out << "; num rows=" << _num_rows;
+ out << ")";
+ return out.str();
+}
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.h b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h
new file mode 100644
index 0000000000..1ad15edf7f
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include "common/status.h"
+#include "gen_cpp/parquet_types.h"
+#include "schema_desc.h"
+
+namespace doris::vectorized {
+
+class FileMetaData {
+public:
+ FileMetaData(tparquet::FileMetaData& metadata);
+ ~FileMetaData() = default;
+ Status init_schema();
+ const tparquet::FileMetaData& to_thrift_metadata();
+ int32_t num_row_groups() const { return _num_groups; }
+ int32_t num_columns() const { return _num_columns; };
+ int32_t num_rows() const { return _num_rows; };
+ SchemaDescriptor schema() const { return _schema; };
+ std::string debug_string() const;
+
+private:
+ tparquet::FileMetaData _metadata;
+ int32_t _num_groups = 0;
+ int32_t _num_columns = 0;
+ int64_t _num_rows = 0;
+ SchemaDescriptor _schema;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
new file mode 100644
index 0000000000..543b1dd450
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_group_reader.h"
+
+namespace doris::vectorized {} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
new file mode 100644
index 0000000000..adb150e696
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <common/status.h>
+
+namespace doris::vectorized {
+
+class RowGroupReader {};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
new file mode 100644
index 0000000000..fedf544bbd
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_page_index.h"
+
+namespace doris::vectorized {
+
+Status PageIndex::get_row_range_for_page() {
+ return Status();
+}
+
+Status PageIndex::collect_skipped_page_range() {
+ return Status();
+}
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.h b/be/src/vec/exec/format/parquet/vparquet_page_index.h
new file mode 100644
index 0000000000..801de39b8a
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+#include <gen_cpp/parquet_types.h>
+
+namespace doris::vectorized {
+
+class PageIndex {
+public:
+ Status get_row_range_for_page();
+ Status collect_skipped_page_range();
+
+ //private:
+ // // row range define
+ // tparquet::ColumnIndex _column_index;
+ // tparquet::OffsetIndex _offset_index;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
new file mode 100644
index 0000000000..d781159cab
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_page_reader.h"
+
+namespace doris::vectorized {
+
+Status PageReader::read_page_header() {
+ return Status();
+}
+
+Status PageReader::read_page_data() {
+ return Status();
+}
+
+Status PageReader::init() {
+ return Status();
+}
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
new file mode 100644
index 0000000000..9b66896aaf
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+#include <gen_cpp/parquet_types.h>
+
+namespace doris::vectorized {
+
+class PageReader {
+public:
+ Status init();
+ Status read_page_header();
+ Status read_page_data();
+
+ //private:
+ // tparquet::PageHeader* _page_header;
+};
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
new file mode 100644
index 0000000000..215dec1dca
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_reader.h"
+
+#include "parquet_thrift_util.h"
+
+namespace doris::vectorized {
+doris::vectorized::ParquetReader::ParquetReader(doris::FileReader* file_reader, int64_t batch_size,
+ int32_t num_of_columns_from_file,
+ int64_t range_start_offset, int64_t range_size) {
+ // : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) {
+ _file_reader = file_reader;
+ _total_groups = 0;
+ // _current_group = 0;
+ // _statistics = std::make_shared<Statistics>();
+}
+
+doris::vectorized::ParquetReader::~ParquetReader() {
+ // _batch.clear();
+}
+
+Status ParquetReader::init_reader(const TupleDescriptor* tuple_desc,
+ const std::vector<SlotDescriptor*>& tuple_slot_descs,
+ const std::vector<ExprContext*>& conjunct_ctxs,
+ const std::string& timezone) {
+ _file_reader->open();
+ RETURN_IF_ERROR(parse_thrift_footer(_file_reader, _file_metadata));
+ auto metadata = _file_metadata->to_thrift_metadata();
+
+ _total_groups = metadata.row_groups.size();
+ if (_total_groups == 0) {
+ return Status::EndOfFile("Empty Parquet File");
+ }
+
+ return Status::OK();
+}
+
+int64_t ParquetReader::_get_row_group_start_offset(const tparquet::RowGroup& row_group) {
+ if (row_group.__isset.file_offset) {
+ return row_group.file_offset;
+ }
+ const tparquet::ColumnMetaData& first_column = row_group.columns[0].meta_data;
+ return first_column.data_page_offset;
+}
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h
new file mode 100644
index 0000000000..2f5d4aa480
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+#include <parquet/exception.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "common/status.h"
+#include "exprs/expr_context.h"
+#include "gen_cpp/PaloBrokerService_types.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "gen_cpp/parquet_types.h"
+#include "io/file_reader.h"
+#include "vec/core/block.h"
+#include "vparquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+// struct Statistics {
+// int32_t filtered_row_groups = 0;
+// int32_t total_groups = 0;
+// int64_t filtered_rows = 0;
+// int64_t total_rows = 0;
+// int64_t filtered_total_bytes = 0;
+// int64_t total_bytes = 0;
+// };
+
+class ParquetReader {
+public:
+ ParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file,
+ int64_t range_start_offset, int64_t range_size);
+ ~ParquetReader();
+ virtual Status init_reader(const TupleDescriptor* tuple_desc,
+ const std::vector<SlotDescriptor*>& tuple_slot_descs,
+ const std::vector<ExprContext*>& conjunct_ctxs,
+ const std::string& timezone) = 0;
+ virtual Status next_batch(bool* eof) = 0;
+ // std::shared_ptr<Statistics>& statistics() { return _statistics; }
+ void close() {};
+ int64_t size(int64_t* size) { return _file_reader->size(); }
+
+private:
+ int64_t _get_row_group_start_offset(const tparquet::RowGroup& row_group);
+
+private:
+ FileReader* _file_reader;
+ std::shared_ptr<FileMetaData> _file_metadata;
+ // const int64_t _batch_size;
+ // const int32_t _num_of_columns_from_file;
+ int _total_groups; // num of groups(stripes) of a parquet(orc) file
+ // int _current_group; // current group(stripe)
+ // std::map<std::string, int> _map_column; // column-name <---> column-index
+ // std::vector<int> _include_column_ids; // columns that need to get from file
+ // std::shared_ptr<Statistics> _statistics;
+
+ // parquet file reader object
+ // std::vector<Block*> _batch;
+ // std::string _timezone;
+ // int64_t _range_start_offset;
+ // int64_t _range_size;
+
+private:
+ std::atomic<bool> _closed = false;
+};
+
+} // namespace doris::vectorized
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index 0bf06f7b45..d05fe0a976 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -59,6 +59,7 @@ set(EXEC_TEST_FILES
exec/s3_reader_test.cpp
exec/multi_bytes_separator_test.cpp
exec/hdfs_file_reader_test.cpp
+ vec/exec/parquet/parquet_thrift_test.cpp
# exec/new_olap_scan_node_test.cpp
# exec/pre_aggregation_node_test.cpp
# exec/partitioned_hash_table_test.cpp
diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
new file mode 100644
index 0000000000..d5ac78264b
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "io/buffered_reader.h"
+#include "io/file_reader.h"
+#include "io/local_file_reader.h"
+#include "util/runtime_profile.h"
+#include "vec/exec/format/parquet/parquet_thrift_util.h"
+#include "vec/exec/format/parquet/vparquet_file_metadata.h"
+
+namespace doris {
+namespace vectorized {
+
+class ParquetThriftReaderTest : public testing::Test {
+public:
+ ParquetThriftReaderTest() {}
+};
+
+TEST_F(ParquetThriftReaderTest, normal) {
+ LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0);
+
+ auto st = reader.open();
+ EXPECT_TRUE(st.ok());
+
+ std::shared_ptr<FileMetaData> metaData;
+ parse_thrift_footer(&reader, metaData);
+ tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
+ LOG(WARNING) << "num row groups: " << metaData->num_row_groups();
+ LOG(WARNING) << "num columns: " << metaData->num_columns();
+ LOG(WARNING) << "=====================================";
+ for (auto value : t_metadata.row_groups) {
+ LOG(WARNING) << "row group num_rows: " << value.num_rows;
+ }
+ LOG(WARNING) << "=====================================";
+ for (auto value : t_metadata.schema) {
+ LOG(WARNING) << "schema column name: " << value.name;
+ LOG(WARNING) << "schema column type: " << value.type;
+ LOG(WARNING) << "schema column repetition_type: " << value.repetition_type;
+ LOG(WARNING) << "schema column num children: " << value.num_children;
+ }
+}
+
+} // namespace vectorized
+
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org