You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/07/29 06:30:38 UTC

[doris] branch master updated: [feature-wip] (parquet-reader) add parquet reader impl template (#11285)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e4bc3f6b6f [feature-wip] (parquet-reader) add parquet reader impl template (#11285)
e4bc3f6b6f is described below

commit e4bc3f6b6f47e352067bf3bd7b95b0359af4baca
Author: slothever <18...@users.noreply.github.com>
AuthorDate: Fri Jul 29 14:30:31 2022 +0800

    [feature-wip] (parquet-reader) add parquet reader impl template (#11285)
---
 be/src/vec/CMakeLists.txt                          | 10 ++-
 be/src/vec/exec/file_hdfs_scanner.cpp              | 20 +++++
 be/src/vec/exec/file_hdfs_scanner.h                | 26 +++++++
 .../vec/exec/format/parquet/parquet_thrift_util.h  | 87 ++++++++++++++++++++++
 be/src/vec/exec/format/parquet/schema_desc.cpp     | 33 ++++++++
 be/src/vec/exec/format/parquet/schema_desc.h       | 46 ++++++++++++
 .../parquet/vparquet_column_chunk_reader.cpp       | 36 +++++++++
 .../format/parquet/vparquet_column_chunk_reader.h  | 33 ++++++++
 .../exec/format/parquet/vparquet_file_metadata.cpp | 52 +++++++++++++
 .../exec/format/parquet/vparquet_file_metadata.h   | 45 +++++++++++
 .../exec/format/parquet/vparquet_group_reader.cpp  | 20 +++++
 .../exec/format/parquet/vparquet_group_reader.h    | 24 ++++++
 .../exec/format/parquet/vparquet_page_index.cpp    | 29 ++++++++
 .../vec/exec/format/parquet/vparquet_page_index.h  | 35 +++++++++
 .../exec/format/parquet/vparquet_page_reader.cpp   | 33 ++++++++
 .../vec/exec/format/parquet/vparquet_page_reader.h | 34 +++++++++
 be/src/vec/exec/format/parquet/vparquet_reader.cpp | 61 +++++++++++++++
 be/src/vec/exec/format/parquet/vparquet_reader.h   | 87 ++++++++++++++++++++++
 be/test/CMakeLists.txt                             |  1 +
 be/test/vec/exec/parquet/parquet_thrift_test.cpp   | 66 ++++++++++++++++
 20 files changed, 777 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt
index 9ee5a5066f..6be1a9c797 100644
--- a/be/src/vec/CMakeLists.txt
+++ b/be/src/vec/CMakeLists.txt
@@ -221,7 +221,15 @@ set(VEC_FILES
   exec/file_arrow_scanner.cpp
   exec/file_scanner.cpp
   exec/file_scan_node.cpp
-  exec/file_text_scanner.cpp)
+  exec/file_text_scanner.cpp
+  exec/file_hdfs_scanner.cpp
+  exec/format/parquet/vparquet_column_chunk_reader.cpp
+  exec/format/parquet/vparquet_group_reader.cpp
+  exec/format/parquet/vparquet_page_index.cpp
+  exec/format/parquet/vparquet_reader.cpp
+  exec/format/parquet/vparquet_file_metadata.cpp
+  exec/format/parquet/vparquet_page_reader.cpp
+  exec/format/parquet/schema_desc.cpp)
 
 add_library(Vec STATIC
     ${VEC_FILES}
diff --git a/be/src/vec/exec/file_hdfs_scanner.cpp b/be/src/vec/exec/file_hdfs_scanner.cpp
new file mode 100644
index 0000000000..d4c7398913
--- /dev/null
+++ b/be/src/vec/exec/file_hdfs_scanner.cpp
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "file_hdfs_scanner.h"
+
+namespace doris::vectorized {} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/file_hdfs_scanner.h b/be/src/vec/exec/file_hdfs_scanner.h
new file mode 100644
index 0000000000..e88e3a887a
--- /dev/null
+++ b/be/src/vec/exec/file_hdfs_scanner.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace doris::vectorized {
+
+class HdfsFileScanner {};
+
+class ParquetFileHdfsScanner : public HdfsFileScanner {};
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
new file mode 100644
index 0000000000..3e2cbec60e
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+
+#include <cstdint>
+
+#include "common/logging.h"
+#include "gen_cpp/parquet_types.h"
+#include "io/file_reader.h"
+#include "util/coding.h"
+#include "util/thrift_util.h"
+#include "vparquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
+constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024;
+constexpr uint32_t PARQUET_FOOTER_SIZE = 8;
+
+Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) {
+    // try with buffer on stack
+    uint8_t buff[PARQUET_FOOTER_READ_SIZE];
+    int64_t file_size = file->size();
+    // read footer bytes
+    uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE);
+
+    int64_t bytes_read = 0;
+    RETURN_IF_ERROR(
+            file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff));
+
+    // validate magic
+    uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER);
+    if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
+        return Status::Corruption("Invalid magic number in parquet file");
+    }
+
+    // get metadata_size
+    uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE;
+    uint32_t metadata_size = decode_fixed32_le(footer_buff);
+    if (metadata_size > file_size - PARQUET_FOOTER_SIZE) {
+        Status::Corruption("Parquet file size is ", file_size,
+                           " bytes, smaller than the size reported by footer's (", metadata_size,
+                           "bytes)");
+    }
+    tparquet::FileMetaData t_metadata;
+    // deserialize footer
+    RETURN_IF_ERROR(
+            deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata));
+    file_metadata.reset(new FileMetaData(t_metadata));
+    RETURN_IF_ERROR(file_metadata->init_schema());
+    return Status::OK();
+}
+
+//    Status parse_page_header() {
+//        uint8_t* page_buf;
+//
+//    }
+
+//    Status parse_page_index() {
+//
+//    }
+
+//    void deserialize_column_index(int64_t start_offset, tparquet::ColumnIndex) {
+//
+//    }
+//
+//    void deserialize_offset_index(int64_t start_offset, tparquet::OffsetIndex) {
+//
+//    }
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp
new file mode 100644
index 0000000000..21275d2bb4
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "schema_desc.h"
+
+namespace doris::vectorized {
+
+SchemaDescriptor::~SchemaDescriptor() {
+    //    fields.clear();
+}
+
+std::string SchemaDescriptor::debug_string() const {
+    return std::string();
+}
+
+std::string FieldSchema::debug_string() const {
+    return std::string();
+}
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h
new file mode 100644
index 0000000000..678f633e29
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/schema_desc.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+
+namespace doris::vectorized {
+class FieldSchema {
+public:
+    int16_t max_def_level() const { return _max_def_level; }
+    int16_t max_rep_level() const { return _max_rep_level; }
+    std::string debug_string() const;
+
+private:
+    int16_t _max_def_level;
+    int16_t _max_rep_level;
+    //    std::vector<FieldSchema> children;
+};
+
+class SchemaDescriptor {
+public:
+    SchemaDescriptor() = default;
+    ~SchemaDescriptor();
+
+    std::string debug_string() const;
+
+private:
+    //    std::vector<FieldSchema> fields;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
new file mode 100644
index 0000000000..d9fc6313f1
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "vparquet_column_chunk_reader.h"
+
+namespace doris::vectorized {
+
+Status ColumnChunkReader::init() {
+    return Status();
+}
+
+Status ColumnChunkReader::read_min_max_stat() {
+    return Status();
+}
+
+Status ColumnChunkReader::decode_dict_page() {
+    return Status();
+}
+
+Status ColumnChunkReader::decode_nested_page() {
+    return Status();
+}
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
new file mode 100644
index 0000000000..efc9880ed2
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+
+namespace doris::vectorized {
+
+class ColumnChunkReader {
+public:
+    Status init();
+    Status read_min_max_stat();
+    Status decode_dict_page();
+    Status decode_nested_page();
+
+private:
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
new file mode 100644
index 0000000000..6e6ead39d2
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_file_metadata.h"
+
+#include <sstream>
+
+namespace doris::vectorized {
+
+FileMetaData::FileMetaData(tparquet::FileMetaData& metadata) : _metadata(metadata) {
+    _num_rows = metadata.num_rows;
+    _num_groups = metadata.row_groups.size();
+    if (_num_groups != 0) {
+        _num_columns = metadata.row_groups[0].columns.size();
+    }
+    if (metadata.schema[0].num_children <= 0) {
+    }
+}
+
+Status FileMetaData::init_schema() {
+    return Status();
+}
+
+const tparquet::FileMetaData& FileMetaData::to_thrift_metadata() {
+    return _metadata;
+}
+
+std::string FileMetaData::debug_string() const {
+    std::stringstream out;
+    out << "Parquet Metadata(";
+    out << "; version=" << _metadata.version;
+    out << "; num row groups=" << _num_groups;
+    out << "; num rows=" << _num_rows;
+    out << ")";
+    return out.str();
+}
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.h b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h
new file mode 100644
index 0000000000..1ad15edf7f
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include "common/status.h"
+#include "gen_cpp/parquet_types.h"
+#include "schema_desc.h"
+
+namespace doris::vectorized {
+
+class FileMetaData {
+public:
+    FileMetaData(tparquet::FileMetaData& metadata);
+    ~FileMetaData() = default;
+    Status init_schema();
+    const tparquet::FileMetaData& to_thrift_metadata();
+    int32_t num_row_groups() const { return _num_groups; }
+    int32_t num_columns() const { return _num_columns; };
+    int32_t num_rows() const { return _num_rows; };
+    SchemaDescriptor schema() const { return _schema; };
+    std::string debug_string() const;
+
+private:
+    tparquet::FileMetaData _metadata;
+    int32_t _num_groups = 0;
+    int32_t _num_columns = 0;
+    int64_t _num_rows = 0;
+    SchemaDescriptor _schema;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
new file mode 100644
index 0000000000..543b1dd450
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_group_reader.h"
+
+namespace doris::vectorized {} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
new file mode 100644
index 0000000000..adb150e696
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include <common/status.h>
+
+namespace doris::vectorized {
+
+class RowGroupReader {};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
new file mode 100644
index 0000000000..fedf544bbd
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_page_index.h"
+
+namespace doris::vectorized {
+
+Status PageIndex::get_row_range_for_page() {
+    return Status();
+}
+
+Status PageIndex::collect_skipped_page_range() {
+    return Status();
+}
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.h b/be/src/vec/exec/format/parquet/vparquet_page_index.h
new file mode 100644
index 0000000000..801de39b8a
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+#include <gen_cpp/parquet_types.h>
+
+namespace doris::vectorized {
+
+class PageIndex {
+public:
+    Status get_row_range_for_page();
+    Status collect_skipped_page_range();
+
+    //private:
+    //    // row range define
+    //    tparquet::ColumnIndex _column_index;
+    //    tparquet::OffsetIndex _offset_index;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
new file mode 100644
index 0000000000..d781159cab
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_page_reader.h"
+
+namespace doris::vectorized {
+
+Status PageReader::read_page_header() {
+    return Status();
+}
+
+Status PageReader::read_page_data() {
+    return Status();
+}
+
+Status PageReader::init() {
+    return Status();
+}
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
new file mode 100644
index 0000000000..9b66896aaf
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <common/status.h>
+#include <gen_cpp/parquet_types.h>
+
+namespace doris::vectorized {
+
+class PageReader {
+public:
+    Status init();
+    Status read_page_header();
+    Status read_page_data();
+
+    //private:
+    //    tparquet::PageHeader* _page_header;
+};
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
new file mode 100644
index 0000000000..215dec1dca
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vparquet_reader.h"
+
+#include "parquet_thrift_util.h"
+
+namespace doris::vectorized {
+doris::vectorized::ParquetReader::ParquetReader(doris::FileReader* file_reader, int64_t batch_size,
+                                                int32_t num_of_columns_from_file,
+                                                int64_t range_start_offset, int64_t range_size) {
+    //        : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) {
+    _file_reader = file_reader;
+    _total_groups = 0;
+    //    _current_group = 0;
+    //        _statistics = std::make_shared<Statistics>();
+}
+
+doris::vectorized::ParquetReader::~ParquetReader() {
+    //    _batch.clear();
+}
+
+Status ParquetReader::init_reader(const TupleDescriptor* tuple_desc,
+                                  const std::vector<SlotDescriptor*>& tuple_slot_descs,
+                                  const std::vector<ExprContext*>& conjunct_ctxs,
+                                  const std::string& timezone) {
+    _file_reader->open();
+    RETURN_IF_ERROR(parse_thrift_footer(_file_reader, _file_metadata));
+    auto metadata = _file_metadata->to_thrift_metadata();
+
+    _total_groups = metadata.row_groups.size();
+    if (_total_groups == 0) {
+        return Status::EndOfFile("Empty Parquet File");
+    }
+
+    return Status::OK();
+}
+
+int64_t ParquetReader::_get_row_group_start_offset(const tparquet::RowGroup& row_group) {
+    if (row_group.__isset.file_offset) {
+        return row_group.file_offset;
+    }
+    const tparquet::ColumnMetaData& first_column = row_group.columns[0].meta_data;
+    return first_column.data_page_offset;
+}
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h
new file mode 100644
index 0000000000..2f5d4aa480
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+#include <parquet/exception.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "common/status.h"
+#include "exprs/expr_context.h"
+#include "gen_cpp/PaloBrokerService_types.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "gen_cpp/parquet_types.h"
+#include "io/file_reader.h"
+#include "vec/core/block.h"
+#include "vparquet_file_metadata.h"
+
+namespace doris::vectorized {
+
+//    struct Statistics {
+//        int32_t filtered_row_groups = 0;
+//        int32_t total_groups = 0;
+//        int64_t filtered_rows = 0;
+//        int64_t total_rows = 0;
+//        int64_t filtered_total_bytes = 0;
+//        int64_t total_bytes = 0;
+//    };
+
+class ParquetReader {
+public:
+    ParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file,
+                  int64_t range_start_offset, int64_t range_size);
+    ~ParquetReader();
+    virtual Status init_reader(const TupleDescriptor* tuple_desc,
+                               const std::vector<SlotDescriptor*>& tuple_slot_descs,
+                               const std::vector<ExprContext*>& conjunct_ctxs,
+                               const std::string& timezone) = 0;
+    virtual Status next_batch(bool* eof) = 0;
+    //        std::shared_ptr<Statistics>& statistics() { return _statistics; }
+    void close() {};
+    int64_t size(int64_t* size) { return _file_reader->size(); }
+
+private:
+    int64_t _get_row_group_start_offset(const tparquet::RowGroup& row_group);
+
+private:
+    FileReader* _file_reader;
+    std::shared_ptr<FileMetaData> _file_metadata;
+    //    const int64_t _batch_size;
+    //    const int32_t _num_of_columns_from_file;
+    int _total_groups; // num of groups(stripes) of a parquet(orc) file
+    //    int _current_group;                     // current group(stripe)
+    //    std::map<std::string, int> _map_column; // column-name <---> column-index
+    //    std::vector<int> _include_column_ids;   // columns that need to get from file
+    //        std::shared_ptr<Statistics> _statistics;
+
+    // parquet file reader object
+    //    std::vector<Block*> _batch;
+    //    std::string _timezone;
+    //    int64_t _range_start_offset;
+    //    int64_t _range_size;
+
+private:
+    std::atomic<bool> _closed = false;
+};
+
+} // namespace doris::vectorized
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index 0bf06f7b45..d05fe0a976 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -59,6 +59,7 @@ set(EXEC_TEST_FILES
     exec/s3_reader_test.cpp
     exec/multi_bytes_separator_test.cpp
     exec/hdfs_file_reader_test.cpp
+    vec/exec/parquet/parquet_thrift_test.cpp
     # exec/new_olap_scan_node_test.cpp
     # exec/pre_aggregation_node_test.cpp
     # exec/partitioned_hash_table_test.cpp
diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
new file mode 100644
index 0000000000..d5ac78264b
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "io/buffered_reader.h"
+#include "io/file_reader.h"
+#include "io/local_file_reader.h"
+#include "util/runtime_profile.h"
+#include "vec/exec/format/parquet/parquet_thrift_util.h"
+#include "vec/exec/format/parquet/vparquet_file_metadata.h"
+
+namespace doris {
+namespace vectorized {
+
+class ParquetThriftReaderTest : public testing::Test {
+public:
+    ParquetThriftReaderTest() {}
+};
+
+TEST_F(ParquetThriftReaderTest, normal) {
+    LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0);
+
+    auto st = reader.open();
+    EXPECT_TRUE(st.ok());
+
+    std::shared_ptr<FileMetaData> metaData;
+    parse_thrift_footer(&reader, metaData);
+    tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
+    LOG(WARNING) << "num row groups: " << metaData->num_row_groups();
+    LOG(WARNING) << "num columns: " << metaData->num_columns();
+    LOG(WARNING) << "=====================================";
+    for (auto value : t_metadata.row_groups) {
+        LOG(WARNING) << "row group num_rows: " << value.num_rows;
+    }
+    LOG(WARNING) << "=====================================";
+    for (auto value : t_metadata.schema) {
+        LOG(WARNING) << "schema column name: " << value.name;
+        LOG(WARNING) << "schema column type: " << value.type;
+        LOG(WARNING) << "schema column repetition_type: " << value.repetition_type;
+        LOG(WARNING) << "schema column num children: " << value.num_children;
+    }
+}
+
+} // namespace vectorized
+
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org