You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/05/26 12:22:36 UTC
[arrow] branch master updated: ARROW-5349: [C++][Parquet] Add
method to set file path in a parquet::FileMetaData instance
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f82af62 ARROW-5349: [C++][Parquet] Add method to set file path in a parquet::FileMetaData instance
f82af62 is described below
commit f82af62dfde364f3eeea2015586ec5846a78ec8f
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun May 26 14:22:27 2019 +0200
ARROW-5349: [C++][Parquet] Add method to set file path in a parquet::FileMetaData instance
This is an RFC based on the discussion ongoing. If this seems like the accepted approach I'll add a Python unit test
Author: Wes McKinney <we...@apache.org>
Closes #4386 from wesm/parquet-metadata-set-file-path and squashes the following commits:
e77cf7a8a <Wes McKinney> Add method to set file path in a parquet::FileMetaData instance
---
cpp/src/parquet/metadata-test.cc | 5 +++++
cpp/src/parquet/metadata.cc | 10 ++++++++++
cpp/src/parquet/metadata.h | 5 +++++
python/pyarrow/_parquet.pxd | 2 ++
python/pyarrow/_parquet.pyx | 9 +++++++++
5 files changed, 31 insertions(+)
diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc
index 7115876..8b6ebc1 100644
--- a/cpp/src/parquet/metadata-test.cc
+++ b/cpp/src/parquet/metadata-test.cc
@@ -163,6 +163,11 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
ASSERT_EQ(10, rg2_column1->data_page_offset());
ASSERT_EQ(26, rg2_column2->data_page_offset());
+
+ // Test FileMetaData::set_file_path
+ ASSERT_TRUE(rg2_column1->file_path().empty());
+ f_accessor->set_file_path("/foo/bar/bar.parquet");
+ ASSERT_EQ("/foo/bar/bar.parquet", rg2_column1->file_path());
}
TEST(Metadata, TestV1Version) {
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index f596061..5d701a7 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -386,6 +386,14 @@ class FileMetaData::FileMetaDataImpl {
return key_value_metadata_;
}
+ void set_file_path(const std::string& path) {
+ for (format::RowGroup& row_group : metadata_->row_groups) {
+ for (format::ColumnChunk& chunk : row_group.columns) {
+ chunk.__set_file_path(path);
+ }
+ }
+ }
+
private:
friend FileMetaDataBuilder;
uint32_t metadata_len_;
@@ -483,6 +491,8 @@ std::shared_ptr<const KeyValueMetadata> FileMetaData::key_value_metadata() const
return impl_->key_value_metadata();
}
+void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
+
void FileMetaData::WriteTo(OutputStream* dst) const { return impl_->WriteTo(dst); }
ApplicationVersion::ApplicationVersion(const std::string& application, int major,
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index cd31a3c..fe67bc0 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -110,8 +110,10 @@ class PARQUET_EXPORT ColumnChunkMetaData {
// column chunk
int64_t file_offset() const;
+
// parameter is only used when a dataset is spread across multiple files
const std::string& file_path() const;
+
// column metadata
Type::type type() const;
int64_t num_values() const;
@@ -190,6 +192,9 @@ class PARQUET_EXPORT FileMetaData {
std::shared_ptr<const KeyValueMetadata> key_value_metadata() const;
+ // Set file_path ColumnChunk fields to a particular value
+ void set_file_path(const std::string& path);
+
private:
friend FileMetaDataBuilder;
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len);
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index f1b44b0..75c0015 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -222,6 +222,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
const c_string created_by()
int num_schema_elements()
+ void set_file_path(const c_string& path)
+
unique_ptr[CRowGroupMetaData] RowGroup(int i)
const SchemaDescriptor* schema()
shared_ptr[const CKeyValueMetadata] key_value_metadata() const
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index f074852..db7f0c4 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -522,6 +522,15 @@ cdef class FileMetaData:
def row_group(self, int i):
return RowGroupMetaData(self, i)
+ def set_file_path(self, path):
+ """
+ Modify the file_path field of each ColumnChunk in the
+ FileMetaData to be a particular value
+ """
+ cdef:
+ c_string c_path = tobytes(path)
+ self._metadata.set_file_path(c_path)
+
cdef class ParquetSchema:
cdef: