You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/05/26 12:22:36 UTC

[arrow] branch master updated: ARROW-5349: [C++][Parquet] Add method to set file path in a parquet::FileMetaData instance

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f82af62  ARROW-5349: [C++][Parquet] Add method to set file path in a parquet::FileMetaData instance
f82af62 is described below

commit f82af62dfde364f3eeea2015586ec5846a78ec8f
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun May 26 14:22:27 2019 +0200

    ARROW-5349: [C++][Parquet] Add method to set file path in a parquet::FileMetaData instance
    
    This is an RFC based on the discussion ongoing. If this seems like the accepted approach I'll add a Python unit test
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #4386 from wesm/parquet-metadata-set-file-path and squashes the following commits:
    
    e77cf7a8a <Wes McKinney> Add method to set file path in a parquet::FileMetaData instance
---
 cpp/src/parquet/metadata-test.cc |  5 +++++
 cpp/src/parquet/metadata.cc      | 10 ++++++++++
 cpp/src/parquet/metadata.h       |  5 +++++
 python/pyarrow/_parquet.pxd      |  2 ++
 python/pyarrow/_parquet.pyx      |  9 +++++++++
 5 files changed, 31 insertions(+)

diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc
index 7115876..8b6ebc1 100644
--- a/cpp/src/parquet/metadata-test.cc
+++ b/cpp/src/parquet/metadata-test.cc
@@ -163,6 +163,11 @@ TEST(Metadata, TestBuildAccess) {
   ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
   ASSERT_EQ(10, rg2_column1->data_page_offset());
   ASSERT_EQ(26, rg2_column2->data_page_offset());
+
+  // Test FileMetaData::set_file_path
+  ASSERT_TRUE(rg2_column1->file_path().empty());
+  f_accessor->set_file_path("/foo/bar/bar.parquet");
+  ASSERT_EQ("/foo/bar/bar.parquet", rg2_column1->file_path());
 }
 
 TEST(Metadata, TestV1Version) {
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index f596061..5d701a7 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -386,6 +386,14 @@ class FileMetaData::FileMetaDataImpl {
     return key_value_metadata_;
   }
 
+  void set_file_path(const std::string& path) {
+    for (format::RowGroup& row_group : metadata_->row_groups) {
+      for (format::ColumnChunk& chunk : row_group.columns) {
+        chunk.__set_file_path(path);
+      }
+    }
+  }
+
  private:
   friend FileMetaDataBuilder;
   uint32_t metadata_len_;
@@ -483,6 +491,8 @@ std::shared_ptr<const KeyValueMetadata> FileMetaData::key_value_metadata() const
   return impl_->key_value_metadata();
 }
 
+void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
+
 void FileMetaData::WriteTo(OutputStream* dst) const { return impl_->WriteTo(dst); }
 
 ApplicationVersion::ApplicationVersion(const std::string& application, int major,
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index cd31a3c..fe67bc0 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -110,8 +110,10 @@ class PARQUET_EXPORT ColumnChunkMetaData {
 
   // column chunk
   int64_t file_offset() const;
+
   // parameter is only used when a dataset is spread across multiple files
   const std::string& file_path() const;
+
   // column metadata
   Type::type type() const;
   int64_t num_values() const;
@@ -190,6 +192,9 @@ class PARQUET_EXPORT FileMetaData {
 
   std::shared_ptr<const KeyValueMetadata> key_value_metadata() const;
 
+  // Set file_path ColumnChunk fields to a particular value
+  void set_file_path(const std::string& path);
+
  private:
   friend FileMetaDataBuilder;
   explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len);
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index f1b44b0..75c0015 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -222,6 +222,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         const c_string created_by()
         int num_schema_elements()
 
+        void set_file_path(const c_string& path)
+
         unique_ptr[CRowGroupMetaData] RowGroup(int i)
         const SchemaDescriptor* schema()
         shared_ptr[const CKeyValueMetadata] key_value_metadata() const
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index f074852..db7f0c4 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -522,6 +522,15 @@ cdef class FileMetaData:
     def row_group(self, int i):
         return RowGroupMetaData(self, i)
 
+    def set_file_path(self, path):
+        """
+        Modify the file_path field of each ColumnChunk in the
+        FileMetaData to be a particular value
+        """
+        cdef:
+            c_string c_path = tobytes(path)
+        self._metadata.set_file_path(c_path)
+
 
 cdef class ParquetSchema:
     cdef: