You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2022/04/18 23:31:08 UTC

[arrow] branch master updated: ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new bd66c7e8a4 ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata
bd66c7e8a4 is described below

commit bd66c7e8a4a68a1fc9d30e8c60829518db68fded
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Tue Apr 19 08:30:53 2022 +0900

    ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata
    
    Closes #12910 from kou/glib-parquet-file-metadata
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 c_glib/doc/parquet-glib/parquet-glib-docs.xml |  12 ++
 c_glib/parquet-glib/arrow-file-reader.cpp     |  17 ++
 c_glib/parquet-glib/arrow-file-reader.h       |   6 +-
 c_glib/parquet-glib/meson.build               |   3 +
 c_glib/parquet-glib/metadata.cpp              | 257 ++++++++++++++++++++++++++
 c_glib/parquet-glib/metadata.h                |  63 +++++++
 c_glib/parquet-glib/metadata.hpp              |  30 +++
 c_glib/test/parquet/test-file-metadata.rb     |  87 +++++++++
 8 files changed, 474 insertions(+), 1 deletion(-)

diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml
index 0d42a7d5b7..ea6f98ad7d 100644
--- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml
+++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml
@@ -45,6 +45,14 @@
     </chapter>
   </part>
 
+  <part id="data">
+    <title>Data</title>
+    <chapter id="meta">
+      <title>Meta</title>
+      <xi:include href="xml/metadata.xml"/>
+    </chapter>
+  </part>
+
   <chapter id="object-tree">
     <title>Object Hierarchy</title>
     <xi:include href="xml/tree_index.sgml"/>
@@ -57,6 +65,10 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-8-0-0" role="8.0.0">
+    <title>Index of new symbols in 8.0.0</title>
+    <xi:include href="xml/api-index-8.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-6-0-0" role="6.0.0">
     <title>Index of new symbols in 6.0.0</title>
     <xi:include href="xml/api-index-6.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp
index 2532db2020..fd21a9e9c3 100644
--- a/c_glib/parquet-glib/arrow-file-reader.cpp
+++ b/c_glib/parquet-glib/arrow-file-reader.cpp
@@ -21,6 +21,7 @@
 #include <arrow-glib/internal-index.hpp>
 
 #include <parquet-glib/arrow-file-reader.hpp>
+#include <parquet-glib/metadata.hpp>
 
 #include <parquet/file_reader.h>
 
@@ -381,6 +382,22 @@ gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader,
   parquet_arrow_file_reader->set_use_threads(use_threads);
 }
 
+/**
+ * gparquet_arrow_file_reader_get_metadata:
+ * @reader: A #GParquetArrowFileReader.
+ *
+ * Returns: (transfer full): The metadata.
+ *
+ * Since: 8.0.0
+ */
+GParquetFileMetadata *
+gparquet_arrow_file_reader_get_metadata(GParquetArrowFileReader *reader)
+{
+  auto parquet_reader = gparquet_arrow_file_reader_get_raw(reader);
+  auto parquet_metadata = parquet_reader->parquet_reader()->metadata();
+  return gparquet_file_metadata_new_raw(&parquet_metadata);
+}
+
 G_END_DECLS
 
 GParquetArrowFileReader *
diff --git a/c_glib/parquet-glib/arrow-file-reader.h b/c_glib/parquet-glib/arrow-file-reader.h
index abea06c57f..da234f47c5 100644
--- a/c_glib/parquet-glib/arrow-file-reader.h
+++ b/c_glib/parquet-glib/arrow-file-reader.h
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <arrow-glib/arrow-glib.h>
+#include <parquet-glib/metadata.h>
 
 G_BEGIN_DECLS
 
@@ -73,4 +73,8 @@ void
 gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader,
                                            gboolean use_threads);
 
+GARROW_AVAILABLE_IN_8_0
+GParquetFileMetadata *
+gparquet_arrow_file_reader_get_metadata(GParquetArrowFileReader *reader);
+
 G_END_DECLS
diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build
index 73cd9e45c9..f07ae1d66b 100644
--- a/c_glib/parquet-glib/meson.build
+++ b/c_glib/parquet-glib/meson.build
@@ -22,17 +22,20 @@ project_name = 'parquet-glib'
 sources = files(
   'arrow-file-reader.cpp',
   'arrow-file-writer.cpp',
+  'metadata.cpp',
 )
 
 c_headers = files(
   'arrow-file-reader.h',
   'arrow-file-writer.h',
+  'metadata.h',
   'parquet-glib.h',
 )
 
 cpp_headers = files(
   'arrow-file-reader.hpp',
   'arrow-file-writer.hpp',
+  'metadata.hpp',
   'parquet-glib.hpp',
 )
 
diff --git a/c_glib/parquet-glib/metadata.cpp b/c_glib/parquet-glib/metadata.cpp
new file mode 100644
index 0000000000..a4c3227ee5
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.cpp
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow-glib/arrow-glib.hpp>
+
+#include <parquet-glib/metadata.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: metadata
+ * @title: Metadata related classes
+ * @include: parquet-glib/parquet-glib.h
+ *
+ * #GParquetFileMetadata is a class for file-level metadata.
+ */
+
+typedef struct GParquetFileMetadataPrivate_ {
+  std::shared_ptr<parquet::FileMetaData> metadata;
+} GParquetFileMetadataPrivate;
+
+enum {
+  PROP_METADATA = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GParquetFileMetadata,
+                           gparquet_file_metadata,
+                           G_TYPE_OBJECT)
+
+#define GPARQUET_FILE_METADATA_GET_PRIVATE(object)      \
+  static_cast<GParquetFileMetadataPrivate *>(           \
+    gparquet_file_metadata_get_instance_private(        \
+      GPARQUET_FILE_METADATA(object)))
+
+static void
+gparquet_file_metadata_finalize(GObject *object)
+{
+  auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+  priv->metadata.~shared_ptr();
+  G_OBJECT_CLASS(gparquet_file_metadata_parent_class)->finalize(object);
+}
+
+static void
+gparquet_file_metadata_set_property(GObject *object,
+                                    guint prop_id,
+                                    const GValue *value,
+                                    GParamSpec *pspec)
+{
+  auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_METADATA:
+    priv->metadata =
+      *static_cast<std::shared_ptr<parquet::FileMetaData> *>(
+        g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gparquet_file_metadata_init(GParquetFileMetadata *object)
+{
+  auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+  new(&priv->metadata) std::shared_ptr<parquet::FileMetaData>;
+}
+
+static void
+gparquet_file_metadata_class_init(GParquetFileMetadataClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize = gparquet_file_metadata_finalize;
+  gobject_class->set_property = gparquet_file_metadata_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("metadata",
+                              "Metadata",
+                              "The raw std::shared_ptr<parquet::FileMetaData>",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_METADATA, spec);
+}
+
+/**
+ * gparquet_file_metadata_equal:
+ * @metadata: A #GParquetFileMetadata.
+ * @other_metadata: A #GParquetFileMetadata.
+ *
+ * Returns: %TRUE if both of them have the same data, %FALSE
+ *   otherwise.
+ *
+ * Since: 8.0.0
+ */
+gboolean
+gparquet_file_metadata_equal(GParquetFileMetadata *metadata,
+                             GParquetFileMetadata *other_metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  auto parquet_other_metadata = gparquet_file_metadata_get_raw(other_metadata);
+  return parquet_metadata->Equals(*parquet_other_metadata);
+}
+
+/**
+ * gparquet_file_metadata_get_n_columns:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of top-level columns in the schema.
+ *
+ *   Parquet thrift definition requires that nested schema elements are
+ *   flattened. This method returns the number of columns in the un-flattened
+ *   version.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->num_columns();
+}
+
+/**
+ * gparquet_file_metadata_get_n_schema_elements:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of flattened schema elements.
+ *
+ *   Parquet thrift definition requires that nested schema elements are
+ *   flattened. This method returns the total number of elements in the
+ *   flattened list.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->num_schema_elements();
+}
+
+/**
+ * gparquet_file_metadata_get_n_rows:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The total number of rows.
+ *
+ * Since: 8.0.0
+ */
+gint64
+gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->num_rows();
+}
+
+/**
+ * gparquet_file_metadata_get_n_row_groups:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of row groups in the file.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->num_row_groups();
+}
+
+/**
+ * gparquet_file_metadata_get_created_by:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The application's user-agent string of the writer.
+ *
+ * Since: 8.0.0
+ */
+const gchar *
+gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->created_by().c_str();
+}
+
+/**
+ * gparquet_file_metadata_get_size:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The size of the original thrift encoded metadata footer.
+ *
+ * Since: 8.0.0
+ */
+guint32
+gparquet_file_metadata_get_size(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->size();
+}
+
+/**
+ * gparquet_file_metadata_can_decompress:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: %TRUE if all of the row groups can be decompressed, %FALSE
+ *   otherwise.
+ *
+ *   This will return false if any of the RowGroup's page is
+ *   compressed with a compression format which is not compiled in the
+ *   current Parquet library.
+ *
+ * Since: 8.0.0
+ */
+gboolean
+gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata)
+{
+  auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+  return parquet_metadata->can_decompress();
+}
+
+G_END_DECLS
+
+GParquetFileMetadata *
+gparquet_file_metadata_new_raw(
+  std::shared_ptr<parquet::FileMetaData> *parquet_metadata)
+{
+  auto metadata =
+    GPARQUET_FILE_METADATA(g_object_new(GPARQUET_TYPE_FILE_METADATA,
+                                        "metadata", parquet_metadata,
+                                        NULL));
+  return metadata;
+}
+
+std::shared_ptr<parquet::FileMetaData>
+gparquet_file_metadata_get_raw(GParquetFileMetadata *metadata)
+{
+  auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(metadata);
+  return priv->metadata;
+}
diff --git a/c_glib/parquet-glib/metadata.h b/c_glib/parquet-glib/metadata.h
new file mode 100644
index 0000000000..6a0533e774
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-glib/arrow-glib.h>
+
+G_BEGIN_DECLS
+
+#define GPARQUET_TYPE_FILE_METADATA (gparquet_file_metadata_get_type())
+G_DECLARE_DERIVABLE_TYPE(GParquetFileMetadata,
+                         gparquet_file_metadata,
+                         GPARQUET,
+                         FILE_METADATA,
+                         GObject)
+struct _GParquetFileMetadataClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_8_0
+gboolean
+gparquet_file_metadata_equal(GParquetFileMetadata *metadata,
+                             GParquetFileMetadata *other_metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint64
+gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+const gchar *
+gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+guint32
+gparquet_file_metadata_get_size(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gboolean
+gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata);
+
+G_END_DECLS
diff --git a/c_glib/parquet-glib/metadata.hpp b/c_glib/parquet-glib/metadata.hpp
new file mode 100644
index 0000000000..d5a826b8df
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.hpp
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <parquet/metadata.h>
+
+#include <parquet-glib/metadata.h>
+
+GParquetFileMetadata *
+gparquet_file_metadata_new_raw(
+  std::shared_ptr<parquet::FileMetaData> *parquet_metadata);
+std::shared_ptr<parquet::FileMetaData>
+gparquet_file_metadata_get_raw(GParquetFileMetadata *metadata);
diff --git a/c_glib/test/parquet/test-file-metadata.rb b/c_glib/test/parquet/test-file-metadata.rb
new file mode 100644
index 0000000000..b1d34da26a
--- /dev/null
+++ b/c_glib/test/parquet/test-file-metadata.rb
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestParquetFileMetadata < Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    omit("Parquet is required") unless defined?(::Parquet)
+    @file = Tempfile.open(["data", ".parquet"])
+    @string_array = build_string_array([nil, "hello"])
+    fields = [
+      Arrow::Field.new("int8", Arrow::Int8DataType.new),
+      Arrow::Field.new("boolean", Arrow::BooleanDataType.new),
+    ]
+    structs = [
+      {
+        "int8" => -29,
+        "boolean" => true,
+      },
+      nil,
+    ]
+    @struct_array = build_struct_array(fields, structs)
+    @table = build_table("string" => @string_array,
+                         "struct" => @struct_array)
+    writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path)
+    chunk_size = 1
+    writer.write_table(@table, chunk_size)
+    writer.close
+    reader = Parquet::ArrowFileReader.new(@file.path)
+    @metadata = reader.metadata
+  end
+
+  test("#==") do
+    reader = Parquet::ArrowFileReader.new(@file.path)
+    other_metadata = reader.metadata
+    assert do
+      @metadata == other_metadata
+    end
+  end
+
+  test("#n_columns") do
+    assert_equal(3, @metadata.n_columns)
+  end
+
+  test("#n_schema_elements") do
+    assert_equal(5, @metadata.n_schema_elements)
+  end
+
+  test("#n_rows") do
+    assert_equal(2, @metadata.n_rows)
+  end
+
+  test("#n_row_groups") do
+    assert_equal(2, @metadata.n_row_groups)
+  end
+
+  test("#created_by") do
+    assert_equal("parquet-cpp-arrow version 1.0.0",
+                 @metadata.created_by.gsub(/ [\d.]+(?:-SNAPSHOT)?\z/, " 1.0.0"))
+  end
+
+  test("#size") do
+    assert do
+      @metadata.size > 0
+    end
+  end
+
+  test("#can_decompress?") do
+    assert do
+      @metadata.can_decompress?
+    end
+  end
+end