You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2022/04/18 23:31:08 UTC
[arrow] branch master updated: ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new bd66c7e8a4 ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata
bd66c7e8a4 is described below
commit bd66c7e8a4a68a1fc9d30e8c60829518db68fded
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Tue Apr 19 08:30:53 2022 +0900
ARROW-16214: [GLib][Parquet] Add GParquetFileMetadata
Closes #12910 from kou/glib-parquet-file-metadata
Authored-by: Sutou Kouhei <ko...@clear-code.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
c_glib/doc/parquet-glib/parquet-glib-docs.xml | 12 ++
c_glib/parquet-glib/arrow-file-reader.cpp | 17 ++
c_glib/parquet-glib/arrow-file-reader.h | 6 +-
c_glib/parquet-glib/meson.build | 3 +
c_glib/parquet-glib/metadata.cpp | 257 ++++++++++++++++++++++++++
c_glib/parquet-glib/metadata.h | 63 +++++++
c_glib/parquet-glib/metadata.hpp | 30 +++
c_glib/test/parquet/test-file-metadata.rb | 87 +++++++++
8 files changed, 474 insertions(+), 1 deletion(-)
diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml
index 0d42a7d5b7..ea6f98ad7d 100644
--- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml
+++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml
@@ -45,6 +45,14 @@
</chapter>
</part>
+ <part id="data">
+ <title>Data</title>
+ <chapter id="meta">
+ <title>Meta</title>
+ <xi:include href="xml/metadata.xml"/>
+ </chapter>
+ </part>
+
<chapter id="object-tree">
<title>Object Hierarchy</title>
<xi:include href="xml/tree_index.sgml"/>
@@ -57,6 +65,10 @@
<title>Index of deprecated API</title>
<xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
</index>
+ <index id="api-index-8-0-0" role="8.0.0">
+ <title>Index of new symbols in 8.0.0</title>
+ <xi:include href="xml/api-index-8.0.0.xml"><xi:fallback /></xi:include>
+ </index>
<index id="api-index-6-0-0" role="6.0.0">
<title>Index of new symbols in 6.0.0</title>
<xi:include href="xml/api-index-6.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp
index 2532db2020..fd21a9e9c3 100644
--- a/c_glib/parquet-glib/arrow-file-reader.cpp
+++ b/c_glib/parquet-glib/arrow-file-reader.cpp
@@ -21,6 +21,7 @@
#include <arrow-glib/internal-index.hpp>
#include <parquet-glib/arrow-file-reader.hpp>
+#include <parquet-glib/metadata.hpp>
#include <parquet/file_reader.h>
@@ -381,6 +382,22 @@ gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader,
parquet_arrow_file_reader->set_use_threads(use_threads);
}
+/**
+ * gparquet_arrow_file_reader_get_metadata:
+ * @reader: A #GParquetArrowFileReader.
+ *
+ * Returns: (transfer full): The metadata.
+ *
+ * Since: 8.0.0
+ */
+GParquetFileMetadata *
+gparquet_arrow_file_reader_get_metadata(GParquetArrowFileReader *reader)
+{
+ auto parquet_reader = gparquet_arrow_file_reader_get_raw(reader);
+ auto parquet_metadata = parquet_reader->parquet_reader()->metadata();
+ return gparquet_file_metadata_new_raw(&parquet_metadata);
+}
+
G_END_DECLS
GParquetArrowFileReader *
diff --git a/c_glib/parquet-glib/arrow-file-reader.h b/c_glib/parquet-glib/arrow-file-reader.h
index abea06c57f..da234f47c5 100644
--- a/c_glib/parquet-glib/arrow-file-reader.h
+++ b/c_glib/parquet-glib/arrow-file-reader.h
@@ -19,7 +19,7 @@
#pragma once
-#include <arrow-glib/arrow-glib.h>
+#include <parquet-glib/metadata.h>
G_BEGIN_DECLS
@@ -73,4 +73,8 @@ void
gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader,
gboolean use_threads);
+GARROW_AVAILABLE_IN_8_0
+GParquetFileMetadata *
+gparquet_arrow_file_reader_get_metadata(GParquetArrowFileReader *reader);
+
G_END_DECLS
diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build
index 73cd9e45c9..f07ae1d66b 100644
--- a/c_glib/parquet-glib/meson.build
+++ b/c_glib/parquet-glib/meson.build
@@ -22,17 +22,20 @@ project_name = 'parquet-glib'
sources = files(
'arrow-file-reader.cpp',
'arrow-file-writer.cpp',
+ 'metadata.cpp',
)
c_headers = files(
'arrow-file-reader.h',
'arrow-file-writer.h',
+ 'metadata.h',
'parquet-glib.h',
)
cpp_headers = files(
'arrow-file-reader.hpp',
'arrow-file-writer.hpp',
+ 'metadata.hpp',
'parquet-glib.hpp',
)
diff --git a/c_glib/parquet-glib/metadata.cpp b/c_glib/parquet-glib/metadata.cpp
new file mode 100644
index 0000000000..a4c3227ee5
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.cpp
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow-glib/arrow-glib.hpp>
+
+#include <parquet-glib/metadata.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: metadata
+ * @title: Metadata related classes
+ * @include: parquet-glib/parquet-glib.h
+ *
+ * #GParquetFileMetadata is a class for file-level metadata.
+ */
+
+typedef struct GParquetFileMetadataPrivate_ {
+ std::shared_ptr<parquet::FileMetaData> metadata;
+} GParquetFileMetadataPrivate;
+
+enum {
+ PROP_METADATA = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GParquetFileMetadata,
+ gparquet_file_metadata,
+ G_TYPE_OBJECT)
+
+#define GPARQUET_FILE_METADATA_GET_PRIVATE(object) \
+ static_cast<GParquetFileMetadataPrivate *>( \
+ gparquet_file_metadata_get_instance_private( \
+ GPARQUET_FILE_METADATA(object)))
+
+static void
+gparquet_file_metadata_finalize(GObject *object)
+{
+ auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+ priv->metadata.~shared_ptr();
+ G_OBJECT_CLASS(gparquet_file_metadata_parent_class)->finalize(object);
+}
+
+static void
+gparquet_file_metadata_set_property(GObject *object,
+ guint prop_id,
+ const GValue *value,
+ GParamSpec *pspec)
+{
+ auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+
+ switch (prop_id) {
+ case PROP_METADATA:
+ priv->metadata =
+ *static_cast<std::shared_ptr<parquet::FileMetaData> *>(
+ g_value_get_pointer(value));
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+ break;
+ }
+}
+
+static void
+gparquet_file_metadata_init(GParquetFileMetadata *object)
+{
+ auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
+ new(&priv->metadata) std::shared_ptr<parquet::FileMetaData>;
+}
+
+static void
+gparquet_file_metadata_class_init(GParquetFileMetadataClass *klass)
+{
+ auto gobject_class = G_OBJECT_CLASS(klass);
+ gobject_class->finalize = gparquet_file_metadata_finalize;
+ gobject_class->set_property = gparquet_file_metadata_set_property;
+
+ GParamSpec *spec;
+ spec = g_param_spec_pointer("metadata",
+ "Metadata",
+ "The raw std::shared_ptr<parquet::FileMetaData>",
+ static_cast<GParamFlags>(G_PARAM_WRITABLE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_METADATA, spec);
+}
+
+/**
+ * gparquet_file_metadata_equal:
+ * @metadata: A #GParquetFileMetadata.
+ * @other_metadata: A #GParquetFileMetadata.
+ *
+ * Returns: %TRUE if both of them have the same data, %FALSE
+ * otherwise.
+ *
+ * Since: 8.0.0
+ */
+gboolean
+gparquet_file_metadata_equal(GParquetFileMetadata *metadata,
+ GParquetFileMetadata *other_metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ auto parquet_other_metadata = gparquet_file_metadata_get_raw(other_metadata);
+ return parquet_metadata->Equals(*parquet_other_metadata);
+}
+
+/**
+ * gparquet_file_metadata_get_n_columns:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of top-level columns in the schema.
+ *
+ * Parquet thrift definition requires that nested schema elements are
+ * flattened. This method returns the number of columns in the un-flattened
+ * version.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->num_columns();
+}
+
+/**
+ * gparquet_file_metadata_get_n_schema_elements:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of flattened schema elements.
+ *
+ * Parquet thrift definition requires that nested schema elements are
+ * flattened. This method returns the total number of elements in the
+ * flattened list.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->num_schema_elements();
+}
+
+/**
+ * gparquet_file_metadata_get_n_rows:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The total number of rows.
+ *
+ * Since: 8.0.0
+ */
+gint64
+gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->num_rows();
+}
+
+/**
+ * gparquet_file_metadata_get_n_row_groups:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The number of row groups in the file.
+ *
+ * Since: 8.0.0
+ */
+gint
+gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->num_row_groups();
+}
+
+/**
+ * gparquet_file_metadata_get_created_by:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The application's user-agent string of the writer.
+ *
+ * Since: 8.0.0
+ */
+const gchar *
+gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->created_by().c_str();
+}
+
+/**
+ * gparquet_file_metadata_get_size:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: The size of the original thrift encoded metadata footer.
+ *
+ * Since: 8.0.0
+ */
+guint32
+gparquet_file_metadata_get_size(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->size();
+}
+
+/**
+ * gparquet_file_metadata_can_decompress:
+ * @metadata: A #GParquetFileMetadata.
+ *
+ * Returns: %TRUE if all of the row groups can be decompressed, %FALSE
+ * otherwise.
+ *
+ * This will return false if any of the RowGroup's page is
+ * compressed with a compression format which is not compiled in the
+ * current Parquet library.
+ *
+ * Since: 8.0.0
+ */
+gboolean
+gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata)
+{
+ auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
+ return parquet_metadata->can_decompress();
+}
+
+G_END_DECLS
+
+GParquetFileMetadata *
+gparquet_file_metadata_new_raw(
+ std::shared_ptr<parquet::FileMetaData> *parquet_metadata)
+{
+ auto metadata =
+ GPARQUET_FILE_METADATA(g_object_new(GPARQUET_TYPE_FILE_METADATA,
+ "metadata", parquet_metadata,
+ NULL));
+ return metadata;
+}
+
+std::shared_ptr<parquet::FileMetaData>
+gparquet_file_metadata_get_raw(GParquetFileMetadata *metadata)
+{
+ auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(metadata);
+ return priv->metadata;
+}
diff --git a/c_glib/parquet-glib/metadata.h b/c_glib/parquet-glib/metadata.h
new file mode 100644
index 0000000000..6a0533e774
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-glib/arrow-glib.h>
+
+G_BEGIN_DECLS
+
+#define GPARQUET_TYPE_FILE_METADATA (gparquet_file_metadata_get_type())
+G_DECLARE_DERIVABLE_TYPE(GParquetFileMetadata,
+ gparquet_file_metadata,
+ GPARQUET,
+ FILE_METADATA,
+ GObject)
+struct _GParquetFileMetadataClass
+{
+ GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_8_0
+gboolean
+gparquet_file_metadata_equal(GParquetFileMetadata *metadata,
+ GParquetFileMetadata *other_metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint64
+gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gint
+gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+const gchar *
+gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+guint32
+gparquet_file_metadata_get_size(GParquetFileMetadata *metadata);
+GARROW_AVAILABLE_IN_8_0
+gboolean
+gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata);
+
+G_END_DECLS
diff --git a/c_glib/parquet-glib/metadata.hpp b/c_glib/parquet-glib/metadata.hpp
new file mode 100644
index 0000000000..d5a826b8df
--- /dev/null
+++ b/c_glib/parquet-glib/metadata.hpp
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <parquet/metadata.h>
+
+#include <parquet-glib/metadata.h>
+
+GParquetFileMetadata *
+gparquet_file_metadata_new_raw(
+ std::shared_ptr<parquet::FileMetaData> *parquet_metadata);
+std::shared_ptr<parquet::FileMetaData>
+gparquet_file_metadata_get_raw(GParquetFileMetadata *metadata);
diff --git a/c_glib/test/parquet/test-file-metadata.rb b/c_glib/test/parquet/test-file-metadata.rb
new file mode 100644
index 0000000000..b1d34da26a
--- /dev/null
+++ b/c_glib/test/parquet/test-file-metadata.rb
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestParquetFileMetadata < Test::Unit::TestCase
+ include Helper::Buildable
+
+ def setup
+ omit("Parquet is required") unless defined?(::Parquet)
+ @file = Tempfile.open(["data", ".parquet"])
+ @string_array = build_string_array([nil, "hello"])
+ fields = [
+ Arrow::Field.new("int8", Arrow::Int8DataType.new),
+ Arrow::Field.new("boolean", Arrow::BooleanDataType.new),
+ ]
+ structs = [
+ {
+ "int8" => -29,
+ "boolean" => true,
+ },
+ nil,
+ ]
+ @struct_array = build_struct_array(fields, structs)
+ @table = build_table("string" => @string_array,
+ "struct" => @struct_array)
+ writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path)
+ chunk_size = 1
+ writer.write_table(@table, chunk_size)
+ writer.close
+ reader = Parquet::ArrowFileReader.new(@file.path)
+ @metadata = reader.metadata
+ end
+
+ test("#==") do
+ reader = Parquet::ArrowFileReader.new(@file.path)
+ other_metadata = reader.metadata
+ assert do
+ @metadata == other_metadata
+ end
+ end
+
+ test("#n_columns") do
+ assert_equal(3, @metadata.n_columns)
+ end
+
+ test("#n_schema_elements") do
+ assert_equal(5, @metadata.n_schema_elements)
+ end
+
+ test("#n_rows") do
+ assert_equal(2, @metadata.n_rows)
+ end
+
+ test("#n_row_groups") do
+ assert_equal(2, @metadata.n_row_groups)
+ end
+
+ test("#created_by") do
+ assert_equal("parquet-cpp-arrow version 1.0.0",
+ @metadata.created_by.gsub(/ [\d.]+(?:-SNAPSHOT)?\z/, " 1.0.0"))
+ end
+
+ test("#size") do
+ assert do
+ @metadata.size > 0
+ end
+ end
+
+ test("#can_decompress?") do
+ assert do
+ @metadata.can_decompress?
+ end
+ end
+end