You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/02 04:18:13 UTC

[arrow] branch master updated: GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3ad2c8b090 GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)
3ad2c8b090 is described below

commit 3ad2c8b090ff951f2881c4ac8789fa9817e824b6
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Mon Jan 2 13:17:12 2023 +0900

    GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)
    
    
    * Closes: #15146
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 c_glib/arrow-dataset-glib/dataset-factory.cpp      | 231 ++++++++++++++++++++-
 c_glib/arrow-dataset-glib/dataset-factory.h        |  17 ++
 c_glib/arrow-dataset-glib/dataset-factory.hpp      |   5 +
 c_glib/arrow-glib/version.h.in                     |  46 ++++
 .../arrow-dataset-glib/arrow-dataset-glib-docs.xml |   4 +
 c_glib/doc/arrow-glib/arrow-glib-docs.xml          |   8 +
 .../dataset/test-file-system-dataset-factory.rb    |  47 +++++
 7 files changed, 356 insertions(+), 2 deletions(-)

diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp
index 1e532760a2..97cab55542 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.cpp
+++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp
@@ -19,6 +19,7 @@
 
 #include <arrow-glib/error.hpp>
 #include <arrow-glib/file-system.hpp>
+#include <arrow-glib/schema.hpp>
 
 #include <arrow-dataset-glib/dataset-factory.hpp>
 #include <arrow-dataset-glib/dataset.hpp>
@@ -33,6 +34,8 @@ G_BEGIN_DECLS
  * @title: Dataset factory related classes
  * @include: arrow-dataset-glib/arrow-dataset-glib.h
  *
+ * #GADatasetFinishOptions is a class for gadataset_factory_finish().
+ *
  * #GADatasetDatasetFactory is a base class for dataset factories.
  *
  * #GADatasetFileSystemDatasetFactory is a class for
@@ -41,6 +44,203 @@ G_BEGIN_DECLS
  * Since: 5.0.0
  */
 
+struct GADatasetFinishOptionsPrivate {
+  arrow::dataset::FinishOptions options;
+  GArrowSchema *schema;
+};
+
+enum {
+  PROP_FINISH_OPTIONS = 1,
+  PROP_SCHEMA,
+  PROP_INSPECT_N_FRAGMENTS,
+  PROP_VALIDATE_FRAGMENTS,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFinishOptions,
+                           gadataset_finish_options,
+                           G_TYPE_OBJECT)
+
+#define GADATASET_FINISH_OPTIONS_GET_PRIVATE(obj)        \
+  static_cast<GADatasetFinishOptionsPrivate *>(          \
+    gadataset_finish_options_get_instance_private(       \
+      GADATASET_FINISH_OPTIONS(obj)))
+
+static void
+gadataset_finish_options_finalize(GObject *object)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+  priv->options.~FinishOptions();
+  G_OBJECT_CLASS(gadataset_finish_options_parent_class)->finalize(object);
+}
+
+static void
+gadataset_finish_options_dispose(GObject *object)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+  if (priv->schema) {
+    g_object_unref(priv->schema);
+    priv->schema = nullptr;
+  }
+  G_OBJECT_CLASS(gadataset_finish_options_parent_class)->dispose(object);
+}
+
+static void
+gadataset_finish_options_set_property(GObject *object,
+                                      guint prop_id,
+                                      const GValue *value,
+                                      GParamSpec *pspec)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FINISH_OPTIONS:
+    {
+      auto arrow_finish_options =
+        static_cast<arrow::dataset::FinishOptions *>(g_value_get_pointer(value));
+      if (arrow_finish_options) {
+        priv->options = *arrow_finish_options;
+        if (priv->options.schema) {
+          priv->schema = garrow_schema_new_raw(&(priv->options.schema));
+        }
+      }
+    }
+    break;
+  case PROP_SCHEMA:
+    if (priv->schema != g_value_get_object(value)) {
+      auto schema_previous = priv->schema;
+      auto schema = g_value_dup_object(value);
+      if (schema) {
+        priv->schema = GARROW_SCHEMA(schema);
+        priv->options.schema = garrow_schema_get_raw(priv->schema);
+      } else {
+        priv->schema = nullptr;
+        priv->options.schema = nullptr;
+      }
+      if (schema_previous) {
+        g_object_unref(schema_previous);
+      }
+    }
+    break;
+  case PROP_INSPECT_N_FRAGMENTS:
+    priv->options.inspect_options.fragments = g_value_get_int(value);
+    break;
+  case PROP_VALIDATE_FRAGMENTS:
+    priv->options.validate_fragments = g_value_get_boolean(value);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_finish_options_get_property(GObject *object,
+                                      guint prop_id,
+                                      GValue *value,
+                                      GParamSpec *pspec)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_SCHEMA:
+    g_value_set_object(value, priv->schema);
+    break;
+  case PROP_INSPECT_N_FRAGMENTS:
+    g_value_set_int(value, priv->options.inspect_options.fragments);
+    break;
+  case PROP_VALIDATE_FRAGMENTS:
+    g_value_set_boolean(value, priv->options.validate_fragments);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_finish_options_init(GADatasetFinishOptions *object)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+  new(&priv->options) arrow::dataset::FinishOptions;
+}
+
+static void
+gadataset_finish_options_class_init(GADatasetFinishOptionsClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize = gadataset_finish_options_finalize;
+  gobject_class->dispose = gadataset_finish_options_dispose;
+  gobject_class->set_property = gadataset_finish_options_set_property;
+  gobject_class->get_property = gadataset_finish_options_get_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("finish-options",
+                              "Finish options",
+                              "The raw arrow::dataset::FinishOptions *",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FINISH_OPTIONS, spec);
+
+  /**
+   * GADatasetFinishOptions:schema:
+   *
+   * The schema to finalize the dataset's schema.
+   *
+   * Since: 11.0.0
+   */
+  spec = g_param_spec_object("schema",
+                             "Schema",
+                             "The schema to finalize the dataset's schema",
+                             GARROW_TYPE_SCHEMA,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE));
+  g_object_class_install_property(gobject_class, PROP_SCHEMA, spec);
+
+  arrow::dataset::FinishOptions finish_options;
+  /**
+   * GADatasetFinishOptions:inspect-n-fragments:
+   *
+   * The number of fragments to be used to inspect schema.
+   *
+   * Since: 11.0.0
+   */
+  spec = g_param_spec_int("inspect-n-fragments",
+                          "Inspect N fragments",
+                          "The number of fragments to be used to inspect schema",
+                          arrow::dataset::InspectOptions::kInspectAllFragments,
+                          G_MAXINT,
+                          finish_options.inspect_options.fragments,
+                          static_cast<GParamFlags>(G_PARAM_READWRITE));
+  g_object_class_install_property(gobject_class, PROP_INSPECT_N_FRAGMENTS, spec);
+
+  /**
+   * GADatasetFinishOptions:validate-fragments:
+   *
+   * Whether validate fragments against the given schema or not.
+   *
+   * Since: 11.0.0
+   */
+  spec = g_param_spec_boolean("validate-fragments",
+                              "Validate fragments",
+                              "Whether validate fragments or not",
+                              finish_options.validate_fragments,
+                              static_cast<GParamFlags>(G_PARAM_READWRITE));
+  g_object_class_install_property(gobject_class, PROP_VALIDATE_FRAGMENTS, spec);
+}
+
+/**
+ * gadataset_finish_options_new:
+ *
+ * Returns: A newly created #GADatasetDataset.
+ *
+ * Since: 11.0.0
+ */
+GADatasetFinishOptions *
+gadataset_finish_options_new(void)
+{
+  return gadataset_finish_options_new_raw(nullptr);
+}
+
+
 typedef struct GADatasetDatasetFactoryPrivate_ {
   std::shared_ptr<arrow::dataset::DatasetFactory> factory;
 } GADatasetDatasetFactoryPrivate;
@@ -118,6 +318,7 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass)
 /**
  * gadataset_dataset_factory_finish:
  * @factory: A #GADatasetDatasetFactory.
+ * @options: (nullable): A #GADatasetFinishOptions.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
  * Returns: (transfer full) (nullable):
@@ -127,10 +328,15 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass)
  */
 GADatasetDataset *
 gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+                                 GADatasetFinishOptions *options,
                                  GError **error)
 {
   auto arrow_factory = gadataset_dataset_factory_get_raw(factory);
-  auto arrow_dataset_result = arrow_factory->Finish();
+  arrow::dataset::FinishOptions arrow_options;
+  if (options) {
+    arrow_options = *gadataset_finish_options_get_raw(options);
+  }
+  auto arrow_dataset_result = arrow_factory->Finish(arrow_options);
   if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) {
     auto arrow_dataset = *arrow_dataset_result;
     return gadataset_dataset_new_raw(&arrow_dataset);
@@ -474,6 +680,7 @@ gadataset_file_system_dataset_factory_add_path(
 /**
  * gadataset_file_system_dataset_factory_finish:
  * @factory: A #GADatasetFileSystemDatasetFactory.
+ * @options: (nullable): A #GADatasetFinishOptions.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
  * Returns: (transfer full) (nullable):
@@ -484,6 +691,7 @@ gadataset_file_system_dataset_factory_add_path(
 GADatasetFileSystemDataset *
 gadataset_file_system_dataset_factory_finish(
   GADatasetFileSystemDatasetFactory *factory,
+  GADatasetFinishOptions *options,
   GError **error)
 {
   const gchar *context = "[file-system-dataset-factory][finish]";
@@ -527,7 +735,11 @@ gadataset_file_system_dataset_factory_finish(
   if (!garrow::check(error, arrow_factory_result, context)) {
     return NULL;
   }
-  auto arrow_dataset_result = (*arrow_factory_result)->Finish();
+  arrow::dataset::FinishOptions arrow_options;
+  if (options) {
+    arrow_options = *gadataset_finish_options_get_raw(options);
+  }
+  auto arrow_dataset_result = (*arrow_factory_result)->Finish(arrow_options);
   if (!garrow::check(error, arrow_dataset_result, context)) {
     return NULL;
   }
@@ -544,6 +756,21 @@ gadataset_file_system_dataset_factory_finish(
 
 G_END_DECLS
 
+GADatasetFinishOptions *
+gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *options)
+{
+  return GADATASET_FINISH_OPTIONS(g_object_new(GADATASET_TYPE_FINISH_OPTIONS,
+                                               "finish-options", options,
+                                               NULL));
+}
+
+arrow::dataset::FinishOptions *
+gadataset_finish_options_get_raw(GADatasetFinishOptions *options)
+{
+  auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(options);
+  return &(priv->options);
+}
+
 std::shared_ptr<arrow::dataset::DatasetFactory>
 gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory)
 {
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h
index e2ee3ed980..292a9ca70d 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.h
+++ b/c_glib/arrow-dataset-glib/dataset-factory.h
@@ -23,6 +23,21 @@
 
 G_BEGIN_DECLS
 
+#define GADATASET_TYPE_FINISH_OPTIONS (gadataset_finish_options_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetFinishOptions,
+                         gadataset_finish_options,
+                         GADATASET,
+                         FINISH_OPTIONS,
+                         GObject)
+struct _GADatasetFinishOptionsClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_11_0
+GADatasetFinishOptions *
+gadataset_finish_options_new(void);
+
 #define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type())
 G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory,
                          gadataset_dataset_factory,
@@ -37,6 +52,7 @@ struct _GADatasetDatasetFactoryClass
 GARROW_AVAILABLE_IN_5_0
 GADatasetDataset *
 gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+                                 GADatasetFinishOptions *options,
                                  GError **error);
 
 
@@ -92,6 +108,7 @@ GARROW_AVAILABLE_IN_5_0
 GADatasetFileSystemDataset *
 gadataset_file_system_dataset_factory_finish(
   GADatasetFileSystemDatasetFactory *factory,
+  GADatasetFinishOptions *options,
   GError **error);
 
 
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp
index 114db35bc5..6ff68945ad 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.hpp
+++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp
@@ -23,5 +23,10 @@
 
 #include <arrow-dataset-glib/dataset-factory.h>
 
+GADatasetFinishOptions *
+gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *arrow_options);
+arrow::dataset::FinishOptions *
+gadataset_finish_options_get_raw(GADatasetFinishOptions *options);
+
 std::shared_ptr<arrow::dataset::DatasetFactory>
 gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory);
diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in
index 74c54b998d..bd67ed6b8b 100644
--- a/c_glib/arrow-glib/version.h.in
+++ b/c_glib/arrow-glib/version.h.in
@@ -110,6 +110,24 @@
 #  define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor)
 #endif
 
+/**
+ * GARROW_VERSION_11_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 11.0.0
+ */
+#define GARROW_VERSION_11_0 G_ENCODE_VERSION(11, 0)
+
+/**
+ * GARROW_VERSION_10_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 10.0.0
+ */
+#define GARROW_VERSION_10_0 G_ENCODE_VERSION(10, 0)
+
 /**
  * GARROW_VERSION_9_0:
  *
@@ -301,6 +319,34 @@
 
 #define GARROW_AVAILABLE_IN_ALL
 
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_11_0
+#  define GARROW_DEPRECATED_IN_11_0                GARROW_DEPRECATED
+#  define GARROW_DEPRECATED_IN_11_0_FOR(function)  GARROW_DEPRECATED_FOR(function)
+#else
+#  define GARROW_DEPRECATED_IN_11_0
+#  define GARROW_DEPRECATED_IN_11_0_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_11_0
+#  define GARROW_AVAILABLE_IN_11_0 GARROW_UNAVAILABLE(11, 0)
+#else
+#  define GARROW_AVAILABLE_IN_11_0
+#endif
+
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_10_0
+#  define GARROW_DEPRECATED_IN_10_0                GARROW_DEPRECATED
+#  define GARROW_DEPRECATED_IN_10_0_FOR(function)  GARROW_DEPRECATED_FOR(function)
+#else
+#  define GARROW_DEPRECATED_IN_10_0
+#  define GARROW_DEPRECATED_IN_10_0_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_10_0
+#  define GARROW_AVAILABLE_IN_10_0 GARROW_UNAVAILABLE(10, 0)
+#else
+#  define GARROW_AVAILABLE_IN_10_0
+#endif
+
 #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_9_0
 #  define GARROW_DEPRECATED_IN_9_0                GARROW_DEPRECATED
 #  define GARROW_DEPRECATED_IN_9_0_FOR(function)  GARROW_DEPRECATED_FOR(function)
diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
index b13195b070..e6066379ce 100644
--- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
+++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
@@ -68,6 +68,10 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-11-0-0" role="11.0.0">
+    <title>Index of new symbols in 11.0.0</title>
+    <xi:include href="xml/api-index-11.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-6-0-0" role="6.0.0">
     <title>Index of new symbols in 6.0.0</title>
     <xi:include href="xml/api-index-6.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
index 2ad1135bc6..e6990af559 100644
--- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml
+++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
@@ -193,6 +193,14 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-11-0-0" role="11.0.0">
+    <title>Index of new symbols in 11.0.0</title>
+    <xi:include href="xml/api-index-11.0.0.xml"><xi:fallback /></xi:include>
+  </index>
+  <index id="api-index-10-0-0" role="10.0.0">
+    <title>Index of new symbols in 10.0.0</title>
+    <xi:include href="xml/api-index-10.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-9-0-0" role="9.0.0">
     <title>Index of new symbols in 9.0.0</title>
     <xi:include href="xml/api-index-9.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb
index bca9e72418..30944ccd3b 100644
--- a/c_glib/test/dataset/test-file-system-dataset-factory.rb
+++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb
@@ -70,4 +70,51 @@ class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase
     assert_equal(@table1.concatenate([@table2]),
                  dataset.to_table)
   end
+
+  sub_test_case("#finish") do
+    def setup
+      super do
+        @factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+        @factory.file_system_uri = build_file_uri(@path1)
+        yield
+      end
+    end
+
+    def test_schema
+      options = ArrowDataset::FinishOptions.new
+      options.schema = build_schema(visible: Arrow::BooleanDataType.new,
+                                    point: Arrow::Int16DataType.new)
+      dataset = @factory.finish(options)
+      assert_equal(build_table(visible: [
+                                 build_boolean_array([true, false, true]),
+                                 build_boolean_array([false, true, false, true]),
+                               ],
+                               point: [
+                                 build_int16_array([1, 2, 3]),
+                                 build_int16_array([-1, -2, -3, -4]),
+                               ]),
+                   dataset.to_table)
+    end
+
+    def test_inspect_n_fragments
+      options = ArrowDataset::FinishOptions.new
+      options.inspect_n_fragments = -1
+      dataset = @factory.finish(options)
+      assert_equal(@table1, dataset.to_table)
+    end
+
+    def test_validate_fragments
+      options = ArrowDataset::FinishOptions.new
+      options.schema = build_schema(visible: Arrow::BooleanDataType.new,
+                                    point: Arrow::Int16DataType.new)
+      options.validate_fragments = true
+      message = "[file-system-dataset-factory][finish]: " +
+                "Invalid: Unable to merge: " +
+                "Field point has incompatible types: int16 vs int32"
+      error = assert_raise(Arrow::Error::Invalid) do
+        @factory.finish(options)
+      end
+      assert_equal(message, error.message.lines(chomp: true).first)
+    end
+  end
 end