You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/02 04:18:13 UTC
[arrow] branch master updated: GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3ad2c8b090 GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)
3ad2c8b090 is described below
commit 3ad2c8b090ff951f2881c4ac8789fa9817e824b6
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Mon Jan 2 13:17:12 2023 +0900
GH-15146: [GLib] Add `GADatasetFinishOptions` (#15147)
* Closes: #15146
Authored-by: Sutou Kouhei <ko...@clear-code.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
c_glib/arrow-dataset-glib/dataset-factory.cpp | 231 ++++++++++++++++++++-
c_glib/arrow-dataset-glib/dataset-factory.h | 17 ++
c_glib/arrow-dataset-glib/dataset-factory.hpp | 5 +
c_glib/arrow-glib/version.h.in | 46 ++++
.../arrow-dataset-glib/arrow-dataset-glib-docs.xml | 4 +
c_glib/doc/arrow-glib/arrow-glib-docs.xml | 8 +
.../dataset/test-file-system-dataset-factory.rb | 47 +++++
7 files changed, 356 insertions(+), 2 deletions(-)
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp
index 1e532760a2..97cab55542 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.cpp
+++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp
@@ -19,6 +19,7 @@
#include <arrow-glib/error.hpp>
#include <arrow-glib/file-system.hpp>
+#include <arrow-glib/schema.hpp>
#include <arrow-dataset-glib/dataset-factory.hpp>
#include <arrow-dataset-glib/dataset.hpp>
@@ -33,6 +34,8 @@ G_BEGIN_DECLS
* @title: Dataset factory related classes
* @include: arrow-dataset-glib/arrow-dataset-glib.h
*
+ * #GADatasetFinishOptions is a class for gadataset_factory_finish().
+ *
* #GADatasetDatasetFactory is a base class for dataset factories.
*
* #GADatasetFileSystemDatasetFactory is a class for
@@ -41,6 +44,203 @@ G_BEGIN_DECLS
* Since: 5.0.0
*/
+struct GADatasetFinishOptionsPrivate {
+ arrow::dataset::FinishOptions options;
+ GArrowSchema *schema;
+};
+
+enum {
+ PROP_FINISH_OPTIONS = 1,
+ PROP_SCHEMA,
+ PROP_INSPECT_N_FRAGMENTS,
+ PROP_VALIDATE_FRAGMENTS,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFinishOptions,
+ gadataset_finish_options,
+ G_TYPE_OBJECT)
+
+#define GADATASET_FINISH_OPTIONS_GET_PRIVATE(obj) \
+ static_cast<GADatasetFinishOptionsPrivate *>( \
+ gadataset_finish_options_get_instance_private( \
+ GADATASET_FINISH_OPTIONS(obj)))
+
+static void
+gadataset_finish_options_finalize(GObject *object)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+ priv->options.~FinishOptions();
+ G_OBJECT_CLASS(gadataset_finish_options_parent_class)->finalize(object);
+}
+
+static void
+gadataset_finish_options_dispose(GObject *object)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+ if (priv->schema) {
+ g_object_unref(priv->schema);
+ priv->schema = nullptr;
+ }
+ G_OBJECT_CLASS(gadataset_finish_options_parent_class)->dispose(object);
+}
+
+static void
+gadataset_finish_options_set_property(GObject *object,
+ guint prop_id,
+ const GValue *value,
+ GParamSpec *pspec)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+
+ switch (prop_id) {
+ case PROP_FINISH_OPTIONS:
+ {
+ auto arrow_finish_options =
+ static_cast<arrow::dataset::FinishOptions *>(g_value_get_pointer(value));
+ if (arrow_finish_options) {
+ priv->options = *arrow_finish_options;
+ if (priv->options.schema) {
+ priv->schema = garrow_schema_new_raw(&(priv->options.schema));
+ }
+ }
+ }
+ break;
+ case PROP_SCHEMA:
+ if (priv->schema != g_value_get_object(value)) {
+ auto schema_previous = priv->schema;
+ auto schema = g_value_dup_object(value);
+ if (schema) {
+ priv->schema = GARROW_SCHEMA(schema);
+ priv->options.schema = garrow_schema_get_raw(priv->schema);
+ } else {
+ priv->schema = nullptr;
+ priv->options.schema = nullptr;
+ }
+ if (schema_previous) {
+ g_object_unref(schema_previous);
+ }
+ }
+ break;
+ case PROP_INSPECT_N_FRAGMENTS:
+ priv->options.inspect_options.fragments = g_value_get_int(value);
+ break;
+ case PROP_VALIDATE_FRAGMENTS:
+ priv->options.validate_fragments = g_value_get_boolean(value);
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+ break;
+ }
+}
+
+static void
+gadataset_finish_options_get_property(GObject *object,
+ guint prop_id,
+ GValue *value,
+ GParamSpec *pspec)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+
+ switch (prop_id) {
+ case PROP_SCHEMA:
+ g_value_set_object(value, priv->schema);
+ break;
+ case PROP_INSPECT_N_FRAGMENTS:
+ g_value_set_int(value, priv->options.inspect_options.fragments);
+ break;
+ case PROP_VALIDATE_FRAGMENTS:
+ g_value_set_boolean(value, priv->options.validate_fragments);
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+ break;
+ }
+}
+
+static void
+gadataset_finish_options_init(GADatasetFinishOptions *object)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object);
+ new(&priv->options) arrow::dataset::FinishOptions;
+}
+
+static void
+gadataset_finish_options_class_init(GADatasetFinishOptionsClass *klass)
+{
+ auto gobject_class = G_OBJECT_CLASS(klass);
+ gobject_class->finalize = gadataset_finish_options_finalize;
+ gobject_class->dispose = gadataset_finish_options_dispose;
+ gobject_class->set_property = gadataset_finish_options_set_property;
+ gobject_class->get_property = gadataset_finish_options_get_property;
+
+ GParamSpec *spec;
+ spec = g_param_spec_pointer("finish-options",
+ "Finish options",
+ "The raw arrow::dataset::FinishOptions *",
+ static_cast<GParamFlags>(G_PARAM_WRITABLE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_FINISH_OPTIONS, spec);
+
+ /**
+ * GADatasetFinishOptions:schema:
+ *
+ * The schema to finalize the dataset's schema.
+ *
+ * Since: 11.0.0
+ */
+ spec = g_param_spec_object("schema",
+ "Schema",
+ "The schema to finalize the dataset's schema",
+ GARROW_TYPE_SCHEMA,
+ static_cast<GParamFlags>(G_PARAM_READWRITE));
+ g_object_class_install_property(gobject_class, PROP_SCHEMA, spec);
+
+ arrow::dataset::FinishOptions finish_options;
+ /**
+ * GADatasetFinishOptions:inspect-n-fragments:
+ *
+ * The number of fragments to be used to inspect schema.
+ *
+ * Since: 11.0.0
+ */
+ spec = g_param_spec_int("inspect-n-fragments",
+ "Inspect N fragments",
+ "The number of fragments to be used to inspect schema",
+ arrow::dataset::InspectOptions::kInspectAllFragments,
+ G_MAXINT,
+ finish_options.inspect_options.fragments,
+ static_cast<GParamFlags>(G_PARAM_READWRITE));
+ g_object_class_install_property(gobject_class, PROP_INSPECT_N_FRAGMENTS, spec);
+
+ /**
+ * GADatasetFinishOptions:validate-fragments:
+ *
+ * Whether validate fragments against the given schema or not.
+ *
+ * Since: 11.0.0
+ */
+ spec = g_param_spec_boolean("validate-fragments",
+ "Validate fragments",
+ "Whether validate fragments or not",
+ finish_options.validate_fragments,
+ static_cast<GParamFlags>(G_PARAM_READWRITE));
+ g_object_class_install_property(gobject_class, PROP_VALIDATE_FRAGMENTS, spec);
+}
+
+/**
+ * gadataset_finish_options_new:
+ *
+ * Returns: A newly created #GADatasetDataset.
+ *
+ * Since: 11.0.0
+ */
+GADatasetFinishOptions *
+gadataset_finish_options_new(void)
+{
+ return gadataset_finish_options_new_raw(nullptr);
+}
+
+
typedef struct GADatasetDatasetFactoryPrivate_ {
std::shared_ptr<arrow::dataset::DatasetFactory> factory;
} GADatasetDatasetFactoryPrivate;
@@ -118,6 +318,7 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass)
/**
* gadataset_dataset_factory_finish:
* @factory: A #GADatasetDatasetFactory.
+ * @options: (nullable): A #GADatasetFinishOptions.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (transfer full) (nullable):
@@ -127,10 +328,15 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass)
*/
GADatasetDataset *
gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+ GADatasetFinishOptions *options,
GError **error)
{
auto arrow_factory = gadataset_dataset_factory_get_raw(factory);
- auto arrow_dataset_result = arrow_factory->Finish();
+ arrow::dataset::FinishOptions arrow_options;
+ if (options) {
+ arrow_options = *gadataset_finish_options_get_raw(options);
+ }
+ auto arrow_dataset_result = arrow_factory->Finish(arrow_options);
if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) {
auto arrow_dataset = *arrow_dataset_result;
return gadataset_dataset_new_raw(&arrow_dataset);
@@ -474,6 +680,7 @@ gadataset_file_system_dataset_factory_add_path(
/**
* gadataset_file_system_dataset_factory_finish:
* @factory: A #GADatasetFileSystemDatasetFactory.
+ * @options: (nullable): A #GADatasetFinishOptions.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (transfer full) (nullable):
@@ -484,6 +691,7 @@ gadataset_file_system_dataset_factory_add_path(
GADatasetFileSystemDataset *
gadataset_file_system_dataset_factory_finish(
GADatasetFileSystemDatasetFactory *factory,
+ GADatasetFinishOptions *options,
GError **error)
{
const gchar *context = "[file-system-dataset-factory][finish]";
@@ -527,7 +735,11 @@ gadataset_file_system_dataset_factory_finish(
if (!garrow::check(error, arrow_factory_result, context)) {
return NULL;
}
- auto arrow_dataset_result = (*arrow_factory_result)->Finish();
+ arrow::dataset::FinishOptions arrow_options;
+ if (options) {
+ arrow_options = *gadataset_finish_options_get_raw(options);
+ }
+ auto arrow_dataset_result = (*arrow_factory_result)->Finish(arrow_options);
if (!garrow::check(error, arrow_dataset_result, context)) {
return NULL;
}
@@ -544,6 +756,21 @@ gadataset_file_system_dataset_factory_finish(
G_END_DECLS
+GADatasetFinishOptions *
+gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *options)
+{
+ return GADATASET_FINISH_OPTIONS(g_object_new(GADATASET_TYPE_FINISH_OPTIONS,
+ "finish-options", options,
+ NULL));
+}
+
+arrow::dataset::FinishOptions *
+gadataset_finish_options_get_raw(GADatasetFinishOptions *options)
+{
+ auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(options);
+ return &(priv->options);
+}
+
std::shared_ptr<arrow::dataset::DatasetFactory>
gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory)
{
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h
index e2ee3ed980..292a9ca70d 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.h
+++ b/c_glib/arrow-dataset-glib/dataset-factory.h
@@ -23,6 +23,21 @@
G_BEGIN_DECLS
+#define GADATASET_TYPE_FINISH_OPTIONS (gadataset_finish_options_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetFinishOptions,
+ gadataset_finish_options,
+ GADATASET,
+ FINISH_OPTIONS,
+ GObject)
+struct _GADatasetFinishOptionsClass
+{
+ GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_11_0
+GADatasetFinishOptions *
+gadataset_finish_options_new(void);
+
#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type())
G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory,
gadataset_dataset_factory,
@@ -37,6 +52,7 @@ struct _GADatasetDatasetFactoryClass
GARROW_AVAILABLE_IN_5_0
GADatasetDataset *
gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+ GADatasetFinishOptions *options,
GError **error);
@@ -92,6 +108,7 @@ GARROW_AVAILABLE_IN_5_0
GADatasetFileSystemDataset *
gadataset_file_system_dataset_factory_finish(
GADatasetFileSystemDatasetFactory *factory,
+ GADatasetFinishOptions *options,
GError **error);
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp
index 114db35bc5..6ff68945ad 100644
--- a/c_glib/arrow-dataset-glib/dataset-factory.hpp
+++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp
@@ -23,5 +23,10 @@
#include <arrow-dataset-glib/dataset-factory.h>
+GADatasetFinishOptions *
+gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *arrow_options);
+arrow::dataset::FinishOptions *
+gadataset_finish_options_get_raw(GADatasetFinishOptions *options);
+
std::shared_ptr<arrow::dataset::DatasetFactory>
gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory);
diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in
index 74c54b998d..bd67ed6b8b 100644
--- a/c_glib/arrow-glib/version.h.in
+++ b/c_glib/arrow-glib/version.h.in
@@ -110,6 +110,24 @@
# define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor)
#endif
+/**
+ * GARROW_VERSION_11_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 11.0.0
+ */
+#define GARROW_VERSION_11_0 G_ENCODE_VERSION(11, 0)
+
+/**
+ * GARROW_VERSION_10_0:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 10.0.0
+ */
+#define GARROW_VERSION_10_0 G_ENCODE_VERSION(10, 0)
+
/**
* GARROW_VERSION_9_0:
*
@@ -301,6 +319,34 @@
#define GARROW_AVAILABLE_IN_ALL
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_11_0
+# define GARROW_DEPRECATED_IN_11_0 GARROW_DEPRECATED
+# define GARROW_DEPRECATED_IN_11_0_FOR(function) GARROW_DEPRECATED_FOR(function)
+#else
+# define GARROW_DEPRECATED_IN_11_0
+# define GARROW_DEPRECATED_IN_11_0_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_11_0
+# define GARROW_AVAILABLE_IN_11_0 GARROW_UNAVAILABLE(11, 0)
+#else
+# define GARROW_AVAILABLE_IN_11_0
+#endif
+
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_10_0
+# define GARROW_DEPRECATED_IN_10_0 GARROW_DEPRECATED
+# define GARROW_DEPRECATED_IN_10_0_FOR(function) GARROW_DEPRECATED_FOR(function)
+#else
+# define GARROW_DEPRECATED_IN_10_0
+# define GARROW_DEPRECATED_IN_10_0_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_10_0
+# define GARROW_AVAILABLE_IN_10_0 GARROW_UNAVAILABLE(10, 0)
+#else
+# define GARROW_AVAILABLE_IN_10_0
+#endif
+
#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_9_0
# define GARROW_DEPRECATED_IN_9_0 GARROW_DEPRECATED
# define GARROW_DEPRECATED_IN_9_0_FOR(function) GARROW_DEPRECATED_FOR(function)
diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
index b13195b070..e6066379ce 100644
--- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
+++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
@@ -68,6 +68,10 @@
<title>Index of deprecated API</title>
<xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
</index>
+ <index id="api-index-11-0-0" role="11.0.0">
+ <title>Index of new symbols in 11.0.0</title>
+ <xi:include href="xml/api-index-11.0.0.xml"><xi:fallback /></xi:include>
+ </index>
<index id="api-index-6-0-0" role="6.0.0">
<title>Index of new symbols in 6.0.0</title>
<xi:include href="xml/api-index-6.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
index 2ad1135bc6..e6990af559 100644
--- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml
+++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
@@ -193,6 +193,14 @@
<title>Index of deprecated API</title>
<xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
</index>
+ <index id="api-index-11-0-0" role="11.0.0">
+ <title>Index of new symbols in 11.0.0</title>
+ <xi:include href="xml/api-index-11.0.0.xml"><xi:fallback /></xi:include>
+ </index>
+ <index id="api-index-10-0-0" role="10.0.0">
+ <title>Index of new symbols in 10.0.0</title>
+ <xi:include href="xml/api-index-10.0.0.xml"><xi:fallback /></xi:include>
+ </index>
<index id="api-index-9-0-0" role="9.0.0">
<title>Index of new symbols in 9.0.0</title>
<xi:include href="xml/api-index-9.0.0.xml"><xi:fallback /></xi:include>
diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb
index bca9e72418..30944ccd3b 100644
--- a/c_glib/test/dataset/test-file-system-dataset-factory.rb
+++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb
@@ -70,4 +70,51 @@ class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase
assert_equal(@table1.concatenate([@table2]),
dataset.to_table)
end
+
+ sub_test_case("#finish") do
+ def setup
+ super do
+ @factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ @factory.file_system_uri = build_file_uri(@path1)
+ yield
+ end
+ end
+
+ def test_schema
+ options = ArrowDataset::FinishOptions.new
+ options.schema = build_schema(visible: Arrow::BooleanDataType.new,
+ point: Arrow::Int16DataType.new)
+ dataset = @factory.finish(options)
+ assert_equal(build_table(visible: [
+ build_boolean_array([true, false, true]),
+ build_boolean_array([false, true, false, true]),
+ ],
+ point: [
+ build_int16_array([1, 2, 3]),
+ build_int16_array([-1, -2, -3, -4]),
+ ]),
+ dataset.to_table)
+ end
+
+ def test_inspect_n_fragments
+ options = ArrowDataset::FinishOptions.new
+ options.inspect_n_fragments = -1
+ dataset = @factory.finish(options)
+ assert_equal(@table1, dataset.to_table)
+ end
+
+ def test_validate_fragments
+ options = ArrowDataset::FinishOptions.new
+ options.schema = build_schema(visible: Arrow::BooleanDataType.new,
+ point: Arrow::Int16DataType.new)
+ options.validate_fragments = true
+ message = "[file-system-dataset-factory][finish]: " +
+ "Invalid: Unable to merge: " +
+ "Field point has incompatible types: int16 vs int32"
+ error = assert_raise(Arrow::Error::Invalid) do
+ @factory.finish(options)
+ end
+ assert_equal(message, error.message.lines(chomp: true).first)
+ end
+ end
end