You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/05/21 01:09:41 UTC
[arrow] branch master updated: ARROW-5372: [GLib] Add support for
null/boolean values CSV read option
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 853799a ARROW-5372: [GLib] Add support for null/boolean values CSV read option
853799a is described below
commit 853799ac3c3ae3b0eb10bc5c4907ab4e76027d25
Author: Yosuke Shiro <yo...@gmail.com>
AuthorDate: Tue May 21 10:09:28 2019 +0900
ARROW-5372: [GLib] Add support for null/boolean values CSV read option
Author: Yosuke Shiro <yo...@gmail.com>
Author: Kouhei Sutou <ko...@clear-code.com>
Closes #4344 from shiro615/glib-csv-reader-options and squashes the following commits:
7e4a0215 <Kouhei Sutou> Reuse defined variables
8da8bab0 <Yosuke Shiro> Add garrow_csv_read_options_add_{null,true,false}_value
9da0e1c2 <Yosuke Shiro> Use std::vector in arrow::csv::ConverterOptions
edfdd7c1 <Yosuke Shiro> Add support for null/boolean values CSV read option
---
c_glib/arrow-glib/reader.cpp | 222 ++++++++++++++++++++++++++++++++++++++++-
c_glib/arrow-glib/reader.h | 36 +++++++
c_glib/test/test-csv-reader.rb | 86 ++++++++++++++++
3 files changed, 343 insertions(+), 1 deletion(-)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index f67623b..88af8c7 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -918,7 +918,8 @@ enum {
PROP_ALLOW_NEWLINES_IN_VALUES,
PROP_IGNORE_EMPTY_LINES,
PROP_N_HEADER_ROWS,
- PROP_CHECK_UTF8
+ PROP_CHECK_UTF8,
+ PROP_ALLOW_NULL_STRINGS
};
G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions,
@@ -975,6 +976,9 @@ garrow_csv_read_options_set_property(GObject *object,
case PROP_CHECK_UTF8:
priv->convert_options.check_utf8 = g_value_get_boolean(value);
break;
+ case PROP_ALLOW_NULL_STRINGS:
+ priv->convert_options.strings_can_be_null = g_value_get_boolean(value);
+ break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
@@ -1026,6 +1030,9 @@ garrow_csv_read_options_get_property(GObject *object,
case PROP_CHECK_UTF8:
g_value_set_boolean(value, priv->convert_options.check_utf8);
break;
+ case PROP_ALLOW_NULL_STRINGS:
+ g_value_set_boolean(value, priv->convert_options.strings_can_be_null);
+ break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
@@ -1253,6 +1260,24 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
convert_options.check_utf8,
static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class, PROP_CHECK_UTF8, spec);
+
+ /**
+ * GArrowCSVReadOptions:allow-null-strings:
+ *
+ * Whether string / binary columns can have null values.
+ * If %TRUE, then strings in "null_values" are considered null for string columns.
+ * If %FALSE, then all strings are valid string values.
+ *
+ * Since: 0.14.0
+ */
+ spec = g_param_spec_boolean("allow-null-strings",
+ "Allow null strings",
+ "Whether string / binary columns can have null values. "
+ "If TRUE, then strings in null_values are considered null for string columns. "
+ "If FALSE, then all strings are valid string values.",
+ convert_options.strings_can_be_null,
+ static_cast<GParamFlags>(G_PARAM_READWRITE));
+ g_object_class_install_property(gobject_class, PROP_ALLOW_NULL_STRINGS, spec);
}
/**
@@ -1336,6 +1361,201 @@ garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options)
return types;
}
+/**
+ * garrow_csv_read_options_set_null_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @null_values: (array length=n_null_values):
+ * The values to be processed as null.
+ * @n_null_values: The number of the specified null values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options,
+ const gchar **null_values,
+ gsize n_null_values)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.null_values.resize(n_null_values);
+ for (gsize i = 0; i < n_null_values; ++i) {
+ priv->convert_options.null_values[i] = null_values[i];
+ }
+}
+
+/**
+ * garrow_csv_read_options_get_null_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ * The values to be processed as null. It's a %NULL-terminated string array.
+ * If the number of values is zero, this returns %NULL.
+ * It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_null_values(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ const auto &arrow_null_values = priv->convert_options.null_values;
+ if (arrow_null_values.empty()) {
+ return NULL;
+ } else {
+ auto n = arrow_null_values.size();
+ gchar **null_values = g_new(gchar *, n + 1);
+ for (size_t i = 0; i < n; ++i) {
+ null_values[i] = g_strdup(arrow_null_values[i].c_str());
+ }
+ null_values[n] = NULL;
+ return null_values;
+ }
+}
+
+/**
+ * garrow_csv_read_options_add_null_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @null_value: The value to be processed as null.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_null_value(GArrowCSVReadOptions *options,
+ const gchar *null_value)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.null_values.push_back(null_value);
+}
+
+/**
+ * garrow_csv_read_options_set_true_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @true_values: (array length=n_true_values):
+ * The values to be processed as true.
+ * @n_true_values: The number of the specified true values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_true_values(GArrowCSVReadOptions *options,
+ const gchar **true_values,
+ gsize n_true_values)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.true_values.resize(n_true_values);
+ for (gsize i = 0; i < n_true_values; ++i) {
+ priv->convert_options.true_values[i] = true_values[i];
+ }
+}
+
+/**
+ * garrow_csv_read_options_get_true_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ * The values to be processed as true. It's a %NULL-terminated string array.
+ * If the number of values is zero, this returns %NULL.
+ * It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_true_values(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ const auto &arrow_true_values = priv->convert_options.true_values;
+ if (arrow_true_values.empty()) {
+ return NULL;
+ } else {
+ auto n = arrow_true_values.size();
+ gchar **true_values = g_new(gchar *, n + 1);
+ for (size_t i = 0; i < n; ++i) {
+ true_values[i] = g_strdup(arrow_true_values[i].c_str());
+ }
+ true_values[n] = NULL;
+ return true_values;
+ }
+}
+
+/**
+ * garrow_csv_read_options_add_true_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @true_value: The value to be processed as true.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_true_value(GArrowCSVReadOptions *options,
+ const gchar *true_value)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.true_values.push_back(true_value);
+}
+
+/**
+ * garrow_csv_read_options_set_false_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @false_values: (array length=n_false_values):
+ * The values to be processed as false.
+ * @n_false_values: The number of the specified false values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_false_values(GArrowCSVReadOptions *options,
+ const gchar **false_values,
+ gsize n_false_values)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.false_values.resize(n_false_values);
+ for (gsize i = 0; i < n_false_values; ++i) {
+ priv->convert_options.false_values[i] = false_values[i];
+ }
+}
+
+/**
+ * garrow_csv_read_options_get_false_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ * The values to be processed as false. It's a %NULL-terminated string array.
+ * If the number of values is zero, this returns %NULL.
+ * It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_false_values(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ const auto &arrow_false_values = priv->convert_options.false_values;
+ if (arrow_false_values.empty()) {
+ return NULL;
+ } else {
+ auto n = arrow_false_values.size();
+ gchar **false_values = g_new(gchar *, n + 1);
+ for (size_t i = 0; i < n; ++i) {
+ false_values[i] = g_strdup(arrow_false_values[i].c_str());
+ }
+ false_values[n] = NULL;
+ return false_values;
+ }
+}
+
+/**
+ * garrow_csv_read_options_add_false_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @false_value: The value to be processed as false.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
+ const gchar *false_value)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->convert_options.false_values.push_back(false_value);
+}
+
typedef struct GArrowCSVReaderPrivate_ {
std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index a6a206c..c2cce2d 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -264,6 +264,42 @@ garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
GArrowSchema *schema);
GHashTable *
garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options,
+ const gchar **null_values,
+ gsize n_null_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_null_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_null_value(GArrowCSVReadOptions *options,
+ const gchar *null_value);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_true_values(GArrowCSVReadOptions *options,
+ const gchar **true_values,
+ gsize n_true_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_true_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_true_value(GArrowCSVReadOptions *options,
+ const gchar *true_value);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_false_values(GArrowCSVReadOptions *options,
+ const gchar **false_values,
+ gsize n_false_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_false_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
+ const gchar *false_value);
#define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 3cae103..9695b3a 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -90,6 +90,92 @@ count,valid
},
options.column_types)
end
+
+ def test_null_values
+ options = Arrow::CSVReadOptions.new
+ null_values = ["2", "5"]
+ options.null_values = null_values
+ assert_equal(null_values, options.null_values)
+
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+ CSV
+ columns = {
+ "message" => build_string_array(["Start", "Shutdown", "Restart"]),
+ "count" => build_int64_array([nil, 9, nil]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_add_null_value
+ options = Arrow::CSVReadOptions.new
+ null_values = ["2", "5"]
+ options.null_values = null_values
+ options.add_null_value("9")
+ assert_equal(null_values + ["9"], options.null_values)
+ end
+
+ def test_boolean_values
+ options = Arrow::CSVReadOptions.new
+ true_values = ["Start", "Restart"]
+ options.true_values = true_values
+ assert_equal(true_values, options.true_values)
+
+ false_values = ["Shutdown"]
+ options.false_values = false_values
+ assert_equal(false_values, options.false_values)
+
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+ CSV
+ columns = {
+ "message" => build_boolean_array([true, false, true]),
+ "count" => build_int64_array([2, 9, 5]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_add_true_value
+ options = Arrow::CSVReadOptions.new
+ true_values = ["Start", "Restart"]
+ options.true_values = true_values
+ options.add_true_value("Shutdown")
+ assert_equal(true_values + ["Shutdown"], options.true_values)
+ end
+
+ def test_add_false_value
+ options = Arrow::CSVReadOptions.new
+ false_values = ["Start", "Restart"]
+ options.false_values = false_values
+ options.add_false_value("Shutdown")
+ assert_equal(false_values + ["Shutdown"], options.false_values)
+ end
+
+ def test_allow_null_strings
+ options = Arrow::CSVReadOptions.new
+ options.null_values = ["Start", "Restart"]
+ options.allow_null_strings = true
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+ CSV
+ columns = {
+ "message" => build_string_array([nil, "Shutdown", nil]),
+ "count" => build_int64_array([2, 9, 5]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
end
end
end