You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/05/21 01:09:41 UTC

[arrow] branch master updated: ARROW-5372: [GLib] Add support for null/boolean values CSV read option

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 853799a  ARROW-5372: [GLib] Add support for null/boolean values CSV read option
853799a is described below

commit 853799ac3c3ae3b0eb10bc5c4907ab4e76027d25
Author: Yosuke Shiro <yo...@gmail.com>
AuthorDate: Tue May 21 10:09:28 2019 +0900

    ARROW-5372: [GLib] Add support for null/boolean values CSV read option
    
    Author: Yosuke Shiro <yo...@gmail.com>
    Author: Kouhei Sutou <ko...@clear-code.com>
    
    Closes #4344 from shiro615/glib-csv-reader-options and squashes the following commits:
    
    7e4a0215 <Kouhei Sutou> Reuse defined variables
    8da8bab0 <Yosuke Shiro> Add garrow_csv_read_options_add_{null,true,false}_value
    9da0e1c2 <Yosuke Shiro> Use std::vector in arrow::csv::ConverterOptions
    edfdd7c1 <Yosuke Shiro>  Add support for null/boolean values CSV read option
---
 c_glib/arrow-glib/reader.cpp   | 222 ++++++++++++++++++++++++++++++++++++++++-
 c_glib/arrow-glib/reader.h     |  36 +++++++
 c_glib/test/test-csv-reader.rb |  86 ++++++++++++++++
 3 files changed, 343 insertions(+), 1 deletion(-)

diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index f67623b..88af8c7 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -918,7 +918,8 @@ enum {
   PROP_ALLOW_NEWLINES_IN_VALUES,
   PROP_IGNORE_EMPTY_LINES,
   PROP_N_HEADER_ROWS,
-  PROP_CHECK_UTF8
+  PROP_CHECK_UTF8,
+  PROP_ALLOW_NULL_STRINGS
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions,
@@ -975,6 +976,9 @@ garrow_csv_read_options_set_property(GObject *object,
   case PROP_CHECK_UTF8:
     priv->convert_options.check_utf8 = g_value_get_boolean(value);
     break;
+  case PROP_ALLOW_NULL_STRINGS:
+    priv->convert_options.strings_can_be_null = g_value_get_boolean(value);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1026,6 +1030,9 @@ garrow_csv_read_options_get_property(GObject *object,
   case PROP_CHECK_UTF8:
     g_value_set_boolean(value, priv->convert_options.check_utf8);
     break;
+  case PROP_ALLOW_NULL_STRINGS:
+    g_value_set_boolean(value, priv->convert_options.strings_can_be_null);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -1253,6 +1260,24 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
                               convert_options.check_utf8,
                               static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class, PROP_CHECK_UTF8, spec);
+
+  /**
+   * GArrowCSVReadOptions:allow-null-strings:
+   *
+   * Whether string / binary columns can have null values.
+   * If %TRUE, then strings in "null_values" are considered null for string columns.
+   * If %FALSE, then all strings are valid string values.
+   *
+   * Since: 0.14.0
+   */
+  spec = g_param_spec_boolean("allow-null-strings",
+                              "Allow null strings",
+                              "Whether string / binary columns can have null values. "
+                              "If TRUE, then strings in null_values are considered null for string columns. "
+                              "If FALSE, then all strings are valid string values.",
+                              convert_options.strings_can_be_null,
+                              static_cast<GParamFlags>(G_PARAM_READWRITE));
+  g_object_class_install_property(gobject_class, PROP_ALLOW_NULL_STRINGS, spec);
 }
 
 /**
@@ -1336,6 +1361,201 @@ garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options)
   return types;
 }
 
+/**
+ * garrow_csv_read_options_set_null_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @null_values: (array length=n_null_values):
+ *   The values to be processed as null.
+ * @n_null_values: The number of the specified null values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options,
+                                        const gchar **null_values,
+                                        gsize n_null_values)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.null_values.resize(n_null_values);
+  for (gsize i = 0; i < n_null_values; ++i) {
+    priv->convert_options.null_values[i] = null_values[i];
+  }
+}
+
+/**
+ * garrow_csv_read_options_get_null_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ *   The values to be processed as null. It's a %NULL-terminated string array.
+ *   If the number of values is zero, this returns %NULL.
+ *   It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_null_values(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  const auto &arrow_null_values = priv->convert_options.null_values;
+  if (arrow_null_values.empty()) {
+    return NULL;
+  } else {
+    auto n = arrow_null_values.size();
+    gchar **null_values = g_new(gchar *, n + 1);
+    for (size_t i = 0; i < n; ++i) {
+      null_values[i] = g_strdup(arrow_null_values[i].c_str());
+    }
+    null_values[n] = NULL;
+    return null_values;
+  }
+}
+
+/**
+ * garrow_csv_read_options_add_null_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @null_value: The value to be processed as null.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_null_value(GArrowCSVReadOptions *options,
+                                       const gchar *null_value)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.null_values.push_back(null_value);
+}
+
+/**
+ * garrow_csv_read_options_set_true_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @true_values: (array length=n_true_values):
+ *   The values to be processed as true.
+ * @n_true_values: The number of the specified true values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_true_values(GArrowCSVReadOptions *options,
+                                        const gchar **true_values,
+                                        gsize n_true_values)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.true_values.resize(n_true_values);
+  for (gsize i = 0; i < n_true_values; ++i) {
+    priv->convert_options.true_values[i] = true_values[i];
+  }
+}
+
+/**
+ * garrow_csv_read_options_get_true_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ *   The values to be processed as true. It's a %NULL-terminated string array.
+ *   If the number of values is zero, this returns %NULL.
+ *   It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_true_values(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  const auto &arrow_true_values = priv->convert_options.true_values;
+  if (arrow_true_values.empty()) {
+    return NULL;
+  } else {
+    auto n = arrow_true_values.size();
+    gchar **true_values = g_new(gchar *, n + 1);
+    for (size_t i = 0; i < n; ++i) {
+      true_values[i] = g_strdup(arrow_true_values[i].c_str());
+    }
+    true_values[n] = NULL;
+    return true_values;
+  }
+}
+
+/**
+ * garrow_csv_read_options_add_true_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @true_value: The value to be processed as true.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_true_value(GArrowCSVReadOptions *options,
+                                       const gchar *true_value)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.true_values.push_back(true_value);
+}
+
+/**
+ * garrow_csv_read_options_set_false_values:
+ * @options: A #GArrowCSVReadOptions.
+ * @false_values: (array length=n_false_values):
+ *   The values to be processed as false.
+ * @n_false_values: The number of the specified false values.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_set_false_values(GArrowCSVReadOptions *options,
+                                         const gchar **false_values,
+                                         gsize n_false_values)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.false_values.resize(n_false_values);
+  for (gsize i = 0; i < n_false_values; ++i) {
+    priv->convert_options.false_values[i] = false_values[i];
+  }
+}
+
+/**
+ * garrow_csv_read_options_get_false_values:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ *   The values to be processed as false. It's a %NULL-terminated string array.
+ *   If the number of values is zero, this returns %NULL.
+ *   It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 0.14.0
+ */
+gchar **
+garrow_csv_read_options_get_false_values(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  const auto &arrow_false_values = priv->convert_options.false_values;
+  if (arrow_false_values.empty()) {
+    return NULL;
+  } else {
+    auto n = arrow_false_values.size();
+    gchar **false_values = g_new(gchar *, n + 1);
+    for (size_t i = 0; i < n; ++i) {
+      false_values[i] = g_strdup(arrow_false_values[i].c_str());
+    }
+    false_values[n] = NULL;
+    return false_values;
+  }
+}
+
+/**
+ * garrow_csv_read_options_add_false_value:
+ * @options: A #GArrowCSVReadOptions.
+ * @false_value: The value to be processed as false.
+ *
+ * Since: 0.14.0
+ */
+void
+garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
+                                        const gchar *false_value)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->convert_options.false_values.push_back(false_value);
+}
+
 
 typedef struct GArrowCSVReaderPrivate_ {
   std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index a6a206c..c2cce2d 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -264,6 +264,42 @@ garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
                                    GArrowSchema *schema);
 GHashTable *
 garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options,
+                                        const gchar **null_values,
+                                        gsize n_null_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_null_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_null_value(GArrowCSVReadOptions *options,
+                                       const gchar *null_value);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_true_values(GArrowCSVReadOptions *options,
+                                        const gchar **true_values,
+                                        gsize n_true_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_true_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_true_value(GArrowCSVReadOptions *options,
+                                       const gchar *true_value);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_set_false_values(GArrowCSVReadOptions *options,
+                                         const gchar **false_values,
+                                         gsize n_false_values);
+GARROW_AVAILABLE_IN_0_14
+gchar **
+garrow_csv_read_options_get_false_values(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_0_14
+void
+garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
+                                        const gchar *false_value);
 
 #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 3cae103..9695b3a 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -90,6 +90,92 @@ count,valid
                      },
                      options.column_types)
       end
+
+      def test_null_values
+        options = Arrow::CSVReadOptions.new
+        null_values = ["2", "5"]
+        options.null_values = null_values
+        assert_equal(null_values, options.null_values)
+
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+        CSV
+        columns = {
+          "message" => build_string_array(["Start", "Shutdown", "Restart"]),
+          "count" => build_int64_array([nil, 9, nil]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_add_null_value
+        options = Arrow::CSVReadOptions.new
+        null_values = ["2", "5"]
+        options.null_values = null_values
+        options.add_null_value("9")
+        assert_equal(null_values + ["9"], options.null_values)
+      end
+
+      def test_boolean_values
+        options = Arrow::CSVReadOptions.new
+        true_values = ["Start", "Restart"]
+        options.true_values = true_values
+        assert_equal(true_values, options.true_values)
+
+        false_values = ["Shutdown"]
+        options.false_values = false_values
+        assert_equal(false_values, options.false_values)
+
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+        CSV
+        columns = {
+          "message" => build_boolean_array([true, false, true]),
+          "count" => build_int64_array([2, 9, 5]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_add_true_value
+        options = Arrow::CSVReadOptions.new
+        true_values = ["Start", "Restart"]
+        options.true_values = true_values
+        options.add_true_value("Shutdown")
+        assert_equal(true_values + ["Shutdown"], options.true_values)
+      end
+
+      def test_add_false_value
+        options = Arrow::CSVReadOptions.new
+        false_values = ["Start", "Restart"]
+        options.false_values = false_values
+        options.add_false_value("Shutdown")
+        assert_equal(false_values + ["Shutdown"], options.false_values)
+      end
+
+      def test_allow_null_strings
+        options = Arrow::CSVReadOptions.new
+        options.null_values = ["Start", "Restart"]
+        options.allow_null_strings = true
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message,count
+"Start",2
+"Shutdown",9
+"Restart",5
+        CSV
+        columns = {
+          "message" => build_string_array([nil, "Shutdown", nil]),
+          "count" => build_int64_array([2, 9, 5]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
     end
   end
 end