You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/25 21:00:09 UTC

[arrow] branch master updated: ARROW-6036: [GLib] Add support for skip rows and column_names CSV read option

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ecb29d4  ARROW-6036: [GLib] Add support for skip rows and column_names CSV read option
ecb29d4 is described below

commit ecb29d432579a44471d5e1455179c3bafaaaf3d6
Author: Yosuke Shiro <yo...@gmail.com>
AuthorDate: Fri Jul 26 05:59:49 2019 +0900

    ARROW-6036: [GLib] Add support for skip rows and column_names CSV read option
    
    Closes #4944 from shiro615/glib-add-csv-reader-options and squashes the following commits:
    
    8aa4c3828 <Yosuke Shiro> Add garrow_csv_read_options_set_column_names()
    5abba2c28 <Yosuke Shiro>  Add n_skip_rows option for GArrowCSVReadOptions
    
    Authored-by: Yosuke Shiro <yo...@gmail.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 c_glib/arrow-glib/reader.cpp   | 90 ++++++++++++++++++++++++++++++++++++++++++
 c_glib/arrow-glib/reader.h     | 12 ++++++
 c_glib/test/test-csv-reader.rb | 44 +++++++++++++++++++++
 3 files changed, 146 insertions(+)

diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index fdc8ffe..cf12220 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -894,6 +894,7 @@ typedef struct GArrowCSVReadOptionsPrivate_ {
 enum {
   PROP_USE_THREADS = 1,
   PROP_BLOCK_SIZE,
+  PROP_N_SKIP_ROWS,
   PROP_DELIMITER,
   PROP_IS_QUOTED,
   PROP_QUOTE_CHARACTER,
@@ -930,6 +931,9 @@ garrow_csv_read_options_set_property(GObject *object,
   case PROP_BLOCK_SIZE:
     priv->read_options.block_size = g_value_get_int(value);
     break;
+  case PROP_N_SKIP_ROWS:
+    priv->read_options.skip_rows = g_value_get_uint(value);
+    break;
   case PROP_DELIMITER:
     priv->parse_options.delimiter = g_value_get_schar(value);
     break;
@@ -981,6 +985,9 @@ garrow_csv_read_options_get_property(GObject *object,
   case PROP_BLOCK_SIZE:
     g_value_set_int(value, priv->read_options.block_size);
     break;
+  case PROP_N_SKIP_ROWS:
+    g_value_set_uint(value, priv->read_options.skip_rows);
+    break;
   case PROP_DELIMITER:
     g_value_set_schar(value, priv->parse_options.delimiter);
     break;
@@ -1071,6 +1078,24 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
                           static_cast<GParamFlags>(G_PARAM_READWRITE));
   g_object_class_install_property(gobject_class, PROP_BLOCK_SIZE, spec);
 
+  /**
+   * GArrowCSVReadOptions:n-skip-rows:
+   *
+   * The number of header rows to skip (not including
+   * the row of column names, if any)
+   *
+   * Since: 1.0.0
+   */
+  spec = g_param_spec_uint("n-skip-rows",
+                           "N skip rows",
+                           "The number of header rows to skip "
+                           "(not including the row of column names, if any)",
+                           0,
+                           G_MAXUINT,
+                           read_options.skip_rows,
+                           static_cast<GParamFlags>(G_PARAM_READWRITE));
+  g_object_class_install_property(gobject_class, PROP_N_SKIP_ROWS, spec);
+
 
   auto parse_options = arrow::csv::ParseOptions::Defaults();
 
@@ -1514,6 +1539,71 @@ garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
   priv->convert_options.false_values.push_back(false_value);
 }
 
+/**
+ * garrow_csv_read_options_set_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ * @column_names: (array length=n_column_names):
+ *   The column names (if empty, will be read from first
+ *   row after `skip_rows`)
+ * @n_column_names: The number of the specified column names.
+ *
+ * Since: 1.0.0
+ */
+void
+garrow_csv_read_options_set_column_names(GArrowCSVReadOptions *options,
+                                         const gchar **column_names,
+                                         gsize n_column_names)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->read_options.column_names.resize(n_column_names);
+  for (gsize i = 0; i < n_column_names; ++i) {
+    priv->read_options.column_names[i] = column_names[i];
+  }
+}
+
+/**
+ * garrow_csv_read_options_get_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ *   The column names. It's a %NULL-terminated string array.
+ *   If the number of values is zero, this returns %NULL.
+ *   It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 1.0.0
+ */
+gchar **
+garrow_csv_read_options_get_column_names(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  const auto &arrow_column_names = priv->read_options.column_names;
+  if (arrow_column_names.empty()) {
+    return NULL;
+  } else {
+    auto n = arrow_column_names.size();
+    gchar **column_names = g_new(gchar *, n + 1);
+    for (size_t i = 0; i < n; ++i) {
+      column_names[i] = g_strdup(arrow_column_names[i].c_str());
+    }
+    column_names[n] = NULL;
+    return column_names;
+  }
+}
+
+/**
+ * garrow_csv_read_options_add_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ * @column_name: The column name to be added.
+ *
+ * Since: 1.0.0
+ */
+void
+garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
+                                        const gchar *column_name)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  priv->read_options.column_names.push_back(column_name);
+}
 
 typedef struct GArrowCSVReaderPrivate_ {
   std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index ff83e24..6241792 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -298,6 +298,18 @@ GARROW_AVAILABLE_IN_0_14
 void
 garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
                                         const gchar *false_value);
+GARROW_AVAILABLE_IN_1_0
+void
+garrow_csv_read_options_set_column_names(GArrowCSVReadOptions *options,
+                                         const gchar **column_names,
+                                         gsize n_column_names);
+GARROW_AVAILABLE_IN_1_0
+gchar **
+garrow_csv_read_options_get_column_names(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_1_0
+void
+garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
+                                        const gchar *column_name);
 
 #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 9695b3a..99299fc 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -176,6 +176,50 @@ message,count
         assert_equal(build_table(columns),
                      table.read)
       end
+
+      def test_n_skip_rows
+        options = Arrow::CSVReadOptions.new
+        options.n_skip_rows = 1
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message1,message2
+"Start1","Start2"
+"Shutdown1","Shutdown2"
+"Reboot1","Reboot2"
+        CSV
+        columns = {
+          "Start1" => build_string_array(["Shutdown1", "Reboot1"]),
+          "Start2" => build_string_array(["Shutdown2", "Reboot2"]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_column_names
+        options = Arrow::CSVReadOptions.new
+        column_names = ["message", "count"]
+        options.column_names = column_names
+        assert_equal(column_names, options.column_names)
+
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+"Start",2
+"Shutdown",9
+"Reboot",5
+        CSV
+        columns = {
+          "message" => build_string_array(["Start", "Shutdown", "Reboot"]),
+          "count" => build_int64_array([2, 9, 5]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_add_column_name
+        options = Arrow::CSVReadOptions.new
+        column_names = ["message", "count"]
+        options.column_names = column_names
+        options.add_column_name("score")
+        assert_equal(column_names + ["score"], options.column_names)
+      end
     end
   end
 end