You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by sh...@apache.org on 2018/11/15 12:21:30 UTC

[arrow] branch master updated: ARROW-3798: [GLib] Add support for column type CSV read option

This is an automated email from the ASF dual-hosted git repository.

shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 281eb22  ARROW-3798: [GLib] Add support for column type CSV read option
281eb22 is described below

commit 281eb22f8cb7f17afcdabf3795177c25063f4888
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Thu Nov 15 21:21:15 2018 +0900

    ARROW-3798: [GLib] Add support for column type CSV read option
    
    Author: Kouhei Sutou <ko...@clear-code.com>
    
    Closes #2973 from kou/glib-csv-type and squashes the following commits:
    
    3cb0d078 <Kouhei Sutou>  Add column type CSV read option
---
 c_glib/arrow-glib/reader.cpp   | 68 ++++++++++++++++++++++++++++++++++++++++++
 c_glib/arrow-glib/reader.h     |  9 ++++++
 c_glib/test/test-csv-reader.rb | 64 ++++++++++++++++++++++++++++++---------
 3 files changed, 127 insertions(+), 14 deletions(-)

diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 5253a45..b4b5c08 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -22,6 +22,7 @@
 #endif
 
 #include <arrow-glib/column.hpp>
+#include <arrow-glib/data-type.hpp>
 #include <arrow-glib/error.hpp>
 #include <arrow-glib/record-batch.hpp>
 #include <arrow-glib/schema.hpp>
@@ -1276,6 +1277,73 @@ garrow_csv_read_options_new(void)
   return GARROW_CSV_READ_OPTIONS(csv_read_options);
 }
 
+/**
+ * garrow_csv_read_options_add_column_type:
+ * @options: A #GArrowCSVReadOptions.
+ * @name: The name of the target column.
+ * @data_type: The #GArrowDataType for the column.
+ *
+ * Add value type of a column.
+ *
+ * Since: 0.12.0
+ */
+void
+garrow_csv_read_options_add_column_type(GArrowCSVReadOptions *options,
+                                        const gchar *name,
+                                        GArrowDataType *data_type)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  auto arrow_data_type = garrow_data_type_get_raw(data_type);
+  priv->convert_options.column_types[name] = arrow_data_type;
+}
+
+/**
+ * garrow_csv_read_options_add_schema:
+ * @options: A #GArrowCSVReadOptions.
+ * @schema: The #GArrowSchema that specifies columns and their types.
+ *
+ * Add value types for columns in the schema.
+ *
+ * Since: 0.12.0
+ */
+void
+garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
+                                   GArrowSchema *schema)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  for (const auto field : arrow_schema->fields()) {
+    priv->convert_options.column_types[field->name()] = field->type();
+  }
+}
+
+/**
+ * garrow_csv_read_options_get_column_types:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Returns: (transfer full) (element-type gchar* GArrowDataType):
+ *   The column name and value type mapping of the options.
+ *
+ * Since: 0.12.0
+ */
+GHashTable *
+garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  GHashTable *types = g_hash_table_new_full(g_str_hash,
+                                            g_str_equal,
+                                            g_free,
+                                            g_object_unref);
+  for (const auto iter : priv->convert_options.column_types) {
+    auto arrow_name = iter.first;
+    auto arrow_data_type = iter.second;
+    g_hash_table_insert(types,
+                        g_strdup(arrow_name.c_str()),
+                        garrow_data_type_new_raw(&arrow_data_type));
+  }
+  return types;
+}
+
 
 typedef struct GArrowCSVReaderPrivate_ {
   std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index d1a3947..de33a79 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -255,6 +255,15 @@ struct _GArrowCSVReadOptionsClass
 };
 
 GArrowCSVReadOptions *garrow_csv_read_options_new(void);
+void
+garrow_csv_read_options_add_column_type(GArrowCSVReadOptions *options,
+                                        const gchar *name,
+                                        GArrowDataType *data_type);
+void
+garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
+                                   GArrowSchema *schema);
+GHashTable *
+garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options);
 
 #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 12897a8..3cae103 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -40,20 +40,56 @@ message,count
                    table.read)
     end
 
-    def test_options
-      options = Arrow::CSVReadOptions.new
-      options.quoted = false
-      table = Arrow::CSVReader.new(open_input(<<-CSV), options)
-message,count
-"Start",2
-"Shutdown",9
-      CSV
-      columns = {
-        "message" => build_string_array(["\"Start\"", "\"Shutdown\""]),
-        "count" => build_int64_array([2, 9]),
-      }
-      assert_equal(build_table(columns),
-                   table.read)
+    sub_test_case("options") do
+      def test_add_column_type
+        options = Arrow::CSVReadOptions.new
+        options.add_column_type("count", Arrow::UInt8DataType.new)
+        options.add_column_type("valid", Arrow::BooleanDataType.new)
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+count,valid
+2,1
+9,0
+        CSV
+        columns = {
+          "count" => build_uint8_array([2, 9]),
+          "valid" => build_boolean_array([true, false]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_add_schema
+        options = Arrow::CSVReadOptions.new
+        fields = [
+          Arrow::Field.new("count", Arrow::UInt8DataType.new),
+          Arrow::Field.new("valid", Arrow::BooleanDataType.new),
+        ]
+        schema = Arrow::Schema.new(fields)
+        options.add_schema(schema)
+        table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+count,valid
+2,1
+9,0
+        CSV
+        columns = {
+          "count" => build_uint8_array([2, 9]),
+          "valid" => build_boolean_array([true, false]),
+        }
+        assert_equal(build_table(columns),
+                     table.read)
+      end
+
+      def test_column_types
+        require_gi_bindings(3, 3, 1)
+        options = Arrow::CSVReadOptions.new
+        options.add_column_type("count", Arrow::UInt8DataType.new)
+        options.add_column_type("valid", Arrow::BooleanDataType.new)
+        assert_equal({
+                       "count" => Arrow::UInt8DataType.new,
+                       "valid" => Arrow::BooleanDataType.new,
+                     },
+                     options.column_types)
+      end
     end
   end
 end