You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by sh...@apache.org on 2018/11/15 12:21:30 UTC
[arrow] branch master updated: ARROW-3798: [GLib] Add support for
column type CSV read option
This is an automated email from the ASF dual-hosted git repository.
shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 281eb22 ARROW-3798: [GLib] Add support for column type CSV read option
281eb22 is described below
commit 281eb22f8cb7f17afcdabf3795177c25063f4888
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Thu Nov 15 21:21:15 2018 +0900
ARROW-3798: [GLib] Add support for column type CSV read option
Author: Kouhei Sutou <ko...@clear-code.com>
Closes #2973 from kou/glib-csv-type and squashes the following commits:
3cb0d078 <Kouhei Sutou> Add column type CSV read option
---
c_glib/arrow-glib/reader.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++
c_glib/arrow-glib/reader.h | 9 ++++++
c_glib/test/test-csv-reader.rb | 64 ++++++++++++++++++++++++++++++---------
3 files changed, 127 insertions(+), 14 deletions(-)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 5253a45..b4b5c08 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -22,6 +22,7 @@
#endif
#include <arrow-glib/column.hpp>
+#include <arrow-glib/data-type.hpp>
#include <arrow-glib/error.hpp>
#include <arrow-glib/record-batch.hpp>
#include <arrow-glib/schema.hpp>
@@ -1276,6 +1277,73 @@ garrow_csv_read_options_new(void)
return GARROW_CSV_READ_OPTIONS(csv_read_options);
}
+/**
+ * garrow_csv_read_options_add_column_type:
+ * @options: A #GArrowCSVReadOptions.
+ * @name: The name of the target column.
+ * @data_type: The #GArrowDataType for the column.
+ *
+ * Add value type of a column.
+ *
+ * Since: 0.12.0
+ */
+void
+garrow_csv_read_options_add_column_type(GArrowCSVReadOptions *options,
+ const gchar *name,
+ GArrowDataType *data_type)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ auto arrow_data_type = garrow_data_type_get_raw(data_type);
+ priv->convert_options.column_types[name] = arrow_data_type;
+}
+
+/**
+ * garrow_csv_read_options_add_schema:
+ * @options: A #GArrowCSVReadOptions.
+ * @schema: The #GArrowSchema that specifies columns and their types.
+ *
+ * Add value types for columns in the schema.
+ *
+ * Since: 0.12.0
+ */
+void
+garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
+ GArrowSchema *schema)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ auto arrow_schema = garrow_schema_get_raw(schema);
+ for (const auto field : arrow_schema->fields()) {
+ priv->convert_options.column_types[field->name()] = field->type();
+ }
+}
+
+/**
+ * garrow_csv_read_options_get_column_types:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Returns: (transfer full) (element-type gchar* GArrowDataType):
+ * The column name and value type mapping of the options.
+ *
+ * Since: 0.12.0
+ */
+GHashTable *
+garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ GHashTable *types = g_hash_table_new_full(g_str_hash,
+ g_str_equal,
+ g_free,
+ g_object_unref);
+ for (const auto iter : priv->convert_options.column_types) {
+ auto arrow_name = iter.first;
+ auto arrow_data_type = iter.second;
+ g_hash_table_insert(types,
+ g_strdup(arrow_name.c_str()),
+ garrow_data_type_new_raw(&arrow_data_type));
+ }
+ return types;
+}
+
typedef struct GArrowCSVReaderPrivate_ {
std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index d1a3947..de33a79 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -255,6 +255,15 @@ struct _GArrowCSVReadOptionsClass
};
GArrowCSVReadOptions *garrow_csv_read_options_new(void);
+void
+garrow_csv_read_options_add_column_type(GArrowCSVReadOptions *options,
+ const gchar *name,
+ GArrowDataType *data_type);
+void
+garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options,
+ GArrowSchema *schema);
+GHashTable *
+garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options);
#define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 12897a8..3cae103 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -40,20 +40,56 @@ message,count
table.read)
end
- def test_options
- options = Arrow::CSVReadOptions.new
- options.quoted = false
- table = Arrow::CSVReader.new(open_input(<<-CSV), options)
-message,count
-"Start",2
-"Shutdown",9
- CSV
- columns = {
- "message" => build_string_array(["\"Start\"", "\"Shutdown\""]),
- "count" => build_int64_array([2, 9]),
- }
- assert_equal(build_table(columns),
- table.read)
+ sub_test_case("options") do
+ def test_add_column_type
+ options = Arrow::CSVReadOptions.new
+ options.add_column_type("count", Arrow::UInt8DataType.new)
+ options.add_column_type("valid", Arrow::BooleanDataType.new)
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+count,valid
+2,1
+9,0
+ CSV
+ columns = {
+ "count" => build_uint8_array([2, 9]),
+ "valid" => build_boolean_array([true, false]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_add_schema
+ options = Arrow::CSVReadOptions.new
+ fields = [
+ Arrow::Field.new("count", Arrow::UInt8DataType.new),
+ Arrow::Field.new("valid", Arrow::BooleanDataType.new),
+ ]
+ schema = Arrow::Schema.new(fields)
+ options.add_schema(schema)
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+count,valid
+2,1
+9,0
+ CSV
+ columns = {
+ "count" => build_uint8_array([2, 9]),
+ "valid" => build_boolean_array([true, false]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_column_types
+ require_gi_bindings(3, 3, 1)
+ options = Arrow::CSVReadOptions.new
+ options.add_column_type("count", Arrow::UInt8DataType.new)
+ options.add_column_type("valid", Arrow::BooleanDataType.new)
+ assert_equal({
+ "count" => Arrow::UInt8DataType.new,
+ "valid" => Arrow::BooleanDataType.new,
+ },
+ options.column_types)
+ end
end
end
end