You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/25 21:00:09 UTC
[arrow] branch master updated: ARROW-6036: [GLib] Add support for
skip rows and column_names CSV read option
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ecb29d4 ARROW-6036: [GLib] Add support for skip rows and column_names CSV read option
ecb29d4 is described below
commit ecb29d432579a44471d5e1455179c3bafaaaf3d6
Author: Yosuke Shiro <yo...@gmail.com>
AuthorDate: Fri Jul 26 05:59:49 2019 +0900
ARROW-6036: [GLib] Add support for skip rows and column_names CSV read option
Closes #4944 from shiro615/glib-add-csv-reader-options and squashes the following commits:
8aa4c3828 <Yosuke Shiro> Add garrow_csv_read_options_set_column_names()
5abba2c28 <Yosuke Shiro> Add n_skip_rows option for GArrowCSVReadOptions
Authored-by: Yosuke Shiro <yo...@gmail.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
c_glib/arrow-glib/reader.cpp | 90 ++++++++++++++++++++++++++++++++++++++++++
c_glib/arrow-glib/reader.h | 12 ++++++
c_glib/test/test-csv-reader.rb | 44 +++++++++++++++++++++
3 files changed, 146 insertions(+)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index fdc8ffe..cf12220 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -894,6 +894,7 @@ typedef struct GArrowCSVReadOptionsPrivate_ {
enum {
PROP_USE_THREADS = 1,
PROP_BLOCK_SIZE,
+ PROP_N_SKIP_ROWS,
PROP_DELIMITER,
PROP_IS_QUOTED,
PROP_QUOTE_CHARACTER,
@@ -930,6 +931,9 @@ garrow_csv_read_options_set_property(GObject *object,
case PROP_BLOCK_SIZE:
priv->read_options.block_size = g_value_get_int(value);
break;
+ case PROP_N_SKIP_ROWS:
+ priv->read_options.skip_rows = g_value_get_uint(value);
+ break;
case PROP_DELIMITER:
priv->parse_options.delimiter = g_value_get_schar(value);
break;
@@ -981,6 +985,9 @@ garrow_csv_read_options_get_property(GObject *object,
case PROP_BLOCK_SIZE:
g_value_set_int(value, priv->read_options.block_size);
break;
+ case PROP_N_SKIP_ROWS:
+ g_value_set_uint(value, priv->read_options.skip_rows);
+ break;
case PROP_DELIMITER:
g_value_set_schar(value, priv->parse_options.delimiter);
break;
@@ -1071,6 +1078,24 @@ garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class, PROP_BLOCK_SIZE, spec);
+ /**
+ * GArrowCSVReadOptions:n-skip-rows:
+ *
+ * The number of header rows to skip (not including
+ * the row of column names, if any)
+ *
+ * Since: 1.0.0
+ */
+ spec = g_param_spec_uint("n-skip-rows",
+ "N skip rows",
+ "The number of header rows to skip "
+ "(not including the row of column names, if any)",
+ 0,
+ G_MAXUINT,
+ read_options.skip_rows,
+ static_cast<GParamFlags>(G_PARAM_READWRITE));
+ g_object_class_install_property(gobject_class, PROP_N_SKIP_ROWS, spec);
+
auto parse_options = arrow::csv::ParseOptions::Defaults();
@@ -1514,6 +1539,71 @@ garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
priv->convert_options.false_values.push_back(false_value);
}
+/**
+ * garrow_csv_read_options_set_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ * @column_names: (array length=n_column_names):
+ * The column names (if empty, will be read from first
+ * row after `skip_rows`)
+ * @n_column_names: The number of the specified column names.
+ *
+ * Since: 1.0.0
+ */
+void
+garrow_csv_read_options_set_column_names(GArrowCSVReadOptions *options,
+ const gchar **column_names,
+ gsize n_column_names)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->read_options.column_names.resize(n_column_names);
+ for (gsize i = 0; i < n_column_names; ++i) {
+ priv->read_options.column_names[i] = column_names[i];
+ }
+}
+
+/**
+ * garrow_csv_read_options_get_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full):
+ * The column names. It's a %NULL-terminated string array.
+ * If the number of values is zero, this returns %NULL.
+ * It must be freed with g_strfreev() when no longer needed.
+ *
+ * Since: 1.0.0
+ */
+gchar **
+garrow_csv_read_options_get_column_names(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ const auto &arrow_column_names = priv->read_options.column_names;
+ if (arrow_column_names.empty()) {
+ return NULL;
+ } else {
+ auto n = arrow_column_names.size();
+ gchar **column_names = g_new(gchar *, n + 1);
+ for (size_t i = 0; i < n; ++i) {
+ column_names[i] = g_strdup(arrow_column_names[i].c_str());
+ }
+ column_names[n] = NULL;
+ return column_names;
+ }
+}
+
+/**
+ * garrow_csv_read_options_add_column_names:
+ * @options: A #GArrowCSVReadOptions.
+ * @column_name: The column name to be added.
+ *
+ * Since: 1.0.0
+ */
+void
+garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
+ const gchar *column_name)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ priv->read_options.column_names.push_back(column_name);
+}
typedef struct GArrowCSVReaderPrivate_ {
std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index ff83e24..6241792 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -298,6 +298,18 @@ GARROW_AVAILABLE_IN_0_14
void
garrow_csv_read_options_add_false_value(GArrowCSVReadOptions *options,
const gchar *false_value);
+GARROW_AVAILABLE_IN_1_0
+void
+garrow_csv_read_options_set_column_names(GArrowCSVReadOptions *options,
+ const gchar **column_names,
+ gsize n_column_names);
+GARROW_AVAILABLE_IN_1_0
+gchar **
+garrow_csv_read_options_get_column_names(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_1_0
+void
+garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
+ const gchar *column_name);
#define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader,
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 9695b3a..99299fc 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -176,6 +176,50 @@ message,count
assert_equal(build_table(columns),
table.read)
end
+
+ def test_n_skip_rows
+ options = Arrow::CSVReadOptions.new
+ options.n_skip_rows = 1
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+message1,message2
+"Start1","Start2"
+"Shutdown1","Shutdown2"
+"Reboot1","Reboot2"
+ CSV
+ columns = {
+ "Start1" => build_string_array(["Shutdown1", "Reboot1"]),
+ "Start2" => build_string_array(["Shutdown2", "Reboot2"]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_column_names
+ options = Arrow::CSVReadOptions.new
+ column_names = ["message", "count"]
+ options.column_names = column_names
+ assert_equal(column_names, options.column_names)
+
+ table = Arrow::CSVReader.new(open_input(<<-CSV), options)
+"Start",2
+"Shutdown",9
+"Reboot",5
+ CSV
+ columns = {
+ "message" => build_string_array(["Start", "Shutdown", "Reboot"]),
+ "count" => build_int64_array([2, 9, 5]),
+ }
+ assert_equal(build_table(columns),
+ table.read)
+ end
+
+ def test_add_column_name
+ options = Arrow::CSVReadOptions.new
+ column_names = ["message", "count"]
+ options.column_names = column_names
+ options.add_column_name("score")
+ assert_equal(column_names + ["score"], options.column_names)
+ end
end
end
end