You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/04/23 15:20:00 UTC
[arrow] branch master updated: ARROW-5195: [C++] Detect null
strings in CSV string columns
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 277307d ARROW-5195: [C++] Detect null strings in CSV string columns
277307d is described below
commit 277307de4462bb38b177df501f5f949654965a77
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Apr 23 17:19:52 2019 +0200
ARROW-5195: [C++] Detect null strings in CSV string columns
By default, when converting CSV to a string column, all CSV values are considered valid.
Add an option so that strings such as "N/A" etc. are considered null.
Author: Antoine Pitrou <an...@python.org>
Closes #4188 from pitrou/ARROW-5195-csv-allow-null-strings and squashes the following commits:
9dfff4de4 <Antoine Pitrou> ARROW-5195: Detect null strings in CSV string columns
---
cpp/src/arrow/csv/converter-test.cc | 24 ++++++++++++++++++++++++
cpp/src/arrow/csv/converter.cc | 21 +++++++++++++++++----
cpp/src/arrow/csv/options.h | 4 ++++
python/pyarrow/_csv.pyx | 21 ++++++++++++++++++++-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/tests/test_csv.py | 17 ++++++++++++++++-
6 files changed, 82 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc
index 97cee58..e3612e7 100644
--- a/cpp/src/arrow/csv/converter-test.cc
+++ b/cpp/src/arrow/csv/converter-test.cc
@@ -121,6 +121,18 @@ TEST(BinaryConversion, Basics) {
{{"ab", ""}, {"cdé", "\xffgh"}});
}
+TEST(BinaryConversion, Nulls) {
+ AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
+ {{"ab", "NULL"}, {"N/A", ""}},
+ {{true, true}, {true, true}});
+
+ auto options = ConvertOptions::Defaults();
+ options.strings_can_be_null = true;
+ AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
+ {{"ab", ""}, {"", ""}},
+ {{true, false}, {false, true}}, options);
+}
+
TEST(StringConversion, Basics) {
AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
{{"ab", ""}, {"cdé", "gh"}});
@@ -131,6 +143,18 @@ TEST(StringConversion, Basics) {
{{"ab", ""}, {"cdé", "\xffgh"}}, options);
}
+TEST(StringConversion, Nulls) {
+ AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
+ {{"ab", "NULL"}, {"N/A", ""}},
+ {{true, true}, {true, true}});
+
+ auto options = ConvertOptions::Defaults();
+ options.strings_can_be_null = true;
+ AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
+ {{"ab", ""}, {"", ""}},
+ {{true, false}, {false, true}}, options);
+}
+
TEST(StringConversion, Errors) {
// Invalid UTF8 in column 0
AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index c7a5c6f..d1ff8d7 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -130,9 +130,7 @@ class VarSizeBinaryConverter : public ConcreteConverter {
using BuilderType = typename TypeTraits<T>::BuilderType;
BuilderType builder(pool_);
- // TODO do we accept nulls here?
-
- auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ auto visit_non_null = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": invalid UTF8 data");
@@ -140,9 +138,24 @@ class VarSizeBinaryConverter : public ConcreteConverter {
builder.UnsafeAppend(data, size);
return Status::OK();
};
+
RETURN_NOT_OK(builder.Resize(parser.num_rows()));
RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
- RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+
+ if (options_.strings_can_be_null) {
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ if (size > 0 && IsNull(data, size, false /* quoted */)) {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ } else {
+ return visit_non_null(data, size, quoted);
+ }
+ };
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+ } else {
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit_non_null));
+ }
+
RETURN_NOT_OK(builder.Finish(out));
return Status::OK();
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 2014620..9cd312a 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -72,6 +72,10 @@ struct ARROW_EXPORT ConvertOptions {
// Recognized spellings for boolean values
std::vector<std::string> true_values;
std::vector<std::string> false_values;
+ // Whether string / binary columns can have null values.
+ // If true, then strings in "null_values" are considered null for string columns.
+ // If false, then all strings are valid string values.
+ bool strings_can_be_null = false;
static ConvertOptions Defaults();
};
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index b61c461..2932a2d 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -261,6 +261,11 @@ cdef class ConvertOptions:
false_values: list, optional
A sequence of strings that denote false booleans in the data
(defaults are appropriate in most cases).
+ strings_can_be_null: bool, optional (default False)
+ Whether string / binary columns can have null values.
+ If true, then strings in null_values are considered null for
+ string columns.
+ If false, then all strings are valid string values.
"""
cdef:
CCSVConvertOptions options
@@ -269,7 +274,8 @@ cdef class ConvertOptions:
__slots__ = ()
def __init__(self, check_utf8=None, column_types=None, null_values=None,
- true_values=None, false_values=None):
+ true_values=None, false_values=None,
+ strings_can_be_null=None):
self.options = CCSVConvertOptions.Defaults()
if check_utf8 is not None:
self.check_utf8 = check_utf8
@@ -281,6 +287,8 @@ cdef class ConvertOptions:
self.true_values = true_values
if false_values is not None:
self.false_values = false_values
+ if strings_can_be_null is not None:
+ self.strings_can_be_null = strings_can_be_null
@property
def check_utf8(self):
@@ -294,6 +302,17 @@ cdef class ConvertOptions:
self.options.check_utf8 = value
@property
+ def strings_can_be_null(self):
+ """
+ Whether string / binary columns can have null values.
+ """
+ return self.options.strings_can_be_null
+
+ @strings_can_be_null.setter
+ def strings_can_be_null(self, value):
+ self.options.strings_can_be_null = value
+
+ @property
def column_types(self):
"""
Map column names to column types
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1649ee6..ba212a0 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1014,6 +1014,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
vector[c_string] null_values
vector[c_string] true_values
vector[c_string] false_values
+ c_bool strings_can_be_null
@staticmethod
CCSVConvertOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 40a059f..b79d536 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -132,6 +132,10 @@ def test_convert_options():
opts.check_utf8 = False
assert opts.check_utf8 is False
+ assert opts.strings_can_be_null is False
+ opts.strings_can_be_null = True
+ assert opts.strings_can_be_null is True
+
assert opts.column_types == {}
# Pass column_types as mapping
opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
@@ -167,12 +171,13 @@ def test_convert_options():
opts = cls(check_utf8=False, column_types={'a': pa.null()},
null_values=['N', 'nn'], true_values=['T', 'tt'],
- false_values=['F', 'ff'])
+ false_values=['F', 'ff'], strings_can_be_null=True)
assert opts.check_utf8 is False
assert opts.column_types == {'a': pa.null()}
assert opts.null_values == ['N', 'nn']
assert opts.false_values == ['F', 'ff']
assert opts.true_values == ['T', 'tt']
+ assert opts.strings_can_be_null is True
class BaseTestCSVRead:
@@ -284,6 +289,16 @@ class BaseTestCSVRead:
'd': [2, None],
}
+ opts = ConvertOptions(null_values=['Xxx', 'Zzz'],
+ strings_can_be_null=True)
+ table = self.read_bytes(rows, convert_options=opts)
+ assert table.to_pydict() == {
+ 'a': [None, None],
+ 'b': [None, u"#N/A"],
+ 'c': [u"1", u""],
+ 'd': [2, None],
+ }
+
opts = ConvertOptions(null_values=[])
rows = b"a,b\n#N/A,\n"
table = self.read_bytes(rows, convert_options=opts)