You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/04/23 15:20:00 UTC

[arrow] branch master updated: ARROW-5195: [C++] Detect null strings in CSV string columns

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 277307d  ARROW-5195: [C++] Detect null strings in CSV string columns
277307d is described below

commit 277307de4462bb38b177df501f5f949654965a77
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Apr 23 17:19:52 2019 +0200

    ARROW-5195: [C++] Detect null strings in CSV string columns
    
    By default, when converting CSV to a string column, all CSV values are considered valid.
    
    Add an option so that strings such as "N/A" etc. are considered null.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #4188 from pitrou/ARROW-5195-csv-allow-null-strings and squashes the following commits:
    
    9dfff4de4 <Antoine Pitrou> ARROW-5195:  Detect null strings in CSV string columns
---
 cpp/src/arrow/csv/converter-test.cc  | 24 ++++++++++++++++++++++++
 cpp/src/arrow/csv/converter.cc       | 21 +++++++++++++++++----
 cpp/src/arrow/csv/options.h          |  4 ++++
 python/pyarrow/_csv.pyx              | 21 ++++++++++++++++++++-
 python/pyarrow/includes/libarrow.pxd |  1 +
 python/pyarrow/tests/test_csv.py     | 17 ++++++++++++++++-
 6 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc
index 97cee58..e3612e7 100644
--- a/cpp/src/arrow/csv/converter-test.cc
+++ b/cpp/src/arrow/csv/converter-test.cc
@@ -121,6 +121,18 @@ TEST(BinaryConversion, Basics) {
                                             {{"ab", ""}, {"cdé", "\xffgh"}});
 }
 
+TEST(BinaryConversion, Nulls) {
+  AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
+                                            {{"ab", "NULL"}, {"N/A", ""}},
+                                            {{true, true}, {true, true}});
+
+  auto options = ConvertOptions::Defaults();
+  options.strings_can_be_null = true;
+  AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
+                                            {{"ab", ""}, {"", ""}},
+                                            {{true, false}, {false, true}}, options);
+}
+
 TEST(StringConversion, Basics) {
   AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
                                             {{"ab", ""}, {"cdé", "gh"}});
@@ -131,6 +143,18 @@ TEST(StringConversion, Basics) {
                                             {{"ab", ""}, {"cdé", "\xffgh"}}, options);
 }
 
+TEST(StringConversion, Nulls) {
+  AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
+                                            {{"ab", "NULL"}, {"N/A", ""}},
+                                            {{true, true}, {true, true}});
+
+  auto options = ConvertOptions::Defaults();
+  options.strings_can_be_null = true;
+  AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
+                                            {{"ab", ""}, {"", ""}},
+                                            {{true, false}, {false, true}}, options);
+}
+
 TEST(StringConversion, Errors) {
   // Invalid UTF8 in column 0
   AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index c7a5c6f..d1ff8d7 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -130,9 +130,7 @@ class VarSizeBinaryConverter : public ConcreteConverter {
     using BuilderType = typename TypeTraits<T>::BuilderType;
     BuilderType builder(pool_);
 
-    // TODO do we accept nulls here?
-
-    auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+    auto visit_non_null = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
       if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
         return Status::Invalid("CSV conversion error to ", type_->ToString(),
                                ": invalid UTF8 data");
@@ -140,9 +138,24 @@ class VarSizeBinaryConverter : public ConcreteConverter {
       builder.UnsafeAppend(data, size);
       return Status::OK();
     };
+
     RETURN_NOT_OK(builder.Resize(parser.num_rows()));
     RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
-    RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+
+    if (options_.strings_can_be_null) {
+      auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+        if (size > 0 && IsNull(data, size, false /* quoted */)) {
+          builder.UnsafeAppendNull();
+          return Status::OK();
+        } else {
+          return visit_non_null(data, size, quoted);
+        }
+      };
+      RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+    } else {
+      RETURN_NOT_OK(parser.VisitColumn(col_index, visit_non_null));
+    }
+
     RETURN_NOT_OK(builder.Finish(out));
 
     return Status::OK();
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 2014620..9cd312a 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -72,6 +72,10 @@ struct ARROW_EXPORT ConvertOptions {
   // Recognized spellings for boolean values
   std::vector<std::string> true_values;
   std::vector<std::string> false_values;
+  // Whether string / binary columns can have null values.
+  // If true, then strings in "null_values" are considered null for string columns.
+  // If false, then all strings are valid string values.
+  bool strings_can_be_null = false;
 
   static ConvertOptions Defaults();
 };
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index b61c461..2932a2d 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -261,6 +261,11 @@ cdef class ConvertOptions:
     false_values: list, optional
         A sequence of strings that denote false booleans in the data
         (defaults are appropriate in most cases).
+    strings_can_be_null: bool, optional (default False)
+        Whether string / binary columns can have null values.
+        If true, then strings in null_values are considered null for
+        string columns.
+        If false, then all strings are valid string values.
     """
     cdef:
         CCSVConvertOptions options
@@ -269,7 +274,8 @@ cdef class ConvertOptions:
     __slots__ = ()
 
     def __init__(self, check_utf8=None, column_types=None, null_values=None,
-                 true_values=None, false_values=None):
+                 true_values=None, false_values=None,
+                 strings_can_be_null=None):
         self.options = CCSVConvertOptions.Defaults()
         if check_utf8 is not None:
             self.check_utf8 = check_utf8
@@ -281,6 +287,8 @@ cdef class ConvertOptions:
             self.true_values = true_values
         if false_values is not None:
             self.false_values = false_values
+        if strings_can_be_null is not None:
+            self.strings_can_be_null = strings_can_be_null
 
     @property
     def check_utf8(self):
@@ -294,6 +302,17 @@ cdef class ConvertOptions:
         self.options.check_utf8 = value
 
     @property
+    def strings_can_be_null(self):
+        """
+        Whether string / binary columns can have null values.
+        """
+        return self.options.strings_can_be_null
+
+    @strings_can_be_null.setter
+    def strings_can_be_null(self, value):
+        self.options.strings_can_be_null = value
+
+    @property
     def column_types(self):
         """
         Map column names to column types
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1649ee6..ba212a0 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1014,6 +1014,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         vector[c_string] null_values
         vector[c_string] true_values
         vector[c_string] false_values
+        c_bool strings_can_be_null
 
         @staticmethod
         CCSVConvertOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 40a059f..b79d536 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -132,6 +132,10 @@ def test_convert_options():
     opts.check_utf8 = False
     assert opts.check_utf8 is False
 
+    assert opts.strings_can_be_null is False
+    opts.strings_can_be_null = True
+    assert opts.strings_can_be_null is True
+
     assert opts.column_types == {}
     # Pass column_types as mapping
     opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
@@ -167,12 +171,13 @@ def test_convert_options():
 
     opts = cls(check_utf8=False, column_types={'a': pa.null()},
                null_values=['N', 'nn'], true_values=['T', 'tt'],
-               false_values=['F', 'ff'])
+               false_values=['F', 'ff'], strings_can_be_null=True)
     assert opts.check_utf8 is False
     assert opts.column_types == {'a': pa.null()}
     assert opts.null_values == ['N', 'nn']
     assert opts.false_values == ['F', 'ff']
     assert opts.true_values == ['T', 'tt']
+    assert opts.strings_can_be_null is True
 
 
 class BaseTestCSVRead:
@@ -284,6 +289,16 @@ class BaseTestCSVRead:
             'd': [2, None],
             }
 
+        opts = ConvertOptions(null_values=['Xxx', 'Zzz'],
+                              strings_can_be_null=True)
+        table = self.read_bytes(rows, convert_options=opts)
+        assert table.to_pydict() == {
+            'a': [None, None],
+            'b': [None, u"#N/A"],
+            'c': [u"1", u""],
+            'd': [2, None],
+            }
+
         opts = ConvertOptions(null_values=[])
         rows = b"a,b\n#N/A,\n"
         table = self.read_bytes(rows, convert_options=opts)