You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/04/02 17:50:20 UTC
[arrow] branch master updated: ARROW-3791: [C++ / Python] Add boolean type inference to the CSV parser

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f7ef65e  ARROW-3791: [C++ / Python] Add boolean type inference to the CSV parser
f7ef65e is described below

commit f7ef65e5fc367f1f5649dfcea0754e413fcca394
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Apr 2 12:50:11 2019 -0500

    ARROW-3791: [C++ / Python] Add boolean type inference to the CSV parser
    
    The set of recognized values can be customized using arrow::csv::ConvertOptions.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #4055 from pitrou/ARROW-3791-csv-bool-type-inference and squashes the following commits:
    
    7941188ec <Antoine Pitrou> ARROW-3791:  Add boolean type inference to the CSV parser
---
 cpp/src/arrow/csv/column-builder-test.cc | 31 ++++++++++++++
 cpp/src/arrow/csv/column-builder.cc      |  9 +++-
 cpp/src/arrow/csv/converter.cc           | 71 ++++++++++++++++++++++++++++----
 cpp/src/arrow/csv/options.cc             |  4 +-
 cpp/src/arrow/csv/options.h              |  3 ++
 python/pyarrow/_csv.pyx                  | 35 +++++++++++++++-
 python/pyarrow/includes/libarrow.pxd     |  2 +
 python/pyarrow/tests/test_csv.py         | 54 ++++++++++++++++++++----
 8 files changed, 189 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/csv/column-builder-test.cc b/cpp/src/arrow/csv/column-builder-test.cc
index 5035f83..f2c39aa 100644
--- a/cpp/src/arrow/csv/column-builder-test.cc
+++ b/cpp/src/arrow/csv/column-builder-test.cc
@@ -189,6 +189,34 @@ TEST(InferringColumnBuilder, MultipleChunkInteger) {
   AssertChunkedEqual(*expected, *actual);
 }
 
+TEST(InferringColumnBuilder, SingleChunkBoolean) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder, {{"", "0", "FALSE"}}, &actual);
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<BooleanType, bool>({{false, true, true}},
+                                            {{false, false, false}}, &expected);
+  AssertChunkedEqual(*expected, *actual);
+}
+
+TEST(InferringColumnBuilder, MultipleChunkBoolean) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
+  ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+
+  std::shared_ptr<ChunkedArray> actual;
+  AssertBuilding(builder, {{""}, {"1", "True", "0"}}, &actual);
+
+  std::shared_ptr<ChunkedArray> expected;
+  ChunkedArrayFromVector<BooleanType, bool>({{false}, {true, true, true}},
+                                            {{false}, {true, true, false}}, &expected);
+  AssertChunkedEqual(*expected, *actual);
+}
+
 TEST(InferringColumnBuilder, SingleChunkReal) {
   auto tg = TaskGroup::MakeSerial();
   std::shared_ptr<ColumnBuilder> builder;
@@ -316,6 +344,9 @@ TEST(InferringColumnBuilder, MultipleChunkBinary) {
   AssertChunkedEqual(*expected, *actual);
 }
 
+// Parallel parsing is tested more comprehensively on the Python side
+// (see python/pyarrow/tests/test_csv.py)
+
 TEST(InferringColumnBuilder, MultipleChunkIntegerParallel) {
   auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
   std::shared_ptr<ColumnBuilder> builder;
diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc
index 1f37046..657aa6f 100644
--- a/cpp/src/arrow/csv/column-builder.cc
+++ b/cpp/src/arrow/csv/column-builder.cc
@@ -167,7 +167,7 @@ class InferringColumnBuilder : public ColumnBuilder {
   std::shared_ptr<Converter> converter_;
 
   // Current inference status
-  enum class InferKind { Null, Integer, Real, Timestamp, Text, Binary };
+  enum class InferKind { Null, Integer, Boolean, Real, Timestamp, Text, Binary };
 
   std::shared_ptr<DataType> infer_type_;
   InferKind infer_kind_;
@@ -191,6 +191,9 @@ Status InferringColumnBuilder::LoosenType() {
       infer_kind_ = InferKind::Integer;
       break;
     case InferKind::Integer:
+      infer_kind_ = InferKind::Boolean;
+      break;
+    case InferKind::Boolean:
       infer_kind_ = InferKind::Timestamp;
       break;
     case InferKind::Timestamp:
@@ -220,6 +223,10 @@ Status InferringColumnBuilder::UpdateType() {
       infer_type_ = int64();
       can_loosen_type_ = true;
       break;
+    case InferKind::Boolean:
+      infer_type_ = boolean();
+      can_loosen_type_ = true;
+      break;
     case InferKind::Timestamp:
       // We don't support parsing second fractions for now
       infer_type_ = timestamp(TimeUnit::SECOND);
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 22be7d6..c7a5c6f 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -56,6 +56,15 @@ inline bool IsWhitespace(uint8_t c) {
   return c == ' ' || c == '\t';
 }
 
+Status InitializeTrie(const std::vector<std::string>& inputs, Trie* trie) {
+  TrieBuilder builder;
+  for (const auto& s : inputs) {
+    RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
+  }
+  *trie = builder.Finish();
+  return Status::OK();
+}
+
 class ConcreteConverter : public Converter {
  public:
   using Converter::Converter;
@@ -69,12 +78,7 @@ class ConcreteConverter : public Converter {
 
 Status ConcreteConverter::Initialize() {
   // TODO no need to build a separate Trie for each Converter instance
-  TrieBuilder builder;
-  for (const auto& s : options_.null_values) {
-    RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
-  }
-  null_trie_ = builder.Finish();
-  return Status::OK();
+  return InitializeTrie(options_.null_values, &null_trie_);
 }
 
 bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) {
@@ -147,7 +151,7 @@ class VarSizeBinaryConverter : public ConcreteConverter {
  protected:
   Status Initialize() override {
     util::InitializeUTF8();
-    return Status::OK();
+    return ConcreteConverter::Initialize();
   }
 };
 
@@ -184,6 +188,57 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_
 }
 
 /////////////////////////////////////////////////////////////////////////
+// Concrete Converter for booleans
+
+class BooleanConverter : public ConcreteConverter {
+ public:
+  using ConcreteConverter::ConcreteConverter;
+
+  Status Convert(const BlockParser& parser, int32_t col_index,
+                 std::shared_ptr<Array>* out) override;
+
+ protected:
+  Status Initialize() override {
+    // TODO no need to build separate Tries for each BooleanConverter instance
+    RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
+    RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
+    return ConcreteConverter::Initialize();
+  }
+
+  Trie true_trie_;
+  Trie false_trie_;
+};
+
+Status BooleanConverter::Convert(const BlockParser& parser, int32_t col_index,
+                                 std::shared_ptr<Array>* out) {
+  BooleanBuilder builder(type_, pool_);
+
+  auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+    // XXX should quoted values be allowed at all?
+    if (IsNull(data, size, quoted)) {
+      builder.UnsafeAppendNull();
+      return Status::OK();
+    }
+    if (false_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
+        0) {
+      builder.UnsafeAppend(false);
+      return Status::OK();
+    }
+    if (true_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
+        0) {
+      builder.UnsafeAppend(true);
+      return Status::OK();
+    }
+    return GenericConversionError(type_, data, size);
+  };
+  RETURN_NOT_OK(builder.Resize(parser.num_rows()));
+  RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+  RETURN_NOT_OK(builder.Finish(out));
+
+  return Status::OK();
+}
+
+/////////////////////////////////////////////////////////////////////////
 // Concrete Converter for numbers
 
 template <typename T>
@@ -309,7 +364,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type,
     CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>)
     CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>)
     CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>)
-    CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>)
+    CONVERTER_CASE(Type::BOOL, BooleanConverter)
     CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter)
     CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
     CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc
index 01e687b..b6f1346 100644
--- a/cpp/src/arrow/csv/options.cc
+++ b/cpp/src/arrow/csv/options.cc
@@ -24,10 +24,12 @@ ParseOptions ParseOptions::Defaults() { return ParseOptions(); }
 
 ConvertOptions ConvertOptions::Defaults() {
   auto options = ConvertOptions();
-  // The default list of possible null spellings is taken from Pandas' read_csv().
+  // Same default null / true / false spellings as in Pandas.
   options.null_values = {"",     "#N/A", "#N/A N/A", "#NA",     "-1.#IND", "-1.#QNAN",
                          "-NaN", "-nan", "1.#IND",   "1.#QNAN", "N/A",     "NA",
                          "NULL", "NaN",  "n/a",      "nan",     "null"};
+  options.true_values = {"1", "True", "TRUE", "true"};
+  options.false_values = {"0", "False", "FALSE", "false"};
   return options;
 }
 
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 2b4653c..2014620 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -69,6 +69,9 @@ struct ARROW_EXPORT ConvertOptions {
   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
   // Recognized spellings for null values
   std::vector<std::string> null_values;
+  // Recognized spellings for boolean values
+  std::vector<std::string> true_values;
+  std::vector<std::string> false_values;
 
   static ConvertOptions Defaults();
 };
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index db81046..b61c461 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -255,6 +255,12 @@ cdef class ConvertOptions:
     null_values: list, optional
         A sequence of strings that denote nulls in the data
         (defaults are appropriate in most cases).
+    true_values: list, optional
+        A sequence of strings that denote true booleans in the data
+        (defaults are appropriate in most cases).
+    false_values: list, optional
+        A sequence of strings that denote false booleans in the data
+        (defaults are appropriate in most cases).
     """
     cdef:
         CCSVConvertOptions options
@@ -262,7 +268,8 @@ cdef class ConvertOptions:
     # Avoid mistakingly creating attributes
     __slots__ = ()
 
-    def __init__(self, check_utf8=None, column_types=None, null_values=None):
+    def __init__(self, check_utf8=None, column_types=None, null_values=None,
+                 true_values=None, false_values=None):
         self.options = CCSVConvertOptions.Defaults()
         if check_utf8 is not None:
             self.check_utf8 = check_utf8
@@ -270,6 +277,10 @@ cdef class ConvertOptions:
             self.column_types = column_types
         if null_values is not None:
             self.null_values = null_values
+        if true_values is not None:
+            self.true_values = true_values
+        if false_values is not None:
+            self.false_values = false_values
 
     @property
     def check_utf8(self):
@@ -322,6 +333,28 @@ cdef class ConvertOptions:
     def null_values(self, value):
         self.options.null_values = [tobytes(x) for x in value]
 
+    @property
+    def true_values(self):
+        """
+        A sequence of strings that denote true booleans in the data.
+        """
+        return [frombytes(x) for x in self.options.true_values]
+
+    @true_values.setter
+    def true_values(self, value):
+        self.options.true_values = [tobytes(x) for x in value]
+
+    @property
+    def false_values(self):
+        """
+        A sequence of strings that denote false booleans in the data.
+        """
+        return [frombytes(x) for x in self.options.false_values]
+
+    @false_values.setter
+    def false_values(self, value):
+        self.options.false_values = [tobytes(x) for x in value]
+
 
 cdef _get_reader(input_file, shared_ptr[InputStream]* out):
     use_memory_map = False
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 06e7d43..e27f033 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1008,6 +1008,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         c_bool check_utf8
         unordered_map[c_string, shared_ptr[CDataType]] column_types
         vector[c_string] null_values
+        vector[c_string] true_values
+        vector[c_string] false_values
 
         @staticmethod
         CCSVConvertOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 14ba999..6beebf4 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -157,11 +157,22 @@ def test_convert_options():
     opts.null_values = ['xxx', 'yyy']
     assert opts.null_values == ['xxx', 'yyy']
 
+    assert isinstance(opts.true_values, list)
+    opts.true_values = ['xxx', 'yyy']
+    assert opts.true_values == ['xxx', 'yyy']
+
+    assert isinstance(opts.false_values, list)
+    opts.false_values = ['xxx', 'yyy']
+    assert opts.false_values == ['xxx', 'yyy']
+
     opts = cls(check_utf8=False, column_types={'a': pa.null()},
-               null_values=['xxx', 'yyy'])
+               null_values=['N', 'nn'], true_values=['T', 'tt'],
+               false_values=['F', 'ff'])
     assert opts.check_utf8 is False
     assert opts.column_types == {'a': pa.null()}
-    assert opts.null_values == ['xxx', 'yyy']
+    assert opts.null_values == ['N', 'nn']
+    assert opts.false_values == ['F', 'ff']
+    assert opts.true_values == ['T', 'tt']
 
 
 class BaseTestCSVRead:
@@ -196,30 +207,33 @@ class BaseTestCSVRead:
 
     def test_simple_varied(self):
         # Infer various kinds of data
-        rows = b"a,b,c\n1,2,3\n4.0,-5,foo\n"
+        rows = b"a,b,c,d\n1,2,3,0\n4.0,-5,foo,True\n"
         table = self.read_bytes(rows)
         schema = pa.schema([('a', pa.float64()),
                             ('b', pa.int64()),
-                            ('c', pa.string())])
+                            ('c', pa.string()),
+                            ('d', pa.bool_())])
         assert table.schema == schema
         assert table.to_pydict() == {
             'a': [1.0, 4.0],
             'b': [2, -5],
             'c': [u"3", u"foo"],
+            'd': [False, True],
             }
 
     def test_simple_nulls(self):
         # Infer various kinds of data, with nulls
-        rows = (b"a,b,c,d,e\n"
-                b"1,2,,,3\n"
-                b"nan,-5,foo,,nan\n"
-                b"4.5,#N/A,nan,,\xff\n")
+        rows = (b"a,b,c,d,e,f\n"
+                b"1,2,,,3,N/A\n"
+                b"nan,-5,foo,,nan,TRUE\n"
+                b"4.5,#N/A,nan,,\xff,false\n")
         table = self.read_bytes(rows)
         schema = pa.schema([('a', pa.float64()),
                             ('b', pa.int64()),
                             ('c', pa.string()),
                             ('d', pa.null()),
-                            ('e', pa.binary())])
+                            ('e', pa.binary()),
+                            ('f', pa.bool_())])
         assert table.schema == schema
         assert table.to_pydict() == {
             'a': [1.0, None, 4.5],
@@ -227,6 +241,7 @@ class BaseTestCSVRead:
             'c': [u"", u"foo", u"nan"],
             'd': [None, None, None],
             'e': [b"3", b"nan", b"\xff"],
+            'f': [None, True, False],
             }
 
     def test_simple_timestamps(self):
@@ -269,6 +284,27 @@ class BaseTestCSVRead:
             'b': [u""],
             }
 
+    def test_custom_bools(self):
+        # Infer booleans with custom values
+        opts = ConvertOptions(true_values=['T', 'yes'],
+                              false_values=['F', 'no'])
+        rows = (b"a,b,c\n"
+                b"True,T,t\n"
+                b"False,F,f\n"
+                b"True,yes,yes\n"
+                b"False,no,no\n"
+                b"N/A,N/A,N/A\n")
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()),
+                            ('b', pa.bool_()),
+                            ('c', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {
+            'a': ["True", "False", "True", "False", "N/A"],
+            'b': [True, False, True, False, None],
+            'c': ["t", "f", "yes", "no", "N/A"],
+            }
+
     def test_column_types(self):
         # Ask for specific column types in ConvertOptions
         opts = ConvertOptions(column_types={'b': 'float32',