You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/04/02 17:50:20 UTC
[arrow] branch master updated: ARROW-3791: [C++ / Python] Add
boolean type inference to the CSV parser
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f7ef65e ARROW-3791: [C++ / Python] Add boolean type inference to the CSV parser
f7ef65e is described below
commit f7ef65e5fc367f1f5649dfcea0754e413fcca394
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Apr 2 12:50:11 2019 -0500
ARROW-3791: [C++ / Python] Add boolean type inference to the CSV parser
The set of recognized values can be customized using arrow::csv::ConvertOptions.
Author: Antoine Pitrou <an...@python.org>
Closes #4055 from pitrou/ARROW-3791-csv-bool-type-inference and squashes the following commits:
7941188ec <Antoine Pitrou> ARROW-3791: Add boolean type inference to the CSV parser
---
cpp/src/arrow/csv/column-builder-test.cc | 31 ++++++++++++++
cpp/src/arrow/csv/column-builder.cc | 9 +++-
cpp/src/arrow/csv/converter.cc | 71 ++++++++++++++++++++++++++++----
cpp/src/arrow/csv/options.cc | 4 +-
cpp/src/arrow/csv/options.h | 3 ++
python/pyarrow/_csv.pyx | 35 +++++++++++++++-
python/pyarrow/includes/libarrow.pxd | 2 +
python/pyarrow/tests/test_csv.py | 54 ++++++++++++++++++++----
8 files changed, 189 insertions(+), 20 deletions(-)
diff --git a/cpp/src/arrow/csv/column-builder-test.cc b/cpp/src/arrow/csv/column-builder-test.cc
index 5035f83..f2c39aa 100644
--- a/cpp/src/arrow/csv/column-builder-test.cc
+++ b/cpp/src/arrow/csv/column-builder-test.cc
@@ -189,6 +189,34 @@ TEST(InferringColumnBuilder, MultipleChunkInteger) {
AssertChunkedEqual(*expected, *actual);
}
+TEST(InferringColumnBuilder, SingleChunkBoolean) {
+ auto tg = TaskGroup::MakeSerial();
+ std::shared_ptr<ColumnBuilder> builder;
+ ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+
+ std::shared_ptr<ChunkedArray> actual;
+ AssertBuilding(builder, {{"", "0", "FALSE"}}, &actual);
+
+ std::shared_ptr<ChunkedArray> expected;
+ ChunkedArrayFromVector<BooleanType, bool>({{false, true, true}},
+ {{false, false, false}}, &expected);
+ AssertChunkedEqual(*expected, *actual);
+}
+
+TEST(InferringColumnBuilder, MultipleChunkBoolean) {
+ auto tg = TaskGroup::MakeSerial();
+ std::shared_ptr<ColumnBuilder> builder;
+ ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+
+ std::shared_ptr<ChunkedArray> actual;
+ AssertBuilding(builder, {{""}, {"1", "True", "0"}}, &actual);
+
+ std::shared_ptr<ChunkedArray> expected;
+ ChunkedArrayFromVector<BooleanType, bool>({{false}, {true, true, true}},
+ {{false}, {true, true, false}}, &expected);
+ AssertChunkedEqual(*expected, *actual);
+}
+
TEST(InferringColumnBuilder, SingleChunkReal) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ColumnBuilder> builder;
@@ -316,6 +344,9 @@ TEST(InferringColumnBuilder, MultipleChunkBinary) {
AssertChunkedEqual(*expected, *actual);
}
+// Parallel parsing is tested more comprehensively on the Python side
+// (see python/pyarrow/tests/test_csv.py)
+
TEST(InferringColumnBuilder, MultipleChunkIntegerParallel) {
auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
std::shared_ptr<ColumnBuilder> builder;
diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc
index 1f37046..657aa6f 100644
--- a/cpp/src/arrow/csv/column-builder.cc
+++ b/cpp/src/arrow/csv/column-builder.cc
@@ -167,7 +167,7 @@ class InferringColumnBuilder : public ColumnBuilder {
std::shared_ptr<Converter> converter_;
// Current inference status
- enum class InferKind { Null, Integer, Real, Timestamp, Text, Binary };
+ enum class InferKind { Null, Integer, Boolean, Real, Timestamp, Text, Binary };
std::shared_ptr<DataType> infer_type_;
InferKind infer_kind_;
@@ -191,6 +191,9 @@ Status InferringColumnBuilder::LoosenType() {
infer_kind_ = InferKind::Integer;
break;
case InferKind::Integer:
+ infer_kind_ = InferKind::Boolean;
+ break;
+ case InferKind::Boolean:
infer_kind_ = InferKind::Timestamp;
break;
case InferKind::Timestamp:
@@ -220,6 +223,10 @@ Status InferringColumnBuilder::UpdateType() {
infer_type_ = int64();
can_loosen_type_ = true;
break;
+ case InferKind::Boolean:
+ infer_type_ = boolean();
+ can_loosen_type_ = true;
+ break;
case InferKind::Timestamp:
// We don't support parsing second fractions for now
infer_type_ = timestamp(TimeUnit::SECOND);
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 22be7d6..c7a5c6f 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -56,6 +56,15 @@ inline bool IsWhitespace(uint8_t c) {
return c == ' ' || c == '\t';
}
+Status InitializeTrie(const std::vector<std::string>& inputs, Trie* trie) {
+ TrieBuilder builder;
+ for (const auto& s : inputs) {
+ RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
+ }
+ *trie = builder.Finish();
+ return Status::OK();
+}
+
class ConcreteConverter : public Converter {
public:
using Converter::Converter;
@@ -69,12 +78,7 @@ class ConcreteConverter : public Converter {
Status ConcreteConverter::Initialize() {
// TODO no need to build a separate Trie for each Converter instance
- TrieBuilder builder;
- for (const auto& s : options_.null_values) {
- RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
- }
- null_trie_ = builder.Finish();
- return Status::OK();
+ return InitializeTrie(options_.null_values, &null_trie_);
}
bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) {
@@ -147,7 +151,7 @@ class VarSizeBinaryConverter : public ConcreteConverter {
protected:
Status Initialize() override {
util::InitializeUTF8();
- return Status::OK();
+ return ConcreteConverter::Initialize();
}
};
@@ -184,6 +188,57 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_
}
/////////////////////////////////////////////////////////////////////////
+// Concrete Converter for booleans
+
+class BooleanConverter : public ConcreteConverter {
+ public:
+ using ConcreteConverter::ConcreteConverter;
+
+ Status Convert(const BlockParser& parser, int32_t col_index,
+ std::shared_ptr<Array>* out) override;
+
+ protected:
+ Status Initialize() override {
+ // TODO no need to build separate Tries for each BooleanConverter instance
+ RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
+ RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
+ return ConcreteConverter::Initialize();
+ }
+
+ Trie true_trie_;
+ Trie false_trie_;
+};
+
+Status BooleanConverter::Convert(const BlockParser& parser, int32_t col_index,
+ std::shared_ptr<Array>* out) {
+ BooleanBuilder builder(type_, pool_);
+
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ // XXX should quoted values be allowed at all?
+ if (IsNull(data, size, quoted)) {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }
+ if (false_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
+ 0) {
+ builder.UnsafeAppend(false);
+ return Status::OK();
+ }
+ if (true_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
+ 0) {
+ builder.UnsafeAppend(true);
+ return Status::OK();
+ }
+ return GenericConversionError(type_, data, size);
+ };
+ RETURN_NOT_OK(builder.Resize(parser.num_rows()));
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+ RETURN_NOT_OK(builder.Finish(out));
+
+ return Status::OK();
+}
+
+/////////////////////////////////////////////////////////////////////////
// Concrete Converter for numbers
template <typename T>
@@ -309,7 +364,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type,
CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>)
CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>)
CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>)
- CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>)
+ CONVERTER_CASE(Type::BOOL, BooleanConverter)
CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter)
CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc
index 01e687b..b6f1346 100644
--- a/cpp/src/arrow/csv/options.cc
+++ b/cpp/src/arrow/csv/options.cc
@@ -24,10 +24,12 @@ ParseOptions ParseOptions::Defaults() { return ParseOptions(); }
ConvertOptions ConvertOptions::Defaults() {
auto options = ConvertOptions();
- // The default list of possible null spellings is taken from Pandas' read_csv().
+ // Same default null / true / false spellings as in Pandas.
options.null_values = {"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN",
"-NaN", "-nan", "1.#IND", "1.#QNAN", "N/A", "NA",
"NULL", "NaN", "n/a", "nan", "null"};
+ options.true_values = {"1", "True", "TRUE", "true"};
+ options.false_values = {"0", "False", "FALSE", "false"};
return options;
}
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 2b4653c..2014620 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -69,6 +69,9 @@ struct ARROW_EXPORT ConvertOptions {
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
// Recognized spellings for null values
std::vector<std::string> null_values;
+ // Recognized spellings for boolean values
+ std::vector<std::string> true_values;
+ std::vector<std::string> false_values;
static ConvertOptions Defaults();
};
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index db81046..b61c461 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -255,6 +255,12 @@ cdef class ConvertOptions:
null_values: list, optional
A sequence of strings that denote nulls in the data
(defaults are appropriate in most cases).
+ true_values: list, optional
+ A sequence of strings that denote true booleans in the data
+ (defaults are appropriate in most cases).
+ false_values: list, optional
+ A sequence of strings that denote false booleans in the data
+ (defaults are appropriate in most cases).
"""
cdef:
CCSVConvertOptions options
@@ -262,7 +268,8 @@ cdef class ConvertOptions:
# Avoid mistakingly creating attributes
__slots__ = ()
- def __init__(self, check_utf8=None, column_types=None, null_values=None):
+ def __init__(self, check_utf8=None, column_types=None, null_values=None,
+ true_values=None, false_values=None):
self.options = CCSVConvertOptions.Defaults()
if check_utf8 is not None:
self.check_utf8 = check_utf8
@@ -270,6 +277,10 @@ cdef class ConvertOptions:
self.column_types = column_types
if null_values is not None:
self.null_values = null_values
+ if true_values is not None:
+ self.true_values = true_values
+ if false_values is not None:
+ self.false_values = false_values
@property
def check_utf8(self):
@@ -322,6 +333,28 @@ cdef class ConvertOptions:
def null_values(self, value):
self.options.null_values = [tobytes(x) for x in value]
+ @property
+ def true_values(self):
+ """
+ A sequence of strings that denote true booleans in the data.
+ """
+ return [frombytes(x) for x in self.options.true_values]
+
+ @true_values.setter
+ def true_values(self, value):
+ self.options.true_values = [tobytes(x) for x in value]
+
+ @property
+ def false_values(self):
+ """
+ A sequence of strings that denote false booleans in the data.
+ """
+ return [frombytes(x) for x in self.options.false_values]
+
+ @false_values.setter
+ def false_values(self, value):
+ self.options.false_values = [tobytes(x) for x in value]
+
cdef _get_reader(input_file, shared_ptr[InputStream]* out):
use_memory_map = False
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 06e7d43..e27f033 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1008,6 +1008,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
c_bool check_utf8
unordered_map[c_string, shared_ptr[CDataType]] column_types
vector[c_string] null_values
+ vector[c_string] true_values
+ vector[c_string] false_values
@staticmethod
CCSVConvertOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 14ba999..6beebf4 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -157,11 +157,22 @@ def test_convert_options():
opts.null_values = ['xxx', 'yyy']
assert opts.null_values == ['xxx', 'yyy']
+ assert isinstance(opts.true_values, list)
+ opts.true_values = ['xxx', 'yyy']
+ assert opts.true_values == ['xxx', 'yyy']
+
+ assert isinstance(opts.false_values, list)
+ opts.false_values = ['xxx', 'yyy']
+ assert opts.false_values == ['xxx', 'yyy']
+
opts = cls(check_utf8=False, column_types={'a': pa.null()},
- null_values=['xxx', 'yyy'])
+ null_values=['N', 'nn'], true_values=['T', 'tt'],
+ false_values=['F', 'ff'])
assert opts.check_utf8 is False
assert opts.column_types == {'a': pa.null()}
- assert opts.null_values == ['xxx', 'yyy']
+ assert opts.null_values == ['N', 'nn']
+ assert opts.false_values == ['F', 'ff']
+ assert opts.true_values == ['T', 'tt']
class BaseTestCSVRead:
@@ -196,30 +207,33 @@ class BaseTestCSVRead:
def test_simple_varied(self):
# Infer various kinds of data
- rows = b"a,b,c\n1,2,3\n4.0,-5,foo\n"
+ rows = b"a,b,c,d\n1,2,3,0\n4.0,-5,foo,True\n"
table = self.read_bytes(rows)
schema = pa.schema([('a', pa.float64()),
('b', pa.int64()),
- ('c', pa.string())])
+ ('c', pa.string()),
+ ('d', pa.bool_())])
assert table.schema == schema
assert table.to_pydict() == {
'a': [1.0, 4.0],
'b': [2, -5],
'c': [u"3", u"foo"],
+ 'd': [False, True],
}
def test_simple_nulls(self):
# Infer various kinds of data, with nulls
- rows = (b"a,b,c,d,e\n"
- b"1,2,,,3\n"
- b"nan,-5,foo,,nan\n"
- b"4.5,#N/A,nan,,\xff\n")
+ rows = (b"a,b,c,d,e,f\n"
+ b"1,2,,,3,N/A\n"
+ b"nan,-5,foo,,nan,TRUE\n"
+ b"4.5,#N/A,nan,,\xff,false\n")
table = self.read_bytes(rows)
schema = pa.schema([('a', pa.float64()),
('b', pa.int64()),
('c', pa.string()),
('d', pa.null()),
- ('e', pa.binary())])
+ ('e', pa.binary()),
+ ('f', pa.bool_())])
assert table.schema == schema
assert table.to_pydict() == {
'a': [1.0, None, 4.5],
@@ -227,6 +241,7 @@ class BaseTestCSVRead:
'c': [u"", u"foo", u"nan"],
'd': [None, None, None],
'e': [b"3", b"nan", b"\xff"],
+ 'f': [None, True, False],
}
def test_simple_timestamps(self):
@@ -269,6 +284,27 @@ class BaseTestCSVRead:
'b': [u""],
}
+ def test_custom_bools(self):
+ # Infer booleans with custom values
+ opts = ConvertOptions(true_values=['T', 'yes'],
+ false_values=['F', 'no'])
+ rows = (b"a,b,c\n"
+ b"True,T,t\n"
+ b"False,F,f\n"
+ b"True,yes,yes\n"
+ b"False,no,no\n"
+ b"N/A,N/A,N/A\n")
+ table = self.read_bytes(rows, convert_options=opts)
+ schema = pa.schema([('a', pa.string()),
+ ('b', pa.bool_()),
+ ('c', pa.string())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': ["True", "False", "True", "False", "N/A"],
+ 'b': [True, False, True, False, None],
+ 'c': ["t", "f", "yes", "no", "N/A"],
+ }
+
def test_column_types(self):
# Ask for specific column types in ConvertOptions
opts = ConvertOptions(column_types={'b': 'float32',