You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/09 22:43:59 UTC
[arrow] branch master updated: ARROW-3407: [C++] Add UTF8 handling
to CSV conversion
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1f79faf ARROW-3407: [C++] Add UTF8 handling to CSV conversion
1f79faf is described below
commit 1f79fafaec01431072e3af74156997d1e00020dd
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Fri Nov 9 17:43:28 2018 -0500
ARROW-3407: [C++] Add UTF8 handling to CSV conversion
CSV conversion now has distinct paths for string and binary columns. String columns are UTF8-validated by default, but it can be disabled by setting the `check_utf8` option in `ConvertOptions`.
CSV type inference now first attempts string conversion and falls back on binary if UTF8 validation fails (if it's not disabled).
As for performance, on pure ASCII columns single-threaded reading slows down by ~10% (which can be avoided by setting `check_utf8` to false). Multi-threaded reading does not seem affected here.
Based on PR #2916.
Author: Antoine Pitrou <an...@python.org>
Closes #2924 from pitrou/ARROW-3407-csv-utf8-conversion and squashes the following commits:
26a812c5c <Antoine Pitrou> ARROW-3407: Add UTF8 handling to CSV conversion
---
cpp/src/arrow/csv/column-builder.cc | 9 +++-
cpp/src/arrow/csv/converter.cc | 62 +++++++++++++++++-----------
cpp/src/arrow/csv/converter.h | 2 +
cpp/src/arrow/csv/csv-column-builder-test.cc | 51 ++++++++++++++++++++---
cpp/src/arrow/csv/csv-converter-test.cc | 61 +++++++++++++++++++--------
cpp/src/arrow/csv/options.h | 5 +++
python/doc/source/api.rst | 1 +
python/doc/source/csv.rst | 30 ++++++++------
python/pyarrow/_csv.pyx | 42 +++++++++++++++++--
python/pyarrow/csv.py | 2 +-
python/pyarrow/includes/libarrow.pxd | 2 +
python/pyarrow/tests/test_csv.py | 44 +++++++++++++-------
12 files changed, 232 insertions(+), 79 deletions(-)
diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc
index b7610a5..4895f20 100644
--- a/cpp/src/arrow/csv/column-builder.cc
+++ b/cpp/src/arrow/csv/column-builder.cc
@@ -155,7 +155,7 @@ class InferringColumnBuilder : public ColumnBuilder {
std::shared_ptr<Converter> converter_;
// Current inference status
- enum class InferKind { Null, Integer, Real, Text };
+ enum class InferKind { Null, Integer, Real, Text, Binary };
std::shared_ptr<DataType> infer_type_;
InferKind infer_kind_;
@@ -185,6 +185,9 @@ Status InferringColumnBuilder::LoosenType() {
infer_kind_ = InferKind::Text;
break;
case InferKind::Text:
+ infer_kind_ = InferKind::Binary;
+ break;
+ case InferKind::Binary:
return Status::UnknownError("Shouldn't come here");
}
return UpdateType();
@@ -207,6 +210,10 @@ Status InferringColumnBuilder::UpdateType() {
can_loosen_type_ = true;
break;
case InferKind::Text:
+ infer_type_ = utf8();
+ can_loosen_type_ = true;
+ break;
+ case InferKind::Binary:
infer_type_ = binary();
can_loosen_type_ = false;
break;
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 81015c1..729c934 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -28,6 +28,7 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/parsing.h" // IWYU pragma: keep
+#include "arrow/util/utf8.h"
namespace arrow {
namespace csv {
@@ -56,6 +57,7 @@ class ConcreteConverter : public Converter {
using Converter::Converter;
protected:
+ Status Initialize() override { return Status::OK(); }
inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted);
};
@@ -197,34 +199,41 @@ Status NullConverter::Convert(const BlockParser& parser, int32_t col_index,
/////////////////////////////////////////////////////////////////////////
// Concrete Converter for var-sized binary strings
-template <typename T>
+template <typename T, bool CheckUTF8>
class VarSizeBinaryConverter : public ConcreteConverter {
public:
using ConcreteConverter::ConcreteConverter;
Status Convert(const BlockParser& parser, int32_t col_index,
- std::shared_ptr<Array>* out) override;
-};
+ std::shared_ptr<Array>* out) override {
+ using BuilderType = typename TypeTraits<T>::BuilderType;
+ BuilderType builder(pool_);
-template <typename T>
-Status VarSizeBinaryConverter<T>::Convert(const BlockParser& parser, int32_t col_index,
- std::shared_ptr<Array>* out) {
- using BuilderType = typename TypeTraits<T>::BuilderType;
- BuilderType builder(pool_);
+ // TODO do we accept nulls here?
- // TODO handle nulls
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
+ std::stringstream ss;
+ ss << "CSV conversion error to " << type_->ToString() << ": invalid UTF8 data";
+ return Status::Invalid(ss.str());
+ }
+ builder.UnsafeAppend(data, size);
+ return Status::OK();
+ };
+ RETURN_NOT_OK(builder.Resize(parser.num_rows()));
+ RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
+ RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+ RETURN_NOT_OK(builder.Finish(out));
- auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
- builder.UnsafeAppend(data, size);
return Status::OK();
- };
- RETURN_NOT_OK(builder.Resize(parser.num_rows()));
- RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
- RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
- RETURN_NOT_OK(builder.Finish(out));
+ }
- return Status::OK();
-}
+ protected:
+ Status Initialize() override {
+ util::InitializeUTF8();
+ return Status::OK();
+ }
+};
/////////////////////////////////////////////////////////////////////////
// Concrete Converter for fixed-sized binary strings
@@ -242,7 +251,7 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_
FixedSizeBinaryBuilder builder(type_, pool_);
const uint32_t byte_width = static_cast<uint32_t>(builder.byte_width());
- // TODO handle nulls
+ // TODO do we accept nulls here?
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
if (ARROW_PREDICT_FALSE(size != byte_width)) {
@@ -340,9 +349,6 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
break;
CONVERTER_CASE(Type::NA, NullConverter)
- CONVERTER_CASE(Type::BINARY, VarSizeBinaryConverter<BinaryType>)
- CONVERTER_CASE(Type::STRING, VarSizeBinaryConverter<StringType>)
- CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>)
CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>)
CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>)
@@ -354,6 +360,16 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>)
CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>)
CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>)
+ CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
+ CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
+
+ case Type::STRING:
+ if (options.check_utf8) {
+ result = new VarSizeBinaryConverter<StringType, true>(type, options, pool);
+ } else {
+ result = new VarSizeBinaryConverter<StringType, false>(type, options, pool);
+ }
+ break;
default: {
std::stringstream ss;
@@ -364,7 +380,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
#undef CONVERTER_CASE
}
out->reset(result);
- return Status::OK();
+ return result->Initialize();
}
Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions options,
diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h
index 52f8934..3fc3ac4 100644
--- a/cpp/src/arrow/csv/converter.h
+++ b/cpp/src/arrow/csv/converter.h
@@ -55,6 +55,8 @@ class ARROW_EXPORT Converter {
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
+ virtual Status Initialize() = 0;
+
ConvertOptions options_;
MemoryPool* pool_;
std::shared_ptr<DataType> type_;
diff --git a/cpp/src/arrow/csv/csv-column-builder-test.cc b/cpp/src/arrow/csv/csv-column-builder-test.cc
index a4dc236..7488ad8 100644
--- a/cpp/src/arrow/csv/csv-column-builder-test.cc
+++ b/cpp/src/arrow/csv/csv-column-builder-test.cc
@@ -214,17 +214,58 @@ TEST(InferringColumnBuilder, MultipleChunkReal) {
AssertChunkedEqual(*expected, *actual);
}
+TEST(InferringColumnBuilder, SingleChunkString) {
+ auto tg = TaskGroup::MakeSerial();
+ std::shared_ptr<ColumnBuilder> builder;
+ std::shared_ptr<ChunkedArray> actual;
+ std::shared_ptr<ChunkedArray> expected;
+
+ // With valid UTF8
+ ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+ AssertBuilding(builder, {{"", "foo", "baré"}}, &actual);
+
+ ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
+ {{"", "foo", "baré"}}, &expected);
+ AssertChunkedEqual(*expected, *actual);
+
+ // With invalid UTF8, non-checking
+ auto options = ConvertOptions::Defaults();
+ options.check_utf8 = false;
+ tg = TaskGroup::MakeSerial();
+ ASSERT_OK(ColumnBuilder::Make(0, options, tg, &builder));
+ AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
+
+ ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
+ {{"", "foo\xff", "baré"}}, &expected);
+ AssertChunkedEqual(*expected, *actual);
+}
+
TEST(InferringColumnBuilder, SingleChunkBinary) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ColumnBuilder> builder;
+ std::shared_ptr<ChunkedArray> actual;
+ std::shared_ptr<ChunkedArray> expected;
+
+ // With invalid UTF8, checking
+ ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+ AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
+
+ ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
+ {{"", "foo\xff", "baré"}}, &expected);
+ AssertChunkedEqual(*expected, *actual);
+}
+
+TEST(InferringColumnBuilder, MultipleChunkString) {
+ auto tg = TaskGroup::MakeSerial();
+ std::shared_ptr<ColumnBuilder> builder;
ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
std::shared_ptr<ChunkedArray> actual;
- AssertBuilding(builder, {{"", "foo", "bar"}}, &actual);
+ AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré"}}, &actual);
std::shared_ptr<ChunkedArray> expected;
- ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
- {{"", "foo", "bar"}}, &expected);
+ ChunkedArrayFromVector<StringType, std::string>(
+ {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré"}}, &expected);
AssertChunkedEqual(*expected, *actual);
}
@@ -234,11 +275,11 @@ TEST(InferringColumnBuilder, MultipleChunkBinary) {
ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
std::shared_ptr<ChunkedArray> actual;
- AssertBuilding(builder, {{""}, {"008"}, {"NaN", "bar"}}, &actual);
+ AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré\xff"}}, &actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<BinaryType, std::string>(
- {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "bar"}}, &expected);
+ {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré\xff"}}, &expected);
AssertChunkedEqual(*expected, *actual);
}
diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/csv-converter-test.cc
index dd3dba6..024a54c 100644
--- a/cpp/src/arrow/csv/csv-converter-test.cc
+++ b/cpp/src/arrow/csv/csv-converter-test.cc
@@ -17,6 +17,7 @@
#include <cstdint>
#include <memory>
+#include <set>
#include <string>
#include <vector>
@@ -45,12 +46,13 @@ std::vector<std::string> AllNulls() {
template <typename DATA_TYPE, typename C_TYPE>
void AssertConversion(const std::shared_ptr<DataType>& type,
const std::vector<std::string>& csv_string,
- const std::vector<std::vector<C_TYPE>>& expected) {
+ const std::vector<std::vector<C_TYPE>>& expected,
+ ConvertOptions options = ConvertOptions::Defaults()) {
std::shared_ptr<BlockParser> parser;
std::shared_ptr<Converter> converter;
std::shared_ptr<Array> array, expected_array;
- ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
+ ASSERT_OK(Converter::Make(type, options, &converter));
MakeCSVParser(csv_string, &parser);
for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
@@ -65,12 +67,13 @@ template <typename DATA_TYPE, typename C_TYPE>
void AssertConversion(const std::shared_ptr<DataType>& type,
const std::vector<std::string>& csv_string,
const std::vector<std::vector<C_TYPE>>& expected,
- const std::vector<std::vector<bool>>& is_valid) {
+ const std::vector<std::vector<bool>>& is_valid,
+ ConvertOptions options = ConvertOptions::Defaults()) {
std::shared_ptr<BlockParser> parser;
std::shared_ptr<Converter> converter;
std::shared_ptr<Array> array, expected_array;
- ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
+ ASSERT_OK(Converter::Make(type, options, &converter));
MakeCSVParser(csv_string, &parser);
for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
@@ -90,17 +93,47 @@ void AssertConversionAllNulls(const std::shared_ptr<DataType>& type) {
AssertConversion<DATA_TYPE, C_TYPE>(type, nulls, {values}, {is_valid});
}
+void AssertConversionError(const std::shared_ptr<DataType>& type,
+ const std::vector<std::string>& csv_string,
+ const std::set<int32_t>& invalid_columns,
+ ConvertOptions options = ConvertOptions::Defaults()) {
+ std::shared_ptr<BlockParser> parser;
+ std::shared_ptr<Converter> converter;
+ std::shared_ptr<Array> array;
+
+ ASSERT_OK(Converter::Make(type, options, &converter));
+
+ MakeCSVParser(csv_string, &parser);
+ for (int32_t i = 0; i < parser->num_cols(); ++i) {
+ if (invalid_columns.find(i) == invalid_columns.end()) {
+ ASSERT_OK(converter->Convert(*parser, i, &array));
+ } else {
+ ASSERT_RAISES(Invalid, converter->Convert(*parser, i, &array));
+ }
+ }
+}
+
//////////////////////////////////////////////////////////////////////////
// Test functions begin here
TEST(BinaryConversion, Basics) {
- AssertConversion<BinaryType, std::string>(binary(), {"ab,cde\n", ",gh\n"},
- {{"ab", ""}, {"cde", "gh"}});
+ AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n", ",\xffgh\n"},
+ {{"ab", ""}, {"cdé", "\xffgh"}});
}
TEST(StringConversion, Basics) {
- AssertConversion<StringType, std::string>(utf8(), {"ab,cde\n", ",gh\n"},
- {{"ab", ""}, {"cde", "gh"}});
+ AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
+ {{"ab", ""}, {"cdé", "gh"}});
+
+ auto options = ConvertOptions::Defaults();
+ options.check_utf8 = false;
+ AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",\xffgh\n"},
+ {{"ab", ""}, {"cdé", "\xffgh"}}, options);
+}
+
+TEST(StringConversion, Errors) {
+ // Invalid UTF8 in column 0
+ AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
}
TEST(FixedSizeBinaryConversion, Basics) {
@@ -109,16 +142,8 @@ TEST(FixedSizeBinaryConversion, Basics) {
}
TEST(FixedSizeBinaryConversion, Errors) {
- std::shared_ptr<BlockParser> parser;
- std::shared_ptr<Converter> converter;
- std::shared_ptr<Array> array;
- std::shared_ptr<DataType> type = fixed_size_binary(2);
-
- ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
-
- MakeCSVParser({"ab,cd\n", "g,ij\n"}, &parser);
- ASSERT_RAISES(Invalid, converter->Convert(*parser, 0, &array));
- ASSERT_OK(converter->Convert(*parser, 1, &array));
+ // Wrong-sized string in column 0
+ AssertConversionError(fixed_size_binary(2), {"ab,cd\n", "g,ij\n"}, {0});
}
TEST(NullConversion, Basics) {
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 6119786..0e42341 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -51,6 +51,11 @@ struct ARROW_EXPORT ParseOptions {
};
struct ARROW_EXPORT ConvertOptions {
+ // Conversion options
+
+ // Whether to check UTF8 validity of string columns
+ bool check_utf8 = true;
+
static ConvertOptions Defaults();
};
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index caa2d65..4ecd7d6 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -350,6 +350,7 @@ CSV Files
ReadOptions
ParseOptions
+ ConvertOptions
read_csv
.. currentmodule:: pyarrow.parquet
diff --git a/python/doc/source/csv.rst b/python/doc/source/csv.rst
index 5f9b46f..3fff8a8 100644
--- a/python/doc/source/csv.rst
+++ b/python/doc/source/csv.rst
@@ -29,7 +29,7 @@ The features currently offered are the following:
such as ``my_data.csv.gz``)
* fetching column names from the first row in the CSV file
* column-wise type inference and conversion to one of ``null``, ``int64``,
- ``float64`` or ``binary`` data
+ ``float64``, ``string`` or ``binary`` data
* detecting various spellings of null values such as ``NaN`` or ``#N/A``
Usage
@@ -46,21 +46,21 @@ with the file path you want to read from::
pyarrow.Table
total_bill: double
tip: double
- sex: binary
- smoker: binary
- day: binary
- time: binary
+ sex: string
+ smoker: string
+ day: string
+ time: string
size: int64
>>> len(table)
244
>>> df = table.to_pandas()
>>> df.head()
- total_bill tip sex smoker day time size
- 0 16.99 1.01 b'Female' b'No' b'Sun' b'Dinner' 2
- 1 10.34 1.66 b'Male' b'No' b'Sun' b'Dinner' 3
- 2 21.01 3.50 b'Male' b'No' b'Sun' b'Dinner' 3
- 3 23.68 3.31 b'Male' b'No' b'Sun' b'Dinner' 2
- 4 24.59 3.61 b'Female' b'No' b'Sun' b'Dinner' 4
+ total_bill tip sex smoker day time size
+ 0 16.99 1.01 Female No Sun Dinner 2
+ 1 10.34 1.66 Male No Sun Dinner 3
+ 2 21.01 3.50 Male No Sun Dinner 3
+ 3 23.68 3.31 Male No Sun Dinner 2
+ 4 24.59 3.61 Female No Sun Dinner 4
Customized parsing
------------------
@@ -69,11 +69,17 @@ To alter the default parsing settings in case of reading CSV files with an
unusual structure, you should create a :class:`ParseOptions` instance
and pass it to :func:`read_csv`.
+Customized conversion
+---------------------
+
+To alter how CSV data is converted to Arrow types and data, you should create
+a :class:`ConvertOptions` instance and pass it to :func:`read_csv`.
+
Limitations
-----------
Arrow is not able to detect or convert other data types (such as dates
-and times) than the four mentioned above. It is also not possible to
+and times) than the five mentioned above. It is also not possible to
choose the data types of columns explicitly.
Performance
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 90da157..e6488d0 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -215,6 +215,38 @@ cdef class ParseOptions:
self.options.newlines_in_values = value
+cdef class ConvertOptions:
+ """
+ Options for converting CSV data.
+
+ Parameters
+ ----------
+ check_utf8 : bool, optional (default True)
+ Whether to check UTF8 validity of string columns.
+ """
+ cdef:
+ CCSVConvertOptions options
+
+ # Avoid mistakingly creating attributes
+ __slots__ = ()
+
+ def __init__(self, check_utf8=None):
+ self.options = CCSVConvertOptions.Defaults()
+ if check_utf8 is not None:
+ self.check_utf8 = check_utf8
+
+ @property
+ def check_utf8(self):
+ """
+ Whether to check UTF8 validity of string columns.
+ """
+ return self.options.check_utf8
+
+ @check_utf8.setter
+ def check_utf8(self, value):
+ self.options.check_utf8 = value
+
+
cdef _get_reader(input_file, shared_ptr[InputStream]* out):
use_memory_map = False
get_input_stream(input_file, use_memory_map, out)
@@ -234,11 +266,12 @@ cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
out[0] = parse_options.options
-cdef _get_convert_options(convert_options, CCSVConvertOptions* out):
+cdef _get_convert_options(ConvertOptions convert_options,
+ CCSVConvertOptions* out):
if convert_options is None:
out[0] = CCSVConvertOptions.Defaults()
else:
- raise NotImplementedError("non-default convert options not supported")
+ out[0] = convert_options.options
def read_csv(input_file, read_options=None, parse_options=None,
@@ -257,8 +290,9 @@ def read_csv(input_file, read_options=None, parse_options=None,
parse_options: ParseOptions, optional
Options for the CSV parser
(see ParseOptions constructor for defaults)
- convert_options: None
- Currently unused
+ convert_options: ConvertOptions, optional
+ Options for converting CSV data
+ (see ConvertOptions constructor for defaults)
memory_pool: MemoryPool, optional
Pool to allocate Table memory from
diff --git a/python/pyarrow/csv.py b/python/pyarrow/csv.py
index d6830a0..8375ad4 100644
--- a/python/pyarrow/csv.py
+++ b/python/pyarrow/csv.py
@@ -15,4 +15,4 @@
# specific language governing permissions and limitations
# under the License.
-from pyarrow._csv import ReadOptions, ParseOptions, read_csv # noqa
+from pyarrow._csv import ReadOptions, ParseOptions, ConvertOptions, read_csv # noqa
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1f5f4cf..a3d356e 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -947,6 +947,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
CCSVParseOptions Defaults()
cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
+ c_bool check_utf8
+
@staticmethod
CCSVConvertOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index dbfa4f3..c204fdf 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -30,7 +30,7 @@ import pytest
import numpy as np
import pyarrow as pa
-from pyarrow.csv import read_csv, ReadOptions, ParseOptions
+from pyarrow.csv import read_csv, ReadOptions, ParseOptions, ConvertOptions
def generate_col_names():
@@ -117,6 +117,18 @@ def test_parse_options():
assert opts.newlines_in_values is True
+def test_convert_options():
+ cls = ConvertOptions
+ opts = cls()
+
+ assert opts.check_utf8 is True
+ opts.check_utf8 = False
+ assert opts.check_utf8 is False
+
+ opts = cls(check_utf8=False)
+ assert opts.check_utf8 is False
+
+
class BaseTestCSVRead:
def read_bytes(self, b, **kwargs):
@@ -153,31 +165,33 @@ class BaseTestCSVRead:
table = self.read_bytes(rows)
schema = pa.schema([('a', pa.float64()),
('b', pa.int64()),
- ('c', pa.binary())])
+ ('c', pa.string())])
assert table.schema == schema
assert table.to_pydict() == {
'a': [1.0, 4.0],
'b': [2, -5],
- 'c': [b"3", b"foo"],
+ 'c': [u"3", u"foo"],
}
def test_simple_nulls(self):
# Infer various kinds of data, with nulls
- rows = (b"a,b,c,d\n"
- b"1,2,,\n"
- b"nan,-5,foo,\n"
- b"4.5,#N/A,nan,\n")
+ rows = (b"a,b,c,d,e\n"
+ b"1,2,,,3\n"
+ b"nan,-5,foo,,nan\n"
+ b"4.5,#N/A,nan,,\xff\n")
table = self.read_bytes(rows)
schema = pa.schema([('a', pa.float64()),
('b', pa.int64()),
- ('c', pa.binary()),
- ('d', pa.null())])
+ ('c', pa.string()),
+ ('d', pa.null()),
+ ('e', pa.binary())])
assert table.schema == schema
assert table.to_pydict() == {
'a': [1.0, None, 4.5],
'b': [2, -5, None],
- 'c': [b"", b"foo", b"nan"],
- 'd': [None, None, None]
+ 'c': [u"", u"foo", u"nan"],
+ 'd': [None, None, None],
+ 'e': [b"3", b"nan", b"\xff"],
}
def test_no_ending_newline(self):
@@ -212,14 +226,14 @@ class BaseTestCSVRead:
rows = b"a;b,c\nde,fg;eh\n"
table = self.read_bytes(rows)
assert table.to_pydict() == {
- 'a;b': [b'de'],
- 'c': [b'fg;eh'],
+ 'a;b': [u'de'],
+ 'c': [u'fg;eh'],
}
opts = ParseOptions(delimiter=';')
table = self.read_bytes(rows, parse_options=opts)
assert table.to_pydict() == {
- 'a': [b'de,fg'],
- 'b,c': [b'eh'],
+ 'a': [u'de,fg'],
+ 'b,c': [u'eh'],
}
def test_small_random_csv(self):