You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/09 22:43:59 UTC

[arrow] branch master updated: ARROW-3407: [C++] Add UTF8 handling to CSV conversion

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1f79faf  ARROW-3407: [C++] Add UTF8 handling to CSV conversion
1f79faf is described below

commit 1f79fafaec01431072e3af74156997d1e00020dd
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Fri Nov 9 17:43:28 2018 -0500

    ARROW-3407: [C++] Add UTF8 handling to CSV conversion
    
    CSV conversion now has distinct paths for string and binary columns. String columns are UTF8-validated by default, but it can be disabled by setting the `check_utf8` option in `ConvertOptions`.
    
    CSV type inference now first attempts string conversion and falls back on binary if UTF8 validation fails (if it's not disabled).
    
    As for performance, on pure ASCII columns single-threaded reading slows down by ~10% (which can be avoided by setting `check_utf8` to false). Multi-threaded reading does not seem affected here.
    
    Based on PR #2916.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2924 from pitrou/ARROW-3407-csv-utf8-conversion and squashes the following commits:
    
    26a812c5c <Antoine Pitrou> ARROW-3407:  Add UTF8 handling to CSV conversion
---
 cpp/src/arrow/csv/column-builder.cc          |  9 +++-
 cpp/src/arrow/csv/converter.cc               | 62 +++++++++++++++++-----------
 cpp/src/arrow/csv/converter.h                |  2 +
 cpp/src/arrow/csv/csv-column-builder-test.cc | 51 ++++++++++++++++++++---
 cpp/src/arrow/csv/csv-converter-test.cc      | 61 +++++++++++++++++++--------
 cpp/src/arrow/csv/options.h                  |  5 +++
 python/doc/source/api.rst                    |  1 +
 python/doc/source/csv.rst                    | 30 ++++++++------
 python/pyarrow/_csv.pyx                      | 42 +++++++++++++++++--
 python/pyarrow/csv.py                        |  2 +-
 python/pyarrow/includes/libarrow.pxd         |  2 +
 python/pyarrow/tests/test_csv.py             | 44 +++++++++++++-------
 12 files changed, 232 insertions(+), 79 deletions(-)

diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc
index b7610a5..4895f20 100644
--- a/cpp/src/arrow/csv/column-builder.cc
+++ b/cpp/src/arrow/csv/column-builder.cc
@@ -155,7 +155,7 @@ class InferringColumnBuilder : public ColumnBuilder {
   std::shared_ptr<Converter> converter_;
 
   // Current inference status
-  enum class InferKind { Null, Integer, Real, Text };
+  enum class InferKind { Null, Integer, Real, Text, Binary };
 
   std::shared_ptr<DataType> infer_type_;
   InferKind infer_kind_;
@@ -185,6 +185,9 @@ Status InferringColumnBuilder::LoosenType() {
       infer_kind_ = InferKind::Text;
       break;
     case InferKind::Text:
+      infer_kind_ = InferKind::Binary;
+      break;
+    case InferKind::Binary:
       return Status::UnknownError("Shouldn't come here");
   }
   return UpdateType();
@@ -207,6 +210,10 @@ Status InferringColumnBuilder::UpdateType() {
       can_loosen_type_ = true;
       break;
     case InferKind::Text:
+      infer_type_ = utf8();
+      can_loosen_type_ = true;
+      break;
+    case InferKind::Binary:
       infer_type_ = binary();
       can_loosen_type_ = false;
       break;
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 81015c1..729c934 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -28,6 +28,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/parsing.h"  // IWYU pragma: keep
+#include "arrow/util/utf8.h"
 
 namespace arrow {
 namespace csv {
@@ -56,6 +57,7 @@ class ConcreteConverter : public Converter {
   using Converter::Converter;
 
  protected:
+  Status Initialize() override { return Status::OK(); }
   inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted);
 };
 
@@ -197,34 +199,41 @@ Status NullConverter::Convert(const BlockParser& parser, int32_t col_index,
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for var-sized binary strings
 
-template <typename T>
+template <typename T, bool CheckUTF8>
 class VarSizeBinaryConverter : public ConcreteConverter {
  public:
   using ConcreteConverter::ConcreteConverter;
 
   Status Convert(const BlockParser& parser, int32_t col_index,
-                 std::shared_ptr<Array>* out) override;
-};
+                 std::shared_ptr<Array>* out) override {
+    using BuilderType = typename TypeTraits<T>::BuilderType;
+    BuilderType builder(pool_);
 
-template <typename T>
-Status VarSizeBinaryConverter<T>::Convert(const BlockParser& parser, int32_t col_index,
-                                          std::shared_ptr<Array>* out) {
-  using BuilderType = typename TypeTraits<T>::BuilderType;
-  BuilderType builder(pool_);
+    // TODO do we accept nulls here?
 
-  // TODO handle nulls
+    auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+      if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
+        std::stringstream ss;
+        ss << "CSV conversion error to " << type_->ToString() << ": invalid UTF8 data";
+        return Status::Invalid(ss.str());
+      }
+      builder.UnsafeAppend(data, size);
+      return Status::OK();
+    };
+    RETURN_NOT_OK(builder.Resize(parser.num_rows()));
+    RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
+    RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+    RETURN_NOT_OK(builder.Finish(out));
 
-  auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
-    builder.UnsafeAppend(data, size);
     return Status::OK();
-  };
-  RETURN_NOT_OK(builder.Resize(parser.num_rows()));
-  RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
-  RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
-  RETURN_NOT_OK(builder.Finish(out));
+  }
 
-  return Status::OK();
-}
+ protected:
+  Status Initialize() override {
+    util::InitializeUTF8();
+    return Status::OK();
+  }
+};
 
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter for fixed-sized binary strings
@@ -242,7 +251,7 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_
   FixedSizeBinaryBuilder builder(type_, pool_);
   const uint32_t byte_width = static_cast<uint32_t>(builder.byte_width());
 
-  // TODO handle nulls
+  // TODO do we accept nulls here?
 
   auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
     if (ARROW_PREDICT_FALSE(size != byte_width)) {
@@ -340,9 +349,6 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
     break;
 
     CONVERTER_CASE(Type::NA, NullConverter)
-    CONVERTER_CASE(Type::BINARY, VarSizeBinaryConverter<BinaryType>)
-    CONVERTER_CASE(Type::STRING, VarSizeBinaryConverter<StringType>)
-    CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
     CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>)
     CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>)
     CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>)
@@ -354,6 +360,16 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
     CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>)
     CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>)
     CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>)
+    CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
+    CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
+
+    case Type::STRING:
+      if (options.check_utf8) {
+        result = new VarSizeBinaryConverter<StringType, true>(type, options, pool);
+      } else {
+        result = new VarSizeBinaryConverter<StringType, false>(type, options, pool);
+      }
+      break;
 
     default: {
       std::stringstream ss;
@@ -364,7 +380,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions opt
 #undef CONVERTER_CASE
   }
   out->reset(result);
-  return Status::OK();
+  return result->Initialize();
 }
 
 Status Converter::Make(const std::shared_ptr<DataType>& type, ConvertOptions options,
diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h
index 52f8934..3fc3ac4 100644
--- a/cpp/src/arrow/csv/converter.h
+++ b/cpp/src/arrow/csv/converter.h
@@ -55,6 +55,8 @@ class ARROW_EXPORT Converter {
  protected:
   ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
 
+  virtual Status Initialize() = 0;
+
   ConvertOptions options_;
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
diff --git a/cpp/src/arrow/csv/csv-column-builder-test.cc b/cpp/src/arrow/csv/csv-column-builder-test.cc
index a4dc236..7488ad8 100644
--- a/cpp/src/arrow/csv/csv-column-builder-test.cc
+++ b/cpp/src/arrow/csv/csv-column-builder-test.cc
@@ -214,17 +214,58 @@ TEST(InferringColumnBuilder, MultipleChunkReal) {
   AssertChunkedEqual(*expected, *actual);
 }
 
+TEST(InferringColumnBuilder, SingleChunkString) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
+  std::shared_ptr<ChunkedArray> actual;
+  std::shared_ptr<ChunkedArray> expected;
+
+  // With valid UTF8
+  ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+  AssertBuilding(builder, {{"", "foo", "baré"}}, &actual);
+
+  ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
+                                                  {{"", "foo", "baré"}}, &expected);
+  AssertChunkedEqual(*expected, *actual);
+
+  // With invalid UTF8, non-checking
+  auto options = ConvertOptions::Defaults();
+  options.check_utf8 = false;
+  tg = TaskGroup::MakeSerial();
+  ASSERT_OK(ColumnBuilder::Make(0, options, tg, &builder));
+  AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
+
+  ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
+                                                  {{"", "foo\xff", "baré"}}, &expected);
+  AssertChunkedEqual(*expected, *actual);
+}
+
 TEST(InferringColumnBuilder, SingleChunkBinary) {
   auto tg = TaskGroup::MakeSerial();
   std::shared_ptr<ColumnBuilder> builder;
+  std::shared_ptr<ChunkedArray> actual;
+  std::shared_ptr<ChunkedArray> expected;
+
+  // With invalid UTF8, checking
+  ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
+  AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
+
+  ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
+                                                  {{"", "foo\xff", "baré"}}, &expected);
+  AssertChunkedEqual(*expected, *actual);
+}
+
+TEST(InferringColumnBuilder, MultipleChunkString) {
+  auto tg = TaskGroup::MakeSerial();
+  std::shared_ptr<ColumnBuilder> builder;
   ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
 
   std::shared_ptr<ChunkedArray> actual;
-  AssertBuilding(builder, {{"", "foo", "bar"}}, &actual);
+  AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré"}}, &actual);
 
   std::shared_ptr<ChunkedArray> expected;
-  ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
-                                                  {{"", "foo", "bar"}}, &expected);
+  ChunkedArrayFromVector<StringType, std::string>(
+      {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré"}}, &expected);
   AssertChunkedEqual(*expected, *actual);
 }
 
@@ -234,11 +275,11 @@ TEST(InferringColumnBuilder, MultipleChunkBinary) {
   ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
 
   std::shared_ptr<ChunkedArray> actual;
-  AssertBuilding(builder, {{""}, {"008"}, {"NaN", "bar"}}, &actual);
+  AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré\xff"}}, &actual);
 
   std::shared_ptr<ChunkedArray> expected;
   ChunkedArrayFromVector<BinaryType, std::string>(
-      {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "bar"}}, &expected);
+      {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré\xff"}}, &expected);
   AssertChunkedEqual(*expected, *actual);
 }
 
diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/csv-converter-test.cc
index dd3dba6..024a54c 100644
--- a/cpp/src/arrow/csv/csv-converter-test.cc
+++ b/cpp/src/arrow/csv/csv-converter-test.cc
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -45,12 +46,13 @@ std::vector<std::string> AllNulls() {
 template <typename DATA_TYPE, typename C_TYPE>
 void AssertConversion(const std::shared_ptr<DataType>& type,
                       const std::vector<std::string>& csv_string,
-                      const std::vector<std::vector<C_TYPE>>& expected) {
+                      const std::vector<std::vector<C_TYPE>>& expected,
+                      ConvertOptions options = ConvertOptions::Defaults()) {
   std::shared_ptr<BlockParser> parser;
   std::shared_ptr<Converter> converter;
   std::shared_ptr<Array> array, expected_array;
 
-  ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
+  ASSERT_OK(Converter::Make(type, options, &converter));
 
   MakeCSVParser(csv_string, &parser);
   for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
@@ -65,12 +67,13 @@ template <typename DATA_TYPE, typename C_TYPE>
 void AssertConversion(const std::shared_ptr<DataType>& type,
                       const std::vector<std::string>& csv_string,
                       const std::vector<std::vector<C_TYPE>>& expected,
-                      const std::vector<std::vector<bool>>& is_valid) {
+                      const std::vector<std::vector<bool>>& is_valid,
+                      ConvertOptions options = ConvertOptions::Defaults()) {
   std::shared_ptr<BlockParser> parser;
   std::shared_ptr<Converter> converter;
   std::shared_ptr<Array> array, expected_array;
 
-  ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
+  ASSERT_OK(Converter::Make(type, options, &converter));
 
   MakeCSVParser(csv_string, &parser);
   for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
@@ -90,17 +93,47 @@ void AssertConversionAllNulls(const std::shared_ptr<DataType>& type) {
   AssertConversion<DATA_TYPE, C_TYPE>(type, nulls, {values}, {is_valid});
 }
 
+void AssertConversionError(const std::shared_ptr<DataType>& type,
+                           const std::vector<std::string>& csv_string,
+                           const std::set<int32_t>& invalid_columns,
+                           ConvertOptions options = ConvertOptions::Defaults()) {
+  std::shared_ptr<BlockParser> parser;
+  std::shared_ptr<Converter> converter;
+  std::shared_ptr<Array> array;
+
+  ASSERT_OK(Converter::Make(type, options, &converter));
+
+  MakeCSVParser(csv_string, &parser);
+  for (int32_t i = 0; i < parser->num_cols(); ++i) {
+    if (invalid_columns.find(i) == invalid_columns.end()) {
+      ASSERT_OK(converter->Convert(*parser, i, &array));
+    } else {
+      ASSERT_RAISES(Invalid, converter->Convert(*parser, i, &array));
+    }
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////
 // Test functions begin here
 
 TEST(BinaryConversion, Basics) {
-  AssertConversion<BinaryType, std::string>(binary(), {"ab,cde\n", ",gh\n"},
-                                            {{"ab", ""}, {"cde", "gh"}});
+  AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n", ",\xffgh\n"},
+                                            {{"ab", ""}, {"cdé", "\xffgh"}});
 }
 
 TEST(StringConversion, Basics) {
-  AssertConversion<StringType, std::string>(utf8(), {"ab,cde\n", ",gh\n"},
-                                            {{"ab", ""}, {"cde", "gh"}});
+  AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
+                                            {{"ab", ""}, {"cdé", "gh"}});
+
+  auto options = ConvertOptions::Defaults();
+  options.check_utf8 = false;
+  AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",\xffgh\n"},
+                                            {{"ab", ""}, {"cdé", "\xffgh"}}, options);
+}
+
+TEST(StringConversion, Errors) {
+  // Invalid UTF8 in column 0
+  AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
 }
 
 TEST(FixedSizeBinaryConversion, Basics) {
@@ -109,16 +142,8 @@ TEST(FixedSizeBinaryConversion, Basics) {
 }
 
 TEST(FixedSizeBinaryConversion, Errors) {
-  std::shared_ptr<BlockParser> parser;
-  std::shared_ptr<Converter> converter;
-  std::shared_ptr<Array> array;
-  std::shared_ptr<DataType> type = fixed_size_binary(2);
-
-  ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
-
-  MakeCSVParser({"ab,cd\n", "g,ij\n"}, &parser);
-  ASSERT_RAISES(Invalid, converter->Convert(*parser, 0, &array));
-  ASSERT_OK(converter->Convert(*parser, 1, &array));
+  // Wrong-sized string in column 0
+  AssertConversionError(fixed_size_binary(2), {"ab,cd\n", "g,ij\n"}, {0});
 }
 
 TEST(NullConversion, Basics) {
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 6119786..0e42341 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -51,6 +51,11 @@ struct ARROW_EXPORT ParseOptions {
 };
 
 struct ARROW_EXPORT ConvertOptions {
+  // Conversion options
+
+  // Whether to check UTF8 validity of string columns
+  bool check_utf8 = true;
+
   static ConvertOptions Defaults();
 };
 
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index caa2d65..4ecd7d6 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -350,6 +350,7 @@ CSV Files
 
    ReadOptions
    ParseOptions
+   ConvertOptions
    read_csv
 
 .. currentmodule:: pyarrow.parquet
diff --git a/python/doc/source/csv.rst b/python/doc/source/csv.rst
index 5f9b46f..3fff8a8 100644
--- a/python/doc/source/csv.rst
+++ b/python/doc/source/csv.rst
@@ -29,7 +29,7 @@ The features currently offered are the following:
   such as ``my_data.csv.gz``)
 * fetching column names from the first row in the CSV file
 * column-wise type inference and conversion to one of ``null``, ``int64``,
-  ``float64`` or ``binary`` data
+  ``float64``, ``string`` or ``binary`` data
 * detecting various spellings of null values such as ``NaN`` or ``#N/A``
 
 Usage
@@ -46,21 +46,21 @@ with the file path you want to read from::
    pyarrow.Table
    total_bill: double
    tip: double
-   sex: binary
-   smoker: binary
-   day: binary
-   time: binary
+   sex: string
+   smoker: string
+   day: string
+   time: string
    size: int64
    >>> len(table)
    244
    >>> df = table.to_pandas()
    >>> df.head()
-      total_bill   tip        sex smoker     day       time  size
-   0       16.99  1.01  b'Female'  b'No'  b'Sun'  b'Dinner'     2
-   1       10.34  1.66    b'Male'  b'No'  b'Sun'  b'Dinner'     3
-   2       21.01  3.50    b'Male'  b'No'  b'Sun'  b'Dinner'     3
-   3       23.68  3.31    b'Male'  b'No'  b'Sun'  b'Dinner'     2
-   4       24.59  3.61  b'Female'  b'No'  b'Sun'  b'Dinner'     4
+      total_bill   tip     sex smoker  day    time  size
+   0       16.99  1.01  Female     No  Sun  Dinner     2
+   1       10.34  1.66    Male     No  Sun  Dinner     3
+   2       21.01  3.50    Male     No  Sun  Dinner     3
+   3       23.68  3.31    Male     No  Sun  Dinner     2
+   4       24.59  3.61  Female     No  Sun  Dinner     4
 
 Customized parsing
 ------------------
@@ -69,11 +69,17 @@ To alter the default parsing settings in case of reading CSV files with an
 unusual structure, you should create a :class:`ParseOptions` instance
 and pass it to :func:`read_csv`.
 
+Customized conversion
+---------------------
+
+To alter how CSV data is converted to Arrow types and data, you should create
+a :class:`ConvertOptions` instance and pass it to :func:`read_csv`.
+
 Limitations
 -----------
 
 Arrow is not able to detect or convert other data types (such as dates
-and times) than the four mentioned above.  It is also not possible to
+and times) than the five mentioned above.  It is also not possible to
 choose the data types of columns explicitly.
 
 Performance
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 90da157..e6488d0 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -215,6 +215,38 @@ cdef class ParseOptions:
         self.options.newlines_in_values = value
 
 
+cdef class ConvertOptions:
+    """
+    Options for converting CSV data.
+
+    Parameters
+    ----------
+    check_utf8 : bool, optional (default True)
+        Whether to check UTF8 validity of string columns.
+    """
+    cdef:
+        CCSVConvertOptions options
+
+    # Avoid mistakingly creating attributes
+    __slots__ = ()
+
+    def __init__(self, check_utf8=None):
+        self.options = CCSVConvertOptions.Defaults()
+        if check_utf8 is not None:
+            self.check_utf8 = check_utf8
+
+    @property
+    def check_utf8(self):
+        """
+        Whether to check UTF8 validity of string columns.
+        """
+        return self.options.check_utf8
+
+    @check_utf8.setter
+    def check_utf8(self, value):
+        self.options.check_utf8 = value
+
+
 cdef _get_reader(input_file, shared_ptr[InputStream]* out):
     use_memory_map = False
     get_input_stream(input_file, use_memory_map, out)
@@ -234,11 +266,12 @@ cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
         out[0] = parse_options.options
 
 
-cdef _get_convert_options(convert_options, CCSVConvertOptions* out):
+cdef _get_convert_options(ConvertOptions convert_options,
+                          CCSVConvertOptions* out):
     if convert_options is None:
         out[0] = CCSVConvertOptions.Defaults()
     else:
-        raise NotImplementedError("non-default convert options not supported")
+        out[0] = convert_options.options
 
 
 def read_csv(input_file, read_options=None, parse_options=None,
@@ -257,8 +290,9 @@ def read_csv(input_file, read_options=None, parse_options=None,
     parse_options: ParseOptions, optional
         Options for the CSV parser
         (see ParseOptions constructor for defaults)
-    convert_options: None
-        Currently unused
+    convert_options: ConvertOptions, optional
+        Options for converting CSV data
+        (see ConvertOptions constructor for defaults)
     memory_pool: MemoryPool, optional
         Pool to allocate Table memory from
 
diff --git a/python/pyarrow/csv.py b/python/pyarrow/csv.py
index d6830a0..8375ad4 100644
--- a/python/pyarrow/csv.py
+++ b/python/pyarrow/csv.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pyarrow._csv import ReadOptions, ParseOptions, read_csv  # noqa
+from pyarrow._csv import ReadOptions, ParseOptions, ConvertOptions, read_csv  # noqa
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1f5f4cf..a3d356e 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -947,6 +947,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         CCSVParseOptions Defaults()
 
     cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
+        c_bool check_utf8
+
         @staticmethod
         CCSVConvertOptions Defaults()
 
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index dbfa4f3..c204fdf 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -30,7 +30,7 @@ import pytest
 import numpy as np
 
 import pyarrow as pa
-from pyarrow.csv import read_csv, ReadOptions, ParseOptions
+from pyarrow.csv import read_csv, ReadOptions, ParseOptions, ConvertOptions
 
 
 def generate_col_names():
@@ -117,6 +117,18 @@ def test_parse_options():
     assert opts.newlines_in_values is True
 
 
+def test_convert_options():
+    cls = ConvertOptions
+    opts = cls()
+
+    assert opts.check_utf8 is True
+    opts.check_utf8 = False
+    assert opts.check_utf8 is False
+
+    opts = cls(check_utf8=False)
+    assert opts.check_utf8 is False
+
+
 class BaseTestCSVRead:
 
     def read_bytes(self, b, **kwargs):
@@ -153,31 +165,33 @@ class BaseTestCSVRead:
         table = self.read_bytes(rows)
         schema = pa.schema([('a', pa.float64()),
                             ('b', pa.int64()),
-                            ('c', pa.binary())])
+                            ('c', pa.string())])
         assert table.schema == schema
         assert table.to_pydict() == {
             'a': [1.0, 4.0],
             'b': [2, -5],
-            'c': [b"3", b"foo"],
+            'c': [u"3", u"foo"],
             }
 
     def test_simple_nulls(self):
         # Infer various kinds of data, with nulls
-        rows = (b"a,b,c,d\n"
-                b"1,2,,\n"
-                b"nan,-5,foo,\n"
-                b"4.5,#N/A,nan,\n")
+        rows = (b"a,b,c,d,e\n"
+                b"1,2,,,3\n"
+                b"nan,-5,foo,,nan\n"
+                b"4.5,#N/A,nan,,\xff\n")
         table = self.read_bytes(rows)
         schema = pa.schema([('a', pa.float64()),
                             ('b', pa.int64()),
-                            ('c', pa.binary()),
-                            ('d', pa.null())])
+                            ('c', pa.string()),
+                            ('d', pa.null()),
+                            ('e', pa.binary())])
         assert table.schema == schema
         assert table.to_pydict() == {
             'a': [1.0, None, 4.5],
             'b': [2, -5, None],
-            'c': [b"", b"foo", b"nan"],
-            'd': [None, None, None]
+            'c': [u"", u"foo", u"nan"],
+            'd': [None, None, None],
+            'e': [b"3", b"nan", b"\xff"],
             }
 
     def test_no_ending_newline(self):
@@ -212,14 +226,14 @@ class BaseTestCSVRead:
         rows = b"a;b,c\nde,fg;eh\n"
         table = self.read_bytes(rows)
         assert table.to_pydict() == {
-            'a;b': [b'de'],
-            'c': [b'fg;eh'],
+            'a;b': [u'de'],
+            'c': [u'fg;eh'],
             }
         opts = ParseOptions(delimiter=';')
         table = self.read_bytes(rows, parse_options=opts)
         assert table.to_pydict() == {
-            'a': [b'de,fg'],
-            'b,c': [b'eh'],
+            'a': [u'de,fg'],
+            'b,c': [u'eh'],
             }
 
     def test_small_random_csv(self):