You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/11/05 20:59:49 UTC

[arrow] branch master updated: ARROW-3656: [C++] Allow whitespace in numeric CSV fields

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7011ae0  ARROW-3656: [C++] Allow whitespace in numeric CSV fields
7011ae0 is described below

commit 7011ae062d27662101ade55472063dda79899521
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Nov 5 21:59:41 2018 +0100

    ARROW-3656: [C++] Allow whitespace in numeric CSV fields
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2875 from pitrou/ARROW-3656-csv-number-whitespace and squashes the following commits:
    
    547b9365 <Antoine Pitrou> Address review comments
    d84b93f5 <Antoine Pitrou> ARROW-3656:  Allow whitespace in numeric CSV fields
---
 cpp/src/arrow/builder.h                 |  2 ++
 cpp/src/arrow/csv/converter.cc          | 29 ++++++++++++++++++++++++++++-
 cpp/src/arrow/csv/csv-converter-test.cc | 10 ++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 185b506..fa9776f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -174,6 +174,8 @@ class ARROW_EXPORT ArrayBuilder {
     length_ += std::distance(begin, end);
   }
 
+  void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
+
  protected:
   ArrayBuilder() {}
 
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 86f3ab0..81015c1 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -44,6 +44,13 @@ Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8
   return Status::Invalid(ss.str());
 }
 
+inline bool IsWhitespace(uint8_t c) {
+  if (ARROW_PREDICT_TRUE(c > ' ')) {
+    return false;
+  }
+  return c == ' ' || c == '\t';
+}
+
 class ConcreteConverter : public Converter {
  public:
   using Converter::Converter;
@@ -275,9 +282,29 @@ Status NumericConverter<T>::Convert(const BlockParser& parser, int32_t col_index
   StringConverter<T> converter;
 
   auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+    // XXX should quoted values be allowed at all?
     value_type value;
     if (IsNull(data, size, quoted)) {
-      return builder.AppendNull();
+      builder.UnsafeAppendNull();
+      return Status::OK();
+    }
+    if (!std::is_same<BooleanType, T>::value) {
+      // Skip trailing whitespace
+      if (ARROW_PREDICT_TRUE(size > 0) &&
+          ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) {
+        const uint8_t* p = data + size - 1;
+        while (size > 0 && IsWhitespace(*p)) {
+          --size;
+          --p;
+        }
+      }
+      // Skip leading whitespace
+      if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) {
+        while (size > 0 && IsWhitespace(*data)) {
+          --size;
+          ++data;
+        }
+      }
     }
     if (ARROW_PREDICT_FALSE(
             !converter(reinterpret_cast<const char*>(data), size, &value))) {
diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/csv-converter-test.cc
index 2958615..dd3dba6 100644
--- a/cpp/src/arrow/csv/csv-converter-test.cc
+++ b/cpp/src/arrow/csv/csv-converter-test.cc
@@ -158,6 +158,11 @@ TEST(IntegerConversion, Nulls) {
   AssertConversionAllNulls<Int8Type, int8_t>(int8());
 }
 
+TEST(IntegerConversion, Whitespace) {
+  AssertConversion<Int32Type, int32_t>(int32(), {" 12,34 \n", " 56 ,78\n"},
+                                       {{12, 56}, {34, 78}});
+}
+
 TEST(FloatingPointConversion, Basics) {
   AssertConversion<FloatType, float>(float32(), {"12,34.5\n", "0,-1e30\n"},
                                      {{12., 0.}, {34.5, -1e30f}});
@@ -173,6 +178,11 @@ TEST(FloatingPointConversion, Nulls) {
   AssertConversionAllNulls<DoubleType, double>(float64());
 }
 
+TEST(FloatingPointConversion, Whitespace) {
+  AssertConversion<DoubleType, double>(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"},
+                                       {{12., 0.}, {34.5, -1e100}});
+}
+
 TEST(BooleanConversion, Basics) {
   // XXX we may want to accept more bool-like values
   AssertConversion<BooleanType, bool>(boolean(), {"true,false\n", "1,0\n"},