You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/11/05 20:59:49 UTC
[arrow] branch master updated: ARROW-3656: [C++] Allow whitespace
in numeric CSV fields
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7011ae0 ARROW-3656: [C++] Allow whitespace in numeric CSV fields
7011ae0 is described below
commit 7011ae062d27662101ade55472063dda79899521
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Nov 5 21:59:41 2018 +0100
ARROW-3656: [C++] Allow whitespace in numeric CSV fields
Author: Antoine Pitrou <an...@python.org>
Closes #2875 from pitrou/ARROW-3656-csv-number-whitespace and squashes the following commits:
547b9365 <Antoine Pitrou> Address review comments
d84b93f5 <Antoine Pitrou> ARROW-3656: Allow whitespace in numeric CSV fields
---
cpp/src/arrow/builder.h | 2 ++
cpp/src/arrow/csv/converter.cc | 29 ++++++++++++++++++++++++++++-
cpp/src/arrow/csv/csv-converter-test.cc | 10 ++++++++++
3 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 185b506..fa9776f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -174,6 +174,8 @@ class ARROW_EXPORT ArrayBuilder {
length_ += std::distance(begin, end);
}
+ void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
+
protected:
ArrayBuilder() {}
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 86f3ab0..81015c1 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -44,6 +44,13 @@ Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8
return Status::Invalid(ss.str());
}
+inline bool IsWhitespace(uint8_t c) {
+ if (ARROW_PREDICT_TRUE(c > ' ')) {
+ return false;
+ }
+ return c == ' ' || c == '\t';
+}
+
class ConcreteConverter : public Converter {
public:
using Converter::Converter;
@@ -275,9 +282,29 @@ Status NumericConverter<T>::Convert(const BlockParser& parser, int32_t col_index
StringConverter<T> converter;
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ // XXX should quoted values be allowed at all?
value_type value;
if (IsNull(data, size, quoted)) {
- return builder.AppendNull();
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }
+ if (!std::is_same<BooleanType, T>::value) {
+ // Skip trailing whitespace
+ if (ARROW_PREDICT_TRUE(size > 0) &&
+ ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) {
+ const uint8_t* p = data + size - 1;
+ while (size > 0 && IsWhitespace(*p)) {
+ --size;
+ --p;
+ }
+ }
+ // Skip leading whitespace
+ if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) {
+ while (size > 0 && IsWhitespace(*data)) {
+ --size;
+ ++data;
+ }
+ }
}
if (ARROW_PREDICT_FALSE(
!converter(reinterpret_cast<const char*>(data), size, &value))) {
diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/csv-converter-test.cc
index 2958615..dd3dba6 100644
--- a/cpp/src/arrow/csv/csv-converter-test.cc
+++ b/cpp/src/arrow/csv/csv-converter-test.cc
@@ -158,6 +158,11 @@ TEST(IntegerConversion, Nulls) {
AssertConversionAllNulls<Int8Type, int8_t>(int8());
}
+TEST(IntegerConversion, Whitespace) {
+ AssertConversion<Int32Type, int32_t>(int32(), {" 12,34 \n", " 56 ,78\n"},
+ {{12, 56}, {34, 78}});
+}
+
TEST(FloatingPointConversion, Basics) {
AssertConversion<FloatType, float>(float32(), {"12,34.5\n", "0,-1e30\n"},
{{12., 0.}, {34.5, -1e30f}});
@@ -173,6 +178,11 @@ TEST(FloatingPointConversion, Nulls) {
AssertConversionAllNulls<DoubleType, double>(float64());
}
+TEST(FloatingPointConversion, Whitespace) {
+ AssertConversion<DoubleType, double>(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"},
+ {{12., 0.}, {34.5, -1e100}});
+}
+
TEST(BooleanConversion, Basics) {
// XXX we may want to accept more bool-like values
AssertConversion<BooleanType, bool>(boolean(), {"true,false\n", "1,0\n"},