You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/10 17:56:46 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #11849: ARROW-14905: [C++] Enable CSV Writer to handle quoting

lidavidm commented on a change in pull request #11849:
URL: https://github.com/apache/arrow/pull/11849#discussion_r766875020



##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -155,27 +161,64 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     return Status::OK();
   }
 
-  void PopulateColumns(char* output, int32_t* offsets) const override {
-    VisitArrayDataInline<StringType>(
-        *casted_array_->data(),
-        [&](arrow::util::string_view s) {
-          int32_t next_column_offset = static_cast<int32_t>(s.length()) + /*end_char*/ 1;
-          memcpy((output + *offsets - next_column_offset), s.data(), s.length());
-          *(output + *offsets - 1) = end_char_;
-          *offsets -= next_column_offset;
-          offsets++;
-        },
-        [&]() {
-          // For nulls, the configured null value string is copied into the output.
-          int32_t next_column_offset =
-              static_cast<int32_t>(null_string_->size()) + /*end_char*/ 1;
-          memcpy((output + *offsets - next_column_offset), null_string_->data(),
-                 null_string_->size());
-          *(output + *offsets - 1) = end_char_;
-          *offsets -= next_column_offset;
-          offsets++;
-        });
+  Status PopulateColumns(char* output, int32_t* offsets) const override {
+    // Function applied to valid values cast to string.
+    auto valid_function = [&](arrow::util::string_view s) {
+      int32_t next_column_offset = static_cast<int32_t>(s.length()) + /*end_char*/ 1;
+      memcpy((output + *offsets - next_column_offset), s.data(), s.length());
+      *(output + *offsets - 1) = end_char_;
+      *offsets -= next_column_offset;
+      offsets++;
+      return Status::OK();
+    };
+
+    // Function applied to null values cast to string.
+    auto null_function = [&]() {
+      // For nulls, the configured null value string is copied into the output.
+      int32_t next_column_offset =
+          static_cast<int32_t>(null_string_->size()) + /*end_char*/ 1;
+      memcpy((output + *offsets - next_column_offset), null_string_->data(),
+             null_string_->size());
+      *(output + *offsets - 1) = end_char_;
+      *offsets -= next_column_offset;
+      offsets++;
+      return Status::OK();
+    };
+
+    if (reject_values_with_quotes_) {
+      // When using this UnquotedColumnPopulator on values that, after casting, could
+      // produce quotes, we need to return an error in accord with RFC4180. We need to
+      // precede valid_func with a check.
+      return VisitArrayDataInline<StringType>(
+          *casted_array_->data(),
+          [&](arrow::util::string_view s) {
+            ARROW_RETURN_NOT_OK(CheckStringHasNoStructuralChars(s));
+            return valid_function(s);
+          },
+          null_function);
+    } else {
+      // Populate without checking and rejecting values with quotes.
+      return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function,
+                                              null_function);
+    }
+  }
+
+ private:
+  // Returns an error status if s has any structural characters.
+  static Status CheckStringHasNoStructuralChars(const util::string_view& s) {
+    if (std::any_of(s.begin(), s.end(), [](const char& c) {

Review comment:
       I filed ARROW-15064 so we don't forget this, though we need writer benchmarks (ARROW-14993).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org