You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/06 14:41:43 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #11849: ARROW-14905: [C++] Enable CSV Writer to handle quoting

pitrou commented on a change in pull request #11849:
URL: https://github.com/apache/arrow/pull/11849#discussion_r763066233



##########
File path: cpp/src/arrow/csv/options.h
##########
@@ -186,6 +198,9 @@ struct ARROW_EXPORT WriteOptions {
   /// \brief IO context for writing.
   io::IOContext io_context;
 
+  /// \brief Quote style
+  QuoteStyle quote_style = QuoteStyle::Needed;

Review comment:
       Nit, but should it be "quoting style" instead?

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -155,27 +161,61 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     return Status::OK();
   }
 
-  void PopulateColumns(char* output, int32_t* offsets) const override {
-    VisitArrayDataInline<StringType>(
-        *casted_array_->data(),
-        [&](arrow::util::string_view s) {
-          int32_t next_column_offset = static_cast<int32_t>(s.length()) + /*end_char*/ 1;
-          memcpy((output + *offsets - next_column_offset), s.data(), s.length());
-          *(output + *offsets - 1) = end_char_;
-          *offsets -= next_column_offset;
-          offsets++;
-        },
-        [&]() {
-          // For nulls, the configured null value string is copied into the output.
-          int32_t next_column_offset =
-              static_cast<int32_t>(null_string_->size()) + /*end_char*/ 1;
-          memcpy((output + *offsets - next_column_offset), null_string_->data(),
-                 null_string_->size());
-          *(output + *offsets - 1) = end_char_;
-          *offsets -= next_column_offset;
-          offsets++;
-        });
+  Status PopulateColumns(char* output, int32_t* offsets) const override {
+    // Function applied to valid values cast to string.
+    auto valid_function = [&](arrow::util::string_view s) {
+      int32_t next_column_offset = static_cast<int32_t>(s.length()) + /*end_char*/ 1;
+      memcpy((output + *offsets - next_column_offset), s.data(), s.length());
+      *(output + *offsets - 1) = end_char_;
+      *offsets -= next_column_offset;
+      offsets++;
+      return Status::OK();
+    };
+
+    // Function applied to null values cast to string.
+    auto null_function = [&]() {
+      // For nulls, the configured null value string is copied into the output.
+      int32_t next_column_offset =
+          static_cast<int32_t>(null_string_->size()) + /*end_char*/ 1;
+      memcpy((output + *offsets - next_column_offset), null_string_->data(),
+             null_string_->size());
+      *(output + *offsets - 1) = end_char_;
+      *offsets -= next_column_offset;
+      offsets++;
+      return Status::OK();
+    };
+
+    if (reject_values_with_quotes_) {
+      // When using this UnquotedColumnPopulator on values that, after casting, could
+      // produce quotes, we need to return an error in accord with RFC4180. We need to
+      // precede valid_func with a check.
+      return VisitArrayDataInline<StringType>(
+          *casted_array_->data(),
+          [&](arrow::util::string_view s) {
+            ARROW_RETURN_NOT_OK(CheckStringHasNoQuotes(s));
+            return valid_function(s);
+          },
+          null_function);
+    } else {
+      // Populate without checking and rejecting values with quotes.
+      return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function,
+                                              null_function);
+    }
+  }
+
+ private:
+  static Status CheckStringHasNoQuotes(const util::string_view& s) {
+    if (CountQuotes(s) > 0) {

Review comment:
       This sounds incomplete. Other characters need quoting or escaping: the cell delimiter `,`, the line endings `\r` and `\n`.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org