You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/07 16:14:40 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #12574: ARROW-15854: [C++] Refine CSV writer code

pitrou commented on a change in pull request #12574:
URL: https://github.com/apache/arrow/pull/12574#discussion_r820852990



##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -136,15 +134,14 @@ class ColumnPopulator {
   MemoryPool* const pool_;
 };
 
-// Copies the contents of to out properly escaping any necessary characters.
-// Returns the position prior to last copied character (out_end is decremented).
-char* EscapeReverse(arrow::util::string_view s, char* out_end) {
-  for (const char* val = s.data() + s.length() - 1; val >= s.data(); val--, out_end--) {
-    if (*val == '"') {
-      *out_end = *val;
-      out_end--;
+// Copies the contents of s to out properly escaping any necessary characters.
+// Returns the position next to last copied character.
+char* Escape(arrow::util::string_view s, char* out_end) {

Review comment:
       Change variable name to match new meaning?
   
   ```suggestion
   char* Escape(arrow::util::string_view s, char* out) {
   ```

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -184,28 +181,23 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     return Status::OK();
   }
 
-  Status PopulateColumns(char* output, int32_t* offsets) const override {
+  Status PopulateRows(char* output, int64_t* offsets) const override {
     // Function applied to valid values cast to string.
     auto valid_function = [&](arrow::util::string_view s) {
-      int32_t next_column_offset = static_cast<int32_t>(s.length() + end_chars_.size());
-      memcpy((output + *offsets - next_column_offset), s.data(), s.length());
-      memcpy((output + *offsets - end_chars_.size()), end_chars_.c_str(),
-             end_chars_.size());
-      *offsets -= next_column_offset;
+      memcpy(output + *offsets, s.data(), s.length());
+      memcpy(output + *offsets + s.length(), end_chars_.c_str(), end_chars_.size());
+      *offsets += static_cast<int64_t>(s.length() + end_chars_.size());
       offsets++;
       return Status::OK();
     };
 
     // Function applied to null values cast to string.
     auto null_function = [&]() {
       // For nulls, the configured null value string is copied into the output.
-      int32_t next_column_offset =
-          static_cast<int32_t>(null_string_->size() + end_chars_.size());
-      memcpy((output + *offsets - next_column_offset), null_string_->data(),
-             null_string_->size());
-      memcpy((output + *offsets - end_chars_.size()), end_chars_.c_str(),
+      memcpy(output + *offsets, null_string_->data(), null_string_->size());
+      memcpy(output + *offsets + null_string_->size(), end_chars_.c_str(),
              end_chars_.size());
-      *offsets -= next_column_offset;
+      *offsets += static_cast<int32_t>(null_string_->size() + end_chars_.size());

Review comment:
       Why not `int64_t`?

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -309,40 +300,33 @@ class QuotedColumnPopulator : public ColumnPopulator {
     return Status::OK();
   }
 
-  Status PopulateColumns(char* output, int32_t* offsets) const override {
+  Status PopulateRows(char* output, int64_t* offsets) const override {
     auto needs_escaping = row_needs_escaping_.begin();
     VisitArrayDataInline<StringType>(
         *(casted_array_->data()),
         [&](arrow::util::string_view s) {
           // still needs string content length to be added
-          char* row_end = output + *offsets;
-          int32_t next_column_offset = 0;
+          char* row = output + *offsets;
+          *row++ = '"';
           if (!*needs_escaping) {
-            next_column_offset = static_cast<int32_t>(s.length() + kQuoteDelimiterCount);
-            memcpy(row_end - next_column_offset + /*quote_offset=*/1, s.data(),
-                   s.length());
+            memcpy(row, s.data(), s.length());
+            row += s.length();
           } else {
-            // Adjust row_end by 2 + end_chars_.size(): 1 quote char, end_chars_.size()
-            // and 1 to position at the first position to write to.
-            next_column_offset = static_cast<int32_t>(
-                row_end - EscapeReverse(s, row_end - 2 - end_chars_.size()));
+            row = Escape(s, row);
           }
-          *(row_end - next_column_offset) = '"';
-          *(row_end - end_chars_.size() - 1) = '"';
-          memcpy(row_end - end_chars_.size(), end_chars_.data(), end_chars_.length());
-          *offsets -= next_column_offset;
+          *row++ = '"';
+          memcpy(row, end_chars_.data(), end_chars_.length());
+          row += end_chars_.length();
+          *offsets += static_cast<int64_t>(row - (output + *offsets));
           offsets++;
           needs_escaping++;
         },
         [&]() {
           // For nulls, the configured null value string is copied into the output.
-          int32_t next_column_offset =
-              static_cast<int32_t>(null_string_->size() + end_chars_.size());
-          memcpy((output + *offsets - next_column_offset), null_string_->data(),
-                 null_string_->size());
-          memcpy((output + *offsets - end_chars_.size()), end_chars_.c_str(),
+          memcpy(output + *offsets, null_string_->data(), null_string_->size());
+          memcpy(output + *offsets + null_string_->size(), end_chars_.c_str(),
                  end_chars_.size());
-          *offsets -= next_column_offset;
+          *offsets += static_cast<int32_t>(null_string_->size() + end_chars_.size());

Review comment:
       `int64_t` perhaps?

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -572,33 +558,34 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
     }
     // Calculate cumulative offsets for each row (including delimiters).
     // ',' * num_columns - 1(last column doesn't have ,) + eol
-    int32_t delimiters_length =
+    const int32_t delimiters_length =
         static_cast<int32_t>(batch.num_columns() - 1 + options_.eol.size());
-    offsets_[0] += delimiters_length;
-    for (int64_t row = 1; row < batch.num_rows(); row++) {
-      offsets_[row] += offsets_[row - 1] + delimiters_length;
+    int64_t last_row_length = offsets_[0] + delimiters_length;
+    offsets_[0] = 0;
+    for (size_t row = 1; row < offsets_.size(); ++row) {
+      const int64_t this_row_length = offsets_[row] + delimiters_length;
+      offsets_[row] = offsets_[row - 1] + last_row_length;
+      last_row_length = this_row_length;

Review comment:
       Can you add a comment with this?

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -309,40 +300,33 @@ class QuotedColumnPopulator : public ColumnPopulator {
     return Status::OK();
   }
 
-  Status PopulateColumns(char* output, int32_t* offsets) const override {
+  Status PopulateRows(char* output, int64_t* offsets) const override {
     auto needs_escaping = row_needs_escaping_.begin();
     VisitArrayDataInline<StringType>(
         *(casted_array_->data()),
         [&](arrow::util::string_view s) {
           // still needs string content length to be added
-          char* row_end = output + *offsets;
-          int32_t next_column_offset = 0;
+          char* row = output + *offsets;
+          *row++ = '"';
           if (!*needs_escaping) {
-            next_column_offset = static_cast<int32_t>(s.length() + kQuoteDelimiterCount);
-            memcpy(row_end - next_column_offset + /*quote_offset=*/1, s.data(),
-                   s.length());
+            memcpy(row, s.data(), s.length());
+            row += s.length();
           } else {
-            // Adjust row_end by 2 + end_chars_.size(): 1 quote char, end_chars_.size()
-            // and 1 to position at the first position to write to.
-            next_column_offset = static_cast<int32_t>(
-                row_end - EscapeReverse(s, row_end - 2 - end_chars_.size()));
+            row = Escape(s, row);
           }
-          *(row_end - next_column_offset) = '"';
-          *(row_end - end_chars_.size() - 1) = '"';
-          memcpy(row_end - end_chars_.size(), end_chars_.data(), end_chars_.length());
-          *offsets -= next_column_offset;
+          *row++ = '"';
+          memcpy(row, end_chars_.data(), end_chars_.length());
+          row += end_chars_.length();
+          *offsets += static_cast<int64_t>(row - (output + *offsets));

Review comment:
       Wouldn't it be simpler as:
   ```suggestion
             *offsets = static_cast<int64_t>(row - output);
   ```

##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -572,33 +558,34 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
     }
     // Calculate cumulative offsets for each row (including delimiters).
     // ',' * num_columns - 1(last column doesn't have ,) + eol
-    int32_t delimiters_length =
+    const int32_t delimiters_length =
         static_cast<int32_t>(batch.num_columns() - 1 + options_.eol.size());
-    offsets_[0] += delimiters_length;
-    for (int64_t row = 1; row < batch.num_rows(); row++) {
-      offsets_[row] += offsets_[row - 1] + delimiters_length;
+    int64_t last_row_length = offsets_[0] + delimiters_length;
+    offsets_[0] = 0;
+    for (size_t row = 1; row < offsets_.size(); ++row) {
+      const int64_t this_row_length = offsets_[row] + delimiters_length;
+      offsets_[row] = offsets_[row - 1] + last_row_length;
+      last_row_length = this_row_length;

Review comment:
       Also, you probably mean: "`offsets_[i]` was the length of row `i-1` before the conversion"...?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org