You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/06/29 09:11:31 UTC
[arrow] branch master updated: ARROW-16850: [C++] Copy CSV data field and end chars separately (#13394)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b0422f8df0 ARROW-16850: [C++] Copy CSV data field and end chars separately (#13394)
b0422f8df0 is described below

commit b0422f8df0c7dd02e2982dddcfd2047e067cc0a0
Author: Yibo Cai <yi...@arm.com>
AuthorDate: Wed Jun 29 17:11:20 2022 +0800

    ARROW-16850: [C++] Copy CSV data field and end chars separately (#13394)
    
    It's a finding when evaluating Arm Statistical Profiling Extension(SPE)
    on Neoverse-N1 with Arrow CSV writer benchmark. With SPE, we are able to
    locate precisely the machine code causing heavy cache miss or the exact
    branch suffering from high misprediction rate.
    
    CSV writer benchmark reveals relatively high branch misprediction rate
    inside glibc `memcpy` function. The branch is comparing the buffer size
    against 8, and to run different copying code per comparison result.
    
    Arrow CSV writer populates one column at once. In the inner loop, it
    does two `memcpy`. First one is to copy the field value(string) from
    Arrow string array. Then it does another copy to append the delimiter
    or end-of-line(if last column), which is at most 2 chars. Data in same
    column is normally of similar size, but as we are copying a very short
    delimiter after each data field, looks it makes CPU harder to predict
    the correct code path in `memcpy` as input data size varies quickly.
    
    This PR adds a trivial `CopyEndChars` routine to copy only the
    delimiter and end-of-line. It improves performance significantly on
    Arm Neoverse-N1. Skylake also sees mild improvement.
    
    Lead-authored-by: Yibo Cai <yi...@arm.com>
    Co-authored-by: Antoine Pitrou <pi...@free.fr>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/csv/writer.cc | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc
index a9dbec542d..95c2e03a10 100644
--- a/cpp/src/arrow/csv/writer.cc
+++ b/cpp/src/arrow/csv/writer.cc
@@ -59,6 +59,25 @@ namespace csv {
 
 namespace {
 
+// This function is to improve performance. It copies CSV delimiter and eol
+// without calling `memcpy`.
+// Each CSV field is followed by a delimiter or eol, which is often only one
+// or two chars. If copying both the field and delimiter with `memcpy`, CPU
+// may suffer from high branch misprediction as we are tripping `memcpy` with
+// interleaved (normal/tiny/normal/tiny/...) buffer sizes, which are handled
+// separately inside `memcpy`. This function goes fast path if the buffer
+// size is one or two chars to leave `memcpy` only for copying CSV fields.
+void CopyEndChars(char* dest, const char* src, size_t size) {
+  if (size == 1) {
+    // for fixed size memcpy, compiler will generate direct load/store opcode
+    memcpy(dest, src, 1);
+  } else if (size == 2) {
+    memcpy(dest, src, 2);
+  } else {
+    memcpy(dest, src, size);
+  }
+}
+
 struct SliceIteratorFunctor {
   Result<std::shared_ptr<RecordBatch>> Next() {
     if (current_offset < batch->num_rows()) {
@@ -185,7 +204,7 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     // Function applied to valid values cast to string.
     auto valid_function = [&](arrow::util::string_view s) {
       memcpy(output + *offsets, s.data(), s.length());
-      memcpy(output + *offsets + s.length(), end_chars_.c_str(), end_chars_.size());
+      CopyEndChars(output + *offsets + s.length(), end_chars_.c_str(), end_chars_.size());
       *offsets += static_cast<int64_t>(s.length() + end_chars_.size());
       offsets++;
       return Status::OK();
@@ -195,8 +214,8 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     auto null_function = [&]() {
       // For nulls, the configured null value string is copied into the output.
       memcpy(output + *offsets, null_string_->data(), null_string_->size());
-      memcpy(output + *offsets + null_string_->size(), end_chars_.c_str(),
-             end_chars_.size());
+      CopyEndChars(output + *offsets + null_string_->size(), end_chars_.c_str(),
+                   end_chars_.size());
       *offsets += static_cast<int64_t>(null_string_->size() + end_chars_.size());
       offsets++;
       return Status::OK();
@@ -314,7 +333,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
             row = Escape(s, row);
           }
           *row++ = '"';
-          memcpy(row, end_chars_.data(), end_chars_.length());
+          CopyEndChars(row, end_chars_.data(), end_chars_.length());
           row += end_chars_.length();
           *offsets = static_cast<int64_t>(row - output);
           offsets++;
@@ -323,8 +342,8 @@ class QuotedColumnPopulator : public ColumnPopulator {
         [&]() {
           // For nulls, the configured null value string is copied into the output.
           memcpy(output + *offsets, null_string_->data(), null_string_->size());
-          memcpy(output + *offsets + null_string_->size(), end_chars_.c_str(),
-                 end_chars_.size());
+          CopyEndChars(output + *offsets + null_string_->size(), end_chars_.c_str(),
+                       end_chars_.size());
           *offsets += static_cast<int64_t>(null_string_->size() + end_chars_.size());
           offsets++;
           needs_escaping++;