You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/02/16 02:17:41 UTC

[GitHub] [arrow] cyb70289 commented on a change in pull request #12399: ARROW-14993: [C++] Benchmark CSV writer

cyb70289 commented on a change in pull request #12399:
URL: https://github.com/apache/arrow/pull/12399#discussion_r807476472



##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -194,34 +204,49 @@ class UnquotedColumnPopulator : public ColumnPopulator {
       return Status::OK();
     };
 
-    if (reject_values_with_quotes_) {
-      // When using this UnquotedColumnPopulator on values that, after casting, could
-      // produce quotes, we need to return an error in accord with RFC4180. We need to
-      // precede valid_func with a check.
-      return VisitArrayDataInline<StringType>(
-          *casted_array_->data(),
-          [&](arrow::util::string_view s) {
-            RETURN_NOT_OK(CheckStringHasNoStructuralChars(s));
-            return valid_function(s);
-          },
-          null_function);
-    } else {
-      // Populate without checking and rejecting values with quotes.
-      return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function,
-                                              null_function);
-    }
+    return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function,
+                                            null_function);
   }
 
  private:
-  // Returns an error status if s has any structural characters.
-  static Status CheckStringHasNoStructuralChars(const util::string_view& s) {
-    if (std::any_of(s.begin(), s.end(), [](const char& c) {
-          return c == '\n' || c == '\r' || c == ',' || c == '"';
-        })) {
-      return Status::Invalid(
-          "CSV values may not contain structural characters if quoting style is "
-          "\"None\". See RFC4180. Invalid value: ",
-          s);
+  // Returns an error status if string array has any structural characters.
+  static Status CheckStringArrayHasNoStructuralChars(const StringArray& array) {
+    // scan the underlying string array buffer as a single big string
+    const uint8_t* const data = array.raw_data() + array.value_offset(0);
+    const int64_t buffer_size = array.total_values_length();
+    int64_t offset = 0;
+#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
+#if defined(ARROW_HAVE_SSE4_2)
+    // _mm_cmpistrc gives slightly better performance than the naive approach,
+    // probably doesn't deserve the effort
+    using simd_batch = xsimd::batch<uint8_t, xsimd::sse4_2>;
+#else
+    using simd_batch = xsimd::batch<uint8_t, xsimd::neon64>;

Review comment:
       `xsimd::neon` implements neon instructions before armv8 (64bit arch), the simd register size is 128bits. `xsimd::neon64` adds neon instructions for armv8, the simd register size is still 128bits.
   E.g., reduce function `vmaxvq_u8` is only implemented on armv8, `ximd::any` can use it only if `xsimd::neon64` is specified [1], otherwise it has to use lower efficient code [2].
   
   [1] https://github.com/xtensor-stack/xsimd/blob/master/include/xsimd/arch/xsimd_neon64.hpp#L65
   [2] https://github.com/xtensor-stack/xsimd/blob/master/include/xsimd/arch/xsimd_neon.hpp#L2121




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org