You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/04 19:55:42 UTC

[GitHub] [arrow] nealrichardson commented on a change in pull request #8365: ARROW-6582: [R] Arrow to R fails with embedded nuls in strings

nealrichardson commented on a change in pull request #8365:
URL: https://github.com/apache/arrow/pull/8365#discussion_r551519433



##########
File path: r/src/array_to_vector.cpp
##########
@@ -288,36 +290,104 @@ struct Converter_String : public Converter {
     }
 
     StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
-    if (array->null_count()) {
-      // need to watch for nulls
-      arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
-                                                array->offset(), n);
+
+    const bool all_valid = array->null_count() == 0;
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+
+    bool nul_was_stripped = false;
+
+    if (all_valid) {
+      // no need to watch for missing strings
       cpp11::unwind_protect([&] {
-        for (int i = 0; i < n; i++, null_reader.Next()) {
-          if (null_reader.IsSet()) {
-            SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
-          } else {
-            SET_STRING_ELT(data, start + i, NA_STRING);
+        if (strip_out_nuls) {
+          for (int i = 0; i < n; i++) {
+            SET_STRING_ELT(data, start + i,
+                           r_string_from_view_strip_nul(string_array->GetView(i),
+                                                        &nul_was_stripped));
           }
+          return;

Review comment:
       The main function return (L347) is `return Status::OK();` -- should these do the same? Or do I misunderstand what this return does?

##########
File path: r/src/array_to_vector.cpp
##########
@@ -288,36 +290,104 @@ struct Converter_String : public Converter {
     }
 
     StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
-    if (array->null_count()) {
-      // need to watch for nulls
-      arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
-                                                array->offset(), n);
+
+    const bool all_valid = array->null_count() == 0;
+    const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+
+    bool nul_was_stripped = false;
+
+    if (all_valid) {
+      // no need to watch for missing strings
       cpp11::unwind_protect([&] {
-        for (int i = 0; i < n; i++, null_reader.Next()) {
-          if (null_reader.IsSet()) {
-            SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
-          } else {
-            SET_STRING_ELT(data, start + i, NA_STRING);
+        if (strip_out_nuls) {
+          for (int i = 0; i < n; i++) {
+            SET_STRING_ELT(data, start + i,
+                           r_string_from_view_strip_nul(string_array->GetView(i),
+                                                        &nul_was_stripped));
           }
+          return;
+        }
+
+        for (int i = 0; i < n; i++) {
+          SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
         }
       });
     } else {
       cpp11::unwind_protect([&] {
-        for (int i = 0; i < n; i++) {
-          SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
+        arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(),
+                                                      array->offset(), n);
+
+        if (strip_out_nuls) {
+          for (int i = 0; i < n; i++, validity_reader.Next()) {
+            if (validity_reader.IsSet()) {
+              SET_STRING_ELT(data, start + i,
+                             r_string_from_view_strip_nul(string_array->GetView(i),
+                                                          &nul_was_stripped));
+            } else {
+              SET_STRING_ELT(data, start + i, NA_STRING);
+            }
+          }
+          return;
+        }
+
+        for (int i = 0; i < n; i++, validity_reader.Next()) {
+          if (validity_reader.IsSet()) {
+            SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
+          } else {
+            SET_STRING_ELT(data, start + i, NA_STRING);
+          }
         }
       });
     }
 
+    if (nul_was_stripped) {
+      cpp11::warning("Stripping '\\0' (nul) from character vector");
+    }
+
     return Status::OK();
   }
 
   bool Parallel() const { return false; }
 
  private:
-  static SEXP r_string_from_view(const arrow::util::string_view& view) {
+  static SEXP r_string_from_view(arrow::util::string_view view) {

Review comment:
       Why this change?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org