You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/06/17 08:08:50 UTC

[arrow] branch master updated: ARROW-12882: [C++][Gandiva] Fix behavior of the convert replace function on gandiva

This is an automated email from the ASF dual-hosted git repository.

praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new caee207  ARROW-12882: [C++][Gandiva] Fix behavior of the convert replace function on gandiva
caee207 is described below

commit caee207ef9be60b00307146260dac8626c79fc18
Author: João Pedro <jo...@simbioseventures.com>
AuthorDate: Thu Jun 17 13:37:46 2021 +0530

    ARROW-12882: [C++][Gandiva] Fix behavior of the convert replace function on gandiva
    
    The convert_replace function on Gandiva, when defining an empty replacement char, should be able to replace the invalid chars with an empty string.
    
    Closes #10406 from jpedroantunes/bugfix/convert-replace-empty-char and squashes the following commits:
    
    0e1ec000f <João Pedro> Fix behavior of the convert replace function on gandiva
    
    Authored-by: João Pedro <jo...@simbioseventures.com>
    Signed-off-by: Praveen <pr...@dremio.com>
---
 cpp/src/gandiva/precompiled/string_ops.cc      | 22 ++++++++++++++--------
 cpp/src/gandiva/precompiled/string_ops_test.cc | 12 ++++++++++--
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 738ec36..1cd566d 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -1243,10 +1243,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
                                                     const char* char_to_replace,
                                                     int32_t char_to_replace_len,
                                                     int32_t* out_len) {
-  if (char_to_replace_len == 0) {
-    *out_len = text_len;
-    return text_in;
-  } else if (char_to_replace_len != 1) {
+  if (char_to_replace_len > 1) {
     gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
     *out_len = 0;
     return "";
@@ -1262,6 +1259,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
   }
   int32_t valid_bytes_to_cpy = 0;
   int32_t out_byte_counter = 0;
+  int32_t in_byte_counter = 0;
   int32_t char_len;
   // scan the base text from left to right and increment the start pointer till
   // looking for invalid chars to substitute
@@ -1273,9 +1271,15 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
       // define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
       char_len = 1;
       // first copy the valid bytes until now and then replace the invalid character
-      memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
-      ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
-      out_byte_counter += valid_bytes_to_cpy + char_len;
+      memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
+      // if the replacement char is empty, the invalid char should be ignored
+      if (char_to_replace_len == 0) {
+        out_byte_counter += valid_bytes_to_cpy;
+      } else {
+        ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
+        out_byte_counter += valid_bytes_to_cpy + char_len;
+      }
+      in_byte_counter += valid_bytes_to_cpy + char_len;
       valid_bytes_to_cpy = 0;
       continue;
     }
@@ -1285,8 +1289,10 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
   if (out_byte_counter == 0) return text_in;
   // if there are still valid bytes to copy, do it
   if (valid_bytes_to_cpy != 0) {
-    memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
+    memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
   }
+  // the out length will be the out bytes copied + the missing end bytes copied
+  *out_len = valid_bytes_to_cpy + out_byte_counter;
   return ret;
 }
 
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 8ffaace..2460633 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -175,14 +175,22 @@ TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
   EXPECT_TRUE(ctx.has_error());
   ctx.Reset();
 
-  // full valid utf8, but invalid replacement char length
+  // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
   std::string f("ok-\xa0\xa1-valid");
   auto f_in_out_len = static_cast<int>(f.length());
   const char* f_str = convert_replace_invalid_fromUTF8_binary(
       ctx_ptr, f.data(), f_in_out_len, "", 0, &f_in_out_len);
-  EXPECT_EQ(std::string(f_str, f_in_out_len), "ok-\xa0\xa1-valid");
+  EXPECT_EQ(std::string(f_str, f_in_out_len), "ok--valid");
   EXPECT_FALSE(ctx.has_error());
+  ctx.Reset();
 
+  // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
+  std::string g("\xa0\xa1-ok-\xa0\xa1-valid-\xa0\xa1");
+  auto g_in_out_len = static_cast<int>(g.length());
+  const char* g_str = convert_replace_invalid_fromUTF8_binary(
+      ctx_ptr, g.data(), g_in_out_len, "", 0, &g_in_out_len);
+  EXPECT_EQ(std::string(g_str, g_in_out_len), "-ok--valid-");
+  EXPECT_FALSE(ctx.has_error());
   ctx.Reset();
 }