You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/06/17 08:08:50 UTC
[arrow] branch master updated: ARROW-12882: [C++][Gandiva] Fix
behavior of the convert replace function on gandiva
This is an automated email from the ASF dual-hosted git repository.
praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new caee207 ARROW-12882: [C++][Gandiva] Fix behavior of the convert replace function on gandiva
caee207 is described below
commit caee207ef9be60b00307146260dac8626c79fc18
Author: João Pedro <jo...@simbioseventures.com>
AuthorDate: Thu Jun 17 13:37:46 2021 +0530
ARROW-12882: [C++][Gandiva] Fix behavior of the convert replace function on gandiva
The convert_replace function on Gandiva, when defining an empty replacement char, should be able to replace the invalid chars with an empty string.
Closes #10406 from jpedroantunes/bugfix/convert-replace-empty-char and squashes the following commits:
0e1ec000f <João Pedro> Fix behavior of the convert replace function on gandiva
Authored-by: João Pedro <jo...@simbioseventures.com>
Signed-off-by: Praveen <pr...@dremio.com>
---
cpp/src/gandiva/precompiled/string_ops.cc | 22 ++++++++++++++--------
cpp/src/gandiva/precompiled/string_ops_test.cc | 12 ++++++++++--
2 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 738ec36..1cd566d 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -1243,10 +1243,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
const char* char_to_replace,
int32_t char_to_replace_len,
int32_t* out_len) {
- if (char_to_replace_len == 0) {
- *out_len = text_len;
- return text_in;
- } else if (char_to_replace_len != 1) {
+ if (char_to_replace_len > 1) {
gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
*out_len = 0;
return "";
@@ -1262,6 +1259,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
}
int32_t valid_bytes_to_cpy = 0;
int32_t out_byte_counter = 0;
+ int32_t in_byte_counter = 0;
int32_t char_len;
// scan the base text from left to right and increment the start pointer till
// looking for invalid chars to substitute
@@ -1273,9 +1271,15 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
// define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
char_len = 1;
// first copy the valid bytes until now and then replace the invalid character
- memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
- ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
- out_byte_counter += valid_bytes_to_cpy + char_len;
+ memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
+ // if the replacement char is empty, the invalid char should be ignored
+ if (char_to_replace_len == 0) {
+ out_byte_counter += valid_bytes_to_cpy;
+ } else {
+ ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
+ out_byte_counter += valid_bytes_to_cpy + char_len;
+ }
+ in_byte_counter += valid_bytes_to_cpy + char_len;
valid_bytes_to_cpy = 0;
continue;
}
@@ -1285,8 +1289,10 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
if (out_byte_counter == 0) return text_in;
// if there are still valid bytes to copy, do it
if (valid_bytes_to_cpy != 0) {
- memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
+ memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
}
+ // the out length will be the out bytes copied + the missing end bytes copied
+ *out_len = valid_bytes_to_cpy + out_byte_counter;
return ret;
}
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 8ffaace..2460633 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -175,14 +175,22 @@ TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
EXPECT_TRUE(ctx.has_error());
ctx.Reset();
- // full valid utf8, but invalid replacement char length
+ // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
std::string f("ok-\xa0\xa1-valid");
auto f_in_out_len = static_cast<int>(f.length());
const char* f_str = convert_replace_invalid_fromUTF8_binary(
ctx_ptr, f.data(), f_in_out_len, "", 0, &f_in_out_len);
- EXPECT_EQ(std::string(f_str, f_in_out_len), "ok-\xa0\xa1-valid");
+ EXPECT_EQ(std::string(f_str, f_in_out_len), "ok--valid");
EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+ // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
+ std::string g("\xa0\xa1-ok-\xa0\xa1-valid-\xa0\xa1");
+ auto g_in_out_len = static_cast<int>(g.length());
+ const char* g_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, g.data(), g_in_out_len, "", 0, &g_in_out_len);
+ EXPECT_EQ(std::string(g_str, g_in_out_len), "-ok--valid-");
+ EXPECT_FALSE(ctx.has_error());
ctx.Reset();
}