You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2022/04/27 11:49:13 UTC
[arrow] branch master updated: ARROW-16031: [C++][Gandiva] Fix Soundex errors generate
This is an automated email from the ASF dual-hosted git repository.
ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0933facc8e ARROW-16031: [C++][Gandiva] Fix Soundex errors generate
0933facc8e is described below
commit 0933facc8effd4984f28349ff85765a44bb171bf
Author: ViniciusSouzaRoque <vi...@dremio.com>
AuthorDate: Wed Apr 27 17:18:57 2022 +0530
ARROW-16031: [C++][Gandiva] Fix Soundex errors generate
Current Soundex Function presents errors in generating results for some specific cases, when we have a sequence of 2 equal numbers separated by 0 it ends up ignoring it, as it deletes the 0 before generating the entire Soundex
**Example with errors:**
Alice Ichabod -> Correct return is **A422** but the Soundex returned **A421**
Luke Garcia -> Correct return is **L226** but the Soundex returned **L262**
Closes #12716 from ViniciusSouzaRoque/fixbug/fix-soundex-return
Authored-by: ViniciusSouzaRoque <vi...@dremio.com>
Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
cpp/src/gandiva/function_registry_string.cc | 2 +-
cpp/src/gandiva/precompiled/string_ops.cc | 73 ++++++++++++------
cpp/src/gandiva/precompiled/string_ops_test.cc | 101 ++++++++++++++++++++++---
cpp/src/gandiva/precompiled/types.h | 4 +-
cpp/src/gandiva/tests/projector_test.cc | 14 ++--
5 files changed, 152 insertions(+), 42 deletions(-)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index a56d8f07d1..4890ec8838 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -84,7 +84,7 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "repeat_utf8_int32",
NativeFunction::kNeedsContext),
- NativeFunction("soundex", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+ NativeFunction("soundex", {}, DataTypeVector{utf8()}, utf8(), kResultNullInternal,
"soundex_utf8", NativeFunction::kNeedsContext),
NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 0ff3847cb7..628e6ba2de 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -2862,9 +2862,9 @@ static char mappings[] = {'0', '1', '2', '3', '0', '1', '2', '0', '0',
// the input string followed by a phonetic code. Characters that are not alphabetic are
// ignored. If expression evaluates to the null value, null is returned.
//
-// The soundex algorith works with the following steps:
+// The soundex algorithm works with the following steps:
// 1. Retain the first letter of the string and drop all other occurrences of a, e, i,
-// o, u, y, h, w.
+// o, u, y, h, w. (let's call them special letters)
// 2. Replace consonants with digits as follows (after the first letter):
// b, f, p, v → 1
// c, g, j, k, q, s, x, z → 2
@@ -2872,61 +2872,90 @@ static char mappings[] = {'0', '1', '2', '3', '0', '1', '2', '0', '0',
// l → 4
// m, n → 5
// r → 6
-// 3. If two or more letters with the same number are adjacent in the original name
-// (before step 1), only retain the first letter; also two letters with the same number
-// separated by 'h' or 'w' are coded as a single number, whereas such letters separated
-// by a vowel are coded twice. This rule also applies to the first letter.
+// 3. If two or more letters with the same number were adjacent in the original name
+// (before step 1), then omit all but the first. This rule also applies to the first
+// letter.
// 4. If the string have too few letters in the word that you can't assign three
// numbers, append with zeros until there are three numbers. If you have four or more
// numbers, retain only the first three.
FORCE_INLINE
-const char* soundex_utf8(gdv_int64 ctx, const char* in, gdv_int32 in_len,
- int32_t* out_len) {
+const char* soundex_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ bool in_validity, bool* out_valid, int32_t* out_len) {
if (in_len <= 0) {
+ *out_valid = true;
*out_len = 0;
return "";
}
// The soundex code is composed by one letter and three numbers
- char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, 4));
- if (ret == nullptr) {
- gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string");
+ char* soundex = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 4));
+
+ if (soundex == nullptr || ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_valid = false;
*out_len = 0;
return "";
}
int si = 1;
+ int ret_len = 1;
unsigned char c;
int start_idx = 0;
for (int i = 0; i < in_len; ++i) {
if (isalpha(in[i]) > 0) {
+ // Retain the first letter
ret[0] = toupper(in[i]);
start_idx = i + 1;
break;
}
}
- for (int i = start_idx, l = in_len; i < l; i++) {
+ // If ret[0] is not initialised, return validity false
+ if (start_idx == 0) {
+ *out_valid = false;
+ *out_len = 0;
+ return "";
+ }
+
+ soundex[0] = '\0';
+ // Replace consonants with digits and special letters with 0
+ for (int i = start_idx; i < in_len; i++) {
if (isalpha(in[i]) > 0) {
c = toupper(in[i]) - 65;
- if (mappings[c] != '0') {
- if (mappings[c] != ret[si - 1]) {
- ret[si] = mappings[c];
- si++;
- }
+ if (mappings[c] != soundex[si - 1]) {
+ soundex[si] = mappings[c];
+ si++;
+ }
+ }
+ }
+
+ int i = 1;
+ // If the saved letter's digit is the same as the resulting first digit, skip it
+ if (si > 1) {
+ if (soundex[1] == mappings[ret[0] - 65]) {
+ i = 2;
+ }
- if (si > 3) break;
+ for (; i < si; i++) {
+ // If it is a special letter, we ignore, because it has been dropped in first step
+ if (soundex[i] != '0') {
+ ret[ret_len] = soundex[i];
+ ret_len++;
}
+ if (ret_len > 3) break;
}
}
- if (si <= 3) {
- while (si <= 3) {
- ret[si] = '0';
- si++;
+ // If the return have too few numbers, append with zeros until there are three
+ if (ret_len <= 3) {
+ while (ret_len <= 3) {
+ ret[ret_len] = '0';
+ ret_len++;
}
}
+ *out_valid = true;
*out_len = 4;
return ret;
}
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index acc570d571..08a6ca9951 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -2383,45 +2383,122 @@ TEST(TestStringOps, TestFromHex) {
EXPECT_EQ(output, "");
EXPECT_EQ(out_valid, false);
}
+
TEST(TestStringOps, TestSoundex) {
gandiva::ExecutionContext ctx;
auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
+ bool validity = false;
const char* out;
- out = soundex_utf8(ctx_ptr, "Miller", 6, &out_len);
+ out = soundex_utf8(ctx_ptr, "123456789", 9, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "");
+ EXPECT_EQ(validity, false);
+
+ out = soundex_utf8(ctx_ptr, "a", 1, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A000");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "123456789a", 10, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A000");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "a123456789", 10, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A000");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "robert", 6, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "R163");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "r-O-b-E-r-T", 11, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "R163");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Robert", 6, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "R163");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Rupert", 6, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "R163");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Honeyman", 8, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "H555");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Tymczak", 7, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "T522");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Ashcraft", 8, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A226");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Ashcroft", 8, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A226");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Jjjice", 6, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "J200");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Luke Garcia", 11, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "L226");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "123 321 Luke 987 Gar4cia", 24, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "L226");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Alice Ichabod", 13, true, &validity, &out_len);
+ EXPECT_EQ(std::string(out, out_len), "A422");
+ EXPECT_EQ(validity, true);
+
+ out = soundex_utf8(ctx_ptr, "Miller", 6, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "M460");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "3Miller", 7, &out_len);
+ out = soundex_utf8(ctx_ptr, "3Miller", 7, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "M460");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "Mill3r", 6, &out_len);
+ out = soundex_utf8(ctx_ptr, "Mill3r", 6, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "M460");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "abc", 3, &out_len);
+ out = soundex_utf8(ctx_ptr, "abc", 3, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "A120");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "123abc", 6, &out_len);
+ out = soundex_utf8(ctx_ptr, "123abc", 6, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "A120");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "test", 4, &out_len);
+ out = soundex_utf8(ctx_ptr, "test", 4, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "T230");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "", 0, &out_len);
+ out = soundex_utf8(ctx_ptr, "", 0, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "Elvis", 5, &out_len);
+ out = soundex_utf8(ctx_ptr, "Elvis", 5, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "E412");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "waterloo", 8, &out_len);
+ out = soundex_utf8(ctx_ptr, "waterloo", 8, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "W364");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "eowolf", 6, &out_len);
+ out = soundex_utf8(ctx_ptr, "eowolf", 6, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), "E410");
+ EXPECT_EQ(validity, true);
- out = soundex_utf8(ctx_ptr, "Smith", 5, &out_len);
- auto out2 = soundex_utf8(ctx_ptr, "Smythe", 6, &out_len);
+ out = soundex_utf8(ctx_ptr, "Smith", 5, true, &validity, &out_len);
+ auto out2 = soundex_utf8(ctx_ptr, "Smythe", 6, true, &validity, &out_len);
EXPECT_EQ(std::string(out, out_len), std::string(out2, out_len));
+ EXPECT_EQ(validity, true);
}
TEST(TestStringOps, TestInstr) {
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index f3bd51ae80..fde23cdd02 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -686,8 +686,8 @@ const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
gdv_int32 text_len, gdv_int32 offset,
gdv_int32 length, gdv_int32* out_len);
-const char* soundex_utf8(gdv_int64 ctx, const char* in, gdv_int32 in_len,
- int32_t* out_len);
+const char* soundex_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ bool in_validity, bool* out_valid, int32_t* out_len);
const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
gdv_int64 out_len, gdv_int32* out_length);
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 98b422d7fa..86c35af4bf 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -835,12 +835,16 @@ TEST_F(TestProjector, TestSoundex) {
EXPECT_TRUE(status.ok()) << status.message();
// Create a row-batch with some sample data
- int num_records = 4;
- auto array0 =
- MakeArrowArrayUtf8({"test", "", "Miller", "abc"}, {true, true, true, true});
+ int num_records = 11;
+ auto array0 = MakeArrowArrayUtf8(
+ {"test", "", "Miller", "abc", "democrat", "luke garcia", "alice ichabod", "Jjjice",
+ "SACHS", "路-大学b路%$大", "a"},
+ {true, true, true, true, true, true, true, true, true, true, true});
// expected output
- auto exp_soundex =
- MakeArrowArrayUtf8({"T230", "", "M460", "A120"}, {true, true, true, true});
+ auto exp_soundex = MakeArrowArrayUtf8(
+ {"T230", "", "M460", "A120", "D526", "L226", "A422", "J200", "S220", "B000",
+ "A000"},
+ {true, true, true, true, true, true, true, true, true, true, true});
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});