You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2022/04/27 11:49:13 UTC

[arrow] branch master updated: ARROW-16031: [C++][Gandiva] Fix Soundex errors generate

This is an automated email from the ASF dual-hosted git repository.

ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0933facc8e ARROW-16031: [C++][Gandiva] Fix Soundex errors generate
0933facc8e is described below

commit 0933facc8effd4984f28349ff85765a44bb171bf
Author: ViniciusSouzaRoque <vi...@dremio.com>
AuthorDate: Wed Apr 27 17:18:57 2022 +0530

    ARROW-16031: [C++][Gandiva] Fix Soundex errors generate
    
    Current Soundex Function presents errors in generating results for some specific cases, when we have a sequence of 2 equal numbers separated by 0 it ends up ignoring it, as it deletes the 0 before generating the entire Soundex
    
    **Example with errors:**
    
    Alice Ichabod   -> Correct return is **A422** but the Soundex returned **A421**
    
    Luke Garcia       -> Correct return is **L226** but the Soundex returned  **L262**
    
    Closes #12716 from ViniciusSouzaRoque/fixbug/fix-soundex-return
    
    Authored-by: ViniciusSouzaRoque <vi...@dremio.com>
    Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
 cpp/src/gandiva/function_registry_string.cc    |   2 +-
 cpp/src/gandiva/precompiled/string_ops.cc      |  73 ++++++++++++------
 cpp/src/gandiva/precompiled/string_ops_test.cc | 101 ++++++++++++++++++++++---
 cpp/src/gandiva/precompiled/types.h            |   4 +-
 cpp/src/gandiva/tests/projector_test.cc        |  14 ++--
 5 files changed, 152 insertions(+), 42 deletions(-)

diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index a56d8f07d1..4890ec8838 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -84,7 +84,7 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
                      kResultNullIfNull, "repeat_utf8_int32",
                      NativeFunction::kNeedsContext),
 
-      NativeFunction("soundex", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+      NativeFunction("soundex", {}, DataTypeVector{utf8()}, utf8(), kResultNullInternal,
                      "soundex_utf8", NativeFunction::kNeedsContext),
 
       NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 0ff3847cb7..628e6ba2de 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -2862,9 +2862,9 @@ static char mappings[] = {'0', '1', '2', '3', '0', '1', '2', '0', '0',
 // the input string followed by a phonetic code. Characters that are not alphabetic are
 // ignored. If expression evaluates to the null value, null is returned.
 //
-// The soundex algorith works with the following steps:
+// The soundex algorithm works with the following steps:
 //    1. Retain the first letter of the string and drop all other occurrences of a, e, i,
-//    o, u, y, h, w.
+//    o, u, y, h, w. (let's call them special letters)
 //    2. Replace consonants with digits as follows (after the first letter):
 //        b, f, p, v → 1
 //        c, g, j, k, q, s, x, z → 2
@@ -2872,61 +2872,90 @@ static char mappings[] = {'0', '1', '2', '3', '0', '1', '2', '0', '0',
 //        l → 4
 //        m, n → 5
 //        r → 6
-//    3. If two or more letters with the same number are adjacent in the original name
-//    (before step 1), only retain the first letter; also two letters with the same number
-//    separated by 'h' or 'w' are coded as a single number, whereas such letters separated
-//    by a vowel are coded twice. This rule also applies to the first letter.
+//    3. If two or more letters with the same number were adjacent in the original name
+//    (before step 1), then omit all but the first. This rule also applies to the first
+//    letter.
 //    4. If the string have too few letters in the word that you can't assign three
 //    numbers, append with zeros until there are three numbers. If you have four or more
 //    numbers, retain only the first three.
 FORCE_INLINE
-const char* soundex_utf8(gdv_int64 ctx, const char* in, gdv_int32 in_len,
-                         int32_t* out_len) {
+const char* soundex_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+                         bool in_validity, bool* out_valid, int32_t* out_len) {
   if (in_len <= 0) {
+    *out_valid = true;
     *out_len = 0;
     return "";
   }
 
   // The soundex code is composed by one letter and three numbers
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, 4));
-  if (ret == nullptr) {
-    gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string");
+  char* soundex = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 4));
+
+  if (soundex == nullptr || ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_valid = false;
     *out_len = 0;
     return "";
   }
 
   int si = 1;
+  int ret_len = 1;
   unsigned char c;
 
   int start_idx = 0;
   for (int i = 0; i < in_len; ++i) {
     if (isalpha(in[i]) > 0) {
+      // Retain the first letter
       ret[0] = toupper(in[i]);
       start_idx = i + 1;
       break;
     }
   }
 
-  for (int i = start_idx, l = in_len; i < l; i++) {
+  // If ret[0] is not initialised, return validity false
+  if (start_idx == 0) {
+    *out_valid = false;
+    *out_len = 0;
+    return "";
+  }
+
+  soundex[0] = '\0';
+  // Replace consonants with digits and special letters with 0
+  for (int i = start_idx; i < in_len; i++) {
     if (isalpha(in[i]) > 0) {
       c = toupper(in[i]) - 65;
-      if (mappings[c] != '0') {
-        if (mappings[c] != ret[si - 1]) {
-          ret[si] = mappings[c];
-          si++;
-        }
+      if (mappings[c] != soundex[si - 1]) {
+        soundex[si] = mappings[c];
+        si++;
+      }
+    }
+  }
+
+  int i = 1;
+  // If the saved letter's digit is the same as the resulting first digit, skip it
+  if (si > 1) {
+    if (soundex[1] == mappings[ret[0] - 65]) {
+      i = 2;
+    }
 
-        if (si > 3) break;
+    for (; i < si; i++) {
+      // If it is a special letter, we ignore, because it has been dropped in first step
+      if (soundex[i] != '0') {
+        ret[ret_len] = soundex[i];
+        ret_len++;
       }
+      if (ret_len > 3) break;
     }
   }
 
-  if (si <= 3) {
-    while (si <= 3) {
-      ret[si] = '0';
-      si++;
+  // If the return have too few numbers, append with zeros until there are three
+  if (ret_len <= 3) {
+    while (ret_len <= 3) {
+      ret[ret_len] = '0';
+      ret_len++;
     }
   }
+  *out_valid = true;
   *out_len = 4;
   return ret;
 }
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index acc570d571..08a6ca9951 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -2383,45 +2383,122 @@ TEST(TestStringOps, TestFromHex) {
   EXPECT_EQ(output, "");
   EXPECT_EQ(out_valid, false);
 }
+
 TEST(TestStringOps, TestSoundex) {
   gandiva::ExecutionContext ctx;
   auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
   int32_t out_len = 0;
+  bool validity = false;
   const char* out;
 
-  out = soundex_utf8(ctx_ptr, "Miller", 6, &out_len);
+  out = soundex_utf8(ctx_ptr, "123456789", 9, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "");
+  EXPECT_EQ(validity, false);
+
+  out = soundex_utf8(ctx_ptr, "a", 1, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A000");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "123456789a", 10, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A000");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "a123456789", 10, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A000");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "robert", 6, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "R163");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "r-O-b-E-r-T", 11, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "R163");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Robert", 6, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "R163");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Rupert", 6, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "R163");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Honeyman", 8, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "H555");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Tymczak", 7, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "T522");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Ashcraft", 8, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A226");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Ashcroft", 8, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A226");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Jjjice", 6, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "J200");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Luke Garcia", 11, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "L226");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "123 321 Luke 987 Gar4cia", 24, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "L226");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Alice Ichabod", 13, true, &validity, &out_len);
+  EXPECT_EQ(std::string(out, out_len), "A422");
+  EXPECT_EQ(validity, true);
+
+  out = soundex_utf8(ctx_ptr, "Miller", 6, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "M460");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "3Miller", 7, &out_len);
+  out = soundex_utf8(ctx_ptr, "3Miller", 7, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "M460");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "Mill3r", 6, &out_len);
+  out = soundex_utf8(ctx_ptr, "Mill3r", 6, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "M460");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "abc", 3, &out_len);
+  out = soundex_utf8(ctx_ptr, "abc", 3, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "A120");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "123abc", 6, &out_len);
+  out = soundex_utf8(ctx_ptr, "123abc", 6, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "A120");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "test", 4, &out_len);
+  out = soundex_utf8(ctx_ptr, "test", 4, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "T230");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "", 0, &out_len);
+  out = soundex_utf8(ctx_ptr, "", 0, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "Elvis", 5, &out_len);
+  out = soundex_utf8(ctx_ptr, "Elvis", 5, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "E412");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "waterloo", 8, &out_len);
+  out = soundex_utf8(ctx_ptr, "waterloo", 8, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "W364");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "eowolf", 6, &out_len);
+  out = soundex_utf8(ctx_ptr, "eowolf", 6, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), "E410");
+  EXPECT_EQ(validity, true);
 
-  out = soundex_utf8(ctx_ptr, "Smith", 5, &out_len);
-  auto out2 = soundex_utf8(ctx_ptr, "Smythe", 6, &out_len);
+  out = soundex_utf8(ctx_ptr, "Smith", 5, true, &validity, &out_len);
+  auto out2 = soundex_utf8(ctx_ptr, "Smythe", 6, true, &validity, &out_len);
   EXPECT_EQ(std::string(out, out_len), std::string(out2, out_len));
+  EXPECT_EQ(validity, true);
 }
 
 TEST(TestStringOps, TestInstr) {
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index f3bd51ae80..fde23cdd02 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -686,8 +686,8 @@ const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
                                            gdv_int32 text_len, gdv_int32 offset,
                                            gdv_int32 length, gdv_int32* out_len);
 
-const char* soundex_utf8(gdv_int64 ctx, const char* in, gdv_int32 in_len,
-                         int32_t* out_len);
+const char* soundex_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+                         bool in_validity, bool* out_valid, int32_t* out_len);
 
 const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
                                    gdv_int64 out_len, gdv_int32* out_length);
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 98b422d7fa..86c35af4bf 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -835,12 +835,16 @@ TEST_F(TestProjector, TestSoundex) {
   EXPECT_TRUE(status.ok()) << status.message();
 
   // Create a row-batch with some sample data
-  int num_records = 4;
-  auto array0 =
-      MakeArrowArrayUtf8({"test", "", "Miller", "abc"}, {true, true, true, true});
+  int num_records = 11;
+  auto array0 = MakeArrowArrayUtf8(
+      {"test", "", "Miller", "abc", "democrat", "luke garcia", "alice ichabod", "Jjjice",
+       "SACHS", "路-大学b路%$大", "a"},
+      {true, true, true, true, true, true, true, true, true, true, true});
   // expected output
-  auto exp_soundex =
-      MakeArrowArrayUtf8({"T230", "", "M460", "A120"}, {true, true, true, true});
+  auto exp_soundex = MakeArrowArrayUtf8(
+      {"T230", "", "M460", "A120", "D526", "L226", "A422", "J200", "S220", "B000",
+       "A000"},
+      {true, true, true, true, true, true, true, true, true, true, true});
 
   // prepare input record batch
   auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});