You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/13 18:37:18 UTC

[impala] 02/02: IMPALA-8752: Added Jaro-Winkler edit distance and similarity built-in function

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8db7f27ddde226f3efd3bddcc00665d0d9b99ef0
Author: luksan47 <no...@gmail.com>
AuthorDate: Wed Jul 17 02:17:20 2019 -0700

    IMPALA-8752: Added Jaro-Winkler edit distance and similarity built-in function
    
    The added functions return the Jaro/Jaro-Winkler similarity/distance
    of two strings. The algorithm calcuates the Jaro-Similarity of the
    strings, then adds more weight to the result if there are
    common prefixes. (Jaro-Winkler)
    For more detail, see:
    https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
    
    Extended the algorithm with another optional parameter: boost threshold
    The prefix weight will only be applied if the Jaro-similarity
    exceeds the given threshold. By default, its value is 0.7.
    
    The new built-in functions are:
     * jaro_distance, jaro_dst
     * jaro_similarity, jaro_sim
     * jaro_winkler_distance, jw_dst
     * jaro_winkler_similarity, jw_sim
    
    Testing:
     * Added unit tests to expr-test.cc
     * Manual testing over 1400 word pairs from
       http://marvin.cs.uidaho.edu/misspell.html
       Results match Apache commons
    
    Change-Id: I64d7f461516c5e66cc27d62612bc8cc0e8f0178c
    Reviewed-on: http://gerrit.cloudera.org:8080/13870
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/expr-test.cc                    | 108 +++++++++++++++++
 be/src/exprs/string-functions-ir.cc          | 173 +++++++++++++++++++++++++++
 be/src/exprs/string-functions.h              |  26 ++++
 common/function-registry/impala_functions.py |  16 +++
 4 files changed, 323 insertions(+)

diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 9cb4899..be80c2b 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -4013,6 +4013,114 @@ TEST_P(ExprTest, StringFunctions) {
   TestErrorString("le_dst(repeat('x', 256), 'z')",
       "levenshtein argument exceeds maximum length of 255 characters\n");
 
+  for (const string fn_name: { "jaro_dst", "jaro_distance" }) {
+    TestIsNull(fn_name + "('foo', NULL)", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, 'foo')", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, NULL)", TYPE_DOUBLE);
+    TestValue(fn_name + "('foo', 'foo')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('foo', 'bar')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('', '')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('', 'jaro')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('jaro', '')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('crate', 'trace')", TYPE_DOUBLE, 0.2666666666666666);
+    TestValue(fn_name + "('dwayne', 'duane')", TYPE_DOUBLE, 0.1777777777777778);
+    TestValue(fn_name + "('martha', 'marhta')", TYPE_DOUBLE, 0.05555555555555558);
+    TestValue(fn_name + "('frog', 'fog')", TYPE_DOUBLE, 0.08333333333333337);
+    TestValue(fn_name + "('hello', 'haloa')", TYPE_DOUBLE, 0.2666666666666666);
+    TestValue(fn_name + "('atcg', 'tagc')", TYPE_DOUBLE, 0.1666666666666667);
+    TestErrorString(fn_name + "('z', repeat('x', 256))",
+        "jaro argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "(repeat('x', 256), 'z')",
+        "jaro argument exceeds maximum length of 255 characters\n");
+  }
+
+  for (const string fn_name: { "jaro_sim", "jaro_similarity" }) {
+    TestIsNull(fn_name + "('foo', NULL)", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, 'foo')", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, NULL)", TYPE_DOUBLE);
+    TestValue(fn_name + "('foo', 'foo')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('foo', 'bar')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('', '')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('', 'jaro')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('jaro', '')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('crate', 'trace')", TYPE_DOUBLE, 0.7333333333333334);
+    TestValue(fn_name + "('dwayne', 'duane')", TYPE_DOUBLE, 0.82222222222222222);
+    TestValue(fn_name + "('martha', 'marhta')", TYPE_DOUBLE, 0.944444444444444444);
+    TestValue(fn_name + "('frog', 'fog')", TYPE_DOUBLE, 0.9166666666666666);
+    TestValue(fn_name + "('hello', 'haloa')", TYPE_DOUBLE, 0.73333333333333334);
+    TestValue(fn_name + "('atcg', 'tagc')", TYPE_DOUBLE, 0.8333333333333333);
+    TestErrorString(fn_name + "('z', repeat('x', 256))",
+        "jaro argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "(repeat('x', 256), 'z')",
+        "jaro argument exceeds maximum length of 255 characters\n");
+  }
+
+  for (const string fn_name: { "jaro_winkler_distance", "jw_dst" }) {
+    TestIsNull(fn_name + "('foo', NULL)", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, 'foo')", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, NULL)", TYPE_DOUBLE);
+    TestValue(fn_name + "('foo', 'foo')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('foo', 'bar')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('', '')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('', 'jaro')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('jaro', '')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('crate', 'trace')", TYPE_DOUBLE, 0.2666666666666666);
+    TestValue(fn_name + "('crate', 'trace', 0.2)", TYPE_DOUBLE, 0.2666666666666666);
+    TestValue(fn_name + "('dwayne', 'duane')", TYPE_DOUBLE, 0.16);
+    TestValue(fn_name + "('martha', 'marhta', 0.0)", TYPE_DOUBLE, 0.05555555555555558);
+    TestValue(fn_name + "('martha', 'marhta')", TYPE_DOUBLE, 0.03888888888888886);
+    TestValue(fn_name + "('martha', 'marhta', 0.2)", TYPE_DOUBLE, 0.02222222222222225);
+    TestValue(fn_name + "('atcg', 'tagc')", TYPE_DOUBLE, 0.1666666666666667);
+    TestValue(fn_name + "('martha', 'marhta', 0.1, 0.99)", TYPE_DOUBLE,
+        0.05555555555555558);
+    TestValue(fn_name + "('dwayne', 'duane', 0.1, 0.9)", TYPE_DOUBLE, 0.1777777777777778);
+    TestErrorString(fn_name + "('z', repeat('x', 256))",
+        "jaro-winkler argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "(repeat('x', 256), 'z')",
+        "jaro-winkler argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.26)",
+        "jaro-winkler scaling factor values can range between 0.0 and 0.25\n");
+    TestErrorString(fn_name + "('foo', 'bar', -0.01)",
+        "jaro-winkler scaling factor values can range between 0.0 and 0.25\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.1, -0.01)",
+        "jaro-winkler boost threshold values can range between 0.0 and 1.0\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.1, 1.01)",
+        "jaro-winkler boost threshold values can range between 0.0 and 1.0\n");
+  }
+  for (const string fn_name: { "jaro_winkler_similarity", "jw_sim"}) {
+    TestIsNull(fn_name + "('foo', NULL)", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, 'foo')", TYPE_DOUBLE);
+    TestIsNull(fn_name + "(NULL, NULL)", TYPE_DOUBLE);
+    TestValue(fn_name + "('foo', 'foo')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('foo', 'bar')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('', '')", TYPE_DOUBLE, 1.0);
+    TestValue(fn_name + "('', 'jaro')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('jaro', '')", TYPE_DOUBLE, 0.0);
+    TestValue(fn_name + "('crate', 'trace')", TYPE_DOUBLE, 0.7333333333333334);
+    TestValue(fn_name + "('crate', 'trace', 0.2)", TYPE_DOUBLE, 0.7333333333333334);
+    TestValue(fn_name + "('dwayne', 'duane')", TYPE_DOUBLE, 0.84);
+    TestValue(fn_name + "('martha', 'marhta', 0.0)", TYPE_DOUBLE, 0.94444444444444442);
+    TestValue(fn_name + "('martha', 'marhta', 0.1)", TYPE_DOUBLE, 0.96111111111111111);
+    TestValue(fn_name + "('martha', 'marhta', 0.2)", TYPE_DOUBLE, 0.97777777777777777);
+    TestValue(fn_name + "('atcg', 'tagc')", TYPE_DOUBLE, 0.8333333333333333);;
+    TestValue(fn_name + "('martha', 'marhta', 0.1, 0.99)", TYPE_DOUBLE,
+        0.94444444444444442);
+    TestValue(fn_name + "('dwayne', 'duane', 0.1, 0.9)", TYPE_DOUBLE,
+        0.82222222222222222);
+    TestErrorString(fn_name + "('z', repeat('x', 256))",
+        "jaro-winkler argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "(repeat('x', 256), 'z')",
+        "jaro-winkler argument exceeds maximum length of 255 characters\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.26)",
+        "jaro-winkler scaling factor values can range between 0.0 and 0.25\n");
+    TestErrorString(fn_name + "('foo', 'bar', -0.01)",
+        "jaro-winkler scaling factor values can range between 0.0 and 0.25\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.1, -0.01)",
+        "jaro-winkler boost threshold values can range between 0.0 and 1.0\n");
+    TestErrorString(fn_name + "('foo', 'bar', 0.1, 1.01)",
+        "jaro-winkler boost threshold values can range between 0.0 and 1.0\n");
+  }
+
   TestStringValue("substring('Hello', 1)", "Hello");
   TestStringValue("substring('Hello', -2)", "lo");
   TestStringValue("substring('Hello', cast(0 as bigint))", "");
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 67b7ce1..5606fcb 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -1167,4 +1167,177 @@ IntVal StringFunctions::Levenshtein(
 
   return IntVal(result);
 }
+
+// Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+// Implements Jaro similarity
+DoubleVal StringFunctions::JaroSimilarity(
+    FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {
+
+  int s1len = s1.len;
+  int s2len = s2.len;
+
+  // error if either input exceeds 255 characters
+  if (s1len > 255 || s2len > 255) {
+    ctx->SetError("jaro argument exceeds maximum length of 255 characters");
+    return DoubleVal(-1.0);
+  }
+
+  // short cut cases:
+  // - null strings
+  // - zero length strings
+  // - identical length and value strings
+  if (s1.is_null || s2.is_null) return DoubleVal::null();
+  if (s1len == 0 && s2len == 0) return DoubleVal(1.0);
+  if (s1len == 0 || s2len == 0) return DoubleVal(0.0);
+  if (s1len == s2len && memcmp(s1.ptr, s2.ptr, s1len) == 0) return DoubleVal(1.0);
+
+  // the window size to search for matches in the other string
+  int max_range = std::max(0, std::max(s1len, s2len) / 2 - 1);
+
+  int s1_matching[s1len];
+  int s2_matching[s2len];
+  std::fill_n(s1_matching, s1len, -1);
+  std::fill_n(s2_matching, s2len, -1);
+
+  // calculate matching characters
+  int matching_characters = 0;
+  for (int i = 0; i < s1len; i++) {
+    // matching window
+    int min_index = std::max(i - max_range, 0);
+    int max_index = std::min(i + max_range + 1, s2len);
+    if (min_index >= max_index) break;
+
+    for (int j = min_index; j < max_index; j++) {
+      if (s2_matching[j] == -1 && s1.ptr[i] == s2.ptr[j]) {
+        s1_matching[i] = i;
+        s2_matching[j] = j;
+        matching_characters++;
+        break;
+      }
+    }
+  }
+
+  if (matching_characters == 0) return DoubleVal(0.0);
+
+  // transpositions (one-way only)
+  double transpositions = 0.0;
+  for (int i = 0, s1i = 0, s2i = 0; i < matching_characters; i++) {
+    while (s1_matching[s1i] == -1) {
+      s1i++;
+    }
+    while (s2_matching[s2i] == -1) {
+      s2i++;
+    }
+    if (s1.ptr[s1i] != s2.ptr[s2i]) {
+      transpositions += 0.5;
+    }
+    s1i++;
+    s2i++;
+  }
+  double m = static_cast<double>(matching_characters);
+  double jaro_similarity = 1.0 / 3.0  * ( m / static_cast<double>(s1len)
+                                        + m / static_cast<double>(s2len)
+                                        + (m - transpositions) / m );
+
+  return DoubleVal(jaro_similarity);
+}
+
+DoubleVal StringFunctions::JaroDistance(
+    FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {
+
+  DoubleVal jaro_similarity = StringFunctions::JaroSimilarity(ctx, s1, s2);
+  if (jaro_similarity.is_null) return DoubleVal::null();
+  if (jaro_similarity.val == -1.0) return DoubleVal(-1.0);
+  return DoubleVal(1.0 - jaro_similarity.val);
+}
+
+DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2) {
+  return StringFunctions::JaroWinklerDistance(ctx, s1, s2,
+    DoubleVal(0.1), DoubleVal(0.7));
+}
+
+DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2,
+      const DoubleVal& scaling_factor) {
+  return StringFunctions::JaroWinklerDistance(ctx, s1, s2,
+    scaling_factor, DoubleVal(0.7));
+}
+
+// Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+// Implements Jaro-Winkler distance
+// Extended with boost_theshold: Winkler's modification only applies if Jaro exceeds it
+DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2,
+      const DoubleVal& scaling_factor, const DoubleVal& boost_threshold) {
+
+  DoubleVal jaro_winkler_similarity = StringFunctions::JaroWinklerSimilarity(
+    ctx, s1, s2, scaling_factor, boost_threshold);
+
+  if (jaro_winkler_similarity.is_null) return DoubleVal::null();
+  if (jaro_winkler_similarity.val == -1.0) return DoubleVal(-1.0);
+  return DoubleVal(1.0 - jaro_winkler_similarity.val);
+}
+
+DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2) {
+  return StringFunctions::JaroWinklerSimilarity(ctx, s1, s2,
+    DoubleVal(0.1), DoubleVal(0.7));
+}
+
+DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2,
+      const DoubleVal& scaling_factor) {
+  return StringFunctions::JaroWinklerSimilarity(ctx, s1, s2,
+    scaling_factor, DoubleVal(0.7));
+}
+
+// Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+// Implements Jaro-Winkler similarity
+// Extended with boost_theshold: Winkler's modification only applies if Jaro exceeds it
+DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
+      const StringVal& s1, const StringVal& s2,
+      const DoubleVal& scaling_factor, const DoubleVal& boost_threshold) {
+
+  constexpr int MAX_PREFIX_LENGTH = 4;
+  int s1len = s1.len;
+  int s2len = s2.len;
+
+  // error if either input exceeds 255 characters
+  if (s1len > 255 || s2len > 255) {
+    ctx->SetError("jaro-winkler argument exceeds maximum length of 255 characters");
+    return DoubleVal(-1.0);
+  }
+  // scaling factor has to be between 0.0 and 0.25
+  if (scaling_factor.val < 0.0 || scaling_factor.val > 0.25) {
+    ctx->SetError("jaro-winkler scaling factor values can range between 0.0 and 0.25");
+    return DoubleVal(-1.0);
+  }
+  // error if boost threshold is out of range 0.0..1.0
+  if (boost_threshold.val < 0.0 || boost_threshold.val > 1.0) {
+    ctx->SetError("jaro-winkler boost threshold values can range between 0.0 and 1.0");
+    return DoubleVal(-1.0);
+  }
+
+  if (s1.is_null || s2.is_null) return DoubleVal::null();
+
+  DoubleVal jaro_similarity = StringFunctions::JaroSimilarity(ctx, s1, s2);
+  if (jaro_similarity.is_null) return DoubleVal::null();
+  if (jaro_similarity.val == -1.0) return DoubleVal(-1.0);
+
+  double jaro_winkler_similarity = jaro_similarity.val;
+
+  if (jaro_similarity.val > boost_threshold.val) {
+    int common_length = std::min(MAX_PREFIX_LENGTH, std::min(s1len, s2len));
+    int common_prefix = 0;
+    while (common_prefix < common_length &&
+           s1.ptr[common_prefix] == s2.ptr[common_prefix]) {
+      common_prefix++;
+    }
+
+    jaro_winkler_similarity += common_prefix * scaling_factor.val *
+      (1.0 - jaro_similarity.val);
+  }
+  return DoubleVal(jaro_winkler_similarity);
+}
 }
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 84ee595..8386461 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -158,6 +158,32 @@ class StringFunctions {
   static IntVal Levenshtein(
       FunctionContext* context, const StringVal& s1, const StringVal& s2);
 
+  static DoubleVal JaroDistance(
+      FunctionContext* ctx, const StringVal& s1, const StringVal& s2);
+
+  static DoubleVal JaroSimilarity(
+      FunctionContext* ctx, const StringVal& s1, const StringVal& s2);
+
+  static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2);
+
+  static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2, const DoubleVal& scaling_factor);
+
+  static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2, const DoubleVal& scaling_factor,
+      const DoubleVal& boost_threshold);
+
+  static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2);
+
+  static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2, const DoubleVal& scaling_factor);
+
+  static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
+      const StringVal& s2, const DoubleVal& scaling_factor,
+      const DoubleVal& boost_threshold);
+
  private:
   /// Templatized implementation of the actual string trimming function.
   /// The first parameter, 'D', is one of StringFunctions::TrimPosition values.
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 06bf8ce..d7d1ceb 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -586,6 +586,22 @@ visible_functions = [
    'impala::StringFunctions::GetJsonObject'],
   [['levenshtein', 'le_dst'], 'INT', ['STRING', 'STRING'],
    '_ZN6impala15StringFunctions11LevenshteinEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
+  [['jaro_distance', 'jaro_dst'], 'DOUBLE', ['STRING', 'STRING'],
+   '_ZN6impala15StringFunctions12JaroDistanceEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
+  [['jaro_similarity', 'jaro_sim'], 'DOUBLE', ['STRING', 'STRING'],
+   '_ZN6impala15StringFunctions14JaroSimilarityEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
+  [['jaro_winkler_distance', 'jw_dst'], 'DOUBLE', ['STRING', 'STRING'],
+   '_ZN6impala15StringFunctions19JaroWinklerDistanceEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
+  [['jaro_winkler_distance', 'jw_dst'], 'DOUBLE', ['STRING', 'STRING', 'DOUBLE'],
+   '_ZN6impala15StringFunctions19JaroWinklerDistanceEPN10impala_udf15FunctionContextERKNS1_9StringValES6_RKNS1_9DoubleValE'],
+  [['jaro_winkler_distance', 'jw_dst'], 'DOUBLE', ['STRING', 'STRING', 'DOUBLE', 'DOUBLE'],
+   '_ZN6impala15StringFunctions19JaroWinklerDistanceEPN10impala_udf15FunctionContextERKNS1_9StringValES6_RKNS1_9DoubleValES9_'],
+  [['jaro_winkler_similarity', 'jw_sim'], 'DOUBLE', ['STRING', 'STRING'],
+   '_ZN6impala15StringFunctions21JaroWinklerSimilarityEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
+  [['jaro_winkler_similarity', 'jw_sim'], 'DOUBLE', ['STRING', 'STRING', 'DOUBLE'],
+   '_ZN6impala15StringFunctions21JaroWinklerSimilarityEPN10impala_udf15FunctionContextERKNS1_9StringValES6_RKNS1_9DoubleValE'],
+  [['jaro_winkler_similarity', 'jw_sim'], 'DOUBLE', ['STRING', 'STRING', 'DOUBLE', 'DOUBLE'],
+   '_ZN6impala15StringFunctions21JaroWinklerSimilarityEPN10impala_udf15FunctionContextERKNS1_9StringValES6_RKNS1_9DoubleValES9_'],
 
   # Conditional Functions
   # Some of these have empty symbols because the BE special-cases them based on the