You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@calcite.apache.org by ch...@apache.org on 2019/05/17 04:26:48 UTC

[calcite] branch master updated: [CALCITE-2965] Implement string functions: REPEAT, SPACE, SOUNDEX, DIFFERENCE

This is an automated email from the ASF dual-hosted git repository.

chunwei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/master by this push:
     new 4d04773  [CALCITE-2965] Implement string functions: REPEAT, SPACE, SOUNDEX, DIFFERENCE
4d04773 is described below

commit 4d04773054c5bda3b43c3dbeb2be01df19cbe3e3
Author: Chunwei Lei <ch...@alibaba-inc.com>
AuthorDate: Fri Mar 15 21:55:40 2019 +0800

    [CALCITE-2965] Implement string functions: REPEAT, SPACE, SOUNDEX, DIFFERENCE
---
 core/pom.xml                                       |  4 ++
 .../calcite/adapter/enumerable/RexImpTable.java    |  8 +++
 .../org/apache/calcite/runtime/SqlFunctions.java   | 38 ++++++++++++
 .../calcite/sql/fun/SqlLibraryOperators.java       | 40 +++++++++++++
 .../org/apache/calcite/sql/type/OperandTypes.java  |  3 +
 .../org/apache/calcite/sql/type/ReturnTypes.java   | 21 +++++++
 .../org/apache/calcite/util/BuiltInMethod.java     |  4 ++
 .../calcite/sql/test/SqlOperatorBaseTest.java      | 69 ++++++++++++++++++++++
 pom.xml                                            |  6 ++
 site/_docs/reference.md                            | 10 ++--
 10 files changed, 198 insertions(+), 5 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index a8fbb69..0502bbb 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -38,6 +38,10 @@ limitations under the License.
     <!-- Sorted by groupId, artifactId; calcite dependencies first. Put versions
          in dependencyManagement in the root POM, not here. -->
     <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
+    <dependency>
       <groupId>org.apache.calcite.avatica</groupId>
       <artifactId>avatica-core</artifactId>
     </dependency>
diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index f1fe432..8159dc5 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -91,12 +91,16 @@ import static org.apache.calcite.linq4j.tree.ExpressionType.NotEqual;
 import static org.apache.calcite.linq4j.tree.ExpressionType.OrElse;
 import static org.apache.calcite.linq4j.tree.ExpressionType.Subtract;
 import static org.apache.calcite.linq4j.tree.ExpressionType.UnaryPlus;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.DIFFERENCE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_DEPTH;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_KEYS;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_LENGTH;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_PRETTY;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_REMOVE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_TYPE;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.SOUNDEX;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.SPACE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.TRANSLATE3;
 import static org.apache.calcite.sql.fun.SqlStdOperatorTable.ABS;
 import static org.apache.calcite.sql.fun.SqlStdOperatorTable.ACOS;
@@ -286,6 +290,10 @@ public class RexImpTable {
     defineMethod(OVERLAY, BuiltInMethod.OVERLAY.method, NullPolicy.STRICT);
     defineMethod(POSITION, BuiltInMethod.POSITION.method, NullPolicy.STRICT);
     defineMethod(ASCII, BuiltInMethod.ASCII.method, NullPolicy.STRICT);
+    defineMethod(REPEAT, BuiltInMethod.REPEAT.method, NullPolicy.STRICT);
+    defineMethod(SPACE, BuiltInMethod.SPACE.method, NullPolicy.STRICT);
+    defineMethod(SOUNDEX, BuiltInMethod.SOUNDEX.method, NullPolicy.STRICT);
+    defineMethod(DIFFERENCE, BuiltInMethod.DIFFERENCE.method, NullPolicy.STRICT);
 
     final TrimImplementor trimImplementor = new TrimImplementor();
     defineImplementor(TRIM, NullPolicy.STRICT, trimImplementor, false);
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 99fd3ff..24b02fc 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -39,6 +39,10 @@ import org.apache.calcite.util.TimeWithTimeZoneString;
 import org.apache.calcite.util.TimestampWithTimeZoneString;
 import org.apache.calcite.util.Util;
 
+import org.apache.commons.codec.language.Soundex;
+
+import com.google.common.base.Strings;
+
 import java.lang.reflect.Field;
 import java.math.BigDecimal;
 import java.math.BigInteger;
@@ -84,6 +88,10 @@ public class SqlFunctions {
 
   private static final TimeZone LOCAL_TZ = TimeZone.getDefault();
 
+  private static final Soundex SOUNDEX = new Soundex();
+
+  private static final int SOUNDEX_LENGTH = 4;
+
   private static final Function1<List<Object>, Enumerable<Object>> LIST_AS_ENUMERABLE =
       Linq4j::asEnumerable;
 
@@ -212,6 +220,36 @@ public class SqlFunctions {
         ? 0 : s.codePointAt(0);
   }
 
+  /** SQL REPEAT(string, int) function. */
+  public static String repeat(String s, int n) {
+    if (n < 1) {
+      return "";
+    }
+    return Strings.repeat(s, n);
+  }
+
+  /** SQL SPACE(int) function. */
+  public static String space(int n) {
+    return repeat(" ", n);
+  }
+
+  /** SQL SOUNDEX(string) function. */
+  public static String soundex(String s) {
+    return SOUNDEX.soundex(s);
+  }
+
+  /** SQL DIFFERENCE(string, string) function. */
+  public static int difference(String s0, String s1) {
+    String result0 = soundex(s0);
+    String result1 = soundex(s1);
+    for (int i = 0; i < SOUNDEX_LENGTH; i++) {
+      if (result0.charAt(i) != result1.charAt(i)) {
+        return i;
+      }
+    }
+    return SOUNDEX_LENGTH;
+  }
+
   /** SQL CHARACTER_LENGTH(string) function. */
   public static int charLength(String s) {
     return s.length();
diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index c48387a..1afbc24 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -149,6 +149,46 @@ public abstract class SqlLibraryOperators {
 
   @LibraryOperator(libraries = {MYSQL})
   public static final SqlFunction JSON_REMOVE = new SqlJsonRemoveFunction();
+
+  @LibraryOperator(libraries = {MYSQL, POSTGRESQL})
+  public static final SqlFunction REPEAT =
+      new SqlFunction(
+          "REPEAT",
+          SqlKind.OTHER_FUNCTION,
+          ReturnTypes.ARG0_NULLABLE_VARYING,
+          null,
+          OperandTypes.STRING_INTEGER,
+          SqlFunctionCategory.STRING);
+
+  @LibraryOperator(libraries = {MYSQL})
+  public static final SqlFunction SPACE =
+      new SqlFunction(
+          "SPACE",
+          SqlKind.OTHER_FUNCTION,
+          ReturnTypes.VARCHAR_2000_NULLABLE,
+          null,
+          OperandTypes.INTEGER,
+          SqlFunctionCategory.STRING);
+
+  @LibraryOperator(libraries = {MYSQL, POSTGRESQL, ORACLE})
+  public static final SqlFunction SOUNDEX =
+      new SqlFunction(
+          "SOUNDEX",
+          SqlKind.OTHER_FUNCTION,
+          ReturnTypes.VARCHAR_4_NULLABLE,
+          null,
+          OperandTypes.CHARACTER,
+          SqlFunctionCategory.STRING);
+
+  @LibraryOperator(libraries = {POSTGRESQL})
+  public static final SqlFunction DIFFERENCE =
+      new SqlFunction(
+          "DIFFERENCE",
+          SqlKind.OTHER_FUNCTION,
+          ReturnTypes.INTEGER_NULLABLE,
+          null,
+          OperandTypes.STRING_STRING,
+          SqlFunctionCategory.STRING);
 }
 
 // End SqlLibraryOperators.java
diff --git a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
index 91f2246..5d12e12 100644
--- a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
+++ b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
@@ -405,6 +405,9 @@ public abstract class OperandTypes {
       family(SqlTypeFamily.STRING, SqlTypeFamily.STRING,
           SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER);
 
+  public static final SqlSingleOperandTypeChecker STRING_INTEGER =
+      family(SqlTypeFamily.STRING, SqlTypeFamily.INTEGER);
+
   /**
    * Operand type-checking strategy where two operands must both be in the
    * same string type family and last type is INTEGER.
diff --git a/core/src/main/java/org/apache/calcite/sql/type/ReturnTypes.java b/core/src/main/java/org/apache/calcite/sql/type/ReturnTypes.java
index a700a11..e05623f 100644
--- a/core/src/main/java/org/apache/calcite/sql/type/ReturnTypes.java
+++ b/core/src/main/java/org/apache/calcite/sql/type/ReturnTypes.java
@@ -283,10 +283,31 @@ public abstract class ReturnTypes {
       cascade(BIGINT, SqlTypeTransforms.TO_NULLABLE);
 
   /**
+   * Type-inference strategy that always returns "VARCHAR(4)".
+   */
+  public static final SqlReturnTypeInference VARCHAR_4 =
+      explicit(SqlTypeName.VARCHAR, 4);
+
+  /**
+   * Type-inference strategy that always returns "VARCHAR(4)" with nulls
+   * allowed if any of the operands allow nulls.
+   */
+  public static final SqlReturnTypeInference VARCHAR_4_NULLABLE =
+      cascade(VARCHAR_4, SqlTypeTransforms.TO_NULLABLE);
+
+  /**
    * Type-inference strategy that always returns "VARCHAR(2000)".
    */
   public static final SqlReturnTypeInference VARCHAR_2000 =
       explicit(SqlTypeName.VARCHAR, 2000);
+
+  /**
+   * Type-inference strategy that always returns "VARCHAR(2000)" with nulls
+   * allowed if any of the operands allow nulls.
+   */
+  public static final SqlReturnTypeInference VARCHAR_2000_NULLABLE =
+      cascade(VARCHAR_2000, SqlTypeTransforms.TO_NULLABLE);
+
   /**
    * Type-inference strategy for Histogram agg support
    */
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index ac5f9bd..7646d4b 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -268,6 +268,10 @@ public enum BuiltInMethod {
   UPPER(SqlFunctions.class, "upper", String.class),
   LOWER(SqlFunctions.class, "lower", String.class),
   ASCII(SqlFunctions.class, "ascii", String.class),
+  REPEAT(SqlFunctions.class, "repeat", String.class, int.class),
+  SPACE(SqlFunctions.class, "space", int.class),
+  SOUNDEX(SqlFunctions.class, "soundex", String.class),
+  DIFFERENCE(SqlFunctions.class, "difference", String.class, String.class),
   JSONIZE(JsonFunctions.class, "jsonize", Object.class),
   DEJSONIZE(JsonFunctions.class, "dejsonize", String.class),
   JSON_VALUE_EXPRESSION(JsonFunctions.class, "jsonValueExpression",
diff --git a/core/src/test/java/org/apache/calcite/sql/test/SqlOperatorBaseTest.java b/core/src/test/java/org/apache/calcite/sql/test/SqlOperatorBaseTest.java
index dedb9a9..7705f58 100644
--- a/core/src/test/java/org/apache/calcite/sql/test/SqlOperatorBaseTest.java
+++ b/core/src/test/java/org/apache/calcite/sql/test/SqlOperatorBaseTest.java
@@ -335,6 +335,19 @@ public abstract class SqlOperatorBaseTest {
                 .with("conformance", conformance));
   }
 
+  /** Creates a tester with special sql library. */
+  protected SqlTester tester(SqlLibrary library) {
+    return tester.withOperatorTable(
+        SqlLibraryOperatorTableFactory.INSTANCE
+            .getOperatorTable(SqlLibrary.STANDARD, library))
+        .withConnectionFactory(
+            CalciteAssert.EMPTY_CONNECTION_FACTORY
+                .with(new CalciteAssert
+                    .AddSchemaSpecPostProcessor(CalciteAssert.SchemaSpec.HR))
+                .with("fun", library.name())
+        );
+  }
+
   //--- Tests -----------------------------------------------------------
 
   /**
@@ -4255,6 +4268,62 @@ public abstract class SqlOperatorBaseTest {
     tester.checkNull("ASCII(cast(null as varchar(1)))");
   }
 
+  @Test public void testRepeatFunc() {
+    final SqlTester tester1 = tester(SqlLibrary.MYSQL);
+    tester1.setFor(SqlLibraryOperators.REPEAT);
+    tester1.checkString("REPEAT('a', -100)", "", "VARCHAR(1) NOT NULL");
+    tester1.checkString("REPEAT('a', -1)", "", "VARCHAR(1) NOT NULL");
+    tester1.checkString("REPEAT('a', 0)", "", "VARCHAR(1) NOT NULL");
+    tester1.checkString("REPEAT('a', 2)", "aa", "VARCHAR(1) NOT NULL");
+    tester1.checkString("REPEAT('abc', 3)", "abcabcabc", "VARCHAR(3) NOT NULL");
+    tester1.checkNull("REPEAT(cast(null as varchar(1)), -1)");
+    tester1.checkNull("REPEAT(cast(null as varchar(1)), 2)");
+    tester1.checkNull("REPEAT('abc', cast(null as integer))");
+    tester1.checkNull("REPEAT(cast(null as varchar(1)), cast(null as integer))");
+
+  }
+
+  @Test public void testSpaceFunc() {
+    final SqlTester tester1 = tester(SqlLibrary.MYSQL);
+    tester1.setFor(SqlLibraryOperators.SPACE);
+    tester1.checkString("SPACE(-100)", "", "VARCHAR(2000) NOT NULL");
+    tester1.checkString("SPACE(-1)", "", "VARCHAR(2000) NOT NULL");
+    tester1.checkString("SPACE(0)", "", "VARCHAR(2000) NOT NULL");
+    tester1.checkString("SPACE(2)", "  ", "VARCHAR(2000) NOT NULL");
+    tester1.checkString("SPACE(5)", "     ", "VARCHAR(2000) NOT NULL");
+    tester1.checkNull("SPACE(cast(null as integer))");
+  }
+
+  @Test public void testSoundexFunc() {
+    final SqlTester tester1 = oracleTester();
+    tester1.setFor(SqlLibraryOperators.SOUNDEX);
+    tester1.checkString("SOUNDEX('TECH ON THE NET')", "T253", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('Miller')", "M460", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('miler')", "M460", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('myller')", "M460", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('muller')", "M460", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('m')", "M000", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('mu')", "M000", "VARCHAR(4) NOT NULL");
+    tester1.checkString("SOUNDEX('mile')", "M400", "VARCHAR(4) NOT NULL");
+    tester1.checkNull("SOUNDEX(cast(null as varchar(1)))");
+    tester1.checkFails("SOUNDEX(_UTF8'\u5B57\u5B57')", "The character is not mapped.*", true);
+  }
+
+  @Test public void testDifferenceFunc() {
+    final SqlTester tester1 = tester(SqlLibrary.POSTGRESQL);
+    tester1.setFor(SqlLibraryOperators.DIFFERENCE);
+    tester1.checkScalarExact("DIFFERENCE('Miller', 'miller')", "4");
+    tester1.checkScalarExact("DIFFERENCE('Miller', 'myller')", "4");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'miller')", "4");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'miller')", "4");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'milk')", "2");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'mile')", "2");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'm')", "1");
+    tester1.checkScalarExact("DIFFERENCE('muller', 'lee')", "0");
+    tester1.checkNull("DIFFERENCE('muller', cast(null as varchar(1)))");
+    tester1.checkNull("DIFFERENCE(cast(null as varchar(1)), 'muller')");
+  }
+
   @Test public void testUpperFunc() {
     tester.setFor(SqlStdOperatorTable.UPPER);
     tester.checkString("upper('a')", "A", "CHAR(1) NOT NULL");
diff --git a/pom.xml b/pom.xml
index b5f6161..e48816c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -72,6 +72,7 @@ limitations under the License.
     <cassandra.version>3.11.2</cassandra.version>
     <checksum-maven-plugin.version>1.2</checksum-maven-plugin.version>
     <chinook-data-hsqldb.version>0.1</chinook-data-hsqldb.version>
+    <commons-codec.version>1.12</commons-codec.version>
     <commons-dbcp2.version>2.5.0</commons-dbcp2.version>
     <commons-lang3.version>3.8</commons-lang3.version>
     <commons-pool2.version>2.6.0</commons-pool2.version>
@@ -237,6 +238,11 @@ limitations under the License.
 
       <!-- Now third-party dependencies, sorted by groupId and artifactId. -->
       <dependency>
+        <groupId>commons-codec</groupId>
+        <artifactId>commons-codec</artifactId>
+        <version>${commons-codec.version}</version>
+      </dependency>
+      <dependency>
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-dbcp2</artifactId>
         <version>${commons-dbcp2.version}</version>
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 17c9588..f36d5fd 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -1487,12 +1487,8 @@ period:
 Not implemented:
 
 * {fn CHAR(string)}
-* {fn DIFFERENCE(string, string)}
 * {fn LEFT(string, integer)}
-* {fn REPEAT(string, integer)}
 * {fn RIGHT(string, integer)}
-* {fn SOUNDEX(string)}
-* {fn SPACE(integer)}
 
 #### Date/time
 
@@ -2108,7 +2104,11 @@ semantics.
 | o | SUBSTR(string, position [, substring_length ]) | Returns a portion of *string*, beginning at character *position*, *substring_length* characters long. SUBSTR calculates lengths using characters as defined by the input character set
 | o | GREATEST(expr [, expr ]*)                      | Returns the greatest of the expressions
 | o | LEAST(expr [, expr ]* )                        | Returns the least of the expressions
-| o p | TRANSLATE(expr, fromString, toString)          | Returns *expr* with all occurrences of each character in *fromString* replaced by its corresponding character in *toString*. Characters in *expr* that are not in *fromString* are not replaced
+| o p | TRANSLATE(expr, fromString, toString)        | Returns *expr* with all occurrences of each character in *fromString* replaced by its corresponding character in *toString*. Characters in *expr* that are not in *fromString* are not replaced
+| m o p | SOUNDEX(string)                            | Returns the phonetic representation of *string*. Throws exception when *string* is encoded with multi-byte encoding such as UTF-8
+| p | DIFFERENCE(string, string)                     | Returns the difference between the SOUNDEX values of two character expressions as an integer. For example, returns 4 if the SOUNDEX values are same and returns 0 if the SOUNDEX values are totally different.
+| m p | REPEAT(string, integer)                      | Returns a string of *integer* times *string*; Returns an empty string if *integer* is less than 1
+| m | SPACE(integer)                                 | Returns a string of *integer* spaces; Returns an empty string if *integer* is less than 1
 
 Note: