You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ap...@apache.org on 2015/05/31 07:30:45 UTC

hive git commit: HIVE-686 add UDF substring_index (Alexander Pivovarov, reviewed by Sergio Peña)

Repository: hive
Updated Branches:
  refs/heads/master a4a41830a -> cfab025a1


HIVE-686 add UDF substring_index (Alexander Pivovarov, reviewed by Sergio Peña)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cfab025a
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cfab025a
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cfab025a

Branch: refs/heads/master
Commit: cfab025a1199197bbdf3e9a21267022fd40e1649
Parents: a4a4183
Author: Alexander Pivovarov <ap...@gmail.com>
Authored: Sun May 24 23:40:38 2015 -0700
Committer: Alexander Pivovarov <ap...@gmail.com>
Committed: Sat May 30 22:28:28 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/exec/FunctionRegistry.java   |   1 +
 .../udf/generic/GenericUDFSubstringIndex.java   | 159 +++++++++++++++++++
 .../generic/TestGenericUDFSubstringIndex.java   |  97 +++++++++++
 .../clientpositive/udf_substring_index.q        |  32 ++++
 .../results/clientpositive/show_functions.q.out |   1 +
 .../clientpositive/udf_substring_index.q.out    | 107 +++++++++++++
 6 files changed, 397 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index 94a3b17..fabc21e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -180,6 +180,7 @@ public final class FunctionRegistry {
     system.registerGenericUDF("concat", GenericUDFConcat.class);
     system.registerUDF("substr", UDFSubstr.class, false);
     system.registerUDF("substring", UDFSubstr.class, false);
+    system.registerGenericUDF("substring_index", GenericUDFSubstringIndex.class);
     system.registerUDF("space", UDFSpace.class, false);
     system.registerUDF("repeat", UDFRepeat.class, false);
     system.registerUDF("ascii", UDFAscii.class, false);

http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSubstringIndex.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSubstringIndex.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSubstringIndex.java
new file mode 100644
index 0000000..d0e0ea7
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSubstringIndex.java
@@ -0,0 +1,159 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.NUMERIC_GROUP;
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+
+/**
+ * GenericUDFSubstringIndex.
+ *
+ */
+@Description(name = "substring_index",
+    value = "_FUNC_(str, delim, count) - Returns the substring from string str before count occurrences "
+        + "of the delimiter delim.",
+    extended = "If count is positive, everything to the left of the final delimiter (counting from the left) "
+        + "is returned. If count is negative, everything to the right of the final delimiter "
+        + "(counting from the right) is returned. Substring_index performs a case-sensitive match when searching "
+        + "for delim.\n"
+        + "Example:\n > SELECT _FUNC_('www.apache.org', '.', 2);\n 'www.apache'")
+public class GenericUDFSubstringIndex extends GenericUDF {
+  private transient Converter[] converters = new Converter[3];
+  private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[3];
+  private final Text output = new Text();
+  private transient String delimConst;
+  private transient boolean isDelimConst;
+  private transient Integer countConst;
+  private transient boolean isCountConst;
+
+  @Override
+  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+    checkArgsSize(arguments, 3, 3);
+
+    checkArgPrimitive(arguments, 0);
+    checkArgPrimitive(arguments, 1);
+    checkArgPrimitive(arguments, 2);
+
+    checkArgGroups(arguments, 0, inputTypes, STRING_GROUP);
+    checkArgGroups(arguments, 1, inputTypes, STRING_GROUP);
+    checkArgGroups(arguments, 2, inputTypes, NUMERIC_GROUP);
+
+    obtainStringConverter(arguments, 0, inputTypes, converters);
+    obtainStringConverter(arguments, 1, inputTypes, converters);
+    obtainIntConverter(arguments, 2, inputTypes, converters);
+
+    if (arguments[1] instanceof ConstantObjectInspector) {
+      delimConst = getConstantStringValue(arguments, 1);
+      isDelimConst = true;
+    }
+
+    if (arguments[2] instanceof ConstantObjectInspector) {
+      countConst = getConstantIntValue(arguments, 2);
+      isCountConst = true;
+    }
+
+    ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    return outputOI;
+  }
+
+  @Override
+  public Object evaluate(DeferredObject[] arguments) throws HiveException {
+    // str
+    String str = getStringValue(arguments, 0, converters);
+    if (str == null) {
+      return null;
+    }
+    if (str.length() == 0) {
+      output.set("");
+      return output;
+    }
+
+    // delim
+    String delim;
+    if (isDelimConst) {
+      delim = delimConst;
+    } else {
+      delim = getStringValue(arguments, 1, converters);
+    }
+    if (delim == null) {
+      return null;
+    }
+    if (delim.length() == 0) {
+      output.set("");
+      return output;
+    }
+
+    // count
+    Integer countV;
+    if (isCountConst) {
+      countV = countConst;
+    } else {
+      countV = getIntValue(arguments, 2, converters);
+    }
+    if (countV == null) {
+      return null;
+    }
+    int count = countV.intValue();
+    if (count == 0) {
+      output.set("");
+      return output;
+    }
+
+    // get substring
+    String res;
+    if (count > 0) {
+      int idx = StringUtils.ordinalIndexOf(str, delim, count);
+      if (idx != -1) {
+        res = str.substring(0, idx);
+      } else {
+        res = str;
+      }
+    } else {
+      int idx = StringUtils.lastOrdinalIndexOf(str, delim, -count);
+      if (idx != -1) {
+        res = str.substring(idx + 1);
+      } else {
+        res = str;
+      }
+    }
+
+    output.set(res);
+    return output;
+  }
+
+  @Override
+  public String getDisplayString(String[] children) {
+    return getStandardDisplayString(getFuncName(), children);
+  }
+
+  @Override
+  protected String getFuncName() {
+    return "substring_index";
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFSubstringIndex.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFSubstringIndex.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFSubstringIndex.java
new file mode 100644
index 0000000..bcc98d7
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFSubstringIndex.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+
+public class TestGenericUDFSubstringIndex extends TestCase {
+
+  public void testSubstringIndex() throws HiveException {
+    GenericUDFSubstringIndex udf = new GenericUDFSubstringIndex();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    ObjectInspector valueOI2 = PrimitiveObjectInspectorFactory.writableIntObjectInspector;
+    ObjectInspector[] arguments = { valueOI0, valueOI1, valueOI2 };
+
+    udf.initialize(arguments);
+
+    runAndVerify("www.apache.org", ".", 3, "www.apache.org", udf);
+    runAndVerify("www.apache.org", ".", 2, "www.apache", udf);
+    runAndVerify("www.apache.org", ".", 1, "www", udf);
+    runAndVerify("www.apache.org", ".", 0, "", udf);
+    runAndVerify("www.apache.org", ".", -1, "org", udf);
+    runAndVerify("www.apache.org", ".", -2, "apache.org", udf);
+    runAndVerify("www.apache.org", ".", -3, "www.apache.org", udf);
+
+    // str is empty string
+    runAndVerify("", ".", 1, "", udf);
+    // empty string delim
+    runAndVerify("www.apache.org", "", 1, "", udf);
+    // delim does not exist in str
+    runAndVerify("www.apache.org", "-", 2, "www.apache.org", udf);
+    // delim is 2 chars
+    runAndVerify("www||apache||org", "||", 2, "www||apache", udf);
+
+    // null
+    runAndVerify(null, ".", 2, null, udf);
+    runAndVerify("www.apache.org", null, 2, null, udf);
+    runAndVerify("www.apache.org", ".", null, null, udf);
+  }
+
+  public void testSubstringIndexConst() throws HiveException {
+    GenericUDFSubstringIndex udf = new GenericUDFSubstringIndex();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    Text delim = new Text(".");
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory
+        .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, delim);
+    IntWritable count = new IntWritable(2);
+    ObjectInspector valueOI2 = PrimitiveObjectInspectorFactory
+        .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.intTypeInfo, count);
+    ObjectInspector[] arguments = { valueOI0, valueOI1, valueOI2 };
+
+    udf.initialize(arguments);
+
+    runAndVerifyConst("www.apache.org", "www.apache", udf);
+  }
+
+  private void runAndVerify(String str, String delim, Integer count, String expResult,
+      GenericUDF udf) throws HiveException {
+    DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null);
+    DeferredObject valueObj1 = new DeferredJavaObject(delim != null ? new Text(delim) : delim);
+    DeferredObject valueObj2 = new DeferredJavaObject(count != null ? new IntWritable(count) : null);
+    DeferredObject[] args = { valueObj0, valueObj1, valueObj2 };
+    Text output = (Text) udf.evaluate(args);
+    assertEquals("substring_index() test ", expResult, output != null ? output.toString() : null);
+  }
+
+  private void runAndVerifyConst(String str, String expResult, GenericUDF udf) throws HiveException {
+    DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null);
+    DeferredObject[] args = { valueObj0 };
+    Text output = (Text) udf.evaluate(args);
+    assertEquals("substring_index() test ", expResult, output != null ? output.toString() : null);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/test/queries/clientpositive/udf_substring_index.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_substring_index.q b/ql/src/test/queries/clientpositive/udf_substring_index.q
new file mode 100644
index 0000000..d55e636
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_substring_index.q
@@ -0,0 +1,32 @@
+DESCRIBE FUNCTION substring_index;
+DESCRIBE FUNCTION EXTENDED substring_index;
+
+explain select substring_index('www.apache.org', '.', 2);
+
+select
+substring_index('www.apache.org', '.', 3),
+substring_index('www.apache.org', '.', 2),
+substring_index('www.apache.org', '.', 1),
+substring_index('www.apache.org', '.', 0),
+substring_index('www.apache.org', '.', -1),
+substring_index('www.apache.org', '.', -2),
+substring_index('www.apache.org', '.', -3);
+
+select
+--str is empty string
+substring_index('', '.', 2),
+--delim is empty string
+substring_index('www.apache.org', '', 1),
+--delim does not exist in str
+substring_index('www.apache.org', '-', 2),
+--delim is two chars
+substring_index('www||apache||org', '||', 2),
+--null
+substring_index(cast(null as string), '.', 2),
+substring_index('www.apache.org', cast(null as string), 2),
+substring_index('www.apache.org', '.', cast(null as int));
+
+--varchar and char
+select
+substring_index(cast('www.apache.org' as varchar(20)), '.', 2),
+substring_index(cast('www.apache.org' as char(20)), '.', 2);

http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/test/results/clientpositive/show_functions.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out
index 16820ca..5de4ffc 100644
--- a/ql/src/test/results/clientpositive/show_functions.q.out
+++ b/ql/src/test/results/clientpositive/show_functions.q.out
@@ -194,6 +194,7 @@ str_to_map
 struct
 substr
 substring
+substring_index
 sum
 tan
 to_date

http://git-wip-us.apache.org/repos/asf/hive/blob/cfab025a/ql/src/test/results/clientpositive/udf_substring_index.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_substring_index.q.out b/ql/src/test/results/clientpositive/udf_substring_index.q.out
new file mode 100644
index 0000000..29dc988
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_substring_index.q.out
@@ -0,0 +1,107 @@
+PREHOOK: query: DESCRIBE FUNCTION substring_index
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION substring_index
+POSTHOOK: type: DESCFUNCTION
+substring_index(str, delim, count) - Returns the substring from string str before count occurrences of the delimiter delim.
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED substring_index
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED substring_index
+POSTHOOK: type: DESCFUNCTION
+substring_index(str, delim, count) - Returns the substring from string str before count occurrences of the delimiter delim.
+If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. Substring_index performs a case-sensitive match when searching for delim.
+Example:
+ > SELECT substring_index('www.apache.org', '.', 2);
+ 'www.apache'
+PREHOOK: query: explain select substring_index('www.apache.org', '.', 2)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select substring_index('www.apache.org', '.', 2)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        TableScan
+          alias: _dummy_table
+          Row Limit Per Split: 1
+          Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE
+          Select Operator
+            expressions: 'www.apache' (type: string)
+            outputColumnNames: _col0
+            Statistics: Num rows: 1 Data size: 94 Basic stats: COMPLETE Column stats: COMPLETE
+            ListSink
+
+PREHOOK: query: select
+substring_index('www.apache.org', '.', 3),
+substring_index('www.apache.org', '.', 2),
+substring_index('www.apache.org', '.', 1),
+substring_index('www.apache.org', '.', 0),
+substring_index('www.apache.org', '.', -1),
+substring_index('www.apache.org', '.', -2),
+substring_index('www.apache.org', '.', -3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+substring_index('www.apache.org', '.', 3),
+substring_index('www.apache.org', '.', 2),
+substring_index('www.apache.org', '.', 1),
+substring_index('www.apache.org', '.', 0),
+substring_index('www.apache.org', '.', -1),
+substring_index('www.apache.org', '.', -2),
+substring_index('www.apache.org', '.', -3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+www.apache.org	www.apache	www		org	apache.org	www.apache.org
+PREHOOK: query: select
+--str is empty string
+substring_index('', '.', 2),
+--delim is empty string
+substring_index('www.apache.org', '', 1),
+--delim does not exist in str
+substring_index('www.apache.org', '-', 2),
+--delim is two chars
+substring_index('www||apache||org', '||', 2),
+--null
+substring_index(cast(null as string), '.', 2),
+substring_index('www.apache.org', cast(null as string), 2),
+substring_index('www.apache.org', '.', cast(null as int))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+--str is empty string
+substring_index('', '.', 2),
+--delim is empty string
+substring_index('www.apache.org', '', 1),
+--delim does not exist in str
+substring_index('www.apache.org', '-', 2),
+--delim is two chars
+substring_index('www||apache||org', '||', 2),
+--null
+substring_index(cast(null as string), '.', 2),
+substring_index('www.apache.org', cast(null as string), 2),
+substring_index('www.apache.org', '.', cast(null as int))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+		www.apache.org	www||apache	NULL	NULL	NULL
+PREHOOK: query: --varchar and char
+select
+substring_index(cast('www.apache.org' as varchar(20)), '.', 2),
+substring_index(cast('www.apache.org' as char(20)), '.', 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: --varchar and char
+select
+substring_index(cast('www.apache.org' as varchar(20)), '.', 2),
+substring_index(cast('www.apache.org' as char(20)), '.', 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+www.apache	www.apache