You are viewing a plain text version of this content. The canonical link for it is here.
Posted to gitbox@hive.apache.org by GitBox <gi...@apache.org> on 2022/05/06 08:18:42 UTC

[GitHub] [hive] stiga-huang commented on a diff in pull request #2378: HIVE-25230: Add position and occurrence to INSTR()

stiga-huang commented on code in PR #2378:
URL: https://github.com/apache/hive/pull/2378#discussion_r866596517


##########
ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInstr.java:
##########
@@ -20,79 +20,158 @@
 
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
 /**
- * Generic UDF for string function <code>INSTR(str,substr)</code>. This mimcs
- * the function from MySQL
+ * Generic UDF for string function <code>INSTR(str,substr[,pos[,occurrence]])</code>.
+ * This extends the function from MySQL
  * http://dev.mysql.com/doc/refman/5.1/en/string-functions.html#function_instr
+ * and mimics the function from Oracle
+ * https://docs.oracle.com/database/121/SQLRF/functions089.htm#SQLRF00651
  *
  * <pre>
  * usage:
- * INSTR(str, substr)
+ * INSTR(str, substr[, pos[, occurrence]])
  * </pre>
  * <p>
  */
 @Description(name = "instr",
-    value = "_FUNC_(str, substr) - Returns the index of the first occurance of substr in str",
-    extended = "Example:\n"
-    + "  > SELECT _FUNC_('Facebook', 'boo') FROM src LIMIT 1;\n" + "  5")
+    value = "_FUNC_(str, substr[, pos[, occurrence]]) " +
+        "- Returns the index of the given occurrence of substr in str after position pos",
+    extended = "pos is a 1-based index. If pos < 0, the starting position is\n" +
+        "determined by counting backwards from the end of str and then Hive\n" +
+        "searches backward from the resulting position.\n" +
+        "occurrence is also a 1-based index. The value must be positive.\n" +
+        "If occurrence is greater than the number of matching occurrences,\n" +
+        "the function returns 0.\n" +
+        "If either of the optional arguments, pos or occurrence, is NULL,\n" +
+        "the function also returns NULL.\n" +
+        "Example:\n" +
+        "  > SELECT _FUNC_('Facebook', 'boo') FROM src LIMIT 1;\n" +
+        "  5\n" +
+        "  > SELECT _FUNC_('CORPORATE FLOOR','OR', 3, 2) FROM src LIMIT 1;\n" +
+        "  14\n" +
+        "  > SELECT _FUNC_('CORPORATE FLOOR','OR', -3, 2) FROM src LIMIT 1;\n" +
+        "  2")
 public class GenericUDFInstr extends GenericUDF {
 
   private transient ObjectInspectorConverters.Converter[] converters;
+  private transient PrimitiveObjectInspector.PrimitiveCategory[] inputTypes;
+  private transient Integer posConst;
+  private transient boolean isPosConst = false;
+  private transient Integer occurrenceConst;
+  private transient boolean isOccurrenceConst = false;
 
   @Override
   public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
-    if (arguments.length != 2) {
-      throw new UDFArgumentLengthException(
-          "The function INSTR accepts exactly 2 arguments.");
-    }
+    checkArgsSize(arguments, 2, 4);
 
+    converters = new ObjectInspectorConverters.Converter[arguments.length];
+    inputTypes = new PrimitiveObjectInspector.PrimitiveCategory[arguments.length];
     for (int i = 0; i < arguments.length; i++) {
-      Category category = arguments[i].getCategory();
-      if (category != Category.PRIMITIVE) {
-        throw new UDFArgumentTypeException(i, "The "
-            + GenericUDFUtils.getOrdinal(i + 1)
-            + " argument of function INSTR is expected to a "
-            + Category.PRIMITIVE.toString().toLowerCase() + " type, but "
-            + category.toString().toLowerCase() + " is found");
+      checkArgPrimitive(arguments, i);
+      if (i < 2) {
+        obtainStringConverter(arguments, i, inputTypes, converters);
+      } else {
+        obtainIntConverter(arguments, i, inputTypes, converters);
       }
     }
 
-    converters = new ObjectInspectorConverters.Converter[arguments.length];
-    for (int i = 0; i < arguments.length; i++) {
-      converters[i] = ObjectInspectorConverters.getConverter(arguments[i],
-          PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+    if (arguments.length > 2 && arguments[2] instanceof ConstantObjectInspector) {

Review Comment:
   I think this boosts the performance since `initialize()` will only be called once per instance, while `evaluate()` is called per row. Extracting the constants here save some work in `evaluate()`.
   
   We do the same in other functions, e.g. in substring_index(): 
   https://github.com/apache/hive/blob/037eacea46371015a7f9894c5a9ccfb9708d5c56/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSubstringIndex.java#L71-L79



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscribe@hive.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscribe@hive.apache.org
For additional commands, e-mail: gitbox-help@hive.apache.org