You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/06/19 02:50:16 UTC

svn commit: r786345 - in /hadoop/hive/trunk: CHANGES.txt ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java ql/src/test/queries/clientpositive/udf_like.q ql/src/test/results/clientpositive/udf_like.q.out

Author: zshao
Date: Fri Jun 19 00:50:16 2009
New Revision: 786345

URL: http://svn.apache.org/viewvc?rev=786345&view=rev
Log:
HIVE-542. UDF: Faster String Like. (Yuntao Jia via zshao)

Added:
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=786345&r1=786344&r2=786345&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Fri Jun 19 00:50:16 2009
@@ -73,6 +73,8 @@
     HIVE-528. Map Join followup: split MapJoinObject into MapJoinObjectKey and
     MapJoinObjectValue. (Namit Jain via zshao)
 
+    HIVE-542. UDF: Faster String Like. (Yuntao Jia via zshao)
+
   OPTIMIZATIONS
 
     HIVE-279. Predicate Pushdown support (Prasad Chakka via athusoo).

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java?rev=786345&r1=786344&r2=786345&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java Fri Jun 19 00:50:16 2009
@@ -32,7 +32,19 @@
   private static Log LOG = LogFactory.getLog(UDFLike.class.getName());
   private Text lastLikePattern = new Text();
   private Pattern p = null;
-
+  
+  // Doing characters comparison directly instead of regular expression 
+  // matching for simple patterns like "%abc%".
+  private enum PatternType {  
+    NONE,         // "abc"
+    BEGIN,        // "abc%"
+    END,          // "%abc"
+    MIDDLE,       // "%abc%"
+    COMPLEX,      // all other cases, such as "ab%c_de"
+  }
+  private PatternType type = PatternType.COMPLEX;
+  private Text simplePattern = new Text();
+  
   private BooleanWritable result = new BooleanWritable();
   public UDFLike() {
   }
@@ -63,16 +75,133 @@
     return sb.toString();
   }
   
+  /**
+   * Parses the likePattern. Based on it is a simple pattern 
+   * or not, the function might change two member variables. 
+   * {@link #type} will be changed to the corresponding 
+   * pattern type; {@link #simplePattern} will record 
+   * the string in it for later pattern matching if it 
+   * is a simple pattern.
+   * <p>
+   * Examples: 
+   * <blockquote><pre>
+   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
+   * and changes {@link #simplePattern} to "abc"
+   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
+   * and does not change {@link #simplePattern}
+   * </pre></blockquote>
+   * 
+   * @param likePattern the input LIKE query pattern
+   */
+  private void parseSimplePattern(String likePattern) {
+    int length     = likePattern.length();
+    int beginIndex = 0;
+    int endIndex   = length;
+    char lastChar  = 'a';
+    String strPattern = new String();
+    type = PatternType.NONE;
+    
+    for (int i=0; i<length; i++) {
+      char n = likePattern.charAt(i);
+      if (n == '_') {     // such as "a_b"  
+        if(lastChar != '\\') {                // such as "a%bc"
+          type = PatternType.COMPLEX;
+          return;            
+        } else {                              // such as "abc\%de%"                 
+          strPattern += likePattern.substring(beginIndex, i-1);                         
+          beginIndex = i;
+        }
+      } else if(n == '%') {
+        if (i==0) {                           // such as "%abc"
+          type = PatternType.END;
+          beginIndex = 1;
+        } else if(i<length-1) {
+          if(lastChar != '\\') {              // such as "a%bc"
+            type = PatternType.COMPLEX;
+            return;            
+          } else {                            // such as "abc\%de%"                 
+            strPattern += likePattern.substring(beginIndex, i-1);                         
+            beginIndex = i;
+          }
+        } else {
+          if(lastChar != '\\') {              
+            endIndex = length-1;
+            if (type == PatternType.END) {    // such as "%abc%"
+              type = PatternType.MIDDLE;
+            } else {
+              type = PatternType.BEGIN;       // such as "abc%"
+            }
+          } else {                            // such as "abc\%"
+            strPattern += likePattern.substring(beginIndex, i-1);                         
+            beginIndex = i;
+            endIndex = length;
+          }            
+        }
+      }
+      lastChar = n;
+    }
+    
+    strPattern += likePattern.substring(beginIndex, endIndex); 
+    simplePattern.set(strPattern);
+  }
+  
+  private static boolean find(Text s, Text sub, int startS, int endS) {
+    byte[] byteS   = s.getBytes();
+    byte[] byteSub = sub.getBytes();
+    int    lenSub  = sub.getLength();    
+    boolean match  = false;
+    for (int i=startS; (i<endS-lenSub+1) && (!match); i++) {
+      match = true;
+      for (int j=0; j<lenSub; j++) {
+        if (byteS[j+i] != byteSub[j]) {
+          match = false;
+          break;
+        }
+      }     
+    }    
+    return match;
+  }
+  
   public BooleanWritable evaluate(Text s, Text likePattern) {
     if (s == null || likePattern == null) {
       return null;
     }
     if (!likePattern.equals(lastLikePattern)) {
       lastLikePattern.set(likePattern);
-      p = Pattern.compile(likePatternToRegExp(likePattern.toString()));
+      String strLikePattern = likePattern.toString();
+  
+      parseSimplePattern(strLikePattern);
+      if (type == PatternType.COMPLEX)
+        p = Pattern.compile(likePatternToRegExp(strLikePattern));
+    }
+
+    if (type == PatternType.COMPLEX) {
+      Matcher m = p.matcher(s.toString());
+      result.set(m.matches());
+    } else {            
+      int startS = 0;
+      int endS = s.getLength();
+      // if s is shorter than the required pattern
+      if(endS < simplePattern.getLength()) {
+        result.set(false);
+        return result;
+      }        
+      switch (type) {
+      case BEGIN: 
+        endS = simplePattern.getLength();
+        break;
+      case END:
+        startS = endS-simplePattern.getLength();
+        break;
+      case NONE:
+        if (simplePattern.getLength() != s.getLength()) {          
+          result.set(false);
+          return result;
+        }          
+        break;        
+      }      
+      result.set(find(s, simplePattern, startS, endS));
     }
-    Matcher m = p.matcher(s.toString());
-    result.set(m.matches());
     return result;
   }
   

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q?rev=786345&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q Fri Jun 19 00:50:16 2009
@@ -0,0 +1,10 @@
+EXPLAIN
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+  '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+  '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86;
+
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+  '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+  '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86;

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out?rev=786345&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out Fri Jun 19 00:50:16 2009
@@ -0,0 +1,66 @@
+query: EXPLAIN
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+  '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+  '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (LIKE '_%_' '%\_\%\_%')) (TOK_SELEXPR (LIKE '__' '%\_\%\_%')) (TOK_SELEXPR (LIKE '%%_%_' '%\_\%\_%')) (TOK_SELEXPR (LIKE '%_%_%' '%\%\_\%')) (TOK_SELEXPR (LIKE '_%_' '\%\_%')) (TOK_SELEXPR (LIKE '%__' '__\%%')) (TOK_SELEXPR (LIKE '_%' '\_\%\_\%%')) (TOK_SELEXPR (LIKE '_%' '\_\%_%')) (TOK_SELEXPR (LIKE '%_' '\%\_')) (TOK_SELEXPR (LIKE 'ab' '\%\_')) (TOK_SELEXPR (LIKE 'ab' '_a%')) (TOK_SELEXPR (LIKE 'ab' 'a'))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) key) 86))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        src 
+            Filter Operator
+              predicate:
+                  expr: (UDFToDouble(key) = UDFToDouble(86))
+                  type: boolean
+              Select Operator
+                expressions:
+                      expr: ('_%_' like '%\_\%\_%')
+                      type: boolean
+                      expr: ('__' like '%\_\%\_%')
+                      type: boolean
+                      expr: ('%%_%_' like '%\_\%\_%')
+                      type: boolean
+                      expr: ('%_%_%' like '%\%\_\%')
+                      type: boolean
+                      expr: ('_%_' like '\%\_%')
+                      type: boolean
+                      expr: ('%__' like '__\%%')
+                      type: boolean
+                      expr: ('_%' like '\_\%\_\%%')
+                      type: boolean
+                      expr: ('_%' like '\_\%_%')
+                      type: boolean
+                      expr: ('%_' like '\%\_')
+                      type: boolean
+                      expr: ('ab' like '\%\_')
+                      type: boolean
+                      expr: ('ab' like '_a%')
+                      type: boolean
+                      expr: ('ab' like 'a')
+                      type: boolean
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 0
+                  table:
+                      input format: org.apache.hadoop.mapred.TextInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+
+query: SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+  '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+  '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86
+Input: default/src
+Output: file:/home/yjia/hive/build/ql/tmp/839410699/10000
+true	false	true	true	false	false	false	false	true	false	false	false