You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/06/19 02:50:16 UTC
svn commit: r786345 - in /hadoop/hive/trunk: CHANGES.txt
ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java
ql/src/test/queries/clientpositive/udf_like.q
ql/src/test/results/clientpositive/udf_like.q.out
Author: zshao
Date: Fri Jun 19 00:50:16 2009
New Revision: 786345
URL: http://svn.apache.org/viewvc?rev=786345&view=rev
Log:
HIVE-542. UDF: Faster String Like. (Yuntao Jia via zshao)
Added:
hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q
hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out
Modified:
hadoop/hive/trunk/CHANGES.txt
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java
Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=786345&r1=786344&r2=786345&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Fri Jun 19 00:50:16 2009
@@ -73,6 +73,8 @@
HIVE-528. Map Join followup: split MapJoinObject into MapJoinObjectKey and
MapJoinObjectValue. (Namit Jain via zshao)
+ HIVE-542. UDF: Faster String Like. (Yuntao Jia via zshao)
+
OPTIMIZATIONS
HIVE-279. Predicate Pushdown support (Prasad Chakka via athusoo).
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java?rev=786345&r1=786344&r2=786345&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java Fri Jun 19 00:50:16 2009
@@ -32,7 +32,19 @@
private static Log LOG = LogFactory.getLog(UDFLike.class.getName());
private Text lastLikePattern = new Text();
private Pattern p = null;
-
+
+ // Doing characters comparison directly instead of regular expression
+ // matching for simple patterns like "%abc%".
+ private enum PatternType {
+ NONE, // "abc"
+ BEGIN, // "abc%"
+ END, // "%abc"
+ MIDDLE, // "%abc%"
+ COMPLEX, // all other cases, such as "ab%c_de"
+ }
+ private PatternType type = PatternType.COMPLEX;
+ private Text simplePattern = new Text();
+
private BooleanWritable result = new BooleanWritable();
public UDFLike() {
}
@@ -63,16 +75,133 @@
return sb.toString();
}
+ /**
+ * Parses the likePattern. Based on it is a simple pattern
+ * or not, the function might change two member variables.
+ * {@link #type} will be changed to the corresponding
+ * pattern type; {@link #simplePattern} will record
+ * the string in it for later pattern matching if it
+ * is a simple pattern.
+ * <p>
+ * Examples:
+ * <blockquote><pre>
+ * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
+ * and changes {@link #simplePattern} to "abc"
+ * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
+ * and does not change {@link #simplePattern}
+ * </pre></blockquote>
+ *
+ * @param likePattern the input LIKE query pattern
+ */
+ private void parseSimplePattern(String likePattern) {
+ int length = likePattern.length();
+ int beginIndex = 0;
+ int endIndex = length;
+ char lastChar = 'a';
+ String strPattern = new String();
+ type = PatternType.NONE;
+
+ for (int i=0; i<length; i++) {
+ char n = likePattern.charAt(i);
+ if (n == '_') { // such as "a_b"
+ if(lastChar != '\\') { // such as "a%bc"
+ type = PatternType.COMPLEX;
+ return;
+ } else { // such as "abc\%de%"
+ strPattern += likePattern.substring(beginIndex, i-1);
+ beginIndex = i;
+ }
+ } else if(n == '%') {
+ if (i==0) { // such as "%abc"
+ type = PatternType.END;
+ beginIndex = 1;
+ } else if(i<length-1) {
+ if(lastChar != '\\') { // such as "a%bc"
+ type = PatternType.COMPLEX;
+ return;
+ } else { // such as "abc\%de%"
+ strPattern += likePattern.substring(beginIndex, i-1);
+ beginIndex = i;
+ }
+ } else {
+ if(lastChar != '\\') {
+ endIndex = length-1;
+ if (type == PatternType.END) { // such as "%abc%"
+ type = PatternType.MIDDLE;
+ } else {
+ type = PatternType.BEGIN; // such as "abc%"
+ }
+ } else { // such as "abc\%"
+ strPattern += likePattern.substring(beginIndex, i-1);
+ beginIndex = i;
+ endIndex = length;
+ }
+ }
+ }
+ lastChar = n;
+ }
+
+ strPattern += likePattern.substring(beginIndex, endIndex);
+ simplePattern.set(strPattern);
+ }
+
+ private static boolean find(Text s, Text sub, int startS, int endS) {
+ byte[] byteS = s.getBytes();
+ byte[] byteSub = sub.getBytes();
+ int lenSub = sub.getLength();
+ boolean match = false;
+ for (int i=startS; (i<endS-lenSub+1) && (!match); i++) {
+ match = true;
+ for (int j=0; j<lenSub; j++) {
+ if (byteS[j+i] != byteSub[j]) {
+ match = false;
+ break;
+ }
+ }
+ }
+ return match;
+ }
+
public BooleanWritable evaluate(Text s, Text likePattern) {
if (s == null || likePattern == null) {
return null;
}
if (!likePattern.equals(lastLikePattern)) {
lastLikePattern.set(likePattern);
- p = Pattern.compile(likePatternToRegExp(likePattern.toString()));
+ String strLikePattern = likePattern.toString();
+
+ parseSimplePattern(strLikePattern);
+ if (type == PatternType.COMPLEX)
+ p = Pattern.compile(likePatternToRegExp(strLikePattern));
+ }
+
+ if (type == PatternType.COMPLEX) {
+ Matcher m = p.matcher(s.toString());
+ result.set(m.matches());
+ } else {
+ int startS = 0;
+ int endS = s.getLength();
+ // if s is shorter than the required pattern
+ if(endS < simplePattern.getLength()) {
+ result.set(false);
+ return result;
+ }
+ switch (type) {
+ case BEGIN:
+ endS = simplePattern.getLength();
+ break;
+ case END:
+ startS = endS-simplePattern.getLength();
+ break;
+ case NONE:
+ if (simplePattern.getLength() != s.getLength()) {
+ result.set(false);
+ return result;
+ }
+ break;
+ }
+ result.set(find(s, simplePattern, startS, endS));
}
- Matcher m = p.matcher(s.toString());
- result.set(m.matches());
return result;
}
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q?rev=786345&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_like.q Fri Jun 19 00:50:16 2009
@@ -0,0 +1,10 @@
+EXPLAIN
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+ '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+ '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86;
+
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+ '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+ '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86;
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out?rev=786345&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_like.q.out Fri Jun 19 00:50:16 2009
@@ -0,0 +1,66 @@
+query: EXPLAIN
+SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+ '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+ '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (LIKE '_%_' '%\_\%\_%')) (TOK_SELEXPR (LIKE '__' '%\_\%\_%')) (TOK_SELEXPR (LIKE '%%_%_' '%\_\%\_%')) (TOK_SELEXPR (LIKE '%_%_%' '%\%\_\%')) (TOK_SELEXPR (LIKE '_%_' '\%\_%')) (TOK_SELEXPR (LIKE '%__' '__\%%')) (TOK_SELEXPR (LIKE '_%' '\_\%\_\%%')) (TOK_SELEXPR (LIKE '_%' '\_\%_%')) (TOK_SELEXPR (LIKE '%_' '\%\_')) (TOK_SELEXPR (LIKE 'ab' '\%\_')) (TOK_SELEXPR (LIKE 'ab' '_a%')) (TOK_SELEXPR (LIKE 'ab' 'a'))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) key) 86))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ Filter Operator
+ predicate:
+ expr: (UDFToDouble(key) = UDFToDouble(86))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: ('_%_' like '%\_\%\_%')
+ type: boolean
+ expr: ('__' like '%\_\%\_%')
+ type: boolean
+ expr: ('%%_%_' like '%\_\%\_%')
+ type: boolean
+ expr: ('%_%_%' like '%\%\_\%')
+ type: boolean
+ expr: ('_%_' like '\%\_%')
+ type: boolean
+ expr: ('%__' like '__\%%')
+ type: boolean
+ expr: ('_%' like '\_\%\_\%%')
+ type: boolean
+ expr: ('_%' like '\_\%_%')
+ type: boolean
+ expr: ('%_' like '\%\_')
+ type: boolean
+ expr: ('ab' like '\%\_')
+ type: boolean
+ expr: ('ab' like '_a%')
+ type: boolean
+ expr: ('ab' like 'a')
+ type: boolean
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+query: SELECT '_%_' LIKE '%\_\%\_%', '__' LIKE '%\_\%\_%', '%%_%_' LIKE '%\_\%\_%', '%_%_%' LIKE '%\%\_\%',
+ '_%_' LIKE '\%\_%', '%__' LIKE '__\%%', '_%' LIKE '\_\%\_\%%', '_%' LIKE '\_\%_%',
+ '%_' LIKE '\%\_', 'ab' LIKE '\%\_', 'ab' LIKE '_a%', 'ab' LIKE 'a'
+FROM src WHERE src.key = 86
+Input: default/src
+Output: file:/home/yjia/hive/build/ql/tmp/839410699/10000
+true false true true false false false false true false false false