You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/06/19 01:05:08 UTC

svn commit: r786332 - in /hadoop/hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/udf/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/

Author: zshao
Date: Thu Jun 18 23:05:08 2009
New Revision: 786332

URL: http://svn.apache.org/viewvc?rev=786332&view=rev
Log:
HIVE-563. UDF for parsing the URL: parse_url. (Suresh Antony via zshao)

Added:
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=786332&r1=786331&r2=786332&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Thu Jun 18 23:05:08 2009
@@ -45,6 +45,8 @@
 
     HIVE-460. Improve Column Pruning. (He Yongqiang via namit)
 
+    HIVE-563. UDF for parsing the URL: parse_url. (Suresh Antony via zshao)
+
   IMPROVEMENTS
     HIVE-389. Option to build without ivy (jssarma)
 

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java?rev=786332&r1=786331&r2=786332&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java Thu Jun 18 23:05:08 2009
@@ -83,6 +83,7 @@
     registerUDF("regexp", UDFRegExp.class, OperatorType.INFIX, true);
     registerUDF("regexp_replace", UDFRegExpReplace.class, OperatorType.PREFIX, false);
     registerUDF("regexp_extract", UDFRegExpExtract.class, OperatorType.PREFIX, false);
+    registerUDF("parse_url", UDFParseUrl.class, OperatorType.PREFIX, false);
 
     registerUDF("positive", UDFOPPositive.class, OperatorType.PREFIX, true, "+");
     registerUDF("negative", UDFOPNegative.class, OperatorType.PREFIX, true, "-");

Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java Thu Jun 18 23:05:08 2009
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf;
+
+import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+
+
+
+
+/**
+ * UDF to extract specfic parts from URL For example,
+ * parse_url('http://facebook.com/path/p1.php?query=1', 'HOST') will return
+ * 'facebook.com' For example,
+ * parse_url('http://facebook.com/path/p1.php?query=1', 'PATH') will return
+ * '/path/p1.php' parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')
+ * will return 'query=1'
+ * parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'REF') will return
+ * 'Ref' parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'PROTOCOL')
+ * will return 'http' Possible values are
+ * HOST,PATH,QUERY,REF,PROTOCOL,AUTHORITY,FILE,USERINFO Also you can get a value
+ * of particular key in QUERY, using syntax QUERY:<KEY_NAME> eg: QUERY:k1.
+ */
+public class UDFParseUrl extends UDF {
+  private static Log LOG = LogFactory.getLog(UDFParseUrl.class.getName());
+
+  private String lastUrlStr = null;
+  private URL url = null;
+  private Pattern p = null;
+  private String lastKey = null;
+  
+
+  public UDFParseUrl() {
+  }
+
+  public String evaluate(String urlStr, String partToExtract) {
+    if (urlStr == null || partToExtract == null) {
+      return null;
+    }
+
+    if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
+      try {
+        url = new URL(urlStr);
+      } catch (Exception e) {
+        return null;
+      }
+    }
+    lastUrlStr = urlStr;
+
+    if (partToExtract.equals("HOST"))
+      return url.getHost();
+    if (partToExtract.equals("PATH"))
+      return url.getPath();
+    if (partToExtract.equals("QUERY"))
+      return url.getQuery();
+    if (partToExtract.equals("REF"))
+      return url.getRef();
+    if (partToExtract.equals("PROTOCOL"))
+      return url.getProtocol();
+    if (partToExtract.equals("FILE"))
+      return url.getFile();
+    if (partToExtract.equals("AUTHORITY"))
+      return url.getAuthority();
+    if (partToExtract.equals("USERINFO"))
+      return url.getUserInfo();
+
+    return null;
+  }
+
+  public String evaluate(String urlStr, String partToExtract, String key) {
+    if (!partToExtract.equals("QUERY"))
+      return null;
+
+    String query = this.evaluate(urlStr, partToExtract);
+    if (query == null)
+      return null;
+
+   
+    if (!key.equals(lastKey)){
+      p = Pattern.compile("(&|^)"+key+"=([^&]*)");
+    }
+     
+    lastKey = key;
+    Matcher m = p.matcher(query);
+    if (m.find()){
+      return m.group(2);
+    }
+    return null;
+  }
+}

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q Thu Jun 18 23:05:08 2009
@@ -0,0 +1,26 @@
+EXPLAIN
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY') 
+  FROM src WHERE key = 86;
+
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY') 
+  FROM src WHERE key = 86;
\ No newline at end of file

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out Thu Jun 18 23:05:08 2009
@@ -0,0 +1,80 @@
+query: EXPLAIN
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY') 
+  FROM src WHERE key = 86
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'HOST')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'PATH')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'REF')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k2')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k1')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k3')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'FILE')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' '
 PROTOCOL')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'USERINFO')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'AUTHORITY'))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        src 
+            Filter Operator
+              predicate:
+                  expr: (UDFToDouble(key) = UDFToDouble(86))
+                  type: boolean
+              Select Operator
+                expressions:
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO')
+                      type: string
+                      expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+                      type: string
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 0
+                  table:
+                      input format: org.apache.hadoop.mapred.TextInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+
+query: SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'), 
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY') 
+  FROM src WHERE key = 86
+Input: default/src
+Output: file:/mnt/vol/devrs008.snc1/suresh/hive_external/build/ql/tmp/384013469/10000
+facebook.com	/path1/p.php	k1=v1&k2=v2	Ref1	v2	v1	NULL	/path1/p.php?k1=v1&k2=v2	http	NULL	facebook.com