You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/06/19 01:05:08 UTC
svn commit: r786332 - in /hadoop/hive/trunk: ./
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/java/org/apache/hadoop/hive/ql/udf/
ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/
Author: zshao
Date: Thu Jun 18 23:05:08 2009
New Revision: 786332
URL: http://svn.apache.org/viewvc?rev=786332&view=rev
Log:
HIVE-563. UDF for parsing the URL: parse_url. (Suresh Antony via zshao)
Added:
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java
hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q
hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out
Modified:
hadoop/hive/trunk/CHANGES.txt
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=786332&r1=786331&r2=786332&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Thu Jun 18 23:05:08 2009
@@ -45,6 +45,8 @@
HIVE-460. Improve Column Pruning. (He Yongqiang via namit)
+ HIVE-563. UDF for parsing the URL: parse_url. (Suresh Antony via zshao)
+
IMPROVEMENTS
HIVE-389. Option to build without ivy (jssarma)
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java?rev=786332&r1=786331&r2=786332&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java Thu Jun 18 23:05:08 2009
@@ -83,6 +83,7 @@
registerUDF("regexp", UDFRegExp.class, OperatorType.INFIX, true);
registerUDF("regexp_replace", UDFRegExpReplace.class, OperatorType.PREFIX, false);
registerUDF("regexp_extract", UDFRegExpExtract.class, OperatorType.PREFIX, false);
+ registerUDF("parse_url", UDFParseUrl.class, OperatorType.PREFIX, false);
registerUDF("positive", UDFOPPositive.class, OperatorType.PREFIX, true, "+");
registerUDF("negative", UDFOPNegative.class, OperatorType.PREFIX, true, "-");
Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFParseUrl.java Thu Jun 18 23:05:08 2009
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf;
+
+import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+
+
+
+
+/**
+ * UDF to extract specfic parts from URL For example,
+ * parse_url('http://facebook.com/path/p1.php?query=1', 'HOST') will return
+ * 'facebook.com' For example,
+ * parse_url('http://facebook.com/path/p1.php?query=1', 'PATH') will return
+ * '/path/p1.php' parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')
+ * will return 'query=1'
+ * parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'REF') will return
+ * 'Ref' parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'PROTOCOL')
+ * will return 'http' Possible values are
+ * HOST,PATH,QUERY,REF,PROTOCOL,AUTHORITY,FILE,USERINFO Also you can get a value
+ * of particular key in QUERY, using syntax QUERY:<KEY_NAME> eg: QUERY:k1.
+ */
+public class UDFParseUrl extends UDF {
+ private static Log LOG = LogFactory.getLog(UDFParseUrl.class.getName());
+
+ private String lastUrlStr = null;
+ private URL url = null;
+ private Pattern p = null;
+ private String lastKey = null;
+
+
+ public UDFParseUrl() {
+ }
+
+ public String evaluate(String urlStr, String partToExtract) {
+ if (urlStr == null || partToExtract == null) {
+ return null;
+ }
+
+ if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
+ try {
+ url = new URL(urlStr);
+ } catch (Exception e) {
+ return null;
+ }
+ }
+ lastUrlStr = urlStr;
+
+ if (partToExtract.equals("HOST"))
+ return url.getHost();
+ if (partToExtract.equals("PATH"))
+ return url.getPath();
+ if (partToExtract.equals("QUERY"))
+ return url.getQuery();
+ if (partToExtract.equals("REF"))
+ return url.getRef();
+ if (partToExtract.equals("PROTOCOL"))
+ return url.getProtocol();
+ if (partToExtract.equals("FILE"))
+ return url.getFile();
+ if (partToExtract.equals("AUTHORITY"))
+ return url.getAuthority();
+ if (partToExtract.equals("USERINFO"))
+ return url.getUserInfo();
+
+ return null;
+ }
+
+ public String evaluate(String urlStr, String partToExtract, String key) {
+ if (!partToExtract.equals("QUERY"))
+ return null;
+
+ String query = this.evaluate(urlStr, partToExtract);
+ if (query == null)
+ return null;
+
+
+ if (!key.equals(lastKey)){
+ p = Pattern.compile("(&|^)"+key+"=([^&]*)");
+ }
+
+ lastKey = key;
+ Matcher m = p.matcher(query);
+ if (m.find()){
+ return m.group(2);
+ }
+ return null;
+ }
+}
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udf_parse_url.q Thu Jun 18 23:05:08 2009
@@ -0,0 +1,26 @@
+EXPLAIN
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+ FROM src WHERE key = 86;
+
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+ FROM src WHERE key = 86;
\ No newline at end of file
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out?rev=786332&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udf_parse_url.q.out Thu Jun 18 23:05:08 2009
@@ -0,0 +1,80 @@
+query: EXPLAIN
+SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+ FROM src WHERE key = 86
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'HOST')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'PATH')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'REF')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k2')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k1')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'QUERY' 'k3')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'FILE')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' '
PROTOCOL')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'USERINFO')) (TOK_SELEXPR (TOK_FUNCTION parse_url 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' 'AUTHORITY'))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 86))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ Filter Operator
+ predicate:
+ expr: (UDFToDouble(key) = UDFToDouble(86))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO')
+ type: string
+ expr: parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+ type: string
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+query: SELECT parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k2') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k3') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'USERINFO') ,
+parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY')
+ FROM src WHERE key = 86
+Input: default/src
+Output: file:/mnt/vol/devrs008.snc1/suresh/hive_external/build/ql/tmp/384013469/10000
+facebook.com /path1/p.php k1=v1&k2=v2 Ref1 v2 v1 NULL /path1/p.php?k1=v1&k2=v2 http NULL facebook.com