You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/04/21 03:36:29 UTC
svn commit: r766939 - in /hadoop/hive/trunk: ./
ql/src/java/org/apache/hadoop/hive/ql/parse/
ql/src/test/queries/clientnegative/ ql/src/test/queries/clientpositive/
ql/src/test/results/clientnegative/ ql/src/test/results/clientpositive/
Author: zshao
Date: Tue Apr 21 01:36:28 2009
New Revision: 766939
URL: http://svn.apache.org/viewvc?rev=766939&view=rev
Log:
HIVE-61. Implement Group-By. (Namit Jain via zshao)
Added:
hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q
hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q
hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q
hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q
hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q
hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out
hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out
hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out
hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out
hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out
Modified:
hadoop/hive/trunk/CHANGES.txt
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Apr 21 01:36:28 2009
@@ -77,6 +77,8 @@
HIVE-435. Fix Null pointer exception if password is null.
(Prasad Chakka via namit)
+ HIVE-61. Implement Group-By. (Namit Jain via zshao)
+
Release 0.3.0 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java Tue Apr 21 01:36:28 2009
@@ -65,7 +65,11 @@
TARGET_TABLE_COLUMN_MISMATCH("Cannot insert into target table because column number/types are different"),
TABLE_ALIAS_NOT_ALLOWED("Table Alias not Allowed in Sampling Clause"),
CLUSTERBY_DISTRIBUTEBY_CONFLICT("Cannot have both Cluster By and Distribute By Clauses"),
+ ORDERBY_DISTRIBUTEBY_CONFLICT("Cannot have both Order By and Distribute By Clauses"),
CLUSTERBY_SORTBY_CONFLICT("Cannot have both Cluster By and Sort By Clauses"),
+ ORDERBY_SORTBY_CONFLICT("Cannot have both Order By and Sort By Clauses"),
+ CLUSTERBY_ORDERBY_CONFLICT("Cannot have both Cluster By and Order By Clauses"),
+ NO_LIMIT_WITH_ORDERBY("In strict mode, limit must be specified if ORDER BY is present"),
UNION_NOTIN_SUBQ("Top level Union is not supported currently; use a subquery for the union"),
INVALID_INPUT_FORMAT_TYPE("Input Format must implement InputFormat"),
INVALID_OUTPUT_FORMAT_TYPE("Output Format must implement HiveOutputFormat, otherwise it should be either IgnoreKeyTextOutputFormat or SequenceFileOutputFormat"),
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java Tue Apr 21 01:36:28 2009
@@ -53,7 +53,11 @@
* SortBy controls the reduce keys, which affects the order of rows
* that the reducer receives.
*/
+
private HashMap<String, ASTNode> destToSortby;
+
+ /* Order by clause */
+ private HashMap<String, ASTNode> destToOrderby;
private HashMap<String, Integer> destToLimit;
private int outerQueryLimit;
@@ -74,6 +78,7 @@
this.destToClusterby = new HashMap<String, ASTNode>();
this.destToDistributeby = new HashMap<String, ASTNode>();
this.destToSortby = new HashMap<String, ASTNode>();
+ this.destToOrderby = new HashMap<String, ASTNode>();
this.destToLimit = new HashMap<String, Integer>();
this.destToAggregationExprs = new LinkedHashMap<String, LinkedHashMap<String, ASTNode> >();
@@ -143,6 +148,10 @@
this.destToSortby.put(clause, ast);
}
+ public void setOrderByExprForClause(String clause, ASTNode ast) {
+ this.destToOrderby.put(clause, ast);
+ }
+
public void setSrcForAlias(String alias, ASTNode ast) {
this.aliasToSrc.put(alias.toLowerCase(), ast);
}
@@ -210,10 +219,17 @@
return this.destToSortby.get(clause);
}
+ public ASTNode getOrderByForClause(String clause) {
+ return this.destToOrderby.get(clause);
+ }
+
public HashMap<String, ASTNode> getDestToSortBy() {
return destToSortby;
}
+ public HashMap<String, ASTNode> getDestToOrderBy() {
+ return destToOrderby;
+ }
public ASTNode getSrcForAlias(String alias) {
return this.aliasToSrc.get(alias.toLowerCase());
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Tue Apr 21 01:36:28 2009
@@ -470,6 +470,9 @@
if (qbp.getClusterByForClause(ctx_1.dest) != null) {
throw new SemanticException(ErrorMsg.CLUSTERBY_DISTRIBUTEBY_CONFLICT.getMsg(ast));
}
+ else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(ErrorMsg.ORDERBY_DISTRIBUTEBY_CONFLICT.getMsg(ast));
+ }
}
break;
@@ -480,6 +483,20 @@
if (qbp.getClusterByForClause(ctx_1.dest) != null) {
throw new SemanticException(ErrorMsg.CLUSTERBY_SORTBY_CONFLICT.getMsg(ast));
}
+ else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(ErrorMsg.ORDERBY_SORTBY_CONFLICT.getMsg(ast));
+ }
+
+ }
+ break;
+
+ case HiveParser.TOK_ORDERBY: {
+ // Get the order by aliases - these are aliased to the entries in the
+ // select list
+ qbp.setOrderByExprForClause(ctx_1.dest, ast);
+ if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(ErrorMsg.CLUSTERBY_ORDERBY_CONFLICT.getMsg(ast));
+ }
}
break;
@@ -2344,15 +2361,16 @@
}
@SuppressWarnings("nls")
- private Operator genLimitMapRedPlan(String dest, QB qb, Operator input, int limit, boolean isOuterQuery) throws SemanticException {
+ private Operator genLimitMapRedPlan(String dest, QB qb, Operator input, int limit, boolean extraMRStep)
+ throws SemanticException {
// A map-only job can be optimized - instead of converting it to a map-reduce job, we can have another map
// job to do the same to avoid the cost of sorting in the map-reduce phase. A better approach would be to
// write into a local file and then have a map-only job.
// Add the limit operator to get the value fields
Operator curr = genLimitPlan(dest, qb, input, limit);
- // If it is a outer most query and no sorting is specified, exact limit is applied by the fetch task
- if (isOuterQuery && !sortRequired(dest, qb))
+ // the client requested that an extra map-reduce step be performed
+ if (!extraMRStep)
return curr;
// Create a reduceSink operator followed by another limit
@@ -2360,21 +2378,6 @@
return genLimitPlan(dest, qb, curr, limit);
}
- /*
- * Is sorting reuired ?
- * If there are no cluster by/sort by keys, then an additional map-reduce job is not needed.
- * Else, sort the output by the relevant key (via another map-reduce job).
- */
- private boolean sortRequired(String dest, QB qb) {
- if (qb.getParseInfo().getClusterByForClause(dest) != null)
- return true;
-
- if (qb.getParseInfo().getSortByForClause(dest) != null)
- return true;
-
- return false;
- }
-
@SuppressWarnings("nls")
private Operator genReduceSinkPlan(String dest, QB qb,
Operator input, int numReducers) throws SemanticException {
@@ -2401,6 +2404,17 @@
sortExprs = qb.getParseInfo().getSortByForClause(dest);
}
+ if (sortExprs == null) {
+ sortExprs = qb.getParseInfo().getOrderByForClause(dest);
+ if (sortExprs != null) {
+ assert numReducers == 1;
+ // in strict mode, in the presence of order by, limit must be specified
+ Integer limit = qb.getParseInfo().getDestLimit(dest);
+ if (conf.getVar(HiveConf.ConfVars.HIVEPARTITIONPRUNER).equalsIgnoreCase("strict") && limit == null)
+ throw new SemanticException(ErrorMsg.NO_LIMIT_WITH_ORDERBY.getMsg(sortExprs));
+ }
+ }
+
ArrayList<exprNodeDesc> sortCols = new ArrayList<exprNodeDesc>();
StringBuilder order = new StringBuilder();
if (sortExprs != null) {
@@ -2946,19 +2960,35 @@
if (qbp.getClusterByForClause(dest) != null
|| qbp.getDistributeByForClause(dest) != null
+ || qbp.getOrderByForClause(dest) != null
|| qbp.getSortByForClause(dest) != null) {
- curr = genReduceSinkPlan(dest, qb, curr, -1);
+
+ int numReducers = -1;
+
+ // Use only 1 reducer if order by is present
+ if (qbp.getOrderByForClause(dest) != null)
+ numReducers = 1;
+
+ curr = genReduceSinkPlan(dest, qb, curr, numReducers);
}
if (qbp.getIsSubQ()) {
if (limit != null) {
- curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), false);
+ // In case of order by, only 1 reducer is used, so no need of another shuffle
+ curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), qbp.getOrderByForClause(dest) != null ? false : true);
}
} else {
curr = genConversionOps(dest, qb, curr);
// exact limit can be taken care of by the fetch operator
if (limit != null) {
- curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), qb.getIsQuery());
+ boolean extraMRStep = true;
+
+ if (qb.getIsQuery() &&
+ qbp.getClusterByForClause(dest) == null &&
+ qbp.getSortByForClause(dest) == null)
+ extraMRStep = false;
+
+ curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), extraMRStep);
qb.getParseInfo().setOuterQueryLimit(limit.intValue());
}
curr = genFileSinkPlan(dest, qb, curr);
@@ -3353,6 +3383,7 @@
if (qb.isSelectStarQuery()
&& qbParseInfo.getDestToClusterBy().isEmpty()
&& qbParseInfo.getDestToDistributeBy().isEmpty()
+ && qbParseInfo.getDestToOrderBy().isEmpty()
&& qbParseInfo.getDestToSortBy().isEmpty()) {
Iterator<Map.Entry<String, Table>> iter = qb.getMetaData().getAliasToTable().entrySet().iterator();
Table tab = ((Map.Entry<String, Table>)iter.next()).getValue();
Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,5 @@
+FROM src
+MAP src.key, CAST(src.key / 10 AS INT), CAST(src.key % 10 AS INT), src.value
+USING '/bin/cat' AS (tkey, ten, one, tvalue)
+CLUSTER BY tvalue, tkey
+ORDER BY ten, one;
Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,8 @@
+CREATE TABLE dest1(key INT, ten INT, one INT, value STRING) STORED AS TEXTFILE;
+
+FROM src
+INSERT OVERWRITE TABLE dest1
+MAP src.key, CAST(src.key / 10 AS INT), CAST(src.key % 10 AS INT), src.value
+USING '/bin/cat' AS (tkey, ten, one, tvalue)
+ORDER BY tvalue, tkey
+SORT BY ten, one;
Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,7 @@
+set hive.partition.pruning=strict;
+
+EXPLAIN
+SELECT src.key, src.value from src order by src.key;
+
+SELECT src.key, src.value from src order by src.key;
+
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,4 @@
+EXPLAIN
+SELECT x.* FROM SRC x ORDER BY key limit 10;
+
+SELECT x.* FROM SRC x ORDER BY key limit 10;
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,8 @@
+EXPLAIN
+SELECT subq.key, subq.value FROM
+(SELECT x.* FROM SRC x ORDER BY key limit 10) subq
+where subq.key < 10;
+
+SELECT subq.key, subq.value FROM
+(SELECT x.* FROM SRC x ORDER BY key limit 10) subq
+where subq.key < 10;
Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,2 @@
+FAILED: Parse Error: line 5:0 mismatched input 'ORDER' expecting EOF
+
Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: line 8:8 Cannot have both Order By and Sort By Clauses one
Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: line 4:44 In strict mode, limit must be specified if ORDER BY is present key
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,54 @@
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF SRC x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF x))) (TOK_ORDERBY (TOK_COLREF key)) (TOK_LIMIT 10)))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ x
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ Reduce Output Operator
+ key expressions:
+ expr: 0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: 0
+ type: string
+ expr: 1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ Limit
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 10
+
+
+0 val_0
+0 val_0
+0 val_0
+10 val_10
+100 val_100
+100 val_100
+103 val_103
+103 val_103
+104 val_104
+104 val_104
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,57 @@
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF SRC x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF x))) (TOK_ORDERBY (TOK_COLREF key)) (TOK_LIMIT 10))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_COLREF subq key)) (TOK_SELEXPR (TOK_COLREF subq value))) (TOK_WHERE (< (TOK_COLREF subq key) 10))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq:x
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ Reduce Output Operator
+ key expressions:
+ expr: 0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: 0
+ type: string
+ expr: 1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ Limit
+ Filter Operator
+ predicate:
+ expr: (UDFToDouble(0) < UDFToDouble(10))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: 0
+ type: string
+ expr: 1
+ type: string
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+0 val_0
+0 val_0
+0 val_0