You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2009/04/21 03:36:29 UTC

svn commit: r766939 - in /hadoop/hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/parse/ ql/src/test/queries/clientnegative/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientnegative/ ql/src/test/results/clientpositive/

Author: zshao
Date: Tue Apr 21 01:36:28 2009
New Revision: 766939

URL: http://svn.apache.org/viewvc?rev=766939&view=rev
Log:
HIVE-61. Implement Group-By. (Namit Jain via zshao)

Added:
    hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q
    hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q
    hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q
    hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out
    hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out
    hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out
    hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out
    hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Apr 21 01:36:28 2009
@@ -77,6 +77,8 @@
     HIVE-435. Fix Null pointer exception if password is null.
     (Prasad Chakka via namit)
 
+    HIVE-61. Implement Group-By. (Namit Jain via zshao)
+
 Release 0.3.0 - Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ErrorMsg.java Tue Apr 21 01:36:28 2009
@@ -65,7 +65,11 @@
   TARGET_TABLE_COLUMN_MISMATCH("Cannot insert into target table because column number/types are different"),
   TABLE_ALIAS_NOT_ALLOWED("Table Alias not Allowed in Sampling Clause"),
   CLUSTERBY_DISTRIBUTEBY_CONFLICT("Cannot have both Cluster By and Distribute By Clauses"),
+  ORDERBY_DISTRIBUTEBY_CONFLICT("Cannot have both Order By and Distribute By Clauses"),
   CLUSTERBY_SORTBY_CONFLICT("Cannot have both Cluster By and Sort By Clauses"),
+  ORDERBY_SORTBY_CONFLICT("Cannot have both Order By and Sort By Clauses"),
+  CLUSTERBY_ORDERBY_CONFLICT("Cannot have both Cluster By and Order By Clauses"),
+  NO_LIMIT_WITH_ORDERBY("In strict mode, limit must be specified if ORDER BY is present"),
   UNION_NOTIN_SUBQ("Top level Union is not supported currently; use a subquery for the union"),
   INVALID_INPUT_FORMAT_TYPE("Input Format must implement InputFormat"),
   INVALID_OUTPUT_FORMAT_TYPE("Output Format must implement HiveOutputFormat, otherwise it should be either IgnoreKeyTextOutputFormat or SequenceFileOutputFormat"),

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/QBParseInfo.java Tue Apr 21 01:36:28 2009
@@ -53,7 +53,11 @@
    * SortBy controls the reduce keys, which affects the order of rows 
    * that the reducer receives. 
    */
+
   private HashMap<String, ASTNode> destToSortby;
+
+  /* Order by clause */
+  private HashMap<String, ASTNode> destToOrderby;
   private HashMap<String, Integer>    destToLimit;
   private int outerQueryLimit;
 
@@ -74,6 +78,7 @@
     this.destToClusterby = new HashMap<String, ASTNode>();
     this.destToDistributeby = new HashMap<String, ASTNode>();
     this.destToSortby = new HashMap<String, ASTNode>();
+    this.destToOrderby = new HashMap<String, ASTNode>();
     this.destToLimit = new HashMap<String, Integer>();
     
     this.destToAggregationExprs = new LinkedHashMap<String, LinkedHashMap<String, ASTNode> >();
@@ -143,6 +148,10 @@
     this.destToSortby.put(clause, ast);
   }
 
+  public void setOrderByExprForClause(String clause, ASTNode ast) {
+    this.destToOrderby.put(clause, ast);
+  }
+
   public void setSrcForAlias(String alias, ASTNode ast) {
     this.aliasToSrc.put(alias.toLowerCase(), ast);
   }
@@ -210,10 +219,17 @@
     return this.destToSortby.get(clause);
   }
 
+  public ASTNode getOrderByForClause(String clause) {
+    return this.destToOrderby.get(clause);
+  }
+
   public HashMap<String, ASTNode> getDestToSortBy() {
     return destToSortby;
   }
   
+  public HashMap<String, ASTNode> getDestToOrderBy() {
+    return destToOrderby;
+  }
   
   public ASTNode getSrcForAlias(String alias) {
     return this.aliasToSrc.get(alias.toLowerCase());

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=766939&r1=766938&r2=766939&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Tue Apr 21 01:36:28 2009
@@ -470,6 +470,9 @@
         if (qbp.getClusterByForClause(ctx_1.dest) != null) {
           throw new SemanticException(ErrorMsg.CLUSTERBY_DISTRIBUTEBY_CONFLICT.getMsg(ast));
         }
+        else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(ErrorMsg.ORDERBY_DISTRIBUTEBY_CONFLICT.getMsg(ast));
+        }
       }
         break;
 
@@ -480,6 +483,20 @@
         if (qbp.getClusterByForClause(ctx_1.dest) != null) {
           throw new SemanticException(ErrorMsg.CLUSTERBY_SORTBY_CONFLICT.getMsg(ast));
         }
+        else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(ErrorMsg.ORDERBY_SORTBY_CONFLICT.getMsg(ast));
+        }
+        
+      }
+        break;
+
+      case HiveParser.TOK_ORDERBY: {
+        // Get the order by aliases - these are aliased to the entries in the
+        // select list
+        qbp.setOrderByExprForClause(ctx_1.dest, ast);
+        if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(ErrorMsg.CLUSTERBY_ORDERBY_CONFLICT.getMsg(ast));
+        }
       }
         break;
 
@@ -2344,15 +2361,16 @@
   }
 
   @SuppressWarnings("nls")
-  private Operator genLimitMapRedPlan(String dest, QB qb, Operator input, int limit, boolean isOuterQuery) throws SemanticException {
+  private Operator genLimitMapRedPlan(String dest, QB qb, Operator input, int limit, boolean extraMRStep) 
+    throws SemanticException {
     // A map-only job can be optimized - instead of converting it to a map-reduce job, we can have another map
     // job to do the same to avoid the cost of sorting in the map-reduce phase. A better approach would be to
     // write into a local file and then have a map-only job.
     // Add the limit operator to get the value fields
     Operator curr = genLimitPlan(dest, qb, input, limit);
 
-    // If it is a outer most query and no sorting is specified, exact limit is applied by the fetch task
-    if (isOuterQuery && !sortRequired(dest, qb))
+    // the client requested that an extra map-reduce step be performed
+    if (!extraMRStep)
       return curr;
 
     // Create a reduceSink operator followed by another limit
@@ -2360,21 +2378,6 @@
     return genLimitPlan(dest, qb, curr, limit);
   }
 
-  /*
-   * Is sorting reuired ?
-   * If there are no cluster by/sort by keys, then an additional map-reduce job is not needed.
-   * Else, sort the output by the relevant key (via another map-reduce job).
-   */
-  private boolean sortRequired(String dest, QB qb) {
-    if (qb.getParseInfo().getClusterByForClause(dest) != null)
-      return true;
-
-    if (qb.getParseInfo().getSortByForClause(dest) != null)
-      return true;
-    
-    return false;
-  }
-
   @SuppressWarnings("nls")
   private Operator genReduceSinkPlan(String dest, QB qb,
                                      Operator input, int numReducers) throws SemanticException {
@@ -2401,6 +2404,17 @@
       sortExprs = qb.getParseInfo().getSortByForClause(dest);
     }
 
+    if (sortExprs == null) {
+      sortExprs = qb.getParseInfo().getOrderByForClause(dest);
+      if (sortExprs != null) {
+        assert numReducers == 1;
+        // in strict mode, in the presence of order by, limit must be specified
+        Integer limit = qb.getParseInfo().getDestLimit(dest);
+        if (conf.getVar(HiveConf.ConfVars.HIVEPARTITIONPRUNER).equalsIgnoreCase("strict") && limit == null)
+          throw new SemanticException(ErrorMsg.NO_LIMIT_WITH_ORDERBY.getMsg(sortExprs));
+      }
+    }
+
     ArrayList<exprNodeDesc> sortCols = new ArrayList<exprNodeDesc>();
     StringBuilder order = new StringBuilder();
     if (sortExprs != null) {
@@ -2946,19 +2960,35 @@
 
       if (qbp.getClusterByForClause(dest) != null
           || qbp.getDistributeByForClause(dest) != null
+          || qbp.getOrderByForClause(dest) != null
           || qbp.getSortByForClause(dest) != null) {
-        curr = genReduceSinkPlan(dest, qb, curr, -1);
+
+        int numReducers = -1;
+
+        // Use only 1 reducer if order by is present
+        if (qbp.getOrderByForClause(dest) != null)
+          numReducers = 1;
+
+        curr = genReduceSinkPlan(dest, qb, curr, numReducers);
       }
 
       if (qbp.getIsSubQ()) {
         if (limit != null) {
-          curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), false);
+          // In case of order by, only 1 reducer is used, so no need of another shuffle
+          curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), qbp.getOrderByForClause(dest) != null ? false : true);
         }
       } else {
         curr = genConversionOps(dest, qb, curr);
         // exact limit can be taken care of by the fetch operator
         if (limit != null) {
-          curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), qb.getIsQuery());
+          boolean extraMRStep = true;
+
+          if (qb.getIsQuery() &&
+              qbp.getClusterByForClause(dest) == null &&
+              qbp.getSortByForClause(dest) == null)
+            extraMRStep = false;
+
+          curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), extraMRStep);
           qb.getParseInfo().setOuterQueryLimit(limit.intValue());
         }
         curr = genFileSinkPlan(dest, qb, curr);
@@ -3353,6 +3383,7 @@
     if (qb.isSelectStarQuery()
         && qbParseInfo.getDestToClusterBy().isEmpty()
         && qbParseInfo.getDestToDistributeBy().isEmpty()
+        && qbParseInfo.getDestToOrderBy().isEmpty()
         && qbParseInfo.getDestToSortBy().isEmpty()) {
       Iterator<Map.Entry<String, Table>> iter = qb.getMetaData().getAliasToTable().entrySet().iterator();
       Table tab = ((Map.Entry<String, Table>)iter.next()).getValue();

Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/clusterbyorderby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,5 @@
+FROM src
+MAP src.key, CAST(src.key / 10 AS INT), CAST(src.key % 10 AS INT), src.value
+USING '/bin/cat' AS (tkey, ten, one, tvalue)
+CLUSTER BY tvalue, tkey
+ORDER BY ten, one;

Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/orderbysortby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,8 @@
+CREATE TABLE dest1(key INT, ten INT, one INT, value STRING) STORED AS TEXTFILE;
+
+FROM src
+INSERT OVERWRITE TABLE dest1
+MAP src.key, CAST(src.key / 10 AS INT), CAST(src.key % 10 AS INT), src.value
+USING '/bin/cat' AS (tkey, ten, one, tvalue)
+ORDER BY tvalue, tkey
+SORT BY ten, one;

Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/strict_orderby.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,7 @@
+set hive.partition.pruning=strict;
+
+EXPLAIN
+SELECT src.key, src.value from src order by src.key;
+
+SELECT src.key, src.value from src order by src.key;
+

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/order.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,4 @@
+EXPLAIN
+SELECT x.* FROM SRC x ORDER BY key limit 10;
+
+SELECT x.* FROM SRC x ORDER BY key limit 10;

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/order2.q Tue Apr 21 01:36:28 2009
@@ -0,0 +1,8 @@
+EXPLAIN
+SELECT subq.key, subq.value FROM 
+(SELECT x.* FROM SRC x ORDER BY key limit 10) subq
+where subq.key < 10;
+
+SELECT subq.key, subq.value FROM 
+(SELECT x.* FROM SRC x ORDER BY key limit 10) subq
+where subq.key < 10;

Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/clusterbyorderby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,2 @@
+FAILED: Parse Error: line 5:0 mismatched input 'ORDER' expecting EOF
+

Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/orderbysortby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: line 8:8 Cannot have both Order By and Sort By Clauses one

Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/strict_orderby.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: line 4:44 In strict mode, limit must be specified if ORDER BY is present key

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/order.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,54 @@
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF SRC x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF x))) (TOK_ORDERBY (TOK_COLREF key)) (TOK_LIMIT 10)))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        x 
+            Select Operator
+              expressions:
+                    expr: key
+                    type: string
+                    expr: value
+                    type: string
+              Reduce Output Operator
+                key expressions:
+                      expr: 0
+                      type: string
+                sort order: +
+                tag: -1
+                value expressions:
+                      expr: 0
+                      type: string
+                      expr: 1
+                      type: string
+      Reduce Operator Tree:
+        Extract
+          Limit
+            File Output Operator
+              compressed: false
+              GlobalTableId: 0
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 10
+
+
+0	val_0
+0	val_0
+0	val_0
+10	val_10
+100	val_100
+100	val_100
+103	val_103
+103	val_103
+104	val_104
+104	val_104

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out?rev=766939&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/order2.q.out Tue Apr 21 01:36:28 2009
@@ -0,0 +1,57 @@
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF SRC x)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF x))) (TOK_ORDERBY (TOK_COLREF key)) (TOK_LIMIT 10))) subq)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_COLREF subq key)) (TOK_SELEXPR (TOK_COLREF subq value))) (TOK_WHERE (< (TOK_COLREF subq key) 10))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        subq:x 
+            Select Operator
+              expressions:
+                    expr: key
+                    type: string
+                    expr: value
+                    type: string
+              Reduce Output Operator
+                key expressions:
+                      expr: 0
+                      type: string
+                sort order: +
+                tag: -1
+                value expressions:
+                      expr: 0
+                      type: string
+                      expr: 1
+                      type: string
+      Reduce Operator Tree:
+        Extract
+          Limit
+            Filter Operator
+              predicate:
+                  expr: (UDFToDouble(0) < UDFToDouble(10))
+                  type: boolean
+              Select Operator
+                expressions:
+                      expr: 0
+                      type: string
+                      expr: 1
+                      type: string
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 0
+                  table:
+                      input format: org.apache.hadoop.mapred.TextInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+
+0	val_0
+0	val_0
+0	val_0